class CNYES: link_url = 'http://news.cnyes.com/Ajax.aspx?Module=GetRollNews' def __init__(self): self.web = WEB() def fetchListDOM(self, datum): the_page = self.web.getRawData(self.link_url, datum) # 將網頁轉成結構化資料 parser = etree.XMLParser() root = etree.parse(StringIO.StringIO(the_page), parser) # 抓指定位置的連結 return root.xpath('.//Table1') def fetchContent(self, url): the_page = self.web.getRawData(url) parser = etree.HTMLParser() root = etree.parse(StringIO.StringIO(the_page), parser) contents = root.xpath('//*[@id="newsText"]//text()') content = [] for c in contents: part = c.strip() if part != "": content.append(part) info = root.xpath("//*[contains(@class, 'info')]")[0].text.strip() return {"content": content, "info": info} def fetchNews(self, data, target_dt, limit=0): rows = self.fetchListDOM(data) rows_cnt = len(rows) i = 0 news = [] for row in rows: try: datum = {} datum['Title'] = row.xpath('.//NEWSTITLE')[0].text.strip() datum['Link'] = 'http://news.cnyes.com' + row.xpath( './/SNewsSavePath')[0].text.strip() datum['ClassCN'] = row.xpath('.//ClassCName')[0].text.strip() datum['ClassEN'] = row.xpath('.//CLASSENAME')[0].text.strip() #datum['NewsTime'] = t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip() datum['NewsTime'] = datetime.datetime.strptime( target_dt.strftime("%Y-%m-%d ") + row.xpath('.//NewsTime')[0].text.strip(), "%Y-%m-%d %H:%M:%S") #datum['CreateDate'] = row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00","") datum['CreateDate'] = datetime.datetime.strptime( row.xpath('.//CreateDate')[0].text.strip().replace( "T", " ").replace("+08:00", ""), "%Y-%m-%d %H:%M:%S") news.append(datum) except AttributeError: print 'there exists a None object ' if limit > 0 and i >= limit: break return news
class CNYES: link_url = 'http://news.cnyes.com/Ajax.aspx?Module=GetRollNews' def __init__(self): self.web = WEB() def fetchListDOM(self,datum): the_page = self.web.getRawData(self.link_url,datum) # 將網頁轉成結構化資料 parser = etree.XMLParser() root = etree.parse(StringIO.StringIO(the_page),parser) # 抓指定位置的連結 return root.xpath('.//Table1') def fetchContent(self,url): the_page = self.web.getRawData(url) parser = etree.HTMLParser() root = etree.parse(StringIO.StringIO(the_page),parser) contents = root.xpath('//*[@id="newsText"]//text()') content = [] for c in contents: part = c.strip() if part != "": content.append(part) info = root.xpath("//*[contains(@class, 'info')]")[0].text.strip() return {"content":content,"info":info} def fetchNews(self,data,limit=0): rows = self.fetchListDOM(data) rows_cnt = len(rows) i = 0 news = [] for row in rows: #計數器 i = i+1 print "\r[%d/%d] (%.2f%%)"%(i,rows_cnt,float(i)/rows_cnt*100.0), datum ={} datum['Title'] = row.xpath('.//NEWSTITLE')[0].text.strip() datum['Link'] = 'http://news.cnyes.com'+row.xpath('.//SNewsSavePath')[0].text.strip() datum['ClassCN'] = row.xpath('.//ClassCName')[0].text.strip() datum['ClassEN'] = row.xpath('.//CLASSENAME')[0].text.strip() #datum['NewsTime'] = t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip() datum['NewsTime'] = datetime.datetime.strptime(t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip(),"%Y-%m-%d %H:%M:%S") #datum['CreateDate'] = row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00","") datum['CreateDate'] = datetime.datetime.strptime(row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00",""),"%Y-%m-%d %H:%M:%S") try: d = self.fetchContent(datum['Link']) datum['Content'],datum["Info"] = d["content"],d["info"] except: print "Error at getContent" news.append(datum) if limit>0 and i >=limit: break return news
def __init__(self, init_link): self.web = WEB() self.init_link = init_link
def __init__(self): self.web = WEB()
def __init__(self, init_link, diaster_type): self.web = WEB() self.init_link = init_link self.diaster_type = diaster_type