Beispiel #1
0
class CNYES:
    link_url = 'http://news.cnyes.com/Ajax.aspx?Module=GetRollNews'

    def __init__(self):
        self.web = WEB()

    def fetchListDOM(self, datum):
        the_page = self.web.getRawData(self.link_url, datum)
        # 將網頁轉成結構化資料
        parser = etree.XMLParser()
        root = etree.parse(StringIO.StringIO(the_page), parser)
        # 抓指定位置的連結
        return root.xpath('.//Table1')

    def fetchContent(self, url):
        the_page = self.web.getRawData(url)
        parser = etree.HTMLParser()
        root = etree.parse(StringIO.StringIO(the_page), parser)
        contents = root.xpath('//*[@id="newsText"]//text()')
        content = []
        for c in contents:
            part = c.strip()
            if part != "":
                content.append(part)
        info = root.xpath("//*[contains(@class, 'info')]")[0].text.strip()
        return {"content": content, "info": info}

    def fetchNews(self, data, target_dt, limit=0):
        rows = self.fetchListDOM(data)
        rows_cnt = len(rows)
        i = 0
        news = []
        for row in rows:
            try:
                datum = {}
                datum['Title'] = row.xpath('.//NEWSTITLE')[0].text.strip()
                datum['Link'] = 'http://news.cnyes.com' + row.xpath(
                    './/SNewsSavePath')[0].text.strip()
                datum['ClassCN'] = row.xpath('.//ClassCName')[0].text.strip()
                datum['ClassEN'] = row.xpath('.//CLASSENAME')[0].text.strip()
                #datum['NewsTime'] = t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip()
                datum['NewsTime'] = datetime.datetime.strptime(
                    target_dt.strftime("%Y-%m-%d ") +
                    row.xpath('.//NewsTime')[0].text.strip(),
                    "%Y-%m-%d %H:%M:%S")
                #datum['CreateDate'] = row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00","")
                datum['CreateDate'] = datetime.datetime.strptime(
                    row.xpath('.//CreateDate')[0].text.strip().replace(
                        "T", " ").replace("+08:00", ""), "%Y-%m-%d %H:%M:%S")
                news.append(datum)
            except AttributeError:
                print 'there exists a None object '

            if limit > 0 and i >= limit:
                break
        return news
Beispiel #2
0
class CNYES:
  link_url = 'http://news.cnyes.com/Ajax.aspx?Module=GetRollNews'
  def __init__(self):
    self.web = WEB()


  def fetchListDOM(self,datum):
    the_page = self.web.getRawData(self.link_url,datum)
    # 將網頁轉成結構化資料
    parser = etree.XMLParser()
    root = etree.parse(StringIO.StringIO(the_page),parser)
    # 抓指定位置的連結
    return root.xpath('.//Table1')

  def fetchContent(self,url):
      the_page = self.web.getRawData(url)
      parser = etree.HTMLParser()
      root = etree.parse(StringIO.StringIO(the_page),parser)
      contents =  root.xpath('//*[@id="newsText"]//text()')
      content = []
      for c in contents:
        part = c.strip()
        if part != "":
          content.append(part)
      info = root.xpath("//*[contains(@class, 'info')]")[0].text.strip()
      return {"content":content,"info":info}


  def fetchNews(self,data,limit=0):
    rows = self.fetchListDOM(data)
    rows_cnt = len(rows)
    i = 0
    news = []
    for row in rows:
      #計數器
      i = i+1
      print "\r[%d/%d] (%.2f%%)"%(i,rows_cnt,float(i)/rows_cnt*100.0),
      datum ={}
      datum['Title'] = row.xpath('.//NEWSTITLE')[0].text.strip()
      datum['Link'] = 'http://news.cnyes.com'+row.xpath('.//SNewsSavePath')[0].text.strip()
      datum['ClassCN'] = row.xpath('.//ClassCName')[0].text.strip()
      datum['ClassEN'] = row.xpath('.//CLASSENAME')[0].text.strip()
      #datum['NewsTime'] = t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip()
      datum['NewsTime'] = datetime.datetime.strptime(t.strftime("%Y-%m-%d ")+row.xpath('.//NewsTime')[0].text.strip(),"%Y-%m-%d %H:%M:%S")
      #datum['CreateDate'] = row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00","")
      datum['CreateDate'] = datetime.datetime.strptime(row.xpath('.//CreateDate')[0].text.strip().replace("T"," ").replace("+08:00",""),"%Y-%m-%d %H:%M:%S")
      try:
        d = self.fetchContent(datum['Link'])
        datum['Content'],datum["Info"] = d["content"],d["info"]
      except:
        print "Error at getContent"
      news.append(datum)
      if limit>0 and i >=limit:
        break
    return news
Beispiel #3
0
 def __init__(self, init_link):
     self.web = WEB()
     self.init_link = init_link
Beispiel #4
0
 def __init__(self):
     self.web = WEB()
Beispiel #5
0
 def __init__(self, init_link, diaster_type):
     self.web = WEB()
     self.init_link = init_link
     self.diaster_type = diaster_type
Beispiel #6
0
 def __init__(self):
   self.web = WEB()