Ejemplo n.º 1
0
 def getContent(self,url):
      if url.find('163.com') >= 0:
     ##网易新闻
         print '[+]网易新闻:'+url
         html = urllib.urlopen(url).read().decode('gbk').encode('utf-8')
         netEaseReg = re.compile(r'<div id="endText" class="end-text">([\s\S]*?)<div class="sharecommend-wrap clearfix">');
         result = netEaseReg.findall(html)
         if len(result)>0:
             print Toolkit.filterHtmlTag(result[0]).strip()
             return Toolkit.filterHtmlTag(result[0]).strip()
      elif url.find('qq.com') >= 0:
         print "[+]腾讯新闻:"+url
         html = urllib.urlopen(url).read().decode("gbk").encode("utf-8")
         tencentReg = re.compile(r'<P align=center>([\s\S]*?)</div>');
         result = tencentReg.findall(html)
         if len(result)>0:
             print Toolkit.filterHtmlTag(result[0])
             return Toolkit.filterHtmlTag(result[0])
Ejemplo n.º 2
0
     print '[+] 网易国内新闻...数据插入:',len(results)
     for result in results:               
         title = result[1].strip()
         coverurl = Toolkit.getImageUrl(result[2])
         summary = Toolkit.filterHtmlTag(result[3]).strip()
         source = "网易国内新闻"
         timeStr = result[4]
         website = result[0]
         md5Str = hashlib.md5(website).hexdigest()
         try:
             cur.execute("insert ignore into news_domestic(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str))
         except Exception,e:
             print "[-] "+website+"插入失败",e
 elif index == 3:
      print '[+] 网易社会新闻...数据插入:',len(results)
      for result in results:               
         title = result[1].strip()
         coverurl = Toolkit.getImageUrl(result[2])
         summary = Toolkit.filterHtmlTag(result[3]).strip()
         source = "网易社会新闻"
         timeStr = result[4]
         website = result[0]
         md5Str = hashlib.md5(website).hexdigest()
         try:
             cur.execute("insert ignore into news_social(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str))
         except Exception,e:
             print "[-] "+website+"插入失败",e
 conn.commit();
 cur.close();
 conn.close()  
 print '[+] 网易科技新闻...结束'