def insertDb(self,index,results): print '[+] 新浪科技新闻...开始插入:',len(results) conn = MySQLdb.connect(host='localhost',user='******',passwd='',port=3306,charset='utf8') cur = conn.cursor() conn.select_db('newslab') if index == 1: for result in results: title = result[1] summery = result[3] coverurl = Toolkit.getImageUrl(result[2]) timeStr = Toolkit.timeStrConv(result[4]); source = u"新浪科技" website = result[0] md5Str = hashlib.md5(website).hexdigest() try: cur.execute("insert ignore into news(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summery,coverurl,timeStr,source,website,md5Str)) except Exception,e: print "[-] "+website+"插入失败",e
def getContent(self,url): if url.find('163.com') >= 0: ##网易新闻 print '[+]网易新闻:'+url html = urllib.urlopen(url).read().decode('gbk').encode('utf-8') netEaseReg = re.compile(r'<div id="endText" class="end-text">([\s\S]*?)<div class="sharecommend-wrap clearfix">'); result = netEaseReg.findall(html) if len(result)>0: print Toolkit.filterHtmlTag(result[0]).strip() return Toolkit.filterHtmlTag(result[0]).strip() elif url.find('qq.com') >= 0: print "[+]腾讯新闻:"+url html = urllib.urlopen(url).read().decode("gbk").encode("utf-8") tencentReg = re.compile(r'<P align=center>([\s\S]*?)</div>'); result = tencentReg.findall(html) if len(result)>0: print Toolkit.filterHtmlTag(result[0]) return Toolkit.filterHtmlTag(result[0])
print '[+] 网易国内新闻...数据插入:',len(results) for result in results: title = result[1].strip() coverurl = Toolkit.getImageUrl(result[2]) summary = Toolkit.filterHtmlTag(result[3]).strip() source = "网易国内新闻" timeStr = result[4] website = result[0] md5Str = hashlib.md5(website).hexdigest() try: cur.execute("insert ignore into news_domestic(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str)) except Exception,e: print "[-] "+website+"插入失败",e elif index == 3: print '[+] 网易社会新闻...数据插入:',len(results) for result in results: title = result[1].strip() coverurl = Toolkit.getImageUrl(result[2]) summary = Toolkit.filterHtmlTag(result[3]).strip() source = "网易社会新闻" timeStr = result[4] website = result[0] md5Str = hashlib.md5(website).hexdigest() try: cur.execute("insert ignore into news_social(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str)) except Exception,e: print "[-] "+website+"插入失败",e conn.commit(); cur.close(); conn.close() print '[+] 网易科技新闻...结束'
title = result[2] summary = Toolkit.filterHtmlTag(result[3]) coverurl = str(Toolkit.getImageUrl(result[0])) source = u'腾讯国内' website = "http://news.qq.com"+result[1] md5Str = hashlib.md5(website).hexdigest() try: cur.execute("insert ignore into news_domestic(title,summary,coverurl,source,website,md5) values('%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,source,website,md5Str)) except Exception,e: print "[-] "+website+" "+title+summary+" 插入失败",e print '[+] 腾讯国内新闻...插入结束' elif index == 3: print '[+] 腾讯社会新闻...开始插入:',len(results) for result in results: title = result[2] summary = Toolkit.filterHtmlTag(result[3]) coverurl = str(Toolkit.getImageUrl(result[0])) source = u'腾讯国内' website = "http://news.qq.com"+result[1] md5Str = hashlib.md5(website).hexdigest() try: cur.execute("insert ignore into news_social(title,summary,coverurl,source,website,md5) values('%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,source,website,md5Str)) except Exception,e: print "[-] "+website+" "+title+summary+" 插入失败",e print '[+] 腾讯社会新闻...插入结束' conn.commit(); cur.close(); conn.close() print '[+] 腾讯新闻...结束'