def __startToMix__(self): #每次执行生成一篇 datahelper = DB() allHtml = datahelper.__randomP__() title,header,tail,id = datahelper.__randomHandT__() #没有找到id就变成0了 # print(title,header,tail) #title = "<h1>"+title+"</h1>" # print(header) # print("合成的html文件在这儿") mixP = title+header+allHtml+tail # print("本文总长"+str(len(new_news))) # print(title+header+allHtml+tail) # f= open(r"D:\pyfile\ProxySpider\newMission\new.html","w") # f.write(new_news) ##多媒体文件要是用conctent哦! # f.close()/ # print("生成的标题是 "+title ) # print("生成的内容是 "+mixP) if(title!="" and mixP !=""): datahelper.insertMixP(title,mixP) #生成成功的话就是这样 datahelper.updateMixState(id) #这儿无法更新的 ,这儿自带更新了 # print("生成成功") return True else: print("生成混合失败,没有找到title不为空的url") # datahelper.deleteUrl() return False
def job(self, name): #这个是主线程把 dbhelper = DB() #todo 每个线程的异常处理需要单独的进行,后续可能需要主线程来管理子线程的异常梳理才可以 print("正在爬取今天的新闻内容") print('这里是进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) p1 = multiprocessing.Process(target=worker_1, args=(6, )) p2 = multiprocessing.Process(target=worker_2, args=(3, )) #腾讯 p3 = multiprocessing.Process(target=worker_3, args=(4, )) p1.daemon = True p2.daemon = True p3.daemon = True # p1.start() p2.start() p3.start() print("The number of CPU is:" + str(multiprocessing.cpu_count())) for p in multiprocessing.active_children(): print("child p.name:" + p.name + "\tp.id" + str(p.pid)) p1.join() p2.join() p3.join() # 直接不等待吧 # print("The number of CPU is:" + str(multiprocessing.cpu_count())) #结束了后就无法读取进程号了,使用了join的话 # for p in multiprocessing.active_children(): # print("child p.name:" + p.name + "\tp.id" + str(p.pid)) print( "today work done ND!!!!!!!!!!!!!!!!!") # 这是是主线程,如何让主线程等待子线程结束后才输出呢 print("all over !") print("正在去重。。。") dbhelper.quchong() # 执行去重的东西 print("正在等待明天的到来,") dbhelper.getAllTitle()
def wangyiFill(wangyiUrls): datehelper = DB() flag = 1 for url in wangyiUrls: flag += 1 if flag % 50 == 0: #每30个休息一下子这个样子, time.sleep(60 * 3) datehelper.refreshConnection() title, Hcontent, Tcontent, Acontent = fillWangyi.getPageContent( url[0]) #但是如果是title=''这种就会遇到那种可能是格式不支持的那种 if (title != "" and Hcontent != ""): datehelper.updateContent(url[0], title, Hcontent, Tcontent, Acontent) #这个打开更新东西进去是需要title不等于空的才可以 mixNews = MixNews() state = mixNews.__startToMix__() if (state): # 里面已经有那种写入数据库的操作了 datehelper.updateState(url[0])
def fenghuangFill(tengxunUrls): datehelper = DB() flag = 1 for url in tengxunUrls: flag += 1 if flag % 50 == 0: #每30个休息一下子这个样子,打开多少个页面后休息一下 time.sleep(60 * 3) #每次休眠后都重来一个新的数据库连接 datehelper.refreshConnection() title, Hcontent, Tcontent, Acontent = fillFenghaung.getPageContent( url[0]) if (title != "" and Hcontent != ""): datehelper.updateContent(url[0], title, Hcontent, Tcontent, Acontent) mixNews = MixNews() state = mixNews.__startToMix__() if (state): # 里面已经有那种写入数据库的操作了 datehelper.updateState(url[0])
def tengxunFill(tengxunUrls): datehelper = DB() flag = 1 for url in tengxunUrls: print(url[0]) print(url) flag += 1 if flag % 50 == 0: #每30个休息一下子这个样子, print("正在进入休眠") time.sleep(60 * 2) # datehelper.refreshConnection() title, Hcontent, Tcontent, Acontent = fillTengxun.getPageContent( url[0]) #我要做的是把内容填上去,然后再更新 if (title != "" and Hcontent != ""): print(url[0]) print(url) datehelper.updateContent(url[0], title, Hcontent, Tcontent, Acontent) mixNews = MixNews() state = mixNews.__startToMix__() #返回是否生成成功, if (state): # 里面已经有那种写入数据库的操作了 print("混合生成生成成功!") datehelper.updateState(url[0])
class wangyiPageContent: def __init__(self): self.dbhelper = DB() def stripImgUrl(self, replacedSrc): if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "_") if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "_") if (replacedSrc.find(".") != -1): replacedSrc = replacedSrc.replace(".", "_") if (replacedSrc.find("/") != -1): replacedSrc = replacedSrc.replace("/", "_") if (replacedSrc.find("-") != -1): replacedSrc = replacedSrc.replace("-", "_") if (replacedSrc.find("?") != -1): replacedSrc = replacedSrc.split("?")[0] if (replacedSrc.find("?") != -1): replacedSrc = replacedSrc.replace("?", "_") if (replacedSrc.find("!") != -1): replacedSrc = replacedSrc.replace("!", "_") if (replacedSrc.find("\"") != -1): replacedSrc = replacedSrc.replace("\"", "_") if (replacedSrc.find(" ") != -1): replacedSrc = replacedSrc.replace(" ", "") if (replacedSrc.find("“") != -1): replacedSrc = replacedSrc.replace("“", "") if (replacedSrc.find("”") != -1): replacedSrc = replacedSrc.replace("”", "") if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "") if (replacedSrc.find("|") != -1): replacedSrc = replacedSrc.replace("|", "_") return replacedSrc def fixCssdot(self, pContent): # print(pContent) if (pContent.find("'")) != -1: #找到有单引号的,这种多是样式的东西 replaceString = pContent.replace("'", '"') # print(replaceString) return replaceString elif pContent.find("\n"): return pContent.replace("\n", "") else: return pContent # def getPageContent(self,url): #输入一个url获得这个页面的本地化后的文章 # t = time.time() # timeStamp =str(int(round(t * 1000))) # 毫秒级时间戳 # time.sleep(1) # downhelp = Download(r'/home/default/images') #设置下载路径 # # dbhelper = DB() # # title,Hcontent,Tcontent,Acontent="","","","" #要返回的这几个东西 # simpleP = [] # soup =makeBS().makesoup(url) # # print(soup.prettify()) # title = soup.find("h1",attrs={"class":True}) # print("标题 ",title) # if (title!=None): # title = title.text # if(title.find(":")!=-1): # title = title.replace(":","") # # main_content = soup.find("div",attrs={"class":"post_text"}) # if (main_content != None): # allPP = main_content.find_all("p") # for p in range(0, len(allPP)): # localImgList = allPP[p].find_all("img", attrs={"src": True}) # 每个p标签内的img提取和修改链接本地化 # if (p == 0): # Hcontent = allPP[p] # if (localImgList != None): # 找到有的话就遍历 # for img in localImgList: # if img != None: # print(img) # print(img['src']) # if (img['src'].find("//") == 0): # 什么都没有,协议路径改成https # imgSrc = "https:" + img['src'] # # filename = os.path.basename(imgSrc) # print(imgSrc) # # imgName = timeStamp + self.stripImgUrl(imgSrc) # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgSrc) # # print("文件名是" + imgName) # downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now) # 下载这个是没问题的 # img['src'] = "/images/" + imgName + ".jpg" # 修改完后的img # print(img['src']) # # simpleP.append(allPP[p]) # Acontent += str(allPP[p]) # # # Acontent += str(allcontent[i]) # elif (img['src'].find("https:") == 0): # 本来就有找到有https协议 # imgSrc = img['src'] # # filename = os.path.basename(imgSrc) # # print(imgSrc) # # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgSrc) # # print("文件名是" + imgName) # downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now) # img['src'] = "/images/" + imgName + ".jpg" # print(img['src']) # simpleP.append(allPP[p]) # Acontent += str(allPP[p]) # # else: # 那这个就是http协议了 # imgSrc = img['src'] # # filename = os.path.basename(imgSrc) # # print(imgSrc) # # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgSrc) # # print("文件名是" + imgName) # downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now) # img['src'] = "/images/" + imgName + ".jpg" # print(img['src']) # simpleP.append(allPP[p]) # Acontent += str(allPP[p]) # # for p in simpleP: #耦合性极高 # dbhelper.insertSimpleP(p) #这儿这个是一样的 # print(title, Hcontent, Tcontent, Acontent) # return title, Hcontent, Tcontent, Acontent # # # else: # title ="网易没找到标题" # return title, Hcontent, Tcontent, Acontent #---------------------------------------------------这儿的这个是新的了——---------------------------------------------- def getNewsContent(self, url): #解析图片类的,可以都放到类构造函数中去的 title, Hcontent, Tcontent, Acontent = "", "", "", "" #初始化一下 allP = [] # 这个是装所有的段落 simplePList = [] # 中间的段落 downloadTool = Download(r'/home/default/images') # 设置下载路径 cooker = makeBS() soup = cooker.makesoup(url) #soup 可能已经是空的了 if soup == None: print("出现空的url ", url) return title, Hcontent, Tcontent, Acontent try: title = soup.find("head") # 这个其实是head,是头部。 # print(title) # time.sleep(60) except Exception as e: print(e) traceback.print_exc() #这句用来告诉自己这儿需要跳出 # print(title) # print("标题是") if title != None: # 如果找到的话 title = title.text.split("_")[0] print(title) # time.sleep(60) # title=self.fixCssdot(title) if self.dbhelper.ifExists(title): return title, Hcontent, Tcontent, Acontent #存在的话,就不用再解析和下载图片了 # print(title) else: print("没能找到标题,请检查网址 " + url) # print(soup) # print() return title, Hcontent, Tcontent, Acontent # 不行就返回空的回去 if url.find( "photoview") != -1: #===============================解析图片滚动的页面的 print("是图片集合的页面") # print(soup) for span in soup.find_all("img", attrs={"src": True}): print(span) return #返回空白的回去,不要这个图 else: #--===================================================普通的文本图文的新闻 pass #这种是常规的新闻页面的解析 # print(soup) pList = soup.find_all("p") checkLen = len(pList) for p in pList: #解析常规的这几个东西 # print(p) if p.img != None: # print("图片段落") # print(p) try: imgSrc = p.img['src'] imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") now = time.strftime('%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl( imgName) # 这儿已经是添加了时间的 了 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) imgPScr = "/images/" + imgName + ".jpg" # 这个html格式图片地址 HtmlImg = '<p><img src="' + imgPScr + '"/></p>' # print(HtmlImg) #这个是修好的图片的文本的 allP.append(HtmlImg) #放入列表中去 except Exception as e: print(e) print("找不到图片的地址") print(p) print() else: #这就是普通的段落 if p.text != "用微信扫码二维码" and p.text != "分享至好友和朋友圈" and p.text != "" and p.text != "\n": # print("普通段落") if p.a == None: #这儿开始判断是不是第一段 # print(p) allP.append('<p>' + p.text + '</p>') #只要文字进来,不要样式之类的 # 最后再来判断首尾,这个也行的哈,对吧 for p in allP: if (len(allP)) >= 2: Acontent = Acontent + str(p) if allP.index(p) == 0: # 如果发现索引值是第一个的话,那就是开头了 Hcontent = p elif allP.index(p) == len(allP) - 1: # 如果是最后一个句子,那么你就是结尾了 Tcontent = p else: # 不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(p) #这儿存各种各样的列表 else: #如果这个也是一整段的新闻的话 Acontent = Acontent + str(p) try: Tcontent = "<p>" + p.split( "。")[-2] + "</p>" # 最后一句作为 结尾的句子,句号前面那个才是 except Exception as e: Tcontent = "<p>" + p.split( "。")[0] + "</p>" # 无法分的话,比如一句话,那就头尾都一样把 Hcontent = "<p>" + p.split("。")[0] + "</p>" # 这儿是开头的第一句的句子 , simplePList.append(p) # print("title") # print(title) # print("Hcontent") # print(Hcontent) # print("Tcontent") # print(Tcontent) # print("Acontent") # print(Acontent) return self.fixCssdot(title), self.fixCssdot(Hcontent), self.fixCssdot( Tcontent), self.fixCssdot(Acontent) #最后进行返回这些东西进来这儿
# print("合成的html文件在这儿") mixP = title+header+allHtml+tail # print("本文总长"+str(len(new_news))) # print(title+header+allHtml+tail) # f= open(r"D:\pyfile\ProxySpider\newMission\new.html","w") # f.write(new_news) ##多媒体文件要是用conctent哦! # f.close()/ # print("生成的标题是 "+title ) # print("生成的内容是 "+mixP) if(title!="" and mixP !=""): datahelper.insertMixP(title,mixP) #生成成功的话就是这样 datahelper.updateMixState(id) #这儿无法更新的 ,这儿自带更新了 # print("生成成功") return True else: print("生成混合失败,没有找到title不为空的url") # datahelper.deleteUrl() return False # print(len(datahelper.__query__("select * from tengxun where title!='';"))) if __name__ == "__main__": #这个就是url的东西 dbhelper = DB() # ddd = MixNews() # ddd.__startToMix__() title, header, tail, id = dbhelper.__randomHandT__() print(title)
def getEveryDayWangyi(self): dbhelper = DB() dateurl = DateUrl() oneContent = wangyiPageContent() print("共提取到新闻url的数量有") now_date = (date.today() + timedelta(days=-1)).strftime( "%Y-%m-%d") # 昨天日期 print(now_date) # print(dateurl.getOneDayNewUrl("2018-07-03")) #1.页面新闻url写入数据库 dateurl.getRollUrlList( now_date) # 1.这个就是当天的,和凤凰一样,老样子啊,获得了链接后直接可以写入数据库中去了 todayNewUrl = dbhelper.__query__( "select url from tengxun where urlState='False' and fromWhere='wangyi';" ) # 只要数据库中取出需要读取的url # print(type(todayNewUrl)) print(len(todayNewUrl)) #这个才是打开来的东西 urlNumer = len(todayNewUrl) print("正在打开网易的新闻的东西") print(todayNewUrl) # print("正在打开网易的新闻的东西") # print(todayNewUrl) # for newUrl in todayNewUrl: # 2.然后把内容段落写入数据库 # title, Hcontent, Tcontent, Acontent = oneContent.getPageContent(newUrl) # if (title != "网易没找到标题" and title != None and Hcontent != ""): # 有内容的时候就更新这条数据 # dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent) # # print("正在生成新混合新闻。。。") # 3. 然后是把页面页写入数据库,再然后是随机生成相同数量的 # mixNews = MixNews() # mixNews.__startToMix__() # 调用一次就执行一次,可以修改返回的状态 # # else: # print("更新失败,标题提取失败,为空") count = 1 #计数,每100个就休息1分钟 for dic in todayNewUrl: url = dic['url'] #2.把写入数据库的这几个新闻url的内容提取出来 if count % 200 == 0: #突然的中断应该是因为这边连接不上那儿,所以应该问题不大,每两百条休息一分钟 time.sleep(60) count += 1 #这儿的url是未转换成xw。电脑原版页面的url,所以,存的是这种url title, Hcontent, Tcontent, Acontent = oneContent.getNewsContent( url) #这儿漏了更新到url中去 ,自动转换成xw的然后再下载 time.sleep(1) # print(title, Hcontent, Tcontent, Acontent) if (title != "腾讯没找到标题" and title != None and Hcontent != ""): #有内容的时候就更新这条数据 # print("要更新的url是 "+url) resultState = dbhelper.updateContent(url, title, Hcontent, Tcontent, Acontent) #要删除的是更新失败的那个 if resultState == False: #更新成功 print("更新失败,正在删除这个重复的url") print(url) # dbhelper.deleteUrl(url) # 按url把这条记录删除掉咯,生成失败也不需要删除这个拉, print() else: # print("正在生成新混合新闻。。。") # 3. 然后是把页面页写入数据库,再然后是随机生成相同数量的 mixNews = MixNews() if mixNews.__startToMix__() != True: # 调用一次就执行一次,可以修改返回的状态 print("生成失败,已经没有刚填满的未用过的文章了") print(url) dbhelper.deleteUrl( url) # 如何这个内容为空也要删除,(可能前面一个步骤更新的时候发现相同的标题,所以插入不了), # print() else: print(True) else: print("打开页面提取失败,可能是页面为404网易,正在删除这条url " + url) #为空的话,那么就删除这条把 dbhelper.deleteUrl( url) #按url把这条记录删除掉咯 todo don't delete it first
def getPageContent(self, url): #输入一个url获得这个页面的本地化后的文章 ,其实分这个应该是不需要那么麻烦的把,,,, t = time.time() timeStamp = str(int(round(t * 1000))) # 毫秒级时间戳 time.sleep(1) downhelp = Download(r'/home/default/images') #设置下载路径 dbhelper = DB() title, Hcontent, Tcontent, Acontent = "", "", "", "" simpleP = [] soup = None try: soup = makeBS().makesoup(url) # print(soup) title = soup.find("head").title # print(chardet.detect(title.text)) 测不准的这个东西 # print(title) if title != None: title = title.text print("标题是" + str(title)) main_content = soup.find("div", attrs={"id": "main_content"}) flag = 1 if (main_content != None): allPP = main_content.find_all("p") for p in range(0, len(allPP)): # print() # print(allPP[p]) # print(p) if allPP[p].find_all("a") != None: # print("找到了有链接的东西") # print(allPP[p]) allPP[p].a.extract() # print(allPP[p]) localImgList = allPP[p].find_all( "img", attrs={"src": True}) # 每个p标签内的img提取和修改链接本地化 if (localImgList != None): # 找到有的话就遍历,并且本地化还有修改src的东西 for img in localImgList: if img != None: # print("发现图片") # print(img) # print(img['src']) if (img['src'].find("//") == 0 ): # 什么都没有,协议路径改成https imgSrc = "https:" + img['src'] # print(imgSrc) now = time.strftime( '%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl( imgSrc) # print("文件名是" + imgName) # print(imgName) downhelp.downloadImg( imgSrc, imgName=imgName, referer=None, now_date=now) # 下载这个是没问题的 img['src'] = "/images/" + imgName + ".jpg" #修改完后的img # print(img['src']) simpleP.append(allPP[p]) Acontent += str(allPP[p]) # Acontent += str(allcontent[i]) elif (img['src'].find("https:") == 0 ): # 本来就有找到有https协议 imgSrc = img['src'] # print(imgSrc) now = time.strftime( '%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl( imgSrc) downhelp.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) img['src'] = "/images/" + imgName + ".jpg" # print(img['src']) simpleP.append(allPP[p]) Acontent += str(allPP[p]) else: # 那这个就是http协议了 imgSrc = img['src'] # print(imgName) now = time.strftime( '%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl( imgSrc) downhelp.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) img['src'] = "/images/" + imgName + ".jpg" # print(img['src']) simpleP.append(allPP[p]) Acontent += str(allPP[p]) if (p == 0): #这儿是判断是首段还是尾段 Hcontent = allPP[p] #这个是找标题的,这个是修改后的了 elif (p == len(allPP) - 1): print("找到尾段了拉") Tcontent = allPP[p] # for p in simpleP: # dbhelper.insertSimpleP(p) #这儿这个是一样的 todo 记得改回来这儿的地方 return title, Hcontent, Tcontent, Acontent else: title = "凤凰没有找到标题" return title, Hcontent, Tcontent, Acontent except Exception as e: print(e) print("现在网页是:" + url) return title, Hcontent, Tcontent, Acontent
def getEveryFenghuang(self): dbhelper = DB() dateurl = fenghuangDateUrls() oneContent = fenghuangPageContent() print("共提取到新闻url的数量有") now_date = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d") # 昨天日期 print(now_date) # print(dateurl.getOneDayNewUrl("2018-07-03")) #1.页面新闻url写入数据库 todayNewUrl = dateurl.getUrlLists(now_date) #1.这个就是当天的 urlNumer = len(todayNewUrl) todayNewUrl = dbhelper.__query__("select url from tengxun where urlState='False' and fromWhere='fenghuang'") #只要数据库中未填补内容的url print(type(todayNewUrl)) print(len(todayNewUrl)) # for dic in todayNewUrl: # dic['url'] print("") # 这儿才是把东西提取出来 count = 1 #计数,每100个就休息1分钟 print(todayNewUrl) flagNumber = 1 mixNumber = 0 for dic in todayNewUrl: newUrl = dic['url'] #2.把写入数据库的这几个新闻url的内容提取出来 if newUrl.find("pl.ifeng.com")!=-1: title, Hcontent, Tcontent, Acontent = oneContent.getPlContent(newUrl) if (title != "凤凰没有找到标题" and title != None and Hcontent != ""): # 有内容的时候就更新这条数据 dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent) print("正在生成新混合新闻。。。") # 3. 然后是把页面页写入数据库,再然后是随机生成相同数量的 mixNews = MixNews() if mixNews.__startToMix__() != True: # 调用一次就执行一次,可以修改返回的状态 print("生成失败,已经没有刚填满的未用过的文章了") print(newUrl) dbhelper.deleteUrl(newUrl) # 如何这个内容为空也要删除,(可能前面一个步骤更新的时候发现相同的标题,所以插入不了), else: mixNumber+=1 #成功就生成一个累加 else: print("更新失败,标题提取失败,为空") dbhelper.deleteUrl(newUrl) # 按url把这条记录删除掉咯 else: #这个就是默认的那个新闻news.ifeng.com title, Hcontent, Tcontent, Acontent =oneContent.getNewsContent(newUrl) if (title != "凤凰没有找到标题" and title != None and Hcontent != ""): # 有内容的时候就更新这条数据 dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent) print("正在生成新混合新闻。。。") # 3. 然后是把页面页写入数据库,再然后是随机生成相同数量的 mixNews = MixNews() if mixNews.__startToMix__() != True: # 调用一次就执行一次,可以修改返回的状态 print("生成失败,已经没有刚填满的未用过的文章了") print(newUrl) dbhelper.deleteUrl(newUrl) # 如何这个内容为空也要删除,(可能前面一个步骤更新的时候发现相同的标题,所以插入不了), else: mixNumber+=1 #成功就生成一个累加 else: print("更新失败,标题提取失败,为空") dbhelper.deleteUrl(newUrl) # 按url把这条记录删除掉咯 print("目前生成了 共有那么多个混合的新闻 "+str(mixNumber)) #普遍存在
#coding=utf-8 from os import path from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from DBcontrol import DB d = path.dirname(__file__) # Read the whole text. # text = open(path.join(d, 'test.text'),encoding='utf-8').read() chak = DB() allTogether = chak.getAllTitle() import jieba wordlist = jieba.cut(allTogether, cut_all=True) #直接切换成了这个数据 wl = " ".join(wordlist) print(wl)#输出分词之后的txt coloring = np.array(Image.open(path.join(d, "test.jpg"))) # 设置停用词 # stopwords = set(STOPWORDS) # stopwords.add("said")
class DateUrl: def __init__(self): self.dbhelper = DB() #默认就给你创建好了, #这个是按天来的,组合一下就行了,获取滚动列表的新闻url def getDateUrlList(self, startDate, endDate): #返回这两个日期区间的url,顺便就写入数据库了 urlList = [] timehelper = TimeHelper() datelist = [] if (startDate != endDate): #不相等的时候就算差值 datelist = timehelper.getTimeList(startDate, endDate) else: datelist.append(startDate) for oneDay in datelist: time.sleep(1.5) #500毫秒一次,那我设置成800毫秒请求一次 onedatelist = [] try: onedatelist = self.getOneDayNewUrl(oneDay) except Exception: time.sleep(30) onedatelist = self.getOneDayNewUrl(oneDay) urlList = urlList + onedatelist # todo 这样并不好,耦合性太高了,不方便平时的调试排错,融合进去了这些东西 # self.saveListToMysql(onedatelist,oneDay,"tengxun") #存到数据库里面去,把每个都插入进去 return urlList def getOneDayNewUrl(self, date): date = parse.quote_plus("" + date) oneDayUrlList = [] print(str(date)) # date = "2018-07-26" appid = "3639833dae924cb9efb6ba30e6c5a6fa" url = "https://api.shenjian.io/?appid=" + appid + "&date=" + date # print(url) request = urllib.request.Request(url, headers={ "Accept-Encoding": "gzip", }) response = urllib.request.urlopen(request) gzipFile = gzip.GzipFile(fileobj=response) # print(gzipFile.read().decode('UTF-8')) jsonResult = json.loads(str(gzipFile.read().decode('UTF-8'))) if "data" in jsonResult: print(jsonResult['data']) print("共有多少个新闻" + str(len(jsonResult['data']))) if (len(jsonResult['data']) == 4): oneDayUrlList.append(jsonResult['data']['url']) return oneDayUrlList else: for i in jsonResult['data']: # print(i['url']) oneDayUrlList.append(i['url']) return oneDayUrlList else: print("检测到腾讯的api 中无 data key 10分钟后再试") time.sleep(60 * 10) #如果一下子那个api没有反应的话,那就这样操作咯,用进程把,多个cpu哦 return self.getOneDayNewUrl(date) #采用递归的方式来处理,, # # def saveListToMysql(self,lists,date,fromWhere): # connect = DB() # lists = list(set(lists)) # for i in lists: # connect.insertTenxun(i,date,fromWhere) # print(fromWhere+"插入完毕") # connect.__close__() def tengxunGundong(self): url = 'http://news.qq.com/articleList/rolls/' cooker = makeBS() soup = cooker.makesoup(url, "computer") print(soup) # -----------------------------------------------------下面开始是新的提取出页面的url的----------------------------------- def returnThemeCode(self, theme): ent_Theme = 1537876288634 sport_Theme = 1537877689177 finance_Theme = 1537878365483 tech_Theme = 1537879684280 auto_Theme = 1537887032223 house_Theme = 1537887128904 news_Theme = 1537874915062 if theme == 'news': return news_Theme if theme == 'ent': return ent_Theme if theme == 'sports': return sport_Theme if theme == 'tech': return tech_Theme if theme == 'auto': return auto_Theme if theme == 'house': return house_Theme if theme == 'finance': return finance_Theme def getThemeUrl(self, theme, today, pageNumber): rawUrl = "http://roll.news.qq.com/interface/cpcroll.php" rawReferer = '.qq.com/articleList/rolls/' # 'http://news 前面还有这个东西 my_headers = [ 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)' ] headers = { "User-Agent": random.choice(my_headers), 'Referer': 'http://' + theme + rawReferer } # 默认值 rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str( self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str( pageNumber) + "&date=" + today try: rawhtml = requests.get( rawUrl, headers=headers, allow_redirects=False, timeout=30) # 一般提取文本的话,那就用text,如果是文件就content rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] print(rawhtml.url) print("状态码" + str(rawhtml.status_code)) if rawhtml.status_code == 504: print(504) return print("页面的读取结果为") print(rawhtml.text) if rawhtml.text.find('rollback') == 0: jsonString = rawhtml.text.split("rollback")[1] # 把js提取出来就可以了 else: jsonString = rawhtml.text print(jsonString) dicData = eval(jsonString) print(type(jsonString)) print(jsonString) # print(dicData['data']['article_info']) print(len(dicData['data']['article_info'])) if dicData['data'] == "": print("超过了最大页数了,跳出了就可以了") return urllist = [] for one in dicData['data']['article_info']: # print(one['url']) print(one['url'].replace("\\", "/")) # 还需要检查一下这个和之前的那种野蛮是不是一样的 urllist.append(one['url'].replace("\\", "/")) return urllist except Exception as e: # print(e) return [] # 没有东西诶 def pageUrlMain(self, date): #汇总到这儿来,输入日期就可以返回这个了 # todo 这些都还可能是 page2的情况 # url ="http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537850539512" # 这个是时政新闻 urlNew = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537874915062" # 这儿是国际标题, referer = http://news.qq.com/articleList/rolls/ urlEnt = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=ent&mode=1&cata=&date=2018-09-25&page=1&_=1537876288634" # referer = http://ent.qq.com/articleList/rolls/ # referer = http://ent.qq.com/articleList/rolls/ urlSport = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=sports&mode=1&cata=&date=2018-09-25&page=1&_=1537877689177" # r这个好像而是动态加载出来的,真是的 # todo 这个要重新解析,可能是动态的 referer = http://sports.qq.com/articleList/rolls/ 有些page可能不止一个的,都有 体育的是动态的,待会回来再分析,有好多页,很厉害的样子呢 # 不同的theme要切换不同的referer urlFinance = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=finance&mode=1&cata=&date=2018-09-25&page=1&_=1537878365483" # referer http://finance.qq.com/articleList/rolls/ # todo 默认的解析可以,但是好慢,使用代理的情况下 http://173.255.210.215:8000/?count=6&country=%E5%9B%BD%E5%86%85 urlTech = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=tech&mode=1&cata=&date=2018-09-25&page=2&_=1537879684280" # referer = http://tech.qq.com/articleList/rolls/ # todo 这个也是偶尔有点慢的样子, 使用代理 下 urlAuto = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=auto&mode=1&cata=&date=2018-09-25&page=1&_=1537887032223" # referer 这个是汽车的 http://auto.qq.com/articleList/rolls/dai'li # todo 这个汽车的应该使用另外的解析来解析才可以 urlHouse = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=house&mode=1&cata=&date=2018-09-25&page=1&_=1537887128904" # referer http://house.qq.com/articleList/rolls/ 这个是房产的 可以解析 # getThemeUrl(urlSport,"http://sports.qq.com/articleList/rolls/") urlRaw = "http://roll.news.qq.com/interface/cpcroll.php?" # 带参数进行post请求才对把 # print(jsonDic) # print(rawhtml.json()) oneUrl = {} # 单单一个运动的就够了 themeList = [ 'news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports' ] #一共有7个主题,其实不止这7个的,真好。 tempList = [] #想想用什么数据类型。二维数组?url,theme, 都是字符串 for theme in themeList: print("第一个主题是") for i in range(1, 11): print("第" + str(i) + "页") responseList = self.getThemeUrl(theme, date, i) # 这儿只是测试最大页数有多少而已 if len(responseList) == 0: print("最大页数为" + str(i - 1) + "页") break else: tempList = tempList + responseList oneUrl[theme] = responseList #这样不就可以了,分开来 print(oneUrl) resultUrl = oneUrl tempList = set(tempList) from pprint import pprint pprint(resultUrl) print(len(resultUrl)) # 目前可以返回653 昨天的 完整的1140,数量是很可观的 #写一个函数分类存就可以了 ,本来是一起存的,现在 self.dbhelper.saveDicToMysql(oneUrl, date, "tengxun") #这边不需要分类,可能是因为这个切换了又没放回去的原因 # resultUrl = self.dbhelper.saveListToMysql(resultUrl,date,"tengxun") #这儿没啥去重的 #保存到mysql中去先,这样就可以重复使用拉 #经过去重,去掉已经插入过的东西,然后再返回去 ,这儿还得转会list才可以。 return tempList #直接这儿去重后
class pageContent: def __init__(self): self.dbhelper = DB() def stripImgUrl(self,replacedSrc): if(replacedSrc.find(":")!=-1): replacedSrc = replacedSrc.replace(":","") if(replacedSrc.find(":")!=-1): replacedSrc = replacedSrc.replace(":","_") if(replacedSrc.find(".")!=-1): replacedSrc=replacedSrc.replace(".", "_") if(replacedSrc.find("/")!=-1): replacedSrc = replacedSrc.replace("/","_") if(replacedSrc.find("-")!=-1): replacedSrc = replacedSrc.replace("-","_") if(replacedSrc.find("?")!=-1): replacedSrc =replacedSrc.split("?")[0] if (replacedSrc.find("?") != -1): replacedSrc = replacedSrc.replace("?","_") if (replacedSrc.find("!") != -1): replacedSrc = replacedSrc.replace("!", "_") if (replacedSrc.find("\"") != -1): replacedSrc = replacedSrc.replace("\"","_") if (replacedSrc.find(" ") != -1): replacedSrc = replacedSrc.replace(" ","") if (replacedSrc.find("“") != -1): replacedSrc = replacedSrc.replace("“","") if (replacedSrc.find("”") != -1): replacedSrc = replacedSrc.replace("”","") if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "") if (replacedSrc.find("|") != -1): replacedSrc = replacedSrc.replace("|", "_") return replacedSrc # def getPageContent(self,url): #输入一个url获得这个页面的本地化后的文章, 这个好像是没有保存东西进来这里面 # # time.sleep(1) # # t = time.time() # Acontent = "" # Hcontent,Tcontent = "","" # timeStamp =str(int(round(t * 1000))) # 毫秒级时间戳 # # dbhelper = DB() # simpleP = [] # # soup =makeBS().makesoup(url) # if soup==None: # title ="这个网页打开超时了" # return title, Hcontent, Tcontent, Acontent # # print(soup.prettify()) # title = soup.find("h1") # if(title!=None): # title = title.text # # print("标题 ",title) #找不到标题的话说明是 # # if (title.find(":")!=-1): # # title = title.replace(":","") # downloadTool = Download(r'/home/default/images') #设置下载路径 # # totalLong = 0 # mainDiv = soup.find("div",attrs={"class":"qq_innerMain clearfix"}) # # print(mainDiv) # if(mainDiv==None): # dbhelper.deleteUrl(url) #如果无法找到标题,那么就是图文新闻,舍弃因为文字太少,不要了 # title = "腾讯没找到标题" #直接就全部置空,当作没访问过这个就可以了 # print("错误,此网页是图文网页 "+url) # return title, Hcontent, Tcontent, Acontent # allcontent = mainDiv.find_all("p") # # print(allcontent) # for i in range(len(allcontent)): #这里面这个是p # # print(i) # if(i==0): # Hcontent =allcontent[i] # localImgList = allcontent[i].find_all("img",attrs={"src":True}) #每个p标签内的img提取和修改链接本地化 # if(localImgList!=None): #找到有的话就遍历 # for img in localImgList: # if img!=None: # # print(img['src']) # if(img['src'].find("//")==0): #什么都没有,协议路径改成https # imgSrc = "https:"+img['src'] # # filename = os.path.basename(imgSrc) # # print(imgSrc) # imgName = imgSrc.replace("https://inews.gtimg.com/","").replace("/","_") # # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now+"/" #后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date+self.stripImgUrl(imgName) # # # print("文件名是 " + imgName) # #这儿这个是图片的格式 # newImgName = downloadTool.downloadImg(imgSrc,imgName=imgName,referer=None, now_date=now) #下载这个是没问题的 # img['src']="/images/"+imgName+".jpg" #以这个文件名下载,以这个文件名src # # print(img['src']) # # print("图片的链接有"+imgSrc) # # print(allcontent[i]) # simpleP.append(allcontent[i]) # Acontent += str(allcontent[i]) # elif(img['src'].find("https:")==0): #本来就有找到有https协议, 3选1 而不是反复操作 # imgSrc = img['src'] # # filename = os.path.basename(imgSrc) # # print(imgSrc) # imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") # # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now+"/" #后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date+self.stripImgUrl(imgName) # # print("文件名是" + imgName) # downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # img['src'] = "/images/" + imgSrc.replace("https://inews.gtimg.com/", "").replace("/","_") + ".jpg" # # print(img['src']) # # print("图片的链接有"+imgSrc) # # print(allcontent[i]) # simpleP.append(allcontent[i]) # Acontent += str(allcontent[i]) # else: #那这个就是http协议了 # imgSrc = img['src'] # # filename = os.path.basename(imgSrc) # # print(imgSrc) # imgName = imgSrc.replace("http://inews.gtimg.com/", "").replace("/", "_") # # # now = time.strftime('%Y%m%d', time.localtime(time.time())) # now_date = now+"/" #后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date+self.stripImgUrl(imgName) # # # print("文件名是" + imgName) # downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # img['src'] = "/images/" + imgSrc.replace("http://inews.gtimg.com/", "").replace("/","_") + ".jpg" # # print(img['src']) # # print("图片的链接有"+imgSrc) # # print(allcontent[i]) # simpleP.append(allcontent[i]) # Acontent += str(allcontent[i]) # # # # if (allcontent[i].string=="更多新闻" or allcontent[i].string =="扫一扫,用手机看新闻!"): # Tcontent =allcontent[i-1] # break # else: # Tcontent = allcontent[-1 ] #要么是倒数第一位要么是截断的那一位 # if (allcontent[i].string!=None and allcontent[i].string!="扫一扫,用手机看新闻!"): # # print(allcontent[i]) # Acontent += str(allcontent[i]) # simpleP.append(allcontent[i]) # totalLong = len(allcontent[i].string) + totalLong # # # for p in simpleP: # # # # dbhelper.insertSimpleP(p) #插入段落,但是没有更新标题还有那些东西,发现没有,对啊,这个东西要修改好,这儿转移出去处理把,逻辑分明一点而 # # # print("此文章总共多少个字"+str(totalLong)) # return title,Hcontent,Tcontent,Acontent # # #——————————————下面开始是全新的使用转化成wx 手机端页面的爬取,和前面的区别开来---------------------------前面的没有用对吧。 def findVideo(self,dic): #传入解析到一半的 if dic.find("ext_data: ")!=-1: #有视频的时候才来解析这个东西 try: if dic.find("vid")!=-1: #找到这个字段 # dic = dic.replace('"',"'").replace(",","") dic = dic.split("ext_data: ")[1].split("fisoriginal")[0] vid = dic.split("vid")[1].split(",")[0] vid = vid.split('"')[2] # print(vid) return vid except Exception as e: print(e) return return dic else: return def fixUrl(self,url): url = url.replace("////","//") # print(url) return url def changeNewToWx(self,url): #把链接转化成手机端的,微信的,然后返回合成的微信的链接,或者直接通过请求头来自动跳到手机端 #提取到的两杠的要换成一杠才可以的。 # print(url) if url.find("////")!=-1: url = self.fixUrl (url) # print(url) rawWxUrl = "https://xw.qq.com/" if url.find("qq.com/")!=-1: splitUrl = url.split(".qq.com/") tailUrl = splitUrl[1].replace(".htm","") headUrl = splitUrl[0] #为了提取出主题theme if headUrl.find("//")!=-1: #提取主题 theme=headUrl.split("//")[1] tailUrl = tailUrl.split("//")[0]+"/"+theme+"/"+tailUrl.split("//")[1]+tailUrl.split("//")[2] else: return #如果不是的话,那就返回空值把 # print("转化后url为 -"+rawWxUrl+tailUrl) sumUrl = rawWxUrl+tailUrl # print(sumUrl) # print(rawWxUrl) # print(tailUrl) #为了让这个不影响之前的,那就先检查一下第二个位置是否为“//”是的话就改成/就可以了 if len(sumUrl.split("//"))>1: #为1就没问题 tempUrl = sumUrl.split("//") totalUrl = tempUrl[0]+"//"+ "/".join(tempUrl[1:]) # print(totalUrl) return(totalUrl) # tempWxUrl = rawWxUrl + tailUrl # print(tempWxUrl) return sumUrl def getWxContentNew(self,url): wxUrl = self.changeNewToWx(url) Cooker = makeBS() title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 downloadTool = Download(r'/home/default/images') # 设置下载路径 BS = Cooker.makesoup(wxUrl) #传进来是微信的才可以一 def getWxContent(self,wxUrl):# 先提取出里面的那个,然后再看看什么情况 ----这个是主要解析的函数 time.sleep(1) title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 downloadTool = Download(r'/home/default/images') # 设置下载路径 simplePList = [] Cooker = makeBS() BS = Cooker.makesoup(wxUrl) #传进来是微信的才可以一 # print(BS) if BS==None: return title, Hcontent, Tcontent, Acontent # print(BS) try: title = BS.find("head").title #直接选用head这儿的title标签里面的东西 except Exception as e: print(e) traceback.print_exc() # 貌似这个,一个错,各个错。 return title, Hcontent, Tcontent, Acontent # print("输出title") #todo 这儿插入一个检查一下标题是否存在的东西的 # print(title.text) #todo 改变成手机的ua,让服务器发来手机端页面来给我, 我怀疑现在ua打开的还是电脑版本的 if(title!=None and title.text!="404 - 腾讯网"): title=title.text if self.dbhelper.ifExists(title): #如果标题为空那就直接删除这条了。 ,所以就是因为已经存在,所以后面都不进行读取了对吧 print("已经存在了这个") #这儿也是返回空值的。 return title,Hcontent,Tcontent,Acontent #存在的话,就不用再解析和下载图片了 ,如果只有标题没有别的,很可能是这个新闻标题已经存在 else: print("此新闻可能已经被删除,提取失败") return title,Hcontent,Tcontent,Acontent dicString = "" ''' 测试区域 ''' print(BS) for script in BS.find_all("script", attrs={"async": False}): # 这边应该找内容不为空的 if script.text!=None and script.text!="": print(script.text) dicString = script.text break # dicString = BS.find("script", attrs={"async": False}).text #这边应该找内容不为空的 # print(dicString) print(dicString) dic = dicString.replace("var globalConfig =", "").replace(";", "") # print(dic) tempDic = dic print("解析的文章的部分-》") # 这边已经是空的了//todo 把腾讯的这个分析的东西再搞一下,应该还是解析页面里面除到问题就是这个文件里面的。 print(tempDic) if dic.find("contents: ") != -1: datalist = dic.split("contents: ")[1].split("ext_data")[0].replace("[", "").replace("],", "") # print("这边开始这样") #这个是新加的。 print(datalist) try: dic = eval("(" + datalist + ")") #因为加了这个才能转化成那样,这个应该也是没问题才对。 # print(dic) except Exception as e: print("转化成json出错") print(e) traceback.print_exc() # 貌似这个,一个错,各个错。 return title,Hcontent,Tcontent,Acontent #存在的话,就不用再解析和下载图片了 #return #返回空内容给他们咯,自动舍弃这个东西,然后那边要处理了一下,空内容的情况,这个不对啊。 checkLen = len(dic) pprint.pprint(dic) print(checkLen) #这儿需要考虑只有一个句子的情况,这个情况下是长度为2,这个情况下不是列表了,只是一个字典 if(checkLen>2): for p in dic: #遍历每一个,然后尽心判断应该也是没问题才对的。 try: #traceback.print_exc() if p['type'] == 1: # 这个是一个句子,那就直接插进去就可以了 # 当成句子插入 pContent = p['value'] phtml='<p>'+pContent+"</p>" Acontent =Acontent+phtml if dic.index(p)==0: #如果发现索引值是第一个的话,那就是开头了 Hcontent= phtml # print("find the header p") # print(phtml) elif dic.index(p)==checkLen-1 : #同理如果是最后一个句子,那么你就是结尾了 Tcontent = phtml else: # 不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(phtml) # print(phtml) if p['type'] == 2: imgSrc = p['value'].replace("\/", "/") #图片的真实下载地址 # print(imgSrc) imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") now = time.strftime('%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl(imgName) #这儿已经是添加了时间的 了 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgName) #已经处理好了 imgPScr = "/images/" + imgName + ".jpg" # 这个html格式图片地址 HtmlImg = '<p><img src="'+imgPScr+'"/></p>' # print(imgPScr) Acontent = Acontent + HtmlImg # print("这个是图片了 "+HtmlImg) if dic.index(p)==0: #如果发现索引值是第一个的话,那就是开头了 # print("find the header p") # print(HtmlImg) Hcontent= HtmlImg elif dic.index(p)==checkLen-1 : #如果是最后一个句子,那么你就是结尾了 Tcontent = HtmlImg else: # 不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(HtmlImg) if p['type'] == 3: # 这个是视频的情况,直接提取出来当成句子好了,这儿还有图片的,先不管了 try: pContent = p['value']['desc'] pContent = "<p>"+ pContent+"</p>" # 解析视频 vid = self.findVideo(tempDic) rawVideoString = "" if vid != None: rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>' if dic.index(p) == 0: # 如果发现索引值是第一个的话,那就是开头了 # print("find the header p") # print(pContent) Hcontent = pContent+rawVideoString elif dic.index(p) == checkLen - 1: # 如果是最后一个句子,那么你就是结尾了 Tcontent =pContent+rawVideoString else: #不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(pContent) except Exception as e: pass #舍弃这个段落 except Exception as e: print(e) traceback.print_exc() #貌似这个,一个错,各个错。 #插入数据库先 # for p in simplePList: # self.dbhelper.insertSimpleP(p) #插入段落,但是没有更新标题还有那些东西 Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) return title, Hcontent, Tcontent, Acontent else : #这儿是只有一个句子的网页的情况下 ,这边的一个大段落的,首段尾段的拆分是没问题的 p= dic #这样转换一下就可以了 # print(type(dic)) # print(dic) # print(p) if type(p)==tuple: print("是tuple") try: # print("长度是") # print(len(p)) if(len(p)==1): #又加了一种,怕不够长的处理 p=p[0] # 如果是里面只有一个的话,那就提取出里面的一个元素就可以了, if(len(p)==2): p=p[1] # 如果是里面只有一个的话,那就提取出里面的一个元素就可以了, else: #长度为空,里面除了标题,连文字内容都没有的这种。。。 p={'type':3} #开头结尾都是自己了,标题就也当成是这个来处理了 except Exception as e: print(e) #https://xw.qq.com/a/house/20180928003713 对付这个网页的情况 title,Hcontent,Tcontent,Acontent="","","","" #这样就可以拉 return title, Hcontent, Tcontent, Acontent#直接不要这个url的内容了 # print(p) if p['type'] == 1: # 这个是一个句子,那就直接插进去就可以了 # 当成句子插入 pContent = p['value'] #提取出句子来作为首段尾端还有 中间的段落 # print("长度有") # print(pContent.split("。")) # print(len(pContent.split("。"))) try: Tcontent = "<p>"+pContent.split("。")[-2] +"</p>" #最后一句作为 结尾的句子,句号前面那个才是 except Exception as e: Tcontent="<p>"+pContent.split("。")[0] +"</p>" #无法分的话,比如一句话,那就头尾都一样把 Hcontent = "<p>"+pContent.split("。")[0] +"</p>" #这儿是开头的第一句的句子 , simplePList.append(pContent) #整个把,没办法了饿 phtml = '<p>' + pContent + "</p>" Acontent = Acontent + phtml # print(phtml) if p['type'] == 2: imgSrc = p['value'].replace("\/", "/") # 图片的真实下载地址 # print(imgSrc) imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") now = time.strftime('%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl(imgName) # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgName) imgPScr = "/images/" + imgName + ".jpg" # 这个html格式图片地址 # print(imgPScr) HtmlImg = '<p><img src="' + imgPScr + '"/></p>' Acontent = Acontent + HtmlImg # print("这个是图片了 " + HtmlImg) if p['type'] == 3 : #只有一个视频的时候,解析视频 pContent = title # print(pContent) #解析视频 vid = self.findVideo(tempDic) rawVideoString ="" if vid!=None: rawVideoString='<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid='+vid+'" allowFullScreen="true"></iframe></p>' #只有一个句子的东西是没有这个索引值的 # if dic.index(p) == 0: # 这种情况就是只有一个视频的网页,标题就是唯一的开头了,那结尾呢,已经不能分了把,视频又没找到 # print("find the header p") # print(pContent) Hcontent = pContent #头是描述 Tcontent = rawVideoString #尾是视频作为一个独立的一个段落 #插入数据库先 for p in simplePList: self.dbhelper.insertSimpleP(p) #插入段落,但是没有更新标题还有那些东西 Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) return title, Hcontent, Tcontent, Acontent #返回了后就可以写入数据库了把 # else: #这儿是真的只有一个句子的时候 # pass def getPageContentMain(self,sportsUrl): print(sportsUrl) title, Hcontent, Tcontent, Acontent = self.getWxContent(self.changeNewToWx(sportsUrl)) #直接打开手机端的getWxContent这儿 print(title) # print(Hcontent) # print(Tcontent) # print(Acontent) return title, Hcontent, Tcontent, Acontent
#这个是用来分类整理进入django的数据库的。 # newssentimentanalysis_homenews 这个是示范的名字,分发到不同的表里面就可以了 from DBcontrol import DB chak = DB() # chak.getAllTitle() # chak.saveDicToMysql(testDic,"2019-03-18","tengxun") # chak.insertTengxunTheme("www", "2018-232", "test", "auto") # todo 爬完再执行去重。 resultDic = chak.__query__( "select url,title,urlState,Hcontent,Mcontent,Tcontent,Acontent,newdate,fromWhere from tengxun where urlState='True'" ) print(resultDic) print(len(resultDic)) #这次我并没有更新他们,更新他们之前是在everyday那儿进行处理的,把信息和urlstarte一起更新进去 print("开始分类整理") for rowDic in resultDic: print(rowDic) #七个分类 newssentimentanalysis_caranalysis_comment sql = "" sqlHead = "insert into newssentimentanalysis_" #插入分类新闻主表的sql sqlTail = "news (url,Title,UrlState,Hcontent,Mcontent,Tcontent,Acontent,Date,fromWhere)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" sql2 = "" # sql2Head ="insert into newssentimentanalysis_" #插入新闻正文评分表的sql sql2Tail = "analysis_news(Pos_Score,Neg_score,Sentiment,News_id_id)values (%s,%s,%s,%s)" #这个是sql的 if rowDic['url'].find( 'auto' ) != -1: #找到这个就是汽车,中间是表名 newssentimentanalysis_entertainmentanalysis_news sql = sqlHead + "car" + sqlTail sql2 = sqlHead + "car" + sql2Tail pass if rowDic['url'].find('tech') != -1: #找到这个就是科技
url[0]) if (title != "" and Hcontent != ""): datehelper.updateContent(url[0], title, Hcontent, Tcontent, Acontent) mixNews = MixNews() state = mixNews.__startToMix__() if (state): # 里面已经有那种写入数据库的操作了 datehelper.updateState(url[0]) if __name__ == "__main__": #这个就是url的东西 fillTengxun = pageContent() fillFenghaung = fenghuangPageContent() fillWangyi = wangyiPageContent() datehelper = DB() # tengxunUrls = datehelper.__query__('select url from tengxun where fromWhere ="tengxun" and url!="" and isNull(title) and urlState="False";') # fenghuangUrls = datehelper.__query__('select url from tengxun where fromWhere ="fenghuang" and url!="" and isNull(title) and urlState="False" ;') wangyiUrls = datehelper.__query__( 'select url from tengxun where fromWhere ="wangyi" and url!="" and isNull(title) and urlState="False" ;' ) # print(len(tengxunUrls)) # print(len((fenghuangUrls))) print(len((wangyiUrls))) # tengxun = myThread(tengxunFill,tengxunUrls,"tengxun") # tengxun.run() # wangyi = myThread(wangyiFill, wangyiUrls, "wangyi")
#从三个数据库里面的各种字段里面,已经有的还是需要修改的,真是的,没办法了 from DBcontrol import DB dbhelp = DB()
def saveListToMysql(self, lists, date,fromWhere): connect = DB() lists = list(set(lists)) for i in lists: connect.insertTenxun(i, date,fromWhere) print("所有插入完毕")
def getEveryTengxun(self): #这个其实就是封装在对象里面的一个主函数而已 #开始之前,先检查有没有序列化的文件在这儿 dbhelper = DB() # todo 提取页面失败得这几个可以研究一下 是tuple 网址不同,(网址不同,发布时间的不同,) # {'type': 2, 'value': 'http:\\/\\/inews.gtimg.com\\/newsapp_match\\/0\\/5261922136\\/0'} # 纪念品牌20周年 smart Forease官图发布 # 打开页面提取失败,可能是页面为404腾讯,删除这条url # 删除成功哈 http:////auto.qq.com//a//20181005//001598.htm # http:////news.qq.com//a//20181005//002590.htm # qqnews # 2 pcontent = pageContent() # print("共提取到新闻url的数量有") # now_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 获取当前日期,每次执行操作的时候都这样 now_date = (date.today() + timedelta(days=-1)).strftime( "%Y-%m-%d") # 昨天日期 # time.localtime(time.time()) #暂时是这样,以后的话 print("昨天的日期是" + now_date + "现在正在爬取昨天的新闻!d😀") #1.获取一天的新url #爬取昨晚的 dateUrl = DateUrl( ) # 2018-09-27 日编辑 todo 这儿区分开来,不用通过这儿返回的,另外那儿只需要把那些urlState="False"的提取出来就可以 dateUrl.pageUrlMain(now_date) #获得今天的,并且写入数据库 todo 待会要把这儿的这个调回去 todayNewUrl = dbhelper.__query__( "select url from tengxun where urlState='False' and fromWhere='tengxun'" ) #只要数据库中取出需要读取的url # print(type(todayNewUrl)) print(len(todayNewUrl)) print("") # 这儿才是把东西提取出来 count = 1 #计数,每100个就休息1分钟 for dic in todayNewUrl: #这儿还是需要把 url = dic['url'] #2.把写入数据库的这几个新闻url的内容提取出来 if count % 200 == 0: #突然的中断应该是因为这边连接不上那儿,所以应该问题不大 time.sleep(60 * 2) #每两百个休息4分钟好了 count += 1 #这儿的url是未转换成xw。电脑原版页面的url,所以,存的是这种url #还是得把这个url打开,才知道是否是title已经存在的 title, Hcontent, Tcontent, Acontent = pcontent.getPageContentMain( url) #这儿漏了更新到url中去 ,自动转换成xw的然后再下载 time.sleep(1) # print(title, Hcontent, Tcontent, Acontent) if (title != "腾讯没找到标题" and title != None and Hcontent != ""): #有内容的时候就更新这条数据 # print("要更新的url是 "+url) resultState = dbhelper.updateContent(url, title, Hcontent, Tcontent, Acontent) #要删除的是更新失败的那个 if resultState == False: #更新成功 print("更新失败,正在删除这个url不同,但是标题相同的新闻") print(url) dbhelper.deleteUrl(url) # 按url把这条记录删除掉咯,生成失败也不需要删除这个拉, print() else: # print("正在生成新混合新闻。。。") # 3. 然后是把页面页写入数据库,再然后是随机生成相同数量的 mixNews = MixNews() if mixNews.__startToMix__() != True: # 调用一次就执行一次,可以修改返回的状态 print("生成失败,已经没有刚填满的未用过的文章了") print(url) dbhelper.deleteUrl( url) # 如何这个内容为空也要删除,(可能前面一个步骤更新的时候发现相同的标题,所以插入不了), # print() else: print("打开页面提取失败,可能是页面为404腾讯,删除这条url") #为空的话,那么就删除这条把 dbhelper.deleteUrl(url) #按url把这条记录删除掉咯
#coding=utf-8 # ''' 1.这儿是整合起来跑的,首先是数据库提取出要爬取内容的网页的链接 2.然后就是把使用pageContent 这个东西提取出来,并且自动本地化图片 3.然后就是把文章的段落随机组合成不少于800字的一个文章 #只是用来填充基础库用的 ''' from DBcontrol import DB from fenghuang.fenghuangPageContent import fenghuangPageContent datahelper = DB() URLandDate = datahelper.getLimitUrl(0, 1000, "fenghuang") #到时候可以设定好一个确切的整数,然后再开始遍历划分 print(URLandDate) #这儿取出来后,之后又和一般的那个爬虫是一样的了,设置一定的频率 page = fenghuangPageContent() for url, date in URLandDate: title, Hcontent, Tcontent, Acontent = page.getPageContent(url) print(Acontent) datahelper.updateContent(url, title, Hcontent, Tcontent, Acontent) #包括一整排你的完整文章还有拆分开来的两部分头尾 # datahelper.i # time.sleep(0.5) print("写入100内容完毕") #这个之后才可以随机的组装出数据 # mixP
return False except: print("Check process ERROR!!!") return False def readfile(tfile): with open(tfile, 'r') as f: lines = f.readlines() return lines[-20:] #a;lskdjf a;f dsf;ds f;f f if __name__ == "__main__": dbhelper = DB() tempNumber = len( dbhelper.__query__("select * from c_title")) # 先检测一下多少,变了多少 email = EMail() while (1): if isRunning("python everydaynews.py"): print("程序还在运行中。。。12小时后继续检查") nonNumber = len( dbhelper.__query__("select * from c_title")) # 先检测一下多少,变了多少 print(str(nonNumber - tempNumber)) face = "🤔我该以什么表情来表示呢,如果不是崩了,那就待机中,6小时后我再来看看" if (nonNumber - tempNumber) > 800: face = "\n🤣今天的量还不错😘" elif ((nonNumber - tempNumber)) > 600 and ( (nonNumber - tempNumber)) <= 800: face = "🤗今天的量还算正常哈"
def __init__(self): self.dbhelper = DB()
class pageContent: def __init__(self): self.dbhelper = DB() def stripImgUrl(self, replacedSrc): if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "") if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "_") if (replacedSrc.find(".") != -1): replacedSrc = replacedSrc.replace(".", "_") if (replacedSrc.find("/") != -1): replacedSrc = replacedSrc.replace("/", "_") if (replacedSrc.find("-") != -1): replacedSrc = replacedSrc.replace("-", "_") if (replacedSrc.find("?") != -1): replacedSrc = replacedSrc.split("?")[0] if (replacedSrc.find("?") != -1): replacedSrc = replacedSrc.replace("?", "_") if (replacedSrc.find("!") != -1): replacedSrc = replacedSrc.replace("!", "_") if (replacedSrc.find("\"") != -1): replacedSrc = replacedSrc.replace("\"", "_") if (replacedSrc.find(" ") != -1): replacedSrc = replacedSrc.replace(" ", "") if (replacedSrc.find("“") != -1): replacedSrc = replacedSrc.replace("“", "") if (replacedSrc.find("”") != -1): replacedSrc = replacedSrc.replace("”", "") if (replacedSrc.find(":") != -1): replacedSrc = replacedSrc.replace(":", "") if (replacedSrc.find("|") != -1): replacedSrc = replacedSrc.replace("|", "_") return replacedSrc def findVideo(self, dic): #传入解析到一半的 if dic.find("ext_data: ") != -1: #有视频的时候才来解析这个东西 try: if dic.find("vid") != -1: #找到这个字段 # dic = dic.replace('"',"'").replace(",","") dic = dic.split("ext_data: ")[1].split("fisoriginal")[0] vid = dic.split("vid")[1].split(",")[0] vid = vid.split('"')[2] # print(vid) return vid except Exception as e: print(e) return return dic else: return def fixUrl(self, url): url = url.replace("////", "//") # print(url) return url def changeNewToWx(self, url): #把链接转化成手机端的,微信的,然后返回合成的微信的链接,或者直接通过请求头来自动跳到手机端 #提取到的两杠的要换成一杠才可以的。 # print(url) if url.find("////") != -1: url = self.fixUrl(url) # print(url) rawWxUrl = "https://xw.qq.com/" if url.find("qq.com/") != -1: splitUrl = url.split(".qq.com/") tailUrl = splitUrl[1].replace(".htm", "") headUrl = splitUrl[0] #为了提取出主题theme if headUrl.find("//") != -1: #提取主题 theme = headUrl.split("//")[1] tailUrl = tailUrl.split( "//")[0] + "/" + theme + "/" + tailUrl.split( "//")[1] + tailUrl.split("//")[2] else: return sumUrl = rawWxUrl + tailUrl # print(sumUrl) # print(rawWxUrl) # print(tailUrl) if len(sumUrl.split("//")) > 1: #为1就没问题 tempUrl = sumUrl.split("//") totalUrl = tempUrl[0] + "//" + "/".join(tempUrl[1:]) return (totalUrl) return sumUrl def getWxContent(self, wxUrl, crawlDate): # 先提取出里面的那个,然后再看看什么情况 ----这个是主要解析的函数 #把前面传过来的时间字符串作为时间就好了,这样好一点。 now = crawlDate.replace("-", "") time.sleep(1) title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 # downloadTool = Download(r'/home/default/images') # 也可以手动设置新下载路径。 downloadTool = Download(None) # 设置下载路径,设置为None就是使用默认值 simplePList = [] Cooker = makeBS() BS = Cooker.makesoup(wxUrl) #传进来是微信的才可以一 if BS == None: return title, Hcontent, Tcontent, Acontent # print(BS) try: title = BS.find("head").title #直接选用head这儿的title标签里面的东西 except Exception as e: print(e) traceback.print_exc() # 貌似这个,一个错,各个错。 return title, Hcontent, Tcontent, Acontent if (title != None and title.text != "404 - 腾讯网"): title = title.text if self.dbhelper.ifExists( title): #如果标题为空那就直接删除这条了。所以就是因为已经存在,所以后面都不进行读取了对吧 print("已经存在了这个") return title, Hcontent, Tcontent, Acontent #存在的话,就不用再解析和下载图片了 ,如果只有标题没有别的,很可能是这个新闻标题已经存在 else: print("此新闻可能已经被删除,提取失败") return title, Hcontent, Tcontent, Acontent dicString = "" for script in BS.find_all("script", attrs={"async": False}): # 这边应该找内容不为空的 if script.text != None and script.text != "": print(script.text) dicString = script.text break # print(dicString) dic = dicString.replace("var globalConfig =", "").replace(";", "") # print(dic) tempDic = dic print("解析的文章的部分-》") # print(tempDic) if dic.find("contents: ") != -1: datalist = dic.split("contents: ")[1].split("ext_data")[0].replace( "[", "").replace("],", "") # print("这边开始这样") #这个是新加的。 # print(datalist) try: dic = eval("(" + datalist + ")") #因为加了这个才能转化成那样,这个应该也是没问题才对。 # print(dic) except Exception as e: print("转化成json出错") print(e) traceback.print_exc() # 貌似这个,一个错,各个错。 return title, Hcontent, Tcontent, Acontent #存在的话,就不用再解析和下载图片了 checkLen = len(dic) # pprint.pprint(dic) # print(checkLen) #这儿需要考虑只有一个句子的情况,这个情况下是长度为2,这个情况下不是列表了,只是一个字典 if (checkLen > 2): for p in dic: #遍历每一个,然后尽心判断应该也是没问题才对的。 try: #traceback.print_exc() if p['type'] == 1: # 这个是一个句子,那就直接插进去就可以了 # 当成句子插入 pContent = p['value'] phtml = '<p>' + pContent + "</p>" Acontent = Acontent + phtml if dic.index(p) == 0: #如果发现索引值是第一个的话,那就是开头了 Hcontent = phtml elif dic.index( p) == checkLen - 1: #同理如果是最后一个句子,那么你就是结尾了 Tcontent = phtml else: # 不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(phtml) if p['type'] == 2: imgSrc = p['value'].replace("\/", "/") #图片的真实下载地址 # print(imgSrc) imgName = imgSrc.replace( "https://inews.gtimg.com/", "").replace("/", "_") # now = time.strftime('%Y%m%d', time.localtime(time.time())) #默认就是字符串,时间需要和url中的时间对应上才可以啊。 imgName = self.stripImgUrl( imgName) #这个就只传文件名好了,如果出了问题的话就用上面的那个。 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # imgName = now +"/"+ self.stripImgUrl(imgName) #这儿已经是添加了时间的 了 # src 参考这儿 /static/images/3.jpg /static/images/20190309/3.jpg imgPScr = "/static/images/" + now + "/" + imgName + ".jpg" # 这个html格式图片地址 HtmlImg = '<p><img src="' + imgPScr + '"/></p>' # time.sleep(60) # print(imgPScr) Acontent = Acontent + HtmlImg # print("这个是图片了 "+HtmlImg) if dic.index(p) == 0: #如果发现索引值是第一个的话,那就是开头了 # print("find the header p") # print(HtmlImg) Hcontent = HtmlImg elif dic.index( p) == checkLen - 1: #如果是最后一个句子,那么你就是结尾了 Tcontent = HtmlImg else: # 不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(HtmlImg) if p['type'] == 3: # 这个是视频的情况,直接提取出来当成句子好了,这儿还有图片的,先不管了 try: pContent = p['value']['desc'] pContent = "<p>" + pContent + "</p>" # 解析视频 vid = self.findVideo(tempDic) rawVideoString = "" if vid != None: rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>' if dic.index(p) == 0: # 如果发现索引值是第一个的话,那就是开头了 # print("find the header p") # print(pContent) Hcontent = pContent + rawVideoString elif dic.index( p ) == checkLen - 1: # 如果是最后一个句子,那么你就是结尾了 Tcontent = pContent + rawVideoString else: #不是首段和尾端的端口才加入到零散的段落中去 simplePList.append(pContent) except Exception as e: pass #舍弃这个段落 except Exception as e: print(e) traceback.print_exc() #貌似这个,一个错,各个错。 #插入数据库先 # for p in simplePList: # self.dbhelper.insertSimpleP(p) #插入段落,但是没有更新标题还有那些东西 Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) return title, Hcontent, Tcontent, Acontent else: #这儿是只有一个句子的网页的情况下 ,这边的一个大段落的,首段尾段的拆分是没问题的 p = dic #这样转换一下就可以了 # print(type(dic)) # print(dic) # print(p) if type(p) == tuple: print("是tuple") try: # print("长度是") # print(len(p)) if (len(p) == 1): #又加了一种,怕不够长的处理 p = p[0] # 如果是里面只有一个的话,那就提取出里面的一个元素就可以了, if (len(p) == 2): p = p[1] # 如果是里面只有一个的话,那就提取出里面的一个元素就可以了, else: #长度为空,里面除了标题,连文字内容都没有的这种。。。 p = {'type': 3} #开头结尾都是自己了,标题就也当成是这个来处理了 except Exception as e: print( e ) #https://xw.qq.com/a/house/20180928003713 对付这个网页的情况 title, Hcontent, Tcontent, Acontent = "", "", "", "" #这样就可以拉 return title, Hcontent, Tcontent, Acontent #直接不要这个url的内容了 # print(p) if p['type'] == 1: # 这个是一个句子,那就直接插进去就可以了 # 当成句子插入 pContent = p['value'] #提取出句子来作为首段尾端还有 中间的段落 # print("长度有") # print(pContent.split("。")) # print(len(pContent.split("。"))) try: Tcontent = "<p>" + pContent.split( "。")[-2] + "</p>" #最后一句作为 结尾的句子,句号前面那个才是 except Exception as e: Tcontent = "<p>" + pContent.split( "。")[0] + "</p>" #无法分的话,比如一句话,那就头尾都一样把 Hcontent = "<p>" + pContent.split( "。")[0] + "</p>" #这儿是开头的第一句的句子 , simplePList.append(pContent) #整个把,没办法了饿 phtml = '<p>' + pContent + "</p>" Acontent = Acontent + phtml # print(phtml) if p['type'] == 2: imgSrc = p['value'].replace("\/", "/") # 图片的真实下载地址 # print(imgSrc) imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") # now = time.strftime('%Y%m%d', time.localtime(time.time()))# todo 这个时间要减少一天才对。 now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgName) #这儿已经是添加了时间的 了 imgName = self.stripImgUrl( imgName) # 这个就只传文件名好了,如果出了问题的话就用上面的那个。 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgName) imgPScr = "/static/images/" + now + "/" + imgName + ".jpg" # 这个html格式图片地址 # print(imgPScr) HtmlImg = '<p><img src="' + imgPScr + '"/></p>' Acontent = Acontent + HtmlImg # print("这个是图片了 " + HtmlImg) if p['type'] == 3: #只有一个视频的时候,解析视频 pContent = title # print(pContent) #解析视频 vid = self.findVideo(tempDic) rawVideoString = "" if vid != None: rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>' #只有一个句子的东西是没有这个索引值的 # if dic.index(p) == 0: # 这种情况就是只有一个视频的网页,标题就是唯一的开头了,那结尾呢,已经不能分了把,视频又没找到 # print("find the header p") # print(pContent) Hcontent = pContent #头是描述 Tcontent = rawVideoString #尾是视频作为一个独立的一个段落 #插入数据库先 # for p in simplePList: # self.dbhelper.insertSimpleP(p) #插入段落,但是没有更新标题还有那些东西 Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) return title, Hcontent, Tcontent, Acontent #返回了后就可以写入数据库了把 # else: #这儿是真的只有一个句子的时候 # pass def getPageContentMain(self, sportsUrl, crawlDate): print(sportsUrl) title, Hcontent, Tcontent, Acontent = self.getWxContent( self.changeNewToWx(sportsUrl), crawlDate) #直接打开手机端的getWxContent这儿 print(title) print(Hcontent) print(Tcontent) print(Acontent) return title, Hcontent, Tcontent, Acontent
def __init__(self): self.dbhelper = DB() #默认就给你创建好了,
def getEveryTengxun(self): dbhelper = DB() #处理数据库用 pcontent = pageContent() #处理页面详情用 now_date = (date.today() + timedelta(days=-1)).strftime( "%Y-%m-%d") # 昨天日期 print("昨天的日期是" + now_date + "现在正在爬取昨天的新闻!d😀") #应该是获得昨天才对 #------------------------------------------------爬取昨晚的----------------------------------------------------- print("开始执行写入所有的url") dateUrl = DateUrl( ) # 2018-09-27 日编辑 todo 这儿区分开来,不用通过这儿返回的,另外那儿只需要把那些urlState="False"的提取出来就可以 dateUrl.pageUrlMain(now_date) #获得今天的,并且写入数据库 ,所以这儿返回什么都没关系,不需要返回都可以的 #-------------------------------------------------打开内容------------------------------------------------------ print("开始执行读取页面") todayNewUrl = dbhelper.__query__( "select url from tengxun where urlState='False' and fromWhere='tengxun'" ) print("读取出 " + str(len(todayNewUrl)) + " 条") print("") #每100个就休息1分钟,慢是有原因的#每两百个休息2分钟好了 count = 1 delCount = 0 for dic in todayNewUrl: url = dic['url'] if count % 200 == 0: time.sleep(60 * 2) print("休息2分钟") count += 1 # 爬取的当前时间写入进去。 title, Hcontent, Tcontent, Acontent = pcontent.getPageContentMain( url, now_date) #这儿漏了更新到url中去 ,自动转换成xw的然后再下载 time.sleep(1) if (title != "腾讯没找到标题" and title != None and Hcontent != ""): #有内容的时候就更新这条数据 # todo 这儿加上生成云图保存本地,并且把路径合并成src生成字符串合并到Acontent就可以了。 # 生成img标签 News_Id = url.replace("$", "").replace("/", "").replace( ":", "_").replace(".", "_") imgTag = "<img src=" + Gen_WordCloud( Newsid=News_Id, text=Acontent) + " />" #不能使用单引号,否则会让sql语句中断开的 print(imgTag) Acontent = imgTag + Acontent print("更新的结果有") print(title) print(Tcontent) print(url) print(Acontent) print("显示完毕") resultState = dbhelper.updateContent(url, title, Hcontent, Tcontent, Acontent) #要删除的是更新失败的那个 if resultState == False: #更新成功 print("更新失败,正在删除这个url不同,但是标题相同的新闻") print(url) dbhelper.deleteUrl(url) #删除提取失败的那些 print() else: pass #更新成功什么都不干 else: delCount += 1 print("打开页面提取失败,可能是页面为404腾讯,删除这条url") #为空的话,那么就删除这条把 dbhelper.deleteUrl(url) #按url把这条记录删除掉咯 dbhelper.classifyDB() # 执行完了后就进行分类到django的数据库 comment = CommentCrawl() comment = CommentCrawl() comment.getCommentMain() #执行了爬取评论并且分类到django数据库 print("共删除了 " + str(delCount)) print("原来有 " + str(len(todayNewUrl)) + " 条") print("今天爬取完毕,蟹蟹使用")
return False except: print("Check process ERROR!!!") return False def readfile(tfile): with open(tfile, 'r') as f: lines = f.readlines() return lines[-50:] #a;lskdjf a;f dsf;ds f;f f # todo 所有的错误提示中加上那个可以确定错误位置的东西 if __name__ == "__main__": dbhelper = DB() tempNumber = len( dbhelper.__query__("select * from c_title")) # 先检测一下多少,变了多少 email = EMail() timeSleep = 60 * 60 * 6 #这个是发右键的休眠的时间 while (1): now_date = (date.today() + timedelta(days=-1)).strftime( "%Y-%m-%d") # 具体昨天日期的三个东西的结果 tengxunNumber = len( dbhelper.__query__( "select * from tengxun where newdate='%s' and fromwhere='%s'" % (now_date, "tengxun"))) # 先检测一下多少,变了多少 wangyiNumber = len( dbhelper.__query__( "select * from tengxun where newdate='%s' and fromwhere='%s'" % (now_date, "wangyi")))
class DateUrl: def __init__(self): self.dbhelper = DB() #默认就给你创建好了, def getDateUrlList(self, startDate, endDate): #返回这两个日期区间的url,顺便就写入数据库了 urlList = [] timehelper = TimeHelper() datelist = [] if (startDate != endDate): #不相等的时候就算差值 datelist = timehelper.getTimeList(startDate, endDate) else: datelist.append(startDate) for oneDay in datelist: #这儿也设置了休眠的 time.sleep(1.5) #500毫秒一次,那我设置成800毫秒请求一次 onedatelist = [] try: onedatelist = self.getOneDayNewUrl(oneDay) except Exception: time.sleep(30) onedatelist = self.getOneDayNewUrl(oneDay) urlList = urlList + onedatelist # self.saveListToMysql(onedatelist,oneDay,"tengxun") #存到数据库里面去,把每个都插入进去 return urlList def getOneDayNewUrl(self, date): date = parse.quote_plus("" + date) oneDayUrlList = [] print(str(date)) # date = "2018-07-26" appid = "3639833dae924cb9efb6ba30e6c5a6fa" url = "https://api.shenjian.io/?appid=" + appid + "&date=" + date # print(url) request = urllib.request.Request(url, headers={ "Accept-Encoding": "gzip", }) response = urllib.request.urlopen(request) gzipFile = gzip.GzipFile(fileobj=response) # print(gzipFile.read().decode('UTF-8')) jsonResult = json.loads(str(gzipFile.read().decode('UTF-8'))) if "data" in jsonResult: print(jsonResult['data']) print("共有多少个新闻" + str(len(jsonResult['data']))) if (len(jsonResult['data']) == 4): oneDayUrlList.append(jsonResult['data']['url']) return oneDayUrlList else: for i in jsonResult['data']: # print(i['url']) oneDayUrlList.append(i['url']) return oneDayUrlList else: print("检测到腾讯的api 中无 data key 10分钟后再试") time.sleep(60 * 10) #如果一下子那个api没有反应的话,那就这样操作咯,用进程把,多个cpu哦 return self.getOneDayNewUrl(date) #采用递归的方式来处理,, # -----------------------------------------------------下面开始是新的提取出页面的url的----------------------------------- def returnThemeCode(self, theme): #这个是有用的,用来组合主题代码url的 ent_Theme = 1537876288634 sport_Theme = 1537877689177 finance_Theme = 1537878365483 tech_Theme = 1537879684280 auto_Theme = 1537887032223 house_Theme = 1537887128904 news_Theme = 1537874915062 if theme == 'news': return news_Theme if theme == 'ent': return ent_Theme if theme == 'sports': return sport_Theme if theme == 'tech': return tech_Theme if theme == 'auto': return auto_Theme if theme == 'house': return house_Theme if theme == 'finance': return finance_Theme def getThemeUrl(self, theme, today, pageNumber): rawUrl = "http://roll.news.qq.com/interface/cpcroll.php" rawReferer = '.qq.com/articleList/rolls/' # 'http://news 前面还有这个东西 my_headers = [ 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)' ] headers = { "User-Agent": random.choice(my_headers), 'Referer': 'http://' + theme + rawReferer } # 默认值 rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str( self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str( pageNumber) + "&date=" + today try: rawhtml = requests.get( rawUrl, headers=headers, allow_redirects=False, timeout=30) # 一般提取文本的话,那就用text,如果是文件就content rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] # print(rawhtml.url) print("状态码" + str(rawhtml.status_code)) if rawhtml.status_code == 504: print(504) return print("页面的读取结果为") # print(rawhtml.text) if rawhtml.text.find('rollback') == 0: jsonString = rawhtml.text.split("rollback")[1] # 把js提取出来就可以了 else: jsonString = rawhtml.text print(jsonString) dicData = eval(jsonString) print(type(jsonString)) print(jsonString) # print(dicData['data']['article_info']) print(len(dicData['data']['article_info'])) if dicData['data'] == "": print("超过了最大页数了,跳出了就可以了") return urllist = [] for one in dicData['data']['article_info']: # print(one['url']) print(one['url'].replace("\\", "/")) # 还需要检查一下这个和之前的那种野蛮是不是一样的 urllist.append(one['url'].replace("\\", "/")) return urllist except Exception as e: # print(e) return [] def pageUrlMain(self, date): # 写入url进入数据库,并且写入分类 # url ="http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537850539512" urlNew = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537874915062" urlEnt = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=ent&mode=1&cata=&date=2018-09-25&page=1&_=1537876288634" # referer = http://ent.qq.com/articleList/rolls/ urlSport = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=sports&mode=1&cata=&date=2018-09-25&page=1&_=1537877689177" # r这个好像而是动态加载出来的,真是的 urlFinance = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=finance&mode=1&cata=&date=2018-09-25&page=1&_=1537878365483" urlTech = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=tech&mode=1&cata=&date=2018-09-25&page=2&_=1537879684280" urlAuto = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=auto&mode=1&cata=&date=2018-09-25&page=1&_=1537887032223" urlHouse = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=house&mode=1&cata=&date=2018-09-25&page=1&_=1537887128904" resultUrlDic = {} #写入数据库使用这个 tempList = [] themeList = [ 'news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports' ] #一共有7个主题,其实不止这7个的 for theme in themeList: print("第一个主题是") tempDList = [] for i in range(1, 12): # 一般是10页就很多的了。10页以内 print("第" + str(i) + "页") responseList = self.getThemeUrl(theme, date, i) if len(responseList) == 0: print("最大页数为" + str(i - 1) + "页") break else: tempList = tempList + responseList tempDList += responseList resultUrlDic[theme] = tempDList print(resultUrlDic) tempList = set(tempList) count = 0 print("列表的url数量有:" + str(len(tempList))) for key in resultUrlDic: count += len(resultUrlDic[key]) print("url总共有" + str(count)) print("这个是PageUrls内的提取到的url") pprint(resultUrlDic) print(len(resultUrlDic)) print("这个开始是list类型的结果") print(tempList) self.dbhelper.saveDicToMysql(resultUrlDic, date, "tengxun") #参数,字典结果集,时间,分类 return tempList #直接这儿去重后
def getNewsContent( self, url): # 打开news.ifenghxxx的东西,只要打开文章后,然后解析出 首段,尾段,还有中间的普通段落就可以了 title, Hcontent, Tcontent, Acontent = "", "", "", "" t = time.time() timeStamp = str(int(round(t * 1000))) # 毫秒级时间戳 time.sleep(1) print("现在网页是:" + url) downloadTool = Download(r'/home/default/images') # 设置下载路径 dbhelper = DB() title, Hcontent, Tcontent, Acontent = "", "", "", "" simpleP = [] soup = makeBS().makesoup(url) if soup == None: return title, Hcontent, Tcontent, Acontent try: title = soup.find("head").title if dbhelper.ifExists(title): #如果找到已经存在那就不用再写的拉 return title, Hcontent, Tcontent, Acontent #存在的话,就不用再解析和下载图片了 except Exception as e: #一般提示没找到都是 return title, Hcontent, Tcontent, Acontent #也当成没有处理掉就可以了 # print(title) # print("标题是") if title != None: # 如果找到的话 title = title.text.split("_")[0] # title=self.fixCssdot(title) # print(title) else: print("没能找到标题,请检查网址 " + url) # print(soup) return title, Hcontent, Tcontent, Acontent # 不行就返回空的回去 flag = False # 遇到之前先不写入 pList = [] simplePList = [] # 这个是 for p in soup.find_all( "p" ): #找到p就输出来就可以了 你好啊,大哥,我是你的第一个机械硬盘,感觉还行哈,灯光效果怎么样,不怎么会 # print(p.text) try: if p['class'][0] == "p_time": flag = True continue #跳过这个的下面的,继续走 if p['class'][0] == "detailPic": #这儿找到后就下载图片,并且,修改src然后写入进去,就是那个模块了, todo 有空变成模块 imgSrc = p.img['src'] imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_") now = time.strftime('%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = now_date + self.stripImgUrl( imgName) # 这儿已经是添加了时间的 了 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) # now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 # imgName = now_date + self.stripImgUrl(imgName) #已经处理好了 imgPScr = "/images/" + imgName + ".jpg" # 这个html格式图片地址 #传进来是img的url下载并且修改后当成普通的段落进行处理 pList.append('<img src=' + imgPScr + '/>') # 统一起来把,后面会有加东西 # print("<p><img src='"+p.img['src']+"'/></p>") if p.text == "用微信扫描二维码分享至好友和朋友圈": flag = False #找到了这个就是到结尾了 pass except Exception as e: pass if flag: # print("正在添加") pList.append(p) else: # print("停止添加这一条") break # print(len(pList)) # 这个最后一个句子是尾句, Plen = len(pList) if (len(pList) == 1): # 一个句子的话,首段和尾端都统一起来好了 print("这个文章只有一个句子 " + url) return title, pList[0], pList[0], pList[0] # 这样的话,就不插入 PPlist = [] continueFlag = False for pOne in pList: try: p = pOne.text except Exception: #那就是图片的那个了,我知道是什么来的 p = pOne # print(pOne) if (p != ""): # print(p.strip("\n")) if p.strip( "\n" ) == "用微信扫描二维码分享至好友和朋友圈" and continueFlag == False: #有些有有些没有的 # print("找到第一个这个东西") # continue continueFlag = True elif p.strip( "\n" ) == "用微信扫描二维码分享至好友和朋友圈" and continueFlag == True: #有些有有些没有的 # print("这儿是结束的地方") continueFlag = "break" if continueFlag == True: if (p != "用微信扫描二维码分享至好友和朋友圈"): p = "<p>" + p + "</p>" #这儿是把"'" 这个东西替换成'"' 这个东西 # if p.find("'")!=-1: # print("找到了这个东西") PPlist.append(p) if continueFlag == 'break': break else: pass #图片都会在前面进行处理才对的 #检查一下是不是图片的 # print("文字卫空空那") # if pOne.find("img")!=None: #因为图片是处理过的了,那么就直接加入进去就可以了 # print(p) # print(PPlist) if (len(PPlist)) == 1 and len(PPlist) == 2: #还有2都是这种情况 Hcontent = PPlist[0] Tcontent = PPlist[0] Acontent = PPlist[0] if (len(PPlist)) > 2: Hcontent = PPlist[0] Tcontent = PPlist[-1] for i in PPlist: Acontent = Acontent + i # print(i) # 你好啊 # Acontent = PPlist # print("普通的段落有") simplePList = PPlist[1:-1] # print(p) # print("开头结尾是这个") # print(Hcontent) # print(Tcontent) # print("各个分句子在这儿") for simpleP in simplePList: # 这儿直接写入进去就可以了 dbhelper.insertSimpleP(simpleP) # 这儿这个是一样的 # print(simpleP) return title, Hcontent, Tcontent, Acontent
def saveListToMysql(self, list, date): #url list写入数据库中去 connect = DB() for i in list: connect.insertTenxun(i, date, "wangyi")
def getPlContent(self, url): #打开pl.ifenghxxx的东西 title, Hcontent, Tcontent, Acontent = "", "", "", "" t = time.time() timeStamp = str(int(round(t * 1000))) # 毫秒级时间戳 time.sleep(1) print("现在网页是:" + url) downloadTool = Download(r'/home/default/images') # 设置下载路径 dbhelper = DB() title, Hcontent, Tcontent, Acontent = "", "", "", "" simpleP = [] soup = None soup = makeBS().makesoup(url) if soup != None: title = soup.find("head") else: return title, Hcontent, Tcontent, Acontent # print(title) # print("标题是") if title != None: # 如果找到的话 title = title.text.split("_")[0] # title=self.fixCssdot(title) # print(title) else: print("没能找到标题,请检查网址 " + url) # print(soup) # print() return title, Hcontent, Tcontent, Acontent # 不行就返回空的回去 flag = False #遇到之前先不写入 pList = [] simplePList = [] #这个是 for p in soup.find_all("p"): # print(type(p)) # print(p) try: # print(p['class']) if p['class'][0] == "f14": # print(p) pList.append(p) flag = True if p['class'][0] == "IphoneNone": # print(p['class']) flag = False #结束跳出 except Exception as e: if flag: # print("正在添加") pList.append(p) else: # print("停止添加这一条") break print(len(pList)) #这个最后一个句子是尾句, Plen = len(pList) if (len(pList) == 1): #一个句子的话,首段和尾端都统一起来好了 print("这个文章只有一个句子 " + url) return title, pList[0], pList[0], pList[0] #这样的话,就不插入 for p in pList: # print(p) if p.text != "": pHtml = "<p>" + p.text + "</p>" if pList.index(p) == 0: # 如果发现索引值是第一个的话,那就是开头了 Hcontent = pHtml Acontent = Acontent + pHtml # print("find the header p") # print(phtml) elif pList.index(p) == len(pList) - 1: # 同理如果是最后一个句子,那么你就是结尾了 Acontent = Acontent + pHtml Tcontent = pHtml else: # 不是首段和尾端的端口才加入到零散的段落中去 Acontent = Acontent + pHtml simplePList.append(pHtml) else: #可能是有图片的这个东西,如果有图片那就这百年这样操作 if p.find("img") != None: print("发现图片的段落") for img in p.find_all("img"): #修改图片路径和下载图片这两个操作 #修改里面的图片的地址的东西,然后还有别的什么的 imgSrc = img['src'] now = time.strftime('%Y%m%d', time.localtime(time.time())) now_date = now + "/" # 后面下载的文件名是不需要带杠的,后面就不需要带杠杠 imgName = self.stripImgUrl(imgSrc) print(imgName) imgName = now_date + self.stripImgUrl( imgName) # 这儿已经是添加了时间的 了 # print("文件名是" + imgName) # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上 downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now) imgPScr = "/images/" + imgName + ".jpg" # 这个html格式图片地址 HtmlImg = '<p><img src="' + imgPScr + '"/></p>' if pList.index(p) == 0: # 如果发现索引值是第一个的话,那就是开头了 Hcontent = HtmlImg Acontent = Acontent + HtmlImg elif pList.index( p) == len(pList) - 1: # 同理如果是最后一个句子,那么你就是结尾了 Acontent = Acontent + HtmlImg Tcontent = HtmlImg else: # 不是首段和尾端的端口才加入到零散的段落中去 Acontent = Acontent + HtmlImg simplePList.append(HtmlImg) # print("开头结尾是这个") # print(Hcontent) # print(Tcontent) # print("各个分句子在这儿") for simpleP in simplePList: #这儿直接写入进去就可以了 dbhelper.insertSimpleP(simpleP) # 这儿这个是一样的 # print(simpleP) return title, Hcontent, Tcontent, Acontent
class CommentCrawl(object): def __init__(self): self.dbHelper = DB() def changTimeToDate(self,dateString): timeStamp = dateString timeArray = time.localtime(timeStamp) print(timeArray) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) # print(otherStyleTime) return otherStyleTime def getNewsIdAndUrl(self): #提取出新闻的id和url # dbHelper = DB() themeWord = ['car','technology','home','entertainment','house','finance','sports'] #类别新闻 resultDic = {} sqlHead = "select News_id,url from newssentimentanalysis_" sqlTail = "news" # 插入 for theme in themeWord: print(sqlHead+theme+sqlTail) resultDic[theme] = self.dbHelper.__query__(sqlHead+theme+sqlTail)# 返回 return resultDic #返回格式{'car':[{'id':xx,'url':xx},.....,'home'...] def getAwriteCommentJson(self,id,url): #这个是评论专用的请求返回成字典的。 time.sleep(1) cooker = makeBS() commentRawUrl = "http://coral.qq.com/article/" cmt_id = cooker.getCmt_id(url) #去掉空格 if cmt_id==None: return if cmt_id.find("'")!=-1: cmt_id = cmt_id.replace("'","") else : cmt_id = cmt_id.strip() # print( cmt_id.strip() ) #这个用来拼接用到。 try: allUrl = commentRawUrl + str(cmt_id) + "/comment/#" print(allUrl) responseDic = cooker.makeBSjson(allUrl) # if # print() print(responseDic) commentList = responseDic['data']['commentid'] print(commentList) from pprint import pprint for comment in commentList: pprint(type(comment['id'])) print(comment['id']) comment['content'] = emoji.demojize(comment['content']) #过滤emoji comment['userinfo']['nick'] = emoji.demojize(comment['userinfo']['nick']) comment['time']=self.changTimeToDate(comment['time']) #时间戳改成日期字符串 print("新闻id "+ str(id)) print("新闻的url是 "+ url) self.dbHelper.classifyDBComment(url=url,id=id,comment=comment) #插入数据库。 print("") #-----------------------这儿可以合成sql语句的话就可以执行插入的操作了。----------------------- # 通过url来合成插入的sql语句,DBcontrol的方法中来做这些东西 except Exception as e: print("提取此条评论出错,正在跳过") print(e) def getCommentMain(self): resultDic = self.getNewsIdAndUrl() print(resultDic) from pprint import pprint resultList = [] count = 0 for theme in resultDic: print("现在是",theme) for oneNews in resultDic[theme]: count+=1 #这个累加,然后如果是到了一定的数量那就休眠一下 if count%100==0: #每100条 time.sleep(60*2) #休息两分钟。 print(oneNews) #已经提取出来了 self.getAwriteCommentJson(id=oneNews['News_id'],url=oneNews['url']) #逐条插入,进行,这个不需要返回 # resultList.append(oneNews) # 添加进入 print("finish comments crawl!")