def dealItem(self, title, pageTitle, memberInfo, descInfo, result): # 翻译描述信息 if descInfo: descInfoCN = mainTranslate(descInfo) else: descInfoCN = "" item = { "_id": result["part"] + "_" + result["url"], "url": result["url"], "keyWord": result["keyWord"], "language": result["language"], "name": result["name"], "part": result["part"], "station": result["station"], "title": title, "pageTitle": pageTitle, "memberInfo": memberInfo, "descInfo": descInfo, "descInfoCN": descInfoCN, "country": "" } self.insertItem(item)
def parsePageUser(self, response, url, part, name): try: responseBody = json.loads(response) try: responseText = responseBody[1] except Exception as e: return None # 获取url try: urlList = jsonpath.jsonpath(responseBody, "$..ownerUrls")[0] except Exception as e: logging.error(e) urlList = [url] userurl = "" for i in urlList: if "www.youtube.com" not in i: continue userurl = "https://" + i.split("://")[-1] if not userurl: return None # 判断是否在数据库中 result = collection.find_one({ "part": part, "url": userurl, "platId": platId }) if result: logging.warn("存在库中part:{},name:{},url:{}".format( part, name, userurl)) return None result = formeryoutubecollection.find_one({ "part": part, "url": url, "platId": platId }) if result: logging.warn("存在库中part:{},name:{},url:{}".format( part, name, url)) return None if part == "clothes": # 判断是否在cmms中 domain = "http://cmms.gloapi.com/" isExists = checkUrl(userurl, domain) if isExists: # 代表存在接口中 return None elif part == "GB": # 判断是否在mms中 domain = "http://mms.gloapi.com/" isExists = checkUrl(userurl, domain) if isExists: # 代表存在接口中 return None # 订阅者数量 subscriberCount = self.dealSubscriberCount(responseText) # 观看人数 viewCount = self.dealViewCont(responseText) # 评论:内容 description, descriptionLong = self.dealDescription(responseText) isBlack = False blackWord = "" if part == "GB": blackListall = blackList else: blackListall = clothesblackList # 翻译成中文 if not description.strip(): descriptionChinese = "" else: descriptionChinese = mainTranslate(description) # descriptionChinese = youdao(description) blackWord = "" blackWordCount = 0 for word in blackListall: if word in description or word in descriptionChinese: blackWord += word + "" blackWordCount += 1 blackWord = blackWord.strip() # emailAddress = re.findall("(\w+@\S+.\w+)", descriptionLong) pattern = re.compile( r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b') try: emailAddress = re.search(pattern, descriptionLong).group() except Exception as e: emailAddress = "" # 国家 country = self.dealCountry(responseText) # 标题title upTitle = self.dealTitle(responseText) # 链接 Facebook, Youtube, Instagram = self.dealLinks(responseText) # 商务邮箱 businessEmail = self.dealMail(responseText) # 相关频道 relateChannel = self.relateChannel(responseText) UserItem = { "subscriberCount": subscriberCount, "description": descriptionChinese, "descriptionUn": descriptionLong, "country": country, "viewCount": viewCount, "upTitle": upTitle, "Facebook": Facebook, "Youtube": Youtube, "Instagram": Instagram, "emailAddress": emailAddress, "isMail": businessEmail, "relateChannel": relateChannel.strip(), "url": userurl, # blackWord = "" blackWordCount = 0 "blackWord": blackWord, "blackWordCount": blackWordCount } except Exception as e: logging.error(traceback.format_exc()) UserItem = {} return UserItem
def parsePageVideo(self, response, videoUrl, part, station, userUrl): try: response = json.loads(response) titleList = jsonpath.jsonpath( response, "$..gridRenderer.items..title..simpleText") # 取8个 titleList = titleList[:self.videoNum] lastUpdateTimeList = jsonpath.jsonpath( response, "$..gridRenderer.items..publishedTimeText.simpleText") lastUpdateTimeList = lastUpdateTimeList[:self.videoNum] viewCountTextList = jsonpath.jsonpath( response, "$..gridRenderer.items..viewCountText.simpleText") viewCountTextList = viewCountTextList[:self.videoNum] totalViewCount = 0 viewCountList = [] for viewCountText in viewCountTextList: try: viewCount = int( viewCountText.replace("次观看", "").replace("人正在观看", "").replace( ",", "").strip()) viewCountList.append(viewCount) totalViewCount += viewCount except Exception as e: continue videoTittle = "" for title, lastUpdateTime, viewCount in zip( titleList, lastUpdateTimeList, viewCountList): videoTittle += title + "\n" # videoTittle = youdao(videoTittle) if not videoTittle.strip(): videoTittleChinese = "" else: videoTittleChinese = mainTranslate(videoTittle) VideoTitleCount = 0 whiteWord = "" if part == "GB": whiteListall = whiteList else: whiteListall = clotheswhiteList if station == "Zaful": whiteListall = zafulWhiltList for word in whiteListall: if word.lower() in videoTittle.lower() or word.lower( ) in videoTittleChinese.lower(): VideoTitleCount += 1 word_new = word + " " whiteWord += word_new logging.error("part:{},匹配度等于{}分,videoUrl:{},匹配单词:{}".format( part, VideoTitleCount, userUrl, whiteWord.strip())) try: titleFirst = videoTittleChinese.split("\n")[0] except Exception as e: titleFirst = "" try: viewCountFirst = int(viewCountTextList[0].replace( "次观看", "").replace("人正在观看", "").replace(",", "").strip()) except Exception as e: viewCountFirst = 0 item = { "videoTittle": videoTittleChinese, "videotitleUn": videoTittle, "viewCountAvg": int(totalViewCount / len(titleList)), "titleLastUpdateTime": lastUpdateTimeList[0], "whiteWord": whiteWord.strip(), "VideoTitleCount": VideoTitleCount, "titleFirst": titleFirst, "viewCountFirst": viewCountFirst } except Exception as e: item = {} return item
def dealHeaderFooterInfo(selector): textList = selector.xpath("//header//text()") if not textList: node = selector.xpath( "//*[contains(@class,'header')]") + selector.xpath( "//*[contains(@id,'header')]") + selector.xpath( "//*[contains(@class,'Header')]") + selector.xpath( "//*[contains(@id,'Header')]") if node: textList = node[0].xpath(".//text()") else: node = selector.xpath("//head") if node: textList = node[0].xpath(".//text()") else: textList = [] if not textList: headerStr = "" else: textStr = "" for text in list(set(textList)): text = text.replace("\n", "").strip() if not text: continue text = text + "," textStr += text headerStr = textStr.strip()[:-1] textList = selector.xpath("//footer//text()") if not textList: node = selector.xpath( "//*[contains(@class,'footer')]") + selector.xpath( "//*[contains(@id,'footer')]") + selector.xpath( "//*[contains(@class,'Footer')]") + selector.xpath( "//*[contains(@id,'Footer')]") if node: textList = node[0].xpath(".//text()") else: textList = [] if not textList: footerStr = "" else: textStr = "" for text in list(set(textList)): text = text.replace("\n", "").strip() if not text: continue text = text + "," textStr += text footerStr = textStr.strip()[:-1] if not footerStr and not headerStr: headerZH = "" footerZH = "" else: # 两者至少有一个 if not footerStr: headerZH = mainTranslate(headerStr[:4000]) footerZH = "" elif not headerStr: footerZH = mainTranslate(footerStr[:4000]) headerZH = "" else: # 合并翻译 headerZH = mainTranslate(headerStr[:4000]) footerZH = mainTranslate(footerStr[:4000]) fhBlackWord = "" fhBlackWordCount = 0 for word in blackWordList: if word in footerStr or word in headerZH or word in footerZH or word in headerStr: fhBlackWord += word + " " fhBlackWordCount += 1 fhBlackWord = fhBlackWord.strip() return headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount
def dealResponse(responseBody, mongoUrl): whiteNum = 0 blackNum = 0 blackStr = "" whiteStr = "" headerStr, footerStr = "", "" headerZH, footerZH = "", "" fhBlackWord, fhBlackWordCount = "", 0 facebook, instagram, youtube, twitter, title, desc, titleChinese, emailStr = "", "", "", "", "", "", "", "" try: selector = etree.HTML(responseBody) except Exception as e: logging.error(e) selector = etree.HTML(responseBody.decode()) # 获取邮箱信息 emailStr = getMailPage(responseBody, selector) # 获取标题 try: title = selector.xpath('//title/text()')[0].replace('\n', '').replace( ' ', ' ').strip() except: logging.error("url:{}".format(mongoUrl)) title = "" # 获取描述信息 try: desc = selector.xpath( '//meta[@name="description"]/@content')[0].replace('\n', '').replace( ' ', ' ') except: try: desc = selector.xpath( '//meta[@name="Description"]/@content')[0].replace( '\n', '').replace(' ', ' ') except Exception as e: desc = "" # 标题和描述信息拼接 titkeDesc = title + "\n" + desc try: if titkeDesc: titleChinese = mainTranslate(titkeDesc) for bd in black: if bd in titleChinese: blackword = bd blackNum += 1 blackStr += blackword + " " logging.error("存在黑名单,word:{},url:{}".format( blackword, mongoUrl)) # return blackNum, whiteNum, whiteStr, title, desc, titleChinese, emailStr, facebook, instagram, youtube, twitter blackStr = blackStr.strip() for td in white: if td in titleChinese: whiteNum += 1 whiteStr += td + " " whiteStr = whiteStr.strip() else: titleChinese = "" except: titleChinese = "" for lid in li: urlds = selector.xpath(lid[0]) if len(urlds) > 0: for keys in urlds: if ('facebook' in str(keys)) and (facebook == ''): facebook = lid[1] + str(keys) elif ('instagram' in str(keys)) and (instagram == ''): instagram = lid[1] + str(keys) elif ('twitter' in str(keys)) and (twitter == ''): twitter = lid[1] + str(keys) elif ('youtube' in str(keys)) and (youtube == ''): youtube = lid[1] + str(keys) headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount = dealHeaderFooterInfo( selector) return fhBlackWord, fhBlackWordCount, blackStr, headerZH, footerZH, headerStr, footerStr, blackNum, whiteNum, whiteStr, title.strip( ), desc.strip(), titleChinese.strip(), emailStr, facebook.strip( ), instagram.strip(), youtube.strip(), twitter.strip(), blackStr.strip()
def dealHeaderFooterInfo(selector): headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount = "", "", "", "", "", 0 try: textList = selector.xpath("//header//text()") node = selector.xpath( "//*[contains(@class,'header')]") + selector.xpath( "//*[contains(@id,'header')]") + selector.xpath( "//*[contains(@class,'Header')]") + selector.xpath( "//*[contains(@id,'Header')]") if node: for i in node: try: textList += i.xpath(".//text()") except Exception as e: textList = [] textList += selector.xpath("//head//text()") if not textList: try: textList = selector.xpath("//html//text()") except Exception as e: textList = [] if not textList: headerStr = "" else: textStr = "" for text in list(set(textList)): text = text.replace("\n", "").strip() if not text: continue text = text + "," textStr += text headerStr = textStr.strip()[:-1] try: textList = selector.xpath("//footer//text()") except Exception as e: textList = [] node = selector.xpath( "//*[contains(@class,'footer')]") + selector.xpath( "//*[contains(@id,'footer')]") + selector.xpath( "//*[contains(@class,'Footer')]") + selector.xpath( "//*[contains(@id,'Footer')]") if node: for i in node: try: textList += i.xpath(".//text()") except Exception as e: textList = [] if not textList: footerStr = headerStr else: textStr = "" for text in list(set(textList)): text = text.replace("\n", "").strip() if not text: continue text = text + "," textStr += text footerStr = textStr.strip()[:-1] if not footerStr and not headerStr: headerZH = "" footerZH = "" else: # 两者至少有一个 if not footerStr: headerZH = mainTranslate(headerStr[:4000]) footerZH = "" elif not headerStr: footerZH = mainTranslate(footerStr[-4000:]) headerZH = "" else: # 合并翻译 headerZH = mainTranslate(headerStr[:4000]) footerZH = mainTranslate(footerStr[-4000:]) fhBlackWord = "" fhBlackWordCount = 0 for word in blackWordList: if word in footerStr or word in headerZH or word in footerZH or word in headerStr: fhBlackWord += word + " " fhBlackWordCount += 1 fhBlackWord = fhBlackWord.strip() except Exception as e: logging.error(traceback.format_exc()) return headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount