コード例 #1
0
ファイル: telegram.py プロジェクト: sheng2333/new_resources
    def dealItem(self, title, pageTitle, memberInfo, descInfo, result):
        # 翻译描述信息
        if descInfo:
            descInfoCN = mainTranslate(descInfo)
        else:
            descInfoCN = ""

        item = {
            "_id": result["part"] + "_" + result["url"],
            "url": result["url"],
            "keyWord": result["keyWord"],
            "language": result["language"],
            "name": result["name"],
            "part": result["part"],
            "station": result["station"],
            "title": title,
            "pageTitle": pageTitle,
            "memberInfo": memberInfo,
            "descInfo": descInfo,
            "descInfoCN": descInfoCN,
            "country": ""
        }
        self.insertItem(item)
コード例 #2
0
    def parsePageUser(self, response, url, part, name):
        try:
            responseBody = json.loads(response)
            try:
                responseText = responseBody[1]
            except Exception as e:
                return None
            # 获取url
            try:
                urlList = jsonpath.jsonpath(responseBody, "$..ownerUrls")[0]
            except Exception as e:
                logging.error(e)
                urlList = [url]
            userurl = ""
            for i in urlList:
                if "www.youtube.com" not in i:
                    continue
                userurl = "https://" + i.split("://")[-1]
            if not userurl:
                return None

            # 判断是否在数据库中
            result = collection.find_one({
                "part": part,
                "url": userurl,
                "platId": platId
            })
            if result:
                logging.warn("存在库中part:{},name:{},url:{}".format(
                    part, name, userurl))
                return None
            result = formeryoutubecollection.find_one({
                "part": part,
                "url": url,
                "platId": platId
            })
            if result:
                logging.warn("存在库中part:{},name:{},url:{}".format(
                    part, name, url))
                return None

            if part == "clothes":
                # 判断是否在cmms中
                domain = "http://cmms.gloapi.com/"
                isExists = checkUrl(userurl, domain)
                if isExists:
                    # 代表存在接口中
                    return None
            elif part == "GB":
                # 判断是否在mms中
                domain = "http://mms.gloapi.com/"
                isExists = checkUrl(userurl, domain)
                if isExists:
                    # 代表存在接口中
                    return None

            # 订阅者数量
            subscriberCount = self.dealSubscriberCount(responseText)

            # 观看人数
            viewCount = self.dealViewCont(responseText)

            # 评论:内容
            description, descriptionLong = self.dealDescription(responseText)
            isBlack = False
            blackWord = ""
            if part == "GB":
                blackListall = blackList
            else:
                blackListall = clothesblackList
            # 翻译成中文
            if not description.strip():
                descriptionChinese = ""
            else:
                descriptionChinese = mainTranslate(description)
            # descriptionChinese = youdao(description)
            blackWord = ""
            blackWordCount = 0
            for word in blackListall:
                if word in description or word in descriptionChinese:
                    blackWord += word + ""
                    blackWordCount += 1
            blackWord = blackWord.strip()

            # emailAddress = re.findall("(\w+@\S+.\w+)", descriptionLong)
            pattern = re.compile(
                r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b')
            try:
                emailAddress = re.search(pattern, descriptionLong).group()
            except Exception as e:
                emailAddress = ""

            # 国家
            country = self.dealCountry(responseText)

            # 标题title
            upTitle = self.dealTitle(responseText)

            # 链接
            Facebook, Youtube, Instagram = self.dealLinks(responseText)

            # 商务邮箱
            businessEmail = self.dealMail(responseText)

            # 相关频道
            relateChannel = self.relateChannel(responseText)

            UserItem = {
                "subscriberCount": subscriberCount,
                "description": descriptionChinese,
                "descriptionUn": descriptionLong,
                "country": country,
                "viewCount": viewCount,
                "upTitle": upTitle,
                "Facebook": Facebook,
                "Youtube": Youtube,
                "Instagram": Instagram,
                "emailAddress": emailAddress,
                "isMail": businessEmail,
                "relateChannel": relateChannel.strip(),
                "url": userurl,  # blackWord = ""  blackWordCount = 0
                "blackWord": blackWord,
                "blackWordCount": blackWordCount
            }
        except Exception as e:
            logging.error(traceback.format_exc())
            UserItem = {}
        return UserItem
コード例 #3
0
    def parsePageVideo(self, response, videoUrl, part, station, userUrl):
        try:
            response = json.loads(response)
            titleList = jsonpath.jsonpath(
                response, "$..gridRenderer.items..title..simpleText")
            # 取8个
            titleList = titleList[:self.videoNum]
            lastUpdateTimeList = jsonpath.jsonpath(
                response,
                "$..gridRenderer.items..publishedTimeText.simpleText")
            lastUpdateTimeList = lastUpdateTimeList[:self.videoNum]

            viewCountTextList = jsonpath.jsonpath(
                response, "$..gridRenderer.items..viewCountText.simpleText")
            viewCountTextList = viewCountTextList[:self.videoNum]

            totalViewCount = 0
            viewCountList = []
            for viewCountText in viewCountTextList:
                try:
                    viewCount = int(
                        viewCountText.replace("次观看",
                                              "").replace("人正在观看", "").replace(
                                                  ",", "").strip())
                    viewCountList.append(viewCount)
                    totalViewCount += viewCount
                except Exception as e:
                    continue
            videoTittle = ""
            for title, lastUpdateTime, viewCount in zip(
                    titleList, lastUpdateTimeList, viewCountList):
                videoTittle += title + "\n"
            # videoTittle = youdao(videoTittle)
            if not videoTittle.strip():
                videoTittleChinese = ""
            else:
                videoTittleChinese = mainTranslate(videoTittle)
            VideoTitleCount = 0
            whiteWord = ""
            if part == "GB":
                whiteListall = whiteList
            else:
                whiteListall = clotheswhiteList
                if station == "Zaful":
                    whiteListall = zafulWhiltList
            for word in whiteListall:
                if word.lower() in videoTittle.lower() or word.lower(
                ) in videoTittleChinese.lower():
                    VideoTitleCount += 1
                    word_new = word + " "
                    whiteWord += word_new

            logging.error("part:{},匹配度等于{}分,videoUrl:{},匹配单词:{}".format(
                part, VideoTitleCount, userUrl, whiteWord.strip()))
            try:
                titleFirst = videoTittleChinese.split("\n")[0]
            except Exception as e:
                titleFirst = ""

            try:
                viewCountFirst = int(viewCountTextList[0].replace(
                    "次观看", "").replace("人正在观看", "").replace(",", "").strip())
            except Exception as e:
                viewCountFirst = 0

            item = {
                "videoTittle": videoTittleChinese,
                "videotitleUn": videoTittle,
                "viewCountAvg": int(totalViewCount / len(titleList)),
                "titleLastUpdateTime": lastUpdateTimeList[0],
                "whiteWord": whiteWord.strip(),
                "VideoTitleCount": VideoTitleCount,
                "titleFirst": titleFirst,
                "viewCountFirst": viewCountFirst
            }
        except Exception as e:
            item = {}
        return item
コード例 #4
0
def dealHeaderFooterInfo(selector):
    textList = selector.xpath("//header//text()")
    if not textList:
        node = selector.xpath(
            "//*[contains(@class,'header')]") + selector.xpath(
                "//*[contains(@id,'header')]") + selector.xpath(
                    "//*[contains(@class,'Header')]") + selector.xpath(
                        "//*[contains(@id,'Header')]")
        if node:
            textList = node[0].xpath(".//text()")
        else:
            node = selector.xpath("//head")
            if node:
                textList = node[0].xpath(".//text()")
            else:
                textList = []

    if not textList:
        headerStr = ""
    else:
        textStr = ""
        for text in list(set(textList)):
            text = text.replace("\n", "").strip()
            if not text:
                continue
            text = text + ","
            textStr += text
        headerStr = textStr.strip()[:-1]
    textList = selector.xpath("//footer//text()")
    if not textList:
        node = selector.xpath(
            "//*[contains(@class,'footer')]") + selector.xpath(
                "//*[contains(@id,'footer')]") + selector.xpath(
                    "//*[contains(@class,'Footer')]") + selector.xpath(
                        "//*[contains(@id,'Footer')]")
        if node:
            textList = node[0].xpath(".//text()")
        else:
            textList = []

    if not textList:
        footerStr = ""
    else:
        textStr = ""
        for text in list(set(textList)):
            text = text.replace("\n", "").strip()
            if not text:
                continue
            text = text + ","
            textStr += text
        footerStr = textStr.strip()[:-1]
    if not footerStr and not headerStr:
        headerZH = ""
        footerZH = ""
    else:  # 两者至少有一个
        if not footerStr:
            headerZH = mainTranslate(headerStr[:4000])
            footerZH = ""
        elif not headerStr:
            footerZH = mainTranslate(footerStr[:4000])
            headerZH = ""
        else:
            # 合并翻译
            headerZH = mainTranslate(headerStr[:4000])
            footerZH = mainTranslate(footerStr[:4000])
    fhBlackWord = ""
    fhBlackWordCount = 0
    for word in blackWordList:
        if word in footerStr or word in headerZH or word in footerZH or word in headerStr:
            fhBlackWord += word + " "
            fhBlackWordCount += 1
    fhBlackWord = fhBlackWord.strip()

    return headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount
コード例 #5
0
def dealResponse(responseBody, mongoUrl):
    whiteNum = 0
    blackNum = 0
    blackStr = ""
    whiteStr = ""
    headerStr, footerStr = "", ""
    headerZH, footerZH = "", ""
    fhBlackWord, fhBlackWordCount = "", 0
    facebook, instagram, youtube, twitter, title, desc, titleChinese, emailStr = "", "", "", "", "", "", "", ""
    try:
        selector = etree.HTML(responseBody)
    except Exception as e:
        logging.error(e)
        selector = etree.HTML(responseBody.decode())

    # 获取邮箱信息
    emailStr = getMailPage(responseBody, selector)
    # 获取标题
    try:
        title = selector.xpath('//title/text()')[0].replace('\n', '').replace(
            '  ', ' ').strip()
    except:
        logging.error("url:{}".format(mongoUrl))
        title = ""

    # 获取描述信息
    try:
        desc = selector.xpath(
            '//meta[@name="description"]/@content')[0].replace('\n',
                                                               '').replace(
                                                                   '  ', ' ')
    except:
        try:
            desc = selector.xpath(
                '//meta[@name="Description"]/@content')[0].replace(
                    '\n', '').replace('  ', ' ')
        except Exception as e:
            desc = ""

    # 标题和描述信息拼接
    titkeDesc = title + "\n" + desc
    try:
        if titkeDesc:
            titleChinese = mainTranslate(titkeDesc)
            for bd in black:
                if bd in titleChinese:
                    blackword = bd
                    blackNum += 1
                    blackStr += blackword + " "
                    logging.error("存在黑名单,word:{},url:{}".format(
                        blackword, mongoUrl))
                    # return blackNum, whiteNum, whiteStr, title, desc, titleChinese, emailStr, facebook, instagram, youtube, twitter
            blackStr = blackStr.strip()
            for td in white:
                if td in titleChinese:
                    whiteNum += 1
                    whiteStr += td + " "
            whiteStr = whiteStr.strip()
        else:
            titleChinese = ""
    except:
        titleChinese = ""
    for lid in li:
        urlds = selector.xpath(lid[0])
        if len(urlds) > 0:
            for keys in urlds:
                if ('facebook' in str(keys)) and (facebook == ''):
                    facebook = lid[1] + str(keys)
                elif ('instagram' in str(keys)) and (instagram == ''):
                    instagram = lid[1] + str(keys)
                elif ('twitter' in str(keys)) and (twitter == ''):
                    twitter = lid[1] + str(keys)
                elif ('youtube' in str(keys)) and (youtube == ''):
                    youtube = lid[1] + str(keys)

    headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount = dealHeaderFooterInfo(
        selector)
    return fhBlackWord, fhBlackWordCount, blackStr, headerZH, footerZH, headerStr, footerStr, blackNum, whiteNum, whiteStr, title.strip(
    ), desc.strip(), titleChinese.strip(), emailStr, facebook.strip(
    ), instagram.strip(), youtube.strip(), twitter.strip(), blackStr.strip()
コード例 #6
0
ファイル: webspider.py プロジェクト: sheng2333/new_resources
def dealHeaderFooterInfo(selector):
    headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount = "", "", "", "", "", 0
    try:
        textList = selector.xpath("//header//text()")
        node = selector.xpath(
            "//*[contains(@class,'header')]") + selector.xpath(
                "//*[contains(@id,'header')]") + selector.xpath(
                    "//*[contains(@class,'Header')]") + selector.xpath(
                        "//*[contains(@id,'Header')]")
        if node:
            for i in node:
                try:
                    textList += i.xpath(".//text()")
                except Exception as e:
                    textList = []

        textList += selector.xpath("//head//text()")

        if not textList:
            try:
                textList = selector.xpath("//html//text()")
            except Exception as e:
                textList = []

        if not textList:
            headerStr = ""
        else:
            textStr = ""
            for text in list(set(textList)):
                text = text.replace("\n", "").strip()
                if not text:
                    continue
                text = text + ","
                textStr += text
            headerStr = textStr.strip()[:-1]
        try:
            textList = selector.xpath("//footer//text()")
        except Exception as e:
            textList = []
        node = selector.xpath(
            "//*[contains(@class,'footer')]") + selector.xpath(
                "//*[contains(@id,'footer')]") + selector.xpath(
                    "//*[contains(@class,'Footer')]") + selector.xpath(
                        "//*[contains(@id,'Footer')]")
        if node:
            for i in node:
                try:
                    textList += i.xpath(".//text()")
                except Exception as e:
                    textList = []

        if not textList:
            footerStr = headerStr
        else:
            textStr = ""
            for text in list(set(textList)):
                text = text.replace("\n", "").strip()
                if not text:
                    continue
                text = text + ","
                textStr += text
            footerStr = textStr.strip()[:-1]
        if not footerStr and not headerStr:
            headerZH = ""
            footerZH = ""
        else:  # 两者至少有一个
            if not footerStr:
                headerZH = mainTranslate(headerStr[:4000])
                footerZH = ""
            elif not headerStr:
                footerZH = mainTranslate(footerStr[-4000:])
                headerZH = ""
            else:
                # 合并翻译
                headerZH = mainTranslate(headerStr[:4000])
                footerZH = mainTranslate(footerStr[-4000:])
        fhBlackWord = ""
        fhBlackWordCount = 0
        for word in blackWordList:
            if word in footerStr or word in headerZH or word in footerZH or word in headerStr:
                fhBlackWord += word + " "
                fhBlackWordCount += 1
        fhBlackWord = fhBlackWord.strip()
    except Exception as e:
        logging.error(traceback.format_exc())

    return headerStr, footerStr, headerZH, footerZH, fhBlackWord, fhBlackWordCount