Ejemplo n.º 1
0
    def parse_body(self, response):
        # content = response.xpath("//body//text()").extract()
        # content1 = response.xpath("//body//script//text()").extract()
        # content1 += response.xpath("//body//style//text()").extract()
        # content = [x for x in content if x not in content1]
        
        # content = list(map(lambda x: re.sub(r'[\\n \\r \\t]','',x.strip()), content))
        # content = list(filter(None, content))
        # contentItem['content'] = ';'.join(content)

        contentItem = linkBodyItem()
        # body:二进制类型   text:string类型      包含' \r \n \t \
        tmp = response.text
        tmp = re.sub(r'\n', '', tmp)
        tmp = re.sub(r'\r', '', tmp)
        tmp = re.sub(r'\t', '', tmp)
        tmp = re.sub(r'"', "'", tmp)
        tmp = html.unescape(tmp)
        tmp = re.sub(r'gb2312', 'utf-8', tmp)

        pattern = re.compile(r'<script.*?</script>')
        tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<scip.*?</scip>')
        tmp = pattern.sub(r'', tmp)

        contentItem['content'] = tmp
        contentItem['address'] = response.url

        str2 = ""
        if "baike.baidu.com" in response.url:
            contentItem['intro'] = ''.join(response.xpath('//div[@class="lemma-summary"]/div[@class="para"]/text()[1]').extract())
        elif "wikipedia.org" in response.url:                   
            contentItem['intro'] = ''.join(response.xpath('//div[@id="mw-content-text"]/div/p[1]//text()').extract())
        elif 'facebook.com' in response.url:
            str1 = response.url
            k = str1.find('/', 25)
            m = str1.find('/', 8)
            str2 = str1[:m+1] + 'pg' + str1[m:k+1] + 'about/'
            contentItem['intro'] = str2
        elif 'twitter.com' in response.url:
            # //*[@id="react-root"]/div/div/div/main/div/div[2]/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div//text()  登录
            # //*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()   未登录
            contentItem['intro'] = ''.join(response.xpath('//*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()').extract())
        else:
            contentItem['intro'] = " "

        yield contentItem
        if str2 !="":
            yield Request(url = str2, callback=lambda response, address = str1: self.parse_facebook_intro(response, address), dont_filter=True)
Ejemplo n.º 2
0
    def parse_body(self, response, key, title):
        contentItem = linkBodyItem()
        # body:二进制类型   text:string类型      包含' \r \n \t \
        tmp = response.text

        # tmp = re.sub(r'\n', '', tmp)
        tmp = re.sub(r'\r', '', tmp)
        tmp = re.sub(r'\t', '', tmp)
        tmp = re.sub(r'"', "'", tmp)
        tmp = html.unescape(tmp)
        # tmp = re.sub(r'gb2312', 'utf-8', tmp)

        pattern = re.compile(r'<script.*?</script>', re.DOTALL)
        tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<scip.*?</scip>', re.DOTALL)
        tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<style.*?</style>', re.DOTALL)
        tmp = pattern.sub(r'', tmp)
        # pattern = re.compile(r'<.*?</[\w\W]*?>', re.DOTALL)
        # tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<[\w\W]*?>')
        tmp = pattern.sub(r'', tmp)

        tmp = re.sub(r'\n+', '\n', tmp)
        tmp = " ".join(tmp.split())

        contentItem['content'] = tmp
        contentItem['key'] = key
        contentItem['title'] = title
        contentItem['address'] = response.url

        str2 = ""
        if "baike.baidu.com" in response.url:
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//div[@class="lemma-summary"]/div[@class="para"]/text()[1]'
                ).extract())
        elif "wikipedia.org" in response.url:
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//div[@id="mw-content-text"]/div/p[1]//text()').extract())
        elif 'facebook.com' in response.url:
            str1 = response.url
            k = str1.find('/', 25)
            m = str1.find('/', 8)
            str2 = str1[:m + 1] + 'pg' + str1[m:k + 1] + 'about/'
            contentItem['intro'] = str2
        elif 'twitter.com' in response.url:
            # //*[@id="react-root"]/div/div/div/main/div/div[2]/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div//text()  登录
            # //*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()   未登录
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()'
                ).extract())
        else:
            contentItem['intro'] = " "

        yield contentItem
        if str2 != "":
            yield Request(
                url=str2,
                callback=lambda response, key=key, title=title, address=str1:
                self.parse_facebook_intro(response, key, title, address),
                dont_filter=True)
Ejemplo n.º 3
0
    def parse_body(self, response, key, title):
        contentItem = linkBodyItem()
        # body:二进制类型   text:string类型      包含' \r \n \t \
        tmp = response.text

        tmp = self.processInfo(tmp)

        contentItem['content'] = tmp
        contentItem['key'] = key
        contentItem['title'] = title
        contentItem['address'] = response.url
        contentItem['intro'] = ' '

        str2 = ""
        if "baike.baidu.com" in response.url:
            contentItem['category'] = '百度百科'
            contentItem['intro'] = self.processInfo(''.join(
                response.xpath(
                    '//div[@class="lemma-summary"]//div//text()').extract()))
        elif "wikipedia.org" in response.url:
            contentItem['category'] = '维基百科'
            contentItem['intro'] = self.processInfo(''.join(
                response.xpath('//div[@id="mw-content-text"]/div/p[1]//text()'
                               ).extract()))
        elif 'facebook.com' in response.url:
            str1 = response.url
            k = str1.find('/', 25)
            m = str1.find('/', 8)
            str2 = str1[:m + 1] + 'pg' + str1[m:k + 1] + 'about/'
            contentItem['category'] = '脸书'
            contentItem['intro'] = str2
        elif 'twitter.com' in response.url:
            # //*[@id="react-root"]/div/div/div/main/div/div[2]/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div//text()  登录
            # //*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()   未登录
            contentItem['category'] = '推特'
            contentItem['intro'] = self.processInfo(''.join(
                response.xpath(
                    '//*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()'
                ).extract()))
        elif 'linkedin' in response.url:
            contentItem['category'] = '领英'
        elif 'weibo' in response.url:
            contentItem['category'] = '微博'
        elif 'blog' in response.url:
            contentItem['category'] = '博客'
        elif self.isContain(['zhaopin', 'kanzhun'], response.url):
            contentItem['category'] = '招聘'
        elif self.isContain(['youku', 'tudou'], response.url):
            contentItem['category'] = '视频'
        elif self.isContain(
            ['ifeng', 'finance.sina', 'business.sohu', 'new.qq', 'dooo'],
                response.url):
            contentItem['category'] = '新闻'
        else:
            contentItem['category'] = '其它'

        yield contentItem
        if str2 != "":
            yield Request(
                url=str2,
                callback=lambda response, key=key, title=title, address=str1:
                self.parse_facebook_intro(response, key, title, address),
                dont_filter=True)
Ejemplo n.º 4
0
    def parse_body(self, response, key, title):
        # content = response.xpath("//body//text()").extract()
        # content1 = response.xpath("//body//script//text()").extract()
        # content1 += response.xpath("//body//style//text()").extract()
        # content = [x for x in content if x not in content1]

        # content = list(map(lambda x: re.sub(r'[\\n \\r \\t]','',x.strip()), content))
        # content = list(filter(None, content))
        # contentItem['content'] = ';'.join(content)

        contentItem = linkBodyItem()
        # body:二进制类型   text:string类型      包含' \r \n \t \
        tmp = response.text

        # tmp = re.sub(r'\n', '', tmp)
        tmp = re.sub(r'\r', '', tmp)
        tmp = re.sub(r'\t', '', tmp)
        tmp = re.sub(r'"', "'", tmp)
        tmp = html.unescape(tmp)
        # tmp = re.sub(r'gb2312', 'utf-8', tmp)

        pattern = re.compile(r'<script.*?</script>', re.DOTALL)
        tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<scip.*?</scip>', re.DOTALL)
        tmp = pattern.sub(r'', tmp)
        # pattern = re.compile(r'<.*?</[\w\W]*?>', re.DOTALL)
        # tmp = pattern.sub(r'', tmp)
        pattern = re.compile(r'<[\w\W]*?>')
        tmp = pattern.sub(r'', tmp)

        # r = response.text
        # con = html.unescape(r)
        # pattern = re.compile(r'<script.*?</script>')
        # con = pattern.sub(r'', con)
        # pattern = re.compile(r'<scip.*?</scip>')
        # con = pattern.sub(r'', con)
        # def remove_rub(string):
        #     rub = re.findall(r'<[\w\W]*?>',string)  #去掉<>里面的所有代码,因为数据没有在标签的<>里面,
        #     for r in rub:
        #         string = string.replace(r,'')
        #     return(string.replace('\t','').replace('\r',''))  #去掉  \t 和回车
        # all_data = remove_rub(con)
        # all_data = "".join([s for s in all_data.splitlines(True) if s.strip()])

        contentItem['content'] = tmp
        contentItem['key'] = key
        contentItem['title'] = title
        contentItem['address'] = response.url

        str2 = ""
        if "baike.baidu.com" in response.url:
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//div[@class="lemma-summary"]/div[@class="para"]/text()[1]'
                ).extract())
        elif "wikipedia.org" in response.url:
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//div[@id="mw-content-text"]/div/p[1]//text()').extract())
        elif 'facebook.com' in response.url:
            str1 = response.url
            k = str1.find('/', 25)
            m = str1.find('/', 8)
            str2 = str1[:m + 1] + 'pg' + str1[m:k + 1] + 'about/'
            contentItem['intro'] = str2
        elif 'twitter.com' in response.url:
            # //*[@id="react-root"]/div/div/div/main/div/div[2]/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div//text()  登录
            # //*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()   未登录
            contentItem['intro'] = ''.join(
                response.xpath(
                    '//*[@id="page-container"]/div[2]/div/div/div[1]/div/div/div/div[1]/p//text()'
                ).extract())
        else:
            contentItem['intro'] = " "

        yield contentItem
        if str2 != "":
            yield Request(
                url=str2,
                callback=lambda response, key=key, title=title, address=str1:
                self.parse_facebook_intro(response, key, title, address),
                dont_filter=True)