Example #1
0
    def parse_article_contents(self, response):

        title = response.xpath(
            '//div[@class="content-article"]/h1/text()').get()
        if title == None:
            title = response.xpath(
                '//div[@class="intro-content clearfix"]/h1/text()').get()
        contents = response.xpath(
            '//div[@class="article"]/p//descendant::text()|//div[@class="article"]//img/@src'
        ).getall()
        contents = self.content_filter(contents)
        print(
            f'=================={title}==================\n{response.url}\n{contents}'
        )

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = '攻略汇总'
        item["content"] = contents
        item['time_decline'] = datetime.datetime.utcnow()
        yield item
Example #2
0
    def parse_article_contents(self, response):

        title = response.xpath("//h1[@class='entry-title']//text()").get()
        contents = response.xpath(
            "//div[@class='single-content']//img/@src|//div[@class='single-content']//text()"
        ).getall()
        category = response.xpath("//div[@class='single-cat']/a/text()").get()
        print(
            f'==============={category}==={title}==================\n{response.url}'
        )
        _contents = []
        for content in contents:
            content = content.strip()
            if content != '':
                _contents.append(content)
                print(content)

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = category
        item["content"] = _contents
        # item['time_decline'] = datetime.datetime.utcnow()
        yield item
Example #3
0
 def parse_article_contents(self, response):
     
     title = response.xpath('//article[@class="page-single"]/header/h1/text()').get()
     contents = response.xpath('//article[@class="page-single"]/p[not(.//span[@class="single-mid"])]//text()|//article[@class="page-single"]//img/@src').getall()
     imgs = response.xpath('//article[@class="page-single"]//img/@src').getall()
     contents = self.content_filter(contents, imgs)
     print(f'=================={title}==================\n{response.url}\n{contents}')
     
     now = datetime.datetime.now()
     current_time = now.strftime("%H:%M:%S")
     
     item = AllscrapyItem()
     item["url"] = response.url
     item["time"] = f"{datetime.date.today()}_{current_time}"
     item["title"] = title
     item["category"] = response.meta['category']
     item["content"] = contents
     item['time_decline'] = datetime.datetime.utcnow()
     yield item
Example #4
0
    def parse_content(self, response):

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")
        category = response.meta['category']

        title = response.xpath("//div[@class='c-title']/h1/text()").get()
        contents = response.xpath(
            "//div[@class='new_conts']/p//text()|//div[@class='new_conts']/p/img/@src"
        ).getall()
        contents = self.content_filter(contents)

        print(f'{title}\n{contents}')

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = category
        item["content"] = contents
        yield item
Example #5
0
    def parse_get_article_content(self, response):

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        title = response.xpath("//h1[@class='headline']/text()").get()
        title = title.strip()
        contents = response.xpath(
            "//div[@class='artical-content-read']//text()|//div[@class='artical-content-read']//img//@src"
        ).getall()
        contents = self.content_filter(contents)
        print(f'{title}\n{contents}')

        item = AllscrapyItem()

        item["url"] = response.url
        item["title"] = title
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["category"] = response.meta["category"]
        item["content"] = contents
        yield item
Example #6
0
    def parse_article_contents(self, response):

        title = response.xpath("//h1[@class='title']/text()").get().strip()
        contents = response.xpath(
            "//div[@class='detail_content']/p/text()|//div[@class='detail_content']//img/@src"
        ).getall()

        print(
            f'=================={title}==================\n{response.url}\n{contents}'
        )

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = response.meta['category']
        item["content"] = contents
        item['time_decline'] = datetime.datetime.utcnow()
        yield item
Example #7
0
    def parse_article_contents(self, response):

        title = response.xpath(
            "//div[@class='article-banner-title']/h1/text()").get().strip()
        imgs = response.xpath(
            "//article[@class='article-detail-content article-left']//img/@src"
        ).getall()

        if response.xpath(
                "//article[@class='article-detail-content article-left']//text()"
        ) != []:
            contents = response.xpath(
                "//article[@class='article-detail-content article-left']//text()|//article[@class='article-detail-content article-left']//img/@src"
            ).getall()
        else:
            contents = response.xpath(
                "//article[@class='article-detail-content article-center']//text()|//article[@class='article-detail-content article-center']//img/@src"
            ).getall()
            imgs = response.xpath(
                "//article[@class='article-detail-content article-center']//img/@src"
            ).getall()
        contents = self.content_filter(contents, imgs)
        print(f'=================={title}==================\n{response.url}')
        for content in contents:
            print(content)

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = response.meta['category']
        item["content"] = contents
        # item['time_decline'] = datetime.datetime.utcnow()
        yield item
Example #8
0
    def main_parse(self, response):

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        title = response.xpath("//h1[@class='post-title']/text()").get()
        contents = response.xpath(
            "//div[@itemprop='articleBody']/article//p//text()|//div[@itemprop='articleBody']/article//p/img/@src|//div[@itemprop='articleBody']//iframe[not(@class='embed-responsive-item')]/@src"
        ).getall()
        contents = self.content_filter(contents)
        item = AllscrapyItem()
        img_list = response.xpath(
            "//div[@itemprop='articleBody']/article//p/img/@src|//div[@itemprop='articleBody']//iframe[not(@class='embed-responsive-item')]/@src"
        ).getall()
        img_list = self.img_filter(img_list)
        print(f'{title}\n{contents}')

        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = "all"
        item["content"] = contents
        # item['image_urls'] = img_list
        yield item
Example #9
0
    def parse_article_contents(self, response):

        title = response.xpath("//h1[@id='subject_tpc']/text()").get().strip()
        contents = response.xpath(
            "//div[@id='read_tpc'][.//*[not(contains(@href,'download'))]]/div/text()|//div[@id='read_tpc']//img/@src"
        ).getall()
        imgs = response.xpath("//div[@id='read_tpc']//img/@src").getall()
        if response.xpath("//div[@id='read_tpc']/div/text()").getall() == []:
            contents = response.xpath(
                "//div[@id='read_tpc'][.//*[not(contains(@href,'download'))]]//text()|//div[@id='read_tpc']//img/@src"
            ).getall()

        print(f'=================={title}==================\n{response.url}')
        _contents = []
        for content in contents:
            content = content.strip()

            if '種子連結' in content:
                break
            elif "验证编码" in content:
                continue
            elif "高速上傳" in content:
                continue
            elif "全码" in content:
                continue
            elif "全碼" in content:
                continue
            elif "作种" in content:
                continue
            elif "做种" in content:
                continue
            elif "做種" in content:
                continue
            elif "種子" in content:
                continue
            elif "种子" in content:
                continue
            elif "特 征 码" in content:
                continue
            elif "哈希" in content:
                continue
            elif "期限" in content:
                continue
            elif "下載" in content:
                continue
            elif "下载" in content:
                continue
            elif "編碼" in content:
                continue
            elif "download" in content:
                continue
            if content != '':
                print(content)
                _contents.append(content)

        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")

        item = AllscrapyItem()
        item["url"] = response.url
        item["time"] = f"{datetime.date.today()}_{current_time}"
        item["title"] = title
        item["category"] = '國內原創'
        item["content"] = _contents
        item["imgs"] = imgs
        item["cover"] = imgs[0]
        # item['time_decline'] = datetime.datetime.utcnow()
        yield item