def parse_article_contents(self, response): title = response.xpath( '//div[@class="content-article"]/h1/text()').get() if title == None: title = response.xpath( '//div[@class="intro-content clearfix"]/h1/text()').get() contents = response.xpath( '//div[@class="article"]/p//descendant::text()|//div[@class="article"]//img/@src' ).getall() contents = self.content_filter(contents) print( f'=================={title}==================\n{response.url}\n{contents}' ) now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = '攻略汇总' item["content"] = contents item['time_decline'] = datetime.datetime.utcnow() yield item
def parse_article_contents(self, response): title = response.xpath("//h1[@class='entry-title']//text()").get() contents = response.xpath( "//div[@class='single-content']//img/@src|//div[@class='single-content']//text()" ).getall() category = response.xpath("//div[@class='single-cat']/a/text()").get() print( f'==============={category}==={title}==================\n{response.url}' ) _contents = [] for content in contents: content = content.strip() if content != '': _contents.append(content) print(content) now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = category item["content"] = _contents # item['time_decline'] = datetime.datetime.utcnow() yield item
def parse_article_contents(self, response): title = response.xpath('//article[@class="page-single"]/header/h1/text()').get() contents = response.xpath('//article[@class="page-single"]/p[not(.//span[@class="single-mid"])]//text()|//article[@class="page-single"]//img/@src').getall() imgs = response.xpath('//article[@class="page-single"]//img/@src').getall() contents = self.content_filter(contents, imgs) print(f'=================={title}==================\n{response.url}\n{contents}') now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = response.meta['category'] item["content"] = contents item['time_decline'] = datetime.datetime.utcnow() yield item
def parse_content(self, response): now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") category = response.meta['category'] title = response.xpath("//div[@class='c-title']/h1/text()").get() contents = response.xpath( "//div[@class='new_conts']/p//text()|//div[@class='new_conts']/p/img/@src" ).getall() contents = self.content_filter(contents) print(f'{title}\n{contents}') item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = category item["content"] = contents yield item
def parse_get_article_content(self, response): now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") title = response.xpath("//h1[@class='headline']/text()").get() title = title.strip() contents = response.xpath( "//div[@class='artical-content-read']//text()|//div[@class='artical-content-read']//img//@src" ).getall() contents = self.content_filter(contents) print(f'{title}\n{contents}') item = AllscrapyItem() item["url"] = response.url item["title"] = title item["time"] = f"{datetime.date.today()}_{current_time}" item["category"] = response.meta["category"] item["content"] = contents yield item
def parse_article_contents(self, response): title = response.xpath("//h1[@class='title']/text()").get().strip() contents = response.xpath( "//div[@class='detail_content']/p/text()|//div[@class='detail_content']//img/@src" ).getall() print( f'=================={title}==================\n{response.url}\n{contents}' ) now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = response.meta['category'] item["content"] = contents item['time_decline'] = datetime.datetime.utcnow() yield item
def parse_article_contents(self, response): title = response.xpath( "//div[@class='article-banner-title']/h1/text()").get().strip() imgs = response.xpath( "//article[@class='article-detail-content article-left']//img/@src" ).getall() if response.xpath( "//article[@class='article-detail-content article-left']//text()" ) != []: contents = response.xpath( "//article[@class='article-detail-content article-left']//text()|//article[@class='article-detail-content article-left']//img/@src" ).getall() else: contents = response.xpath( "//article[@class='article-detail-content article-center']//text()|//article[@class='article-detail-content article-center']//img/@src" ).getall() imgs = response.xpath( "//article[@class='article-detail-content article-center']//img/@src" ).getall() contents = self.content_filter(contents, imgs) print(f'=================={title}==================\n{response.url}') for content in contents: print(content) now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = response.meta['category'] item["content"] = contents # item['time_decline'] = datetime.datetime.utcnow() yield item
def main_parse(self, response): now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") title = response.xpath("//h1[@class='post-title']/text()").get() contents = response.xpath( "//div[@itemprop='articleBody']/article//p//text()|//div[@itemprop='articleBody']/article//p/img/@src|//div[@itemprop='articleBody']//iframe[not(@class='embed-responsive-item')]/@src" ).getall() contents = self.content_filter(contents) item = AllscrapyItem() img_list = response.xpath( "//div[@itemprop='articleBody']/article//p/img/@src|//div[@itemprop='articleBody']//iframe[not(@class='embed-responsive-item')]/@src" ).getall() img_list = self.img_filter(img_list) print(f'{title}\n{contents}') item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = "all" item["content"] = contents # item['image_urls'] = img_list yield item
def parse_article_contents(self, response): title = response.xpath("//h1[@id='subject_tpc']/text()").get().strip() contents = response.xpath( "//div[@id='read_tpc'][.//*[not(contains(@href,'download'))]]/div/text()|//div[@id='read_tpc']//img/@src" ).getall() imgs = response.xpath("//div[@id='read_tpc']//img/@src").getall() if response.xpath("//div[@id='read_tpc']/div/text()").getall() == []: contents = response.xpath( "//div[@id='read_tpc'][.//*[not(contains(@href,'download'))]]//text()|//div[@id='read_tpc']//img/@src" ).getall() print(f'=================={title}==================\n{response.url}') _contents = [] for content in contents: content = content.strip() if '種子連結' in content: break elif "验证编码" in content: continue elif "高速上傳" in content: continue elif "全码" in content: continue elif "全碼" in content: continue elif "作种" in content: continue elif "做种" in content: continue elif "做種" in content: continue elif "種子" in content: continue elif "种子" in content: continue elif "特 征 码" in content: continue elif "哈希" in content: continue elif "期限" in content: continue elif "下載" in content: continue elif "下载" in content: continue elif "編碼" in content: continue elif "download" in content: continue if content != '': print(content) _contents.append(content) now = datetime.datetime.now() current_time = now.strftime("%H:%M:%S") item = AllscrapyItem() item["url"] = response.url item["time"] = f"{datetime.date.today()}_{current_time}" item["title"] = title item["category"] = '國內原創' item["content"] = _contents item["imgs"] = imgs item["cover"] = imgs[0] # item['time_decline'] = datetime.datetime.utcnow() yield item