def parse_item(self, response): item = YdNewsCrawlerItem() item["create_at"] = get_time_now() item["update_at"] = get_time_now() item["topic_id"] = get_id("100") item["topic_type"] = "101" item["origin_url"] = response.meta["origin_url"] # regex = re.compile(r'[^/]*.[jpg|png|gif]$') for sel in response.xpath("//div[contains(@id, 'container')]"): item["topic_title"] = sel.xpath("//div//h1/text()").extract()[0] item["content"] = sel.xpath("//div[contains(@id, 'contentText')]//div//p/text()").extract() item["image_urls"] = sel.xpath("//td//img/@src").extract() item["images"] = sel.xpath("//td//img/@src").re(r"[^/]*.[jpg|png|gif]$") if item["image_urls"]: yield item else: continue
def parse_item(self, response): item = YdNewsCrawlerItem() item["create_at"] = get_time_now() item["update_at"] = get_time_now() item["topic_id"] = get_id("100") item["topic_type"] = "102" item["topic_title"] = response.meta["topic_title"] item["origin_url"] = response.meta["origin_url"] for sel in response.xpath("//div[contains(@class, end-text)]//p/text()"): item["content"] = sel.xpath(".//p/text()").extract() item["image_urls"] = sel.xpath(".//p//img/@src").extract() item["images"] = sel.xpath(".//p//img/@src").re(r"[^/]*.[jpg|png|gif]$") if item["image_urls"]: yield item else: continue
def parse_item(self, response): item = YdNewsCrawlerItem() item['create_at'] = get_time_now() item['update_at'] = get_time_now() item['topic_id'] = get_id('100') item['topic_type'] = '103' item['topic_title'] = response.meta['topic_title'] item['origin_url'] = response.meta['origin_url'] for sel in response.xpath("//div[contains(@class, 'end-text')]"): item['content'] = sel.xpath(".//p/text()").extract() item['image_urls'] = sel.xpath(".//p//img/@src").extract() item['images'] = sel.xpath(".//p//img/@src").re(r'[^/]*.[jpg|png|gif]$') if item['image_urls']: yield item else: continue