def parse(self, response, **kwargs): """ :param response: crawler response of the article url :return: parsed doc pushed to elastic """ hxs = Selector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "tr" item["source"] = "hurriyet" date_time = hxs.xpath( "/html/body/article/div[12]/div/section[1]/header/div[1]/div[2]/div[2]/span[2]/time" ).extract() author = hxs.xpath( "/html/body/article/div[12]/div/section[1]/header/section[1]/div[1]/div/div[2]/a[1]/h6" ).extract() title = hxs.xpath( "/html/body/article/div[12]/div/section[1]/header/div[2]/div/h1" ).extract() intro = hxs.xpath( "/html/body/article/div[12]/div/section[3]/div/h2").extract() new_content = hxs.xpath( "/html/body/article/div[12]/div/section[3]/div/div[4]").extract() new_content = ' '.join(new_content) # # Processing outputs item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) item["content"] = re.sub(r'\s{2,}', ' ', new_content) item["date_time"] = " ".join(date_time) item["author"] = " ".join(author) return item
def parse_item(self, response): r = response # inspect_response(response, self) title = r.xpath("//div[@class='qq_article']//h1/text()").extract() source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract() if title: title = title[0] if source: source = source[0] # 要求格式正确 if not title or not source: redis_conn.hset(redis_invalid_url_key, response.url, 0) return content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract()) raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0] re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time) if re_result: ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M')) else: ts = 0 url = r.url new_news = NewsItem( title=title, content=content, source=source, published=ts, url=url ) return new_news
def parse(self, response): for row in response.xpath( "//table[@class='table table-small'][1]/tbody/tr"): l = ItemLoader(item=NewsItem(), selector=row) l.add_xpath("news", 'td[2]/a/text()') yield l.load_item()
def parse(self, response, **kwargs): """ :param response: crawler response of the article url :return: parsed doc pushed to elastic """ hxs = Selector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "tr" item["source"] = "kizlarsoruyor" item["category"] = " ".join(hxs.xpath("//a[@class='no-posting tgec']/text()").extract()) date_time = hxs.xpath("//span[@class='posted-on']/text()").extract() author = hxs.xpath("//span[@class='name']/text()").extract() if author: item["author"] = " ".join(author) else: author = hxs.xpath("//a[@class='username profile-hover']/text()").extract() item["author"] = " ".join(author) title = hxs.xpath("//h1/text()").extract() new_content = hxs.xpath("//div[@class='detail-body']/text()").extract() if new_content: new_content = ' '.join(new_content) else: new_content = hxs.xpath("//div[@class='article-body post-body clearfix']/text()") new_content = ' '.join(new_content) item["title"] = ' '.join(title) item["content"] = re.sub(r'\s{2,}', ' ', new_content) item["date_time"] = " ".join(date_time) return item
def makeNewsItem(url, docId, title, createTime, content, source): item = NewsItem() item['url'] = url item['docId'] = docId item['title'] = title item['createTime'] = createTime item['content'] = content item['source'] = source return item
def parse_item(self, response): ''' parse the data from website create new NewsItem and then fills it by the crawler ''' # create new article from our defined item article = NewsItem() # parse the data from the website article['title'] = response.css('h1.title:og:title').extract()[0] date = response.css('div.counters:article:published_time').extract()[0] article['date'] = self.transform_date(date) found_article = response.css('div.bbtext p::text').extract() article['article'] = self.transform_article(found_article) article['keywords'] = response.css('meta[name=keywords]::attr(content)').extract()[0] article['server'] = 'idnes.cz' return article
def parse_item(self, response): ''' parse the data from website create new NewsItem and then fills it by the crawler ''' # create new article from our defined item article = NewsItem() # parse the data from the website article['title'] = response.css('h1::text').extract()[0] article['date'] = self.current_date found_article = response.css('#contentArticleBox p::text').extract() article['article'] = self.transform_article(found_article) article['keywords'] = response.css( 'meta[name=keywords]::attr(content)').extract()[0] article['server'] = 'novinky.cz' return article
def parse_item(self, response): if str(response.url) not in self.OLD_URLS: self.log("Scraping: %s" % response.url, level=log.INFO) hxs = HtmlXPathSelector(response) item = NewsItem() #item['_id'] = NewsSpider.k item['url'] = response.url item['source'] = self.MY_SETTINGS["source"] item['title'] = None for title_path in self.CONT_PATHS["title"]: item['title'] = item['title'] or hxs.xpath( title_path).extract() item['date'] = None for date_path in self.CONT_PATHS["date"]: item['date'] = item['date'] or hxs.xpath(date_path).extract() div = None for div_path in self.CONT_PATHS["text"]: div = div or hxs.xpath(div_path) text = re.sub('\s+', ' ', ' '.join(div.extract())).strip().replace("\"", "'") #Final item entry tmp = ' '.join(item['title']).encode('ascii', 'ignore') tmp = tmp.replace("\\", "") item['title'] = tmp tmp = ' '.join(item['date']).encode('ascii', 'ignore') tmp = ' '.join(tmp.split()) item['date'] = get_date(tmp) item['content'] = text item['company'] = self.MY_SETTINGS["company"] item['isClean'] = False self.URLS_FILE.write(str(response.url) + '\n') yield item
def parse_item(self, response): ''' parse the data from website create new NewsItem and then fills it by the crawler ''' # create new article from our defined item article = NewsItem() print(response.url) # parse the data from the website article['title'] = response.css('section.article-header h1::text').extract()[0] date = response.css('div.time::text').extract()[0] article['date'] = self.transform_date(date) found_article = response.css('section.article-content p::text').extract() article['article'] = self.transform_article(found_article) keywords = response.css('section.article-tags a::text').extract() article['keywords'] = self.transform_keywords(keywords) article['server'] = 'parlamentnilisty.cz' return article
def parse_item(self, response): # inspect_response(response, self) r = response title = r.xpath('/html/head/title/text()').extract()[0].strip() source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip() content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip() raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0] re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time) if re_result: ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S')) else: ts = 0 url = r.url new_news = NewsItem( title=title, content=content, source=source, published=ts, url=url ) return new_news
def parse_item(self, response): ''' parse the data from website create new NewsItem and then fills it by the crawler ''' # create new article from our defined item article = NewsItem() # parse the data from the website article['title'] = response.css( 'div.titulek-clanku h1::text').extract()[0] date = response.css( 'meta[property=article\:published_time]::attr(content)').extract( )[0] article['date'] = self.transform_date(date) found_article = response.css('div.clanek-telo p::text').extract() article['article'] = self.transform_article(found_article) article['keywords'] = response.css( 'meta[name=keywords]::attr(content)').extract()[0] article['server'] = 'aktualne.cz' return article
def genaNewsItem(dicN): # 保留需要内容 dicSample = { "url": "http:xxxxx", "docId": 'xxxxx', "title": "广州今年还有3条地铁开工 周边房价地图奉上", "createTime": "2017-08-18 17:15:50", "content": "根据地铁官方公布,18、22、11号线将有望于年内全面开工,其中18、22号线涉及南沙与中心区的联系,11号线为广州首条“市区环线”。这个图集将为大家带来地铁房价地图以及简要的规划利好分析。(数据来源:中原研究发展部)", "source": "" } # 只保留筛选需要的信息 # 如dicSample的格式 item = NewsItem() info = dicN['info'] item['url'] = dicN['url'] item['docId'] = dicN['docId'] item['title'] = info['setname'] item['createTime'] = info['lmodify'] item['content'] = dicN['content'] item['source'] = info['source'] return item