def parse(self, response): news_list = response.xpath("//div[@class='list']") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath( ".//h2/a/text()").extract_first() news_item['origin_website'] = 'SEMI大导体产业网' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath( ".//h2/a/@href").extract_first() news_item['section'] = 'SEMI大导体产业网 > 汽车电子应用' news_item['abstract'] = info_item.xpath( ".//div[@class='abstract']/text()").extract_first().strip() news_item['created_at'] = int(datetime.datetime.now().timestamp()) published_at = info_item.xpath( ".//div[@class='inputdate']/text()").extract_first() news_item['published_at'] = self.parse_timestamp(published_at) if self.deadline > news_item['published_at']: return yield news_item self.start_page = self.start_page + 1 yield scrapy.Request('http://ecar.semi.org.cn/indexLoading_' + str(self.start_page) + '.html', callback=self.parse)
def parse(self, response): news_item = NewsSpiderItem() news_list = json.loads(response.body_as_unicode())['result'] if len(news_list) == 0: return else: for info_item in news_list: news_item['title'] = info_item['title'] news_item['origin_website'] = '智东西' news_item['created_at'] = int( datetime.datetime.now().timestamp()) news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item['link'] news_item['section'] = '' news_item['abstract'] = info_item['desp'] yield scrapy.Request(news_item['origin_url'], meta={'item': news_item}, callback=self.detail_parse, dont_filter=True) self.start_page = self.start_page + 1 yield scrapy.FormRequest(dont_filter=True, url=self.start_urls[0], formdata={ 'action': 'category_list', 'page': str(self.start_page) }, callback=self.parse)
def parse(self, response): news_list = response.xpath("//div[@class='ArticleList']/table/tbody/tr") news_list.pop() # 去除最后一个空行 relative_path = response.url.split('index')[0] for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath(".//td[@class='fw_t']/a/text()").extract_first().strip() news_item['origin_website'] = '中国科学院半导体研究所' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = relative_path + info_item.xpath(".//td[@class='fw_t']/a/@href").extract_first()[2:] news_item['section'] = response.xpath("string(//div[@class='Position'])").extract_first() news_item['abstract'] = '' news_item['created_at'] = int(datetime.datetime.now().timestamp()) published_at = info_item.xpath(".//td[@class='fw_s']/text()").extract_first() if published_at: news_item['published_at'] = int(datetime.datetime.strptime('20' + published_at.strip(), "%Y-%m-%d").timestamp()) if self.deadline > news_item['published_at']: return yield news_item else: continue next_link = response.xpath("//div[@class='t_page ColorLink']/a[contains(text(),'下一页')]/@href").extract_first() if next_link: yield scrapy.Request(relative_path + next_link, callback=self.parse, errback=self.err_callback)
def parse(self, response): news_item = NewsSpiderItem() news_list = json.loads(response.body_as_unicode())['info']['list'] if len(news_list) == 0: return else: for info_item in news_list: news_item['title'] = info_item['Title'] news_item['origin_website'] = '人工智能科技' news_item['created_at'] = int( datetime.datetime.now().timestamp()) news_item['published_at'] = int( datetime.datetime.strptime(info_item['CreateTime'], "%Y-%m-%d").timestamp()) if self.deadline > news_item['published_at']: return news_item['origin_host'] = self.allowed_domains[0] news_item[ 'origin_url'] = 'http://www.aistudyblog.com' + info_item[ 'Url'] news_item['section'] = '人工智能科技' + ' > ' + info_item['TypeName'] news_item['abstract'] = info_item['Description'] yield news_item self.start_page = self.start_page + 1 next_link = 'http://www.aistudyblog.com/handler/CMSList.ashx?ActionType=InformationAllTypeList&InformationPage=' + str( self.start_page) yield scrapy.Request(next_link, callback=self.parse)
def parse(self, response): news_list = response.xpath( "//ul[@class='list_left_ul']/li[not(@class='dashed_line')]") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath(".//a/@title").extract_first() news_item['origin_website'] = '智能电网市场' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath( ".//a/@href").extract_first() news_item['section'] = '北极星智能电网在线 > 市场' news_item['abstract'] = '' news_item['created_at'] = int(datetime.datetime.now().timestamp()) published_at = info_item.xpath( ".//span/text()").extract_first().strip() news_item['published_at'] = int( datetime.datetime.strptime(published_at, "%Y-%m-%d").timestamp()) if self.deadline > news_item['published_at']: return yield news_item next_link = response.xpath( "//div[@class='list_page']/div[@class='page']/a[@title='下一页']/@href" ).extract_first() if next_link: yield scrapy.Request('http://www.chinasmartgrid.com.cn/' + next_link, callback=self.parse)
def parse(self, response): news_list = response.xpath("//ul[@class='list_jc']/li") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath( ".//a[1]/@title").extract_first() news_item['origin_website'] = '人工智能实验室' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath( ".//a[1]/@href").extract_first() news_item['section'] = '首页 > 热点信息' news_item['abstract'] = info_item.xpath( ".//p[@class='cn']/text()").extract_first() news_item['created_at'] = int(datetime.datetime.now().timestamp()) published_at = info_item.xpath( ".//p[@class='xx']/span[@class='rq']/text()").extract_first( ).strip() news_item['published_at'] = int( datetime.datetime.strptime(published_at, "%Y-%m-%d").timestamp()) print(self.deadline, news_item['published_at']) if self.deadline > news_item['published_at']: return yield news_item next_link = response.xpath( "//div[@class='col-left box mt10']/div[@class='pg']/a[@class='nxt']/@href" ).extract_first() if next_link: yield scrapy.Request(next_link, callback=self.parse)
def parse(self, response): news_list = response.xpath("//div[@class='article-list']") for info_item in news_list: news_item = NewsSpiderItem() published_at = info_item.xpath( ".//div[@class='a-content']/p[@class='one-more clearfix']/span[@class='time']/text()" ).extract_first().strip() news_item['published_at'] = int( datetime.datetime.strptime(published_at, "%Y-%m-%d").timestamp()) if self.deadline > news_item['published_at']: return news_item['title'] = info_item.xpath( ".//div[@class='a-content']/h3[@class='a-title']/a/text()" ).extract_first() news_item['origin_website'] = '电子发烧友网' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath( ".//div[@class='a-content']/h3[@class='a-title']/a/@href" ).extract_first() news_item['section'] = '电子发烧友网 > 人工智能' news_item['created_at'] = int(datetime.datetime.now().timestamp()) news_item['abstract'] = info_item.xpath( ".//div[@class='a-content']/p[@class='a-summary']/text()" ).extract_first().strip() yield news_item next_link = response.xpath( "//div[@class='pagn1']/a[@class='page-next']/@href").extract_first( ) if next_link: yield scrapy.Request('http://www.elecfans.com/rengongzhineng/' + next_link, callback=self.parse)
def parse(self, response): news_list = response.xpath("//table[@class='gongzuo']/tr") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a//span[@class='hei1']").extract_first().strip() news_item['origin_website'] = 'SEMI大导体产业网' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a/@href").extract_first().replace('..', self.domain) news_item['section'] = 'SEMI大导体产业网 > 热点新闻' news_item['abstract'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a/text()").extract_first().strip() news_item['created_at'] = int(datetime.datetime.now().timestamp()) yield scrapy.Request(news_item['origin_url'], meta={'item': news_item}, callback=self.detail_parse) next_link = response.xpath("//table[@id='AspNetPager1']//tr/td/a[contains('下一页',text())]/@href").extract_first() if next_link: self.start_page = self.start_page + 1 print('正在爬第几页', self.start_page) yield scrapy.FormRequest( url=self.start_urls[0], formdata={'__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': str(self.start_page), '__VIEWSTATE': self.param_viewstate, '__VIEWSTATEGENERATOR': self.param_viewstategenerator}, callback=self.parse )
def parseNewsHref(self,response): category_type = response.meta['category_type'] newsList = response.xpath(".//li[@class='cfix']") for news in newsList: item = NewsSpiderItem() item['type']=category_type item['title'] = news.xpath('.//h2/a/text()').extract_first() item['url'] = news.xpath('.//h2/a/@href').extract_first() item['summary'] = news.xpath('.//p/text()').extract_first() item['time'] = news.xpath(".//em[@class='fRight']/text()").extract_first() item['content'] = [] yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.parseNews)
def parse_article(self, response): detail = response.xpath('//div[@class="article-wrap"]') item = NewsSpiderItem() item['title'] = detail.xpath('./h1[@class="t-h1"]/text()')[0].extract() item['auth'] = u"作者:" + detail.xpath( './div/span[@class="author-name"]/a/text()')[0].extract() item['post_time'] = u"发表时间:" + detail.xpath( './div/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()' )[0].extract() item['descr'] = u"简述:" + self.desc + "\n" # 简述存在错误 all_pars = detail.xpath( '//div[@class="article-content-wrap"]//p/text()').extract() content = '' for par in all_pars: content = content + par + "\n" desc = item.get('main_news') if desc == None: item['main_news'] = content else: item['main_news'] = desc + content yield item
def parse_news(self, response): url = response.url title = response.xpath('//article//h1/text()').extract_first() post_time = response.xpath( '//article//p[@class="update-time"]/text()').extract_first() content = response.xpath( '//*[@id="body-text"]//*[contains(@class, "zn-body__paragraph")]//text()' ) content = ' '.join(content.extract()) item = NewsSpiderItem() item['url'] = url item['title'] = title item['report_time'] = post_time item['content'] = content item['crawl_time'] = time.time() yield item
def parse(self, response): print "Start............................" self.desc = '' for sel in response.xpath('//div[@class="mod-b mod-art clearfix "]'): item = NewsSpiderItem() item['title'] = sel.xpath( './div/h2/a[@class="transition msubstr-row2"]/text()' )[0].extract() self.desc = sel.xpath( './div[@class="mob-ctt index-article-list-yh"]/div[@class="mob-sub"]/text()' )[0].extract() link = sel.xpath('./div/h2/a/@href')[0].extract() url = response.urljoin(link) yield scrapy.Request(url, callback=self.parse_article)
def parse(self, response): news_list = response.xpath("//div[@id='divArticleList']/div[contains(@class,'Article-box-cont')]/div[@class='Article-content']") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath(".//h3/a/text()").extract_first().strip() news_item['origin_website'] = '全球半导体观察' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = 'https://www.dramx.com' + info_item.xpath(".//h3/a/@href").extract_first() news_item['section'] = '全球半导体观察 > ' + response.xpath("//a[@class='Article-boxtitle-active']/text()").extract_first() news_item['abstract'] = info_item.xpath(".//p[@class='Article-essay']/text()").extract_first() news_item['created_at'] = int(datetime.datetime.now().timestamp()) yield scrapy.Request(news_item['origin_url'], meta={'item': news_item}, callback=self.detail_parse, dont_filter=True) next_link = response.xpath("(//div[@class='jogger']/a)[last()]/@href").extract_first() if next_link: yield scrapy.Request('https://www.dramx.com' + next_link, callback=self.parse, errback=self.err_callback, dont_filter=True)
def parse_news(self, response): url = response.url title = response.xpath('//h1[@class="story-body__h1"]/text()').extract_first() post_time = response.xpath( '//div[@class="date date--v2"]/text()').extract_first() content = response.xpath( '//div[@class="story-body__inner"]//p//text()' ) content = ' '.join(content.extract()) if title: item = NewsSpiderItem() item['url'] = url item['title'] = title item['report_time'] = post_time item['content'] = content item['crawl_time'] = time.time() yield item
def parse(self, response): array_split_url = response.request.url.split('-') category = '' if len(array_split_url) > 1: if '/' in array_split_url[1]: category_key = array_split_url[1].split('/')[0] else: category_key = array_split_url[1] category = project_items.get(category_key) content_list = response.xpath( ".//*[@class='viewpointListWrap contentWrap perspective']/ul/li") for content in content_list: url = content.xpath(".//a/@href").extract() if len(url) > 0: url = url[0] title = content.xpath( ".//a/p[@class='perspectiveTitle']/text()").extract() if len(title) > 0: title = title[0] time = content.xpath( ".//p[@class='researchInfo']/span[@class='time']/text()" ).extract() if len(time) > 0: time = time[0] author = content.xpath( ".//*[@class='researchInfo-author']/span[@class='author']/text()" ).extract() if len(author) > 0: author = author[0] item = NewsSpiderItem(url=url, title=title, time=time, author=author, source=self.__source, category=category, create_time=datetime.datetime.now()) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item # 将item暂存 yield request next_page = response.xpath( ".//*[@id='page']/ul/li[@class='active']/following-sibling::*[1]/a/@href" ).extract() if len(next_page) > 0: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parseNews(self, response): data = response.xpath("//div[@id='C-Main-Article-QQ']") item = NewsSpiderItem() timee = data.xpath("//span[@class='article-time']/text()").extract() title = data.xpath("//div[@class='hd']//h1/text()").extract() content = data.xpath("//p/text()").extract() time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") if len(timee) != 0 and len(title) != 0 and len(content) != 0: tm = time_pattern.findall(timee[0])[0] item['time'] = int(time.mktime(time.strptime(tm, '%Y-%m-%d %H:%M'))) item['title'] = title[0] item['url'] = response.url cc = '' if len(content) != 0: for c in content: cc = cc + c + '\n' item['content'] = cc yield item
def parse_raw_html(self, response): item = NewsSpiderItem() item['news_url'] = response.url # try: # chatset = response.encoding # body = response.body.decode(chatset, errors='ignore') # item['raw_html'] = body # except UnicodeDecodeError as e: # logger.error(e) # item['raw_html'] = None item['raw_html'] = response.body try: g = Goose({'stopwords_class': StopWordsChinese}) extr = g.extract(raw_html=item['raw_html']) cleaned_text =extr.cleaned_text title = extr.title if cleaned_text: title_pair = jieba.analyse.extract_tags(title, topK=20, withWeight=True) cleaned_text_pair = jieba.analyse.extract_tags(cleaned_text, topK=20, withWeight=True) title_pair_list = [[k[0], k[1]] for k in title_pair] cleaned_text_pair_list = [[k[0], k[1]] for k in cleaned_text_pair] for ti_va in title_pair_list: flag = True for te_va in cleaned_text_pair_list: if ti_va[0] == te_va[0]: te_va[1] += ti_va[1] * 0.5 flag = False if flag: cleaned_text_pair_list.append(ti_va) cleaned_text_pair_list.sort(key=self.takeSecond,reverse=True) simhash = MySimHash().get_simhash(cleaned_text_pair_list[:20]) item['title']=title item['simhash'] = str(simhash) item['cleaned_text'] = cleaned_text item['tags'] = cleaned_text_pair_list[:20] yield item else: pass except UnicodeDecodeError as e: logger.error("Something unexpected happened")
def parseNews(self,response): data = response.xpath("//div[@class='post_content_main']") item = NewsSpiderItem() timee = data.xpath("//div[@class='post_time_source']/text()").extract() title = data.xpath("//h1/text()").extract() content = data.xpath("//div[@class='post_text']/p/text()").extract() time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") if(len(timee)!=0 and len(title)!=0 and len(content)!=0): tm = time_pattern.findall(timee[0])[0] item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))) item['title'] = title[0] item['url'] = response.url item['origin'] = 'netease' cc='' if(len(content)!=0): for c in content: cc = cc+c+'\n' item['content'] = cc yield item
def parse(self, response): news_list = response.xpath("//table[@class='gongzuo']/tr") for info_item in news_list: news_item = NewsSpiderItem() published_at_text = info_item.xpath( ".//td[2]/text()").extract_first().strip() news_item['title'] = info_item.xpath( ".//td[@class='zuobian']/a/text()").extract_first() news_item['origin_website'] = 'SEMI大导体产业网' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = self.domain + info_item.xpath( ".//td[@class='zuobian']/a/@href").extract_first() news_item['section'] = '大导体产业网 > IC设计与制造' news_item['abstract'] = '' news_item['published_at'] = int( datetime.datetime.strptime(published_at_text, "%Y-%m-%d").timestamp()) news_item['created_at'] = int(datetime.datetime.now().timestamp()) if self.deadline > news_item['published_at']: return yield news_item next_link = response.xpath( "//table[@id='AspNetPager1']//tr/td/a[contains('下一页',text())]/@href" ).extract_first() if next_link: self.start_page = self.start_page + 1 yield scrapy.FormRequest(url=self.start_urls[0], dont_filter=True, formdata={ '__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': str(self.start_page), '__VIEWSTATE': self.param_viewstate, '__VIEWSTATEGENERATOR': self.param_viewstategenerator }, callback=self.parse)
def parseNews(self, response): articles = response.xpath("//div[@id='pagelet-article']") item = NewsSpiderItem() title = articles.xpath( "//div[@class='article-header']/h1/text()").extract()[0] tm = articles.xpath( "//div[@id='pagelet-article']//span[@class='time']/text()" ).extract()[0] content = articles.xpath( "//div[@class='article-content']//p/text()").extract() if (len(title) != 0 and len(tm) != 0 and len(content) != 0): item['title'] = title item['time'] = int(time.mktime(time.strptime(tm, '%Y-%m-%d %H:%M'))) item['url'] = response.url cc = '' if (len(content) != 0): for c in content: cc = cc + c + '\n' item['content'] = cc yield item
def parse(self, response): news_list = json.loads(response.body_as_unicode())['newsList'] for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item['title'] news_item['origin_website'] = 'OFweek人工智能网' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item['htmlpath'] news_item['section'] = 'OFweek人工智能网 > 自然语言处理' news_item['abstract'] = info_item['summery'] news_item['created_at'] = int(datetime.datetime.now().timestamp()) news_item['published_at'] = int( datetime.datetime.strptime(info_item['addtimeStr'], "%Y-%m-%d %H:%M:%S").timestamp()) if self.deadline > news_item['published_at']: return yield news_item self.start_page = self.start_page + 1 next_link = 'https://ai.ofweek.com/CAT-201718-nlp-' + str( self.start_page) + '.html' yield scrapy.Request(next_link, callback=self.parse)
def parse(self, response): news_list = response.xpath("//td[@id='ArticleBody']/ul/li") for info_item in news_list: news_item = NewsSpiderItem() news_item['title'] = info_item.xpath( ".//p/a/span/text()").extract_first() news_item['origin_website'] = '中国半导体行业协会' news_item['origin_host'] = self.allowed_domains[0] news_item['origin_url'] = info_item.xpath( ".//p/a/@href").extract_first() news_item['section'] = '中国半导体行业协会 > 行业要闻' news_item['abstract'] = '' news_item['created_at'] = int(datetime.datetime.now().timestamp()) published_at_text = info_item.xpath( ".//p/span[2]/text()").extract_first() if published_at_text: print(published_at_text) published_at = re.sub(u"[(\()(\))]", "", published_at_text.strip()) news_item['published_at'] = int( datetime.datetime.strptime( published_at, "%Y-%m-%d %H:%M:%S").timestamp()) if self.deadline > news_item['published_at']: return else: continue # print(news_item) yield news_item next_link = response.xpath( "//div[@class='showpage']/form/a[contains(text(),'下一页')]/@href" ).extract_first() if next_link: yield scrapy.Request('http://www.csia.net.cn/Article/' + next_link, callback=self.parse, errback=self.err_callback)
def parseNews(self, response): logging.info("--------------parsing news--------------") startparseSingleNews = datetime.datetime.now() logging.info("$$$$$$$$$$$$$$$$$$$$startparseSingleNews at : " + str(startparseSingleNews)) data = response.xpath("//div[@id='Cnt-Main-Article-QQ']") item = NewsSpiderItem() timee = data.xpath("//span[@class='article-time']/text()").extract() # 修改抽取内容 条件或 content = response.xpath( "//div[@id='Cnt-Main-Article-QQ']/p[@style='TEXT-INDENT: 2em']/text()" ).extract() cc = '' if len(content) > 0: self.fileName = response.url[-10:-4] + ".txt" scripts = response.xpath("//script/text()").extract() url = response.url title = response.xpath("//title/text()").extract() for c in content: cc = cc + c + '\n' content = cc.strip() logging.info("--------------pre url --------------" + response.url) logging.info("--------------pre content --------------" + content) for scriptCnt in scripts: if (scriptCnt.find('pubtime') > 0): time = self.getTimeStr(scriptCnt) logging.info("--------------time--------------" + time) break logging.info("--------------questions urls--------------" + response.url) title = u''.join(title[0]).encode('utf-8') logging.info("--------------questions title--------------" + title) content = u''.join(content).encode('utf-8') logging.info("--------------content title--------------" + content) if (len(content) > 0): if (url.find("sports.qq.com") >= 0): self.save("tencent/sports/", url, time, title, content) elif (url.find("finance.qq.com") >= 0 or url.find("money.qq.com") >= 0 or url.find("stock.qq.com") >= 0): self.save("tencent/finance/", url, time, title, content) elif (url.find("ent.qq.com") >= 0): self.save("tencent/ent/", url, time, title, content) elif (url.find("tech.qq.com") >= 0): self.save("tencent/tech/", url, time, title, content) elif (url.find("auto.qq.com") >= 0): self.save("tencent/auto/", url, time, title, content) elif (url.find("house.qq.com") >= 0): self.save("tencent/house/", url, time, title, content) elif (url.find("fashion.qq.com") >= 0): self.save("tencent/fashion/", url, time, title, content) elif (url.find("cul.qq.com") >= 0): self.save("tencent/cul/", url, time, title, content) #=============================================================== # if(url.find("finance.qq.com") >= 0 or url.find("money.qq.com") >= 0 or url.find("stock.qq.com") >= 0): # self.save("tencent/finance/", url, time, title, content) #=============================================================== endparseSingleNews = datetime.datetime.now() logging.info("$$$$$$$$$$$$$$$$$$$$ endparseSingleNews at : " + str(endparseSingleNews)) logging.info("$$$$$$$$$$$$$$$$$$$$ single news cost time : " + str(endparseSingleNews - startparseSingleNews))