def parse(self, response): global start_urls self.l = ItemLoader(item=NewsItem(), response=response) print(self.start_urls[0]) # yield scrapy.Request(url=self.start_urls[0],callback=self.thehindumain) self.l = ItemLoader(item=NewsItem(), response=response) # yield scrapy.Request(url=self.start_urls[1],callback=self.thetimesmain) #self.l = ItemLoader(item=NewsItem() , response=response) while True: self.l = ItemLoader(item=NewsItem(), response=response) #buiss # yield scrapy.Request(url='http://www.thehindu.com/business/',callback=self.thehindumain) # yield scrapy.Request(url='https://timesofindia.indiatimes.com/business' , callback=self.thetimestopic) #interna yield scrapy.Request( url='http://www.thehindu.com/news/international/', callback=self.thehindumain) yield scrapy.Request( url='https://timesofindia.indiatimes.com/world/', callback=self.thetimestopic) #india #yield scrapy.Request(url='http://www.thehindu.com/news/national/',callback=self.thehindumain) #yield scrapy.Request(url='https://timesofindia.indiatimes.com/india',callback=self.thetimestopic) #sports #yield scrapy.Request(url='http://www.thehindu.com/news/sport/',callback=self.thehindumain) #yield scrapy.Request(url='https://timesofindia.indiatimes.com/sports',callback=self.thetimestopic) #tech # yield scrapy.Request(url='http://www.thehindu.com/news/technology/',callback=self.thehindumain) # yield scrapy.Request(url='https://www.gadgetsnow.com?utm_source=toiweb&utm_medium=referral&utm_campaign=toiweb_hptopnav',callback=self.thetimestopic) #sci #yield scrapy.Request(url='http://www.thehindu.com/news/sci-tech/',callback=self.thehindumain) break
def News(sel, xpath, pri): """ """ Headline = sel.xpath(xpath) for h in Headline: item = NewsItem() tt = h.xpath('.//a/text()').extract() hh = h.xpath('.//a/@href').extract() if tt and len(tt) == 1: item['title'] = tt[0].strip() item['href'] = hh[0] item['uptime'] = timeNow item['pri'] = pri item['site'] = self.siteName request = Request(item['href'], callback=self.parsePeopleContent) request.meta['item'] = item yield request else: for i, j in zip(tt, hh): item = NewsItem() item['title'] = tt[0].strip() item['href'] = hh[0] item['uptime'] = timeNow item['pri'] = pri item['site'] = self.siteName request = Request(item['href'], callback=self.parsePeopleContent) request.meta['item'] = item yield request
def parse(self, response): """ """ timeNow = datetime.now().strftime("%Y-%m-%d %H:%M:%S") hxs = Selector(response) xpath_headline = "/html/body/div[7]/div/div/div/div[3]/h1/a" xpath_mainnews = "/html/body/div[7]/div/div/div/div[3]/ul/li" headline = hxs.xpath(xpath_headline) mainNews = hxs.xpath(xpath_mainnews) items = [] item = NewsItem() item['title'] = Get(headline, "text()") item['href'] = Get(headline, "@href") item['uptime'] = timeNow item['pri'] = 0 item['site'] = self.site_name items.append(item) for mainNewsItem in mainNews: item = NewsItem() item['title'] = Get(mainNewsItem, "a/text()") item['href'] = Get(mainNewsItem, "a/@href") item['uptime'] = timeNow item['pri'] = 3 item['site'] = self.site_name request = Request(item['href'], callback=parseIfengContent) request.meta['item'] = item yield request
def parse(self, response): href = response.url title = response.xpath('//header/h1/text()').extract()[0] pubtime = response.xpath( '//div[@class="reporter"]/time/text()').extract()[0] r = re.findall(u'(\d+)年(\d+)月(\d+)日 (\d+):(\d+)', pubtime) if not r: raise Exception(u'解析时间失败') r = [int(x) for x in r[0]] pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2], hour=r[3], minute=r[4]) htmlcontent = response.xpath( '//div[contains(@class, "page_container")]/article/article' ).extract()[0] keywords = response.xpath('//div[@class="a_k"]/a/text()').extract() source = u'中時電子報' item = NewsItem(title=title, pubtime=pubtime, htmlcontent=htmlcontent, href=href, keywords=keywords, source=source) yield item
def parse_news(self, response): news = NewsItem() news['url'] = response.url temp = response.xpath("//h1[@id='artibodyTitle']//text()").extract() news['title'] = temp[0] if temp else '' temp = response.xpath("//span[@id='media_name']//a//text()").extract() news['source'] = temp[0] if temp else '' if news['source'] == '': temp = response.xpath( "//span[@data-sudaclick='media_name']//a//text()").extract() news['source'] = temp[0] if temp else '' temp = response.xpath("//span[@id='pub_date']//text()").extract() news['public_time'] = temp[0] if temp else '' if news['public_time'] == '': temp = response.xpath( "//span[@class='time-source']//text()").extract() news['public_time'] = temp[0] if temp else '' if news['public_time'] != '': news['public_time'] = self.get_datetime(news['public_time']) temp = response.xpath("//div[@id='artibody']//p//text()").extract() news['content'] = '\n'.join(temp) if temp else '' cat = response.url.split('//')[1].split('.')[0] sub_cat = response.url.split('//')[1].split('/')[1] news['category'] = self.get_category(cat, sub_cat) return news
def parse(self, response): href = response.url title = response.xpath( '//h3[contains(@class, "post-title")]/text()').extract()[0] title = title.replace("\n", "") pubtime = response.xpath( '//abbr[@class="date-header"]/span/text()').extract()[0] r = re.compile(u"(\d+)年(\d+)月(\d+)日") r = re.findall(r, pubtime) if not r: raise Exception(u"解析时间失败") r = [int(x) for x in r[0]] pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2]) htmlcontent = response.xpath( '//div[contains(@class, "post-body")]').extract()[0] keywords = response.xpath( '//span[@class="post-labels"]/a[@rel="tag"]/text()').extract() source = u"KKday" item = NewsItem(title=title, pubtime=pubtime, htmlcontent=htmlcontent, href=href, keywords=keywords, source=source) yield item
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = hxs.select("//div[@class='about']/span/text()").extract()[0].split() item['date'] = time_raw[0] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//dl[@id='content']/dd//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(random()) except: pass
def saveFile(self, response): item = NewsItem() tieuDe = response.xpath( "//*[@id='ArticleHolder']/div[2]/h1/text()").extract_first() ndTomTat = response.xpath( "//*[@id='ArticleContent']/div[1]/h2/p").extract() nd = response.xpath("//*[@id='ArticleContent']").extract() tacgia = response.xpath( "//*[@id='ArticleContent']/p[last()]/span/text()").extract_first() if tacgia == None: tacgia = "admin" print("ndtt ===================================================", ndTomTat) item['tieuDe'] = tieuDe item['ndTomTat'] = "ndtt" self.a.remove(self.a[len(self.a) - 1]) item['nd'] = nd item['tieuDeKhongDau'] = self.tieudekhongdau(tieuDe) item['anh'] = self.anh[len(self.anh) - 1] self.anh.remove(self.anh[len(self.anh) - 1]) item['view'] = 1 item['tacGia'] = tacgia item['create_at'] = datetime.now() item['update_at'] = datetime.now() item['id_lt_id'] = 1 item['id_tl_id'] = 1 yield item
def parseDetail(self, response): item = NewsItem() item['link'] = response.meta['link'] item['title'] = response.meta['title'] data = response.xpath("//div[@id='ContentBody']") item['detail'] = data.xpath("//p/text()").extract() yield item
def parseContent(self, response): item = NewsItem() item['leadingTitle'] = '' item['subTitle'] = '' item['source'] = '搜狐新闻' item['edition'] = '' item['anthor'] = '' item['url'] = response.url title = response.xpath( '//div[@class="text-title"]/h1/text()').extract_first() item['mainTitle'] = '' if title is None else title.replace('\r', '').replace('\n', '').\ replace('\xa0', ' ').replace('\u3000', ' ').strip() datetext = response.xpath( '//div[@class="article-info"]/span/text()').extract_first() date = '' if datetext is not None: datetext = datetext.replace('\r', '').replace('\n', '').replace( '\xa0', ' ').replace('\u3000', ' ').strip() try: date = datetime.datetime.strptime(datetext.strip(), "%Y-%m-%d %H:%M") except: print('时间转换失败') item['date'] = date content = response.xpath('//*[@id="mp-editor"]').xpath( 'string()').extract_first() item['content'] = '' if content is None else content.replace('\r', '').replace('\n', '').replace('\xa0', ' ').\ replace('\u3000', ' ').strip() yield item
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = item['link'].split('/') item['date'] = time_raw[-3] + "-" + time_raw[-2] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(0.5) except: pass
def parse(self, response): youbi_cd = date.today().weekday() for idx, bangumi_list in enumerate(response.css('td[valign="top"]')): # 対象外(当日の曜日以外)の番組情報は取得しない if idx != youbi_cd: continue item = NewsItem() text = '' # 取得対象の番組の概要テキストのみ取得 for bangumi in bangumi_list.css('table.new_day'): bangumi_name = bangumi.css( 'span.prog_name a.bangumiDetailOpen::text').extract_first( ) if bangumi_name is None: continue target_name = get_target_bangumi_name(bangumi_name.strip(), TARGET_BANGUMI_DICT) if target_name is None: continue print(bangumi_name) text = bangumi.css( 'span.expo_org a.bangumiDetailOpen::text').extract_first() if text is None: continue text = remove_words( text, TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['asahi']) if text != '': item['text'] = text yield item
def parse(self, response): href = response.url title = response.xpath('//div[contains(@class, "post-heading")]/h1/text()').extract()[0] pubtime = response.xpath('//div[contains(@class, "post-heading")]/div[@class="date"]').extract()[0] r = re.compile("(\d+)-(\d+)-(\d+) (\d+):(\d+)") r = re.findall(r, pubtime) if not r: r = re.compile("(\d+)-(\d+)-(\d+)") r = re.findall(r, pubtime) if not r: raise Exception(u"解析时间失败") r = [ int(x) for x in r[0] ] if len(r) == 5: pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2], hour=r[3], minute=r[4]) else: pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2]) htmlcontent = response.xpath('//div[@class="post-content"]').extract()[0] soup = BeautifulSoup(htmlcontent, "lxml") for r in soup.find_all("a", class_="image"): r.name = "img" r["src"] = r["href"] r["alt"] = r["title"] htmlcontent = unicode(soup.body.contents[0]) keywords = [] source = u"星島日報" item = NewsItem(title=title, pubtime=pubtime, htmlcontent=htmlcontent, href=href, keywords=keywords, source=source) yield item
def read_story(self, response): storyItem = NewsItem() storyItem['url'] = response.request.url storyItem['title'] = response.xpath('//h1/text()').extract()[0] story = "" try: date = response.xpath( '//meta[@itemprop="datePublished"]/@content').extract()[0] # date = response.xpath('//time[@class="dateline"]/text()')[0].extract() storyItem['date'] = date for i in response.xpath( '//body//article[@id="story"]//p[@class="story-body-text story-content"]' ): for j in i.xpath('.//text()').extract(): stringpart = j.strip( ) #.replace('\u2019',"'").replace('\u201d',"'").replace('\u201c',"'") story = ''.join([story, ' ', stringpart]) storyItem['text'] = story except: storyItem['text'] = '' pass if len(storyItem['text']) > 1000: yield storyItem for link in response.xpath( '//article[@id="story"]//a/@href').extract(): if (link.find('#') == -1) & (link.find('index.html') == -1) & ( link.find('/by/') == -1) & (link.find('news-event') == -1) & (link.find('newsletter') == -1): yield scrapy.Request(url=link, callback=self.read_story)
def parse_item(self,response): title = response.meta['title'] pic_url = response.meta['pic_url'] pic_more_url = response.meta['pic_more_url'] publishedDate = response.meta['publishedDate'] category = response.meta['category'] describe = '' data = response.body.replace('var XinhuammNews =','') data = json.loads(data) content = data['content'] publishedDate = data['releasedate'] contentt = content.replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '') content = re.findall('>(.*?)<', contentt) contentdata = '' for i in content: contentdata += i content = contentdata home_url = response.meta['home_url'] crawlTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) app_name = '新华社' author = '' print "app名称", app_name print "主图片url", pic_url print "子图片url", pic_more_url print "作者", author print "详情页地址", response.url print "所属类型", category print "标题", title print "描述", describe print "内容", content print "主url", home_url print "发布时间", publishedDate print "爬取时间", crawlTime url = response.url item = NewsItem() item['count'] = self.count item['app_name'] = app_name item['pic_url'] = pic_url item['pic_more_url'] = pic_more_url item['author'] = author item['url'] = url item['category'] = category item['title'] = title item['describe'] = describe item['content'] = content item['home_url'] = home_url item['publishedDate'] = publishedDate item['crawlTime'] = crawlTime try: try: publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d %H:%M"))) except: publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d"))) except: publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S"))) if publishedDate_stamp > self.timeStamp: publishedDate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(publishedDate_stamp))) self.count += 1 item['publishedDate'] = publishedDate yield item
def page_content(self, response): item = NewsItem() item["category"] = response.xpath( '//*[@id="cSub"]/div/div[2]/div[1]/h3/text()').extract() item["title1"] = response.xpath( '//*[@id="cSub"]/div/div[1]/div[1]/div/strong/a/text()').extract() item["link1"] = response.xpath( '//*[@id="cSub"]/div/div[1]/div[1]/div/strong/a/@href').extract() item["title2"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[1]/div[1]/strong/a/text()' ).extract() item["link2"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[1]/div[1]/strong/a/@href' ).extract() item["title3"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[2]/div[1]/strong/a/text()' ).extract() item["link3"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[2]/div[1]/strong/a/@href' ).extract() item["title4"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[3]/div[1]/strong/a/text()' ).extract() item["link4"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[3]/div[1]/strong/a/@href' ).extract() item["title5"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[4]/div[1]/strong/a/text()' ).extract() item["link5"] = response.xpath( '//*[@id="cSub"]/div/div[1]/ul[1]/li[4]/div[1]/strong/a/@href' ).extract() yield item
def get_news(self, response): item = NewsItem() title = response.css('h1[class ="content__headline "]').css( 'h1[itemprop="headline"]::text').extract() if (len(title) == 0): # 卫报的news页面有两种形式,如果第一种没抓到,就用模式二 title = response.css('meta[itemprop="description"]').xpath( '@content').extract() time = response.css( 'time[itemprop = "datePublished"]::text').extract_first() category = response.css( 'a[class ="subnav-link subnav-link--current-section"]::text' ).extract_first() tags = response.css('a[class = "submeta__link"]::text').extract() content = response.css('div[itemprop = "articleBody"]').css( 'p::text').extract() if (len(content) == 0): content = title # 有些news内容可能时视频或者其它非文本模式,这时我们用title作为content item['title'] = title item['time'] = time item['category'] = category item['tags'] = tags item['content'] = content yield item
def parseContent(self, response): item = NewsItem() item['leadingTitle'] = '' item['mainTitle'] = response.meta['title'].replace('\r', '').replace('\n', '').replace('\xa0', ' ').\ replace('\u3000', ' ').strip() item['subTitle'] = '' item['source'] = '人民网' date = '' datetext = response.xpath( '//div[@class="box01"]/div[1]/text()').extract_first() if datetext is not None: datetext = datetext.replace('\r', '').replace('\n', '').replace( '\xa0', ' ').replace('\u3000', ' ').strip() index = datetext.find('来源') if index > -1: datetext = datetext[:index] try: date = datetime.datetime.strptime(datetext.strip(), "%Y年%m月%d日%H:%M") except: pass item['date'] = date item['edition'] = '' item['anthor'] = '' content = '' pages = response.xpath('//div[@class="box_con"]/p') for p in pages: c = p.xpath('./text()').extract_first() if c is not None: content += c item['content'] = '' if content is None else content.replace('\r', '').replace('\n', '').\ replace('\xa0', ' ').replace('\u3000', ' ').strip() item['url'] = response.url yield item
def parse(self, response): try: #把获取到的数据转化为json jsonresponse = json.loads(response.body_as_unicode()) try: info = jsonresponse['data']['article_info'] list = Selector(text=info).xpath('//div') for news in list: cat = news.xpath( 'span[@class="t-tit"]/text()').extract()[0] if '图片' in cat: print('跳过图片新闻') continue time = ''.join(news.xpath('dl/dt/span/text()').extract()) title = ''.join(news.xpath('dl/dt/a/text()').extract()) url = ''.join(news.xpath('dl/dt/a/@href').extract()) summary = ''.join(news.xpath('dl/dd/text()').extract()) item = NewsItem() item['time'] = time item['title'] = title item['url'] = url item['summary'] = summary item['cata'] = response.meta['name'] yield scrapy.Request(url, meta={'item': item}, callback=self.parse_content) except TypeError: pass except: pass
def parse(self, response): news = response.xpath('//div[@class="news-box clearfix"]') for sel in news: title = sel.xpath('./div/h4/a/text()').extract_first() abstract = sel.xpath('./div/div[1]/text()').extract_first() article = sel.xpath( './div/div[2]/span[3]/a/text()').extract_first() if (article): article = article.strip() cover_img_src = sel.xpath('./a/img/@src').extract_first() news_url = sel.xpath('./div/h4/a/@href').extract_first() news_date = sel.xpath( './div/div[2]/span[2]/text()').extract_first() view_count = sel.xpath( './div/div[2]/span[4]/text()').extract_first() item = NewsItem() item['title'] = title item['abstract'] = abstract item['article'] = article item['cover_img_src'] = cover_img_src item['news_url'] = news_url item['news_date'] = news_date item['view_count'] = view_count yield item if self.offset < 3000: self.offset += 10 yield scrapy.Request(url=self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): news = Selector(response).xpath('//tr[@class="athing"]/td/a') for news_item in news: item = NewsItem() item['title'] = news_item.xpath('text()').extract()[0] item['url'] = news_item.xpath('@href').extract()[0] yield item
def parse(self, response): item = NewsItem() sel = Selector(response) item['name'] = sel.xpath('//a/text()').extract() item['image_urls'] = sel.xpath( '//a/img/@src[contains(.,".jpg")]').extract() yield item
def parse_detail(self, response): print("爬取信息。。。") item = NewsItem() news = response.meta['news'] item['title'] = news['title'] item['stitle'] = news['stitle'] item['ctime'] = news['ctime'] item['url'] = news['url'] item['wap_url'] = news['wapurl'] item['summary'] = news['summary'] item['wap_summary'] = news['wapsummary'] item['intro'] = news['intro'] item['keywords'] = news['keywords'] item['content'] = ''.join( response.xpath('//*[@id="artibody"]/p/text()').extract()).replace( u'\u3000\u3000', u'\n').replace(u'\xa0\xa0', u'\n').replace( u'\r\n', u'\n').replace(u'\n\r', u'\n').replace(u'\'', u'\\\'').strip() if item['content'] == '': item['content'] = ''.join( response.xpath( '//*[@id="article"]/p/text()').extract()).replace( u'\u3000\u3000', u'\n').replace(u'\xa0\xa0', u'\n').replace(u'\r\n', u'\n').replace( u'\n\r', u'\n').replace(u'\'', u'\\\'').strip() item['category'] = self.category[self.lid] yield item
def parse(self, response): for sel in response.xpath("//div[@class='article-item-warp']"): item = NewsItem() item['title'] = sel.xpath(".//div[@class='article-item__body']/h3/a/text()").get() if item['title'] is not None: item['title'] = str.strip(item['title']) item['source_url'] = response.urljoin(sel.xpath(".//div[@class='article-item__body']/h3/a/@href").get()) if item['source_url'] is not None: item['source_url'] = str.strip(item['source_url']) item['excerpt'] = sel.xpath(".//div[@class='article-item__content']/text()").get() if item['excerpt'] is not None: item['excerpt'] = str.strip(item['excerpt']) item['cover_url'] = response.urljoin(sel.xpath(".//div[@class='article-item__body']/h3/a/@href").get()) if item['cover_url'] is not None: item['cover_url'] = str.strip(item['cover_url']) item['author'] = sel.xpath(".//div[@class='article-item__author']/a[2]/text()").get() if item['author'] is not None: item['author'] = str.strip(item['author']) # check crawlInfo exists crawlInfo = self.mysqlObj.get_post_info(title=item['title'], author=item['author']) if crawlInfo is not None: logging.info("author: %s title: %s is exists" % (item['author'], item['title'])) continue yield scrapy.Request(url=item['source_url'], meta={"item": item, "useSel": False}, callback=self.parse_content)
def parse(self, response): headlines=Selector(response).xpath('//h3[@class="lx-stream-post__header-title gel-great-primer-bold qa-post-title gs-u-mt0 gs-u-mb-"]') item=NewsItem() for headline in headlines: item['title']=headline.xpath('a/span/text()').extract()[0] item['url']='https://www.bbc.com'+headline.xpath('a/@href').extract()[0] yield item
def parse(self, response): #news list # for sel in response.xpath("//div[@class='left_content left']/a/@href"): # if(sel): # url = 'https://www.yidaiyilu.gov.cn%s' % sel.extract() # yield scrapy.Request(url, callback=self.parse) #scroll news for sel in response.xpath("//div[@class='bd']/ul/li/a/@href"): if (sel): url = 'https://www.yidaiyilu.gov.cn%s' % sel.extract() yield scrapy.Request(url, callback=self.parse) item = NewsItem() #print response.body item['time'] = response.xpath( "//div[@class='szty']/span[1]/text()").extract() #item['time'] = datetime.datetime.strftime(time, "%Y-%m-%d %H:%M:%S") item['url'] = response.url item['title'] = response.xpath('//title/text()').extract()[0][:-10] item['content'] = response.xpath( "//p[@style='text-indent:2em;']/text()").extract() item['img_url'] = response.xpath( "//div[@class='info_content']/p/img/@src").extract() item['source'] = response.xpath( "//div[@class='szty']/span[2]/text()").extract() item['type'] = response.xpath( "//ul[@class='local_ul']/li[5]/a/text()").extract() item['desc'] = response.xpath( "//meta[@name='description']/@content").extract() yield item
def parseContent(self, response): item = NewsItem() item['leadingTitle'] = '' item['source'] = '中国长安网' item['edition'] = response.meta['channel'] item['date'] = response.meta['date'] item['url'] = response.url item['site'] = 'a76304d3-3ee0-47f2-8077-a2d08e2b7333' title = response.xpath( '//div[@class="content-l fl"]/h1/text()').extract_first() item['mainTitle'] = '' if title is None else title. \ replace('\r', '').replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip() source = response.xpath('//div[@class="source"]/text()').extract_first()\ .replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip() index = source.find('责任编辑') if index > 0: item['anthor'] = source[index + 5:].replace('\n', '').replace( '\xa0', ' ').replace('\u3000', ' ').strip() # index = source.find('来源') # if index > 0: # date = source[:index].replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip() # try: # item['date'] = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M') # except: # pass item['date'] = response.meta['date'] content = response.xpath('//div[@class="content-main"]').xpath( 'string()').extract_first() item['content'] = '' if content is None else content.\ replace('\r', '').replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip() yield item
def parse(self, response): if 'news.ycombinator.com' in response.url: soup = bs(response.body) items = [(x[0].text, x[0].get('href')) for x in filter(None, [ x.findChildren() for x in soup.findAll('td', {'class': 'title'}) ])] for item in items: print item news_item = NewsItem() news_item['title'] = item[0] news_item['url'] = item[1] try: yield Request(item[1], callback=self.parse) except ValueError: yield Request('http://news.ycombinator.com/' + item[1], callback=self.parse) yield news_item else: sha1_response = hashlib.sha1(response.url).hexdigest() folder = PATH + '/' + sha1_response if not os.path.exists(folder): os.makedirs(folder) with open(folder + '/index.html', 'w+') as file_obj: file_obj.write(response.body)
def parse_data_list(self, response): if 'public-api' in response.url: # 是新闻类型的API data_list = json.loads(response.text) elif 'integration-api' in response.url: data_list = json.loads(response.text)['data'] else: return for data in data_list: if ('integration-api' in response.url) and (data['resourceType'] == 3): continue if data['type'] == 3: # 是图集 continue items = NewsItem() try: items['title'] = data['title'] # items['source_url'] = data['originalSource'] # public-api有来源url,integration无,统一从页面中采集,在网页源代码里有,但JS渲染不可见 items['url'] = 'http://www.sohu.com/a/' + str( data['id']) + '_' + str(data['authorId']) items['date'] = parse_time(str(data['publicTime'])[0:10]) items['source'] = data['authorName'] except KeyError: # 其中一个原因:不是文章而是集合,所以没有authorId,authorName print(data_list.index(data)) print(response.url) print(data) return items['attribute'] = get_attribute(response.url) yield scrapy.Request(items['url'], self.parse_article1, meta={'items': items})
def parse(self, response): headlines=Selector(response).xpath('//div[@class="col-12 col-sm-8 col-md-8 col-lg-8 col-xl-8 pd-left-ultimas-noticias"]') item=NewsItem() for headline in headlines: item['title']=headline.xpath('a/h2/text()').extract()[0] item['url']=headline.xpath('a/@href').extract()[0] yield item