def parse(self, response): '''从Json中获取所有平台的拼音缩写并拼接成URL''' #print "[url: %s || status: %s]"%(response.url,response.status) prefix = 'http://www.wdzj.com/dangan/' ''' filtered=[] for p in eval(response.body): if p['platName'] not in self.all_names: filtered.append(p) self.allPlatDocEntryUrls= {} for t in filtered:''' for t in eval(response.body): self.allPlatDocEntryUrls[prefix + t['platPin'] + '/'] = [t['platName'], t['platPin']] #print len(self.allentryUrls) for i in self.allPlatDocEntryUrls.keys(): r = Request.Request(i, headers=self.common_header, callback=self.parse2, meta={ "m_platname": self.allPlatDocEntryUrls[i][0], "m_platpin": self.allPlatDocEntryUrls[i][1] }) yield r
def parse(self, response): url_nodes = response.css('.feedBox li') for url in url_nodes: post_url = url.css( ".rbox-inner .title-box a::attr(href)").extract_first() title = url.css(".rbox-inner .title-box a::text").extract_first() front_image_url = url.css( ".lbox .img-wrap img::attr(src)").extract_first() come_from_url = url.css( ".footer .y-left .media-avatar::attr(href)").extract_first() come_from_name = url.css( ".footer .y-left .source::text").extract_first() come_from_image_url = url.css( ".footer .y-left .media-avatar img::attr(src)").extract_first( ) comment = url.css(".footer .y-left .comment::text").extract_first() print(post_url) yield request.Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail, meta={ 'front_image_url': front_image_url, "title": title, "come_from_url": come_from_url, 'come_from_name': come_from_name, 'come_from_image_url': come_from_image_url, 'comment': comment }, headers=self.headers, dont_filter=True)
def parse(self, response): '''make requests for every single pages of news''' pagecount = int( response.xpath( "//div[@class='fy_big']/a[last()-1]/text()").extract()[0]) common_suffix = response.xpath( "//div[@class='fy_big']/a[last()-1]/@href").extract()[0] common_suffixes = common_suffix.split('-') if self.is_empty_table: pagelimit = pagecount / 4 else: pagelimit = self.top_page_count #10 对于金融之家这个要大一点,更新的比较频繁 for i in range( 1, min(pagelimit, pagecount) + 1 ): #min(5,int(pagecount)+1)): # we assume that newly added news will not surpass 5 pages #int(pagecount)+1): r = Request.Request(response.url + common_suffixes[0] + "-" + common_suffixes[1] + "-" + str(i) + "-" + common_suffixes[3], callback=self.parse2) #r.meta['grandfather']=response.meta['grandfather'] #r.meta['father']=essay yield r
def parse(self, response): '''get first level entries''' sub_entries = response.xpath("//div[@class='box_body']/table").xpath( ".//a/@href").extract() for i in sub_entries: r = Request.Request(i, callback=self.parse2) yield r
def parse2(self, response): '''make requests for every single pages of news''' pagecount = int( response.xpath( "//div[@class='pages']/label/span/text()").extract()[0]) if self.is_empty_table: pagelimit = min(5, pagecount) else: pagelimit = self.top_page_count for i in range(1, min(pagelimit, pagecount) + 1): r = Request.Request(response.url + str(i) + ".html", self.parse3) yield r
def parse(self, response): '''从Json中获取所有平台的拼音缩写并拼接成URL''' prefix='http://www.wdzj.com/dangan/' ''' filtered=[] for p in eval(response.body): if p['platName'] not in self.all_names: filtered.append(p) self.allPlatDocEntryUrls= {} for t in filtered:''' for t in eval(response.body): self.allPlatDocEntryUrls[prefix+t['platPin']+'/']=t #print len(self.allentryUrls) for i in self.allPlatDocEntryUrls.keys(): r=Request.Request(i,headers=self.common_header,callback=self.parse2,meta=self.allPlatDocEntryUrls[i]) yield r
def parse(self, response): '''抽取每个分类中的总页数,并对每一页分发请求''' # print "[url: %s || status: %s]"%(response.url,response.status) pagecount = int( response.xpath( "//div[@class='page-list-others scroll-style']/a[last()]/@href" ).extract()[0].split("/")[-2]) if self.is_empty_table: pagelimit = pagecount / 4 else: pagelimit = self.top_page_count for i in range(1, min(pagelimit, pagecount) + 1): # min(3,int(pagecount)+1)):#这个网站更新得也不快 r = Request.Request(response.url + r"/page/" + str(i), headers=self.common_header, callback=self.parse2) yield r
def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). self.f = open(r"./JinRiTouTiao/sig.js", 'r', encoding='UTF-8') line = self.f.readline() htmlstr = '' while line: htmlstr = htmlstr + line line = self.f.readline() ctx = execjs.compile(htmlstr) Honey = json.loads(ctx.call('get_as_cp_signature')) eas = Honey['as'] ecp = Honey['cp'] signature = Honey['_signature'] print('start_request执行结束') yield request.Request( url= 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as={}&cp={}&_signature={}' .format(eas, ecp, signature))
def parse(self, response): # 解析列表页 url_nodes = response.css( ".col_w660 .main #leftContent .ecoA9805_con02 ") for url in url_nodes: post_url = url.css("h3 .l a::attr(href)").extract_first() title = url.css("h3 .l a::text").extract_first() image_url = url.css( ".text_box .l a img::attr(src)").extract_first() summary = url.css(".text_box p::text").extract_first() label = url.css(".text_box h4 a::text").extract() release_time = url.css(".text_box h5 i::text").extract_first() print(post_url) yield request.Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail, meta={ 'front_image_url': image_url, "title": title, "summary": summary, 'label': label, 'release_time': release_time }, headers=self.headers, dont_filter=True)
def start_requests(self): for u in self.start_urls: yield Request.Request(u,headers=self.common_header,callback=self.parse)
def start_requests(self): for u in self.start_urls: yield Request.Request(u,callback=self.parse)
def start_requests(self): for u in self.start_urls: yield Request.Request(u, headers=self.common_header, callback=self.parse, meta={"m_cat": u.split("/")[-1]})