Esempio n. 1
0
    def parse(self, response):
        '''从Json中获取所有平台的拼音缩写并拼接成URL'''
        #print "[url: %s || status: %s]"%(response.url,response.status)

        prefix = 'http://www.wdzj.com/dangan/'
        '''
        filtered=[]
        for p in eval(response.body):
            if p['platName'] not in self.all_names:
                filtered.append(p)

        self.allPlatDocEntryUrls= {}
        for t in filtered:'''
        for t in eval(response.body):
            self.allPlatDocEntryUrls[prefix + t['platPin'] +
                                     '/'] = [t['platName'], t['platPin']]

    #print len(self.allentryUrls)
        for i in self.allPlatDocEntryUrls.keys():
            r = Request.Request(i,
                                headers=self.common_header,
                                callback=self.parse2,
                                meta={
                                    "m_platname":
                                    self.allPlatDocEntryUrls[i][0],
                                    "m_platpin": self.allPlatDocEntryUrls[i][1]
                                })
            yield r
Esempio n. 2
0
 def parse(self, response):
     url_nodes = response.css('.feedBox li')
     for url in url_nodes:
         post_url = url.css(
             ".rbox-inner .title-box a::attr(href)").extract_first()
         title = url.css(".rbox-inner .title-box a::text").extract_first()
         front_image_url = url.css(
             ".lbox .img-wrap img::attr(src)").extract_first()
         come_from_url = url.css(
             ".footer .y-left .media-avatar::attr(href)").extract_first()
         come_from_name = url.css(
             ".footer .y-left .source::text").extract_first()
         come_from_image_url = url.css(
             ".footer .y-left .media-avatar img::attr(src)").extract_first(
             )
         comment = url.css(".footer .y-left .comment::text").extract_first()
         print(post_url)
         yield request.Request(url=parse.urljoin(response.url, post_url),
                               callback=self.parse_detail,
                               meta={
                                   'front_image_url': front_image_url,
                                   "title": title,
                                   "come_from_url": come_from_url,
                                   'come_from_name': come_from_name,
                                   'come_from_image_url':
                                   come_from_image_url,
                                   'comment': comment
                               },
                               headers=self.headers,
                               dont_filter=True)
Esempio n. 3
0
    def parse(self, response):
        '''make requests for every single pages of news'''
        pagecount = int(
            response.xpath(
                "//div[@class='fy_big']/a[last()-1]/text()").extract()[0])

        common_suffix = response.xpath(
            "//div[@class='fy_big']/a[last()-1]/@href").extract()[0]
        common_suffixes = common_suffix.split('-')

        if self.is_empty_table:
            pagelimit = pagecount / 4
        else:
            pagelimit = self.top_page_count  #10 对于金融之家这个要大一点,更新的比较频繁

        for i in range(
                1,
                min(pagelimit, pagecount) + 1
        ):  #min(5,int(pagecount)+1)): # we assume that newly added news will not surpass 5 pages
            #int(pagecount)+1):
            r = Request.Request(response.url + common_suffixes[0] + "-" +
                                common_suffixes[1] + "-" + str(i) + "-" +
                                common_suffixes[3],
                                callback=self.parse2)
            #r.meta['grandfather']=response.meta['grandfather']
            #r.meta['father']=essay
            yield r
Esempio n. 4
0
    def parse(self, response):
        '''get first level entries'''

        sub_entries = response.xpath("//div[@class='box_body']/table").xpath(
            ".//a/@href").extract()

        for i in sub_entries:
            r = Request.Request(i, callback=self.parse2)
            yield r
Esempio n. 5
0
    def parse2(self, response):
        '''make requests for every single pages of news'''
        pagecount = int(
            response.xpath(
                "//div[@class='pages']/label/span/text()").extract()[0])

        if self.is_empty_table:
            pagelimit = min(5, pagecount)
        else:
            pagelimit = self.top_page_count

        for i in range(1, min(pagelimit, pagecount) + 1):
            r = Request.Request(response.url + str(i) + ".html", self.parse3)
            yield r
Esempio n. 6
0
    def parse(self, response):
        '''从Json中获取所有平台的拼音缩写并拼接成URL'''
        prefix='http://www.wdzj.com/dangan/'
        '''
        filtered=[]
        for p in eval(response.body):
            if p['platName'] not in self.all_names:
                filtered.append(p)

        self.allPlatDocEntryUrls= {}
        for t in filtered:'''
        for t in eval(response.body):
            self.allPlatDocEntryUrls[prefix+t['platPin']+'/']=t

        #print len(self.allentryUrls)
        for i in self.allPlatDocEntryUrls.keys():
            r=Request.Request(i,headers=self.common_header,callback=self.parse2,meta=self.allPlatDocEntryUrls[i])
            yield r
Esempio n. 7
0
    def parse(self, response):
        '''抽取每个分类中的总页数,并对每一页分发请求'''
        # print "[url: %s || status: %s]"%(response.url,response.status)

        pagecount = int(
            response.xpath(
                "//div[@class='page-list-others scroll-style']/a[last()]/@href"
            ).extract()[0].split("/")[-2])

        if self.is_empty_table:
            pagelimit = pagecount / 4
        else:
            pagelimit = self.top_page_count

        for i in range(1,
                       min(pagelimit, pagecount) +
                       1):  # min(3,int(pagecount)+1)):#这个网站更新得也不快
            r = Request.Request(response.url + r"/page/" + str(i),
                                headers=self.common_header,
                                callback=self.parse2)
            yield r
Esempio n. 8
0
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        self.f = open(r"./JinRiTouTiao/sig.js", 'r', encoding='UTF-8')
        line = self.f.readline()
        htmlstr = ''
        while line:
            htmlstr = htmlstr + line
            line = self.f.readline()
        ctx = execjs.compile(htmlstr)
        Honey = json.loads(ctx.call('get_as_cp_signature'))
        eas = Honey['as']
        ecp = Honey['cp']
        signature = Honey['_signature']
        print('start_request执行结束')
        yield request.Request(
            url=
            'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as={}&cp={}&_signature={}'
            .format(eas, ecp, signature))
Esempio n. 9
0
 def parse(self, response):
     # 解析列表页
     url_nodes = response.css(
         ".col_w660 .main #leftContent .ecoA9805_con02 ")
     for url in url_nodes:
         post_url = url.css("h3 .l a::attr(href)").extract_first()
         title = url.css("h3 .l a::text").extract_first()
         image_url = url.css(
             ".text_box .l a img::attr(src)").extract_first()
         summary = url.css(".text_box p::text").extract_first()
         label = url.css(".text_box h4 a::text").extract()
         release_time = url.css(".text_box h5 i::text").extract_first()
         print(post_url)
         yield request.Request(url=parse.urljoin(response.url, post_url),
                               callback=self.parse_detail,
                               meta={
                                   'front_image_url': image_url,
                                   "title": title,
                                   "summary": summary,
                                   'label': label,
                                   'release_time': release_time
                               },
                               headers=self.headers,
                               dont_filter=True)
Esempio n. 10
0
 def start_requests(self):
     for u in self.start_urls:
         yield Request.Request(u,headers=self.common_header,callback=self.parse)
Esempio n. 11
0
 def start_requests(self):
     for u in self.start_urls:
         yield Request.Request(u,callback=self.parse)
Esempio n. 12
0
 def start_requests(self):
     for u in self.start_urls:
         yield Request.Request(u,
                               headers=self.common_header,
                               callback=self.parse,
                               meta={"m_cat": u.split("/")[-1]})