Ejemplo n.º 1
0
 def parse(self, response):
     '''响应体数据是js代码'''
     # 使用js2py模块,执行js代码,获取数据
     ret = js2py.eval_js(
         response.body.decode("gbk"))  # 对网站分析发现,数据编码格式是gbk的,因此需要先进行解码
     for news in ret.list:  #
         yield Request(news["url"],
                       headers=self.headers,
                       parse='parse_detail',
                       meta={"type": news["channel"]["title"]})
Ejemplo n.º 2
0
 def parse(self, response):
     data = response.xpath("//*[@class='job-list']/ul//li")
     print(len(data))
     "职位类别	人数	地点	发布时间"
     for i in data:
         # "//*[@class='job-list']/ul//li/div/div[2]/div/h3/a/text()"
         title = i.xpath("./div/div[2]/div/h3/a/text()")[0]
         # print(title)
         # 获取工作职责:获取url链接并拼接
         #  发送请求回滚到parse_detali方法里
         # meta{} 在不同的解析函数中传递
         urls = 'https://www.zhipin.com/' + i.xpath(
             "./div/div[2]/div/h3/a/@href")[0]
         # print(urls)
         yield Request(url=urls, parse='parse_detali', meta=title)
     next_ = response.xpath('//div[@class="page"]/a[5]/@href')[0]
     try:
         if next_ != "javascript:;":
             url_next = 'https://www.zhipin.com/' + next_
             yield Request(url=url_next, parse='parse')
     except:
         print(next_)
         traceback.format_exc()
Ejemplo n.º 3
0
 def start_requests(self):
     '''构建初始请求对象并返回'''
     for url in self.start_urls:
         yield Request(url, headers=HEADERS, filter=False)
Ejemplo n.º 4
0
 def parse(self, response):
     self.total += 1
     if self.total > 10:
         return
     yield Request(self.start_urls[0], filter=False, parse='parse')
Ejemplo n.º 5
0
 def start_requests(self):
     # 重写start_requests方法,返回多个请求
     base_url = 'http://movie.douban.com/top250?start='
     for i in range(0, 250, 25):    # 逐个返回第1-10页的请求属相
         url = base_url + str(i)
         yield Request(url)
Ejemplo n.º 6
0
 def start_requests(self):
     while True:
         # 需要发起这个请求,才能获取到列表页数据,并且返回的是一个js语句
         url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&ch=&k=&offset_page=0&offset_num=0&num=120&asc=&page=1&r=0.5559616678192825"
         yield Request(url, parse='parse', filter=False)
         time.sleep(60)  # 每60秒发起一次请求
Ejemplo n.º 7
0
 def start_requests(self):
     while True:
         for url in self.start_urls:
             yield Request(url, parse='parse',
                           filter=False)  # 注意这个parse接收的是字符串
             time.sleep(6)  # 定时发起请求,此时程序不会停止!