コード例 #1
0
    def parse_list(self, response):
        lg_data = {
            "first": "false",
            "pn": "1",
            "kd": "python",
        }
        # 将json格式的数据转化为python字符串
        res = json.loads(response.content)
        # 提取详情页链接中的positionId
        positionIdlist = jsonpath.jsonpath(res, "$..positionId")
        # 详情页url模板
        origin_url = "https://www.lagou.com/jobs/{positionId}.html"
        # 组装url
        index = 15
        result = self.judgeDays(response)
        if result[0]:
            lg_data['pn'] = str(response.meta['next'])
            yield Request(response.url,
                          data=lg_data,
                          headers=self.headers,
                          parse="parse_list")
        else:
            index = result[1]

        for id in positionIdlist[:index]:
            url = origin_url.format(positionId=id)
            yield Request(url, headers=self.headers, parse="parse_detail")
コード例 #2
0
    def parse_list(self, response):
        html = etree.HTML(response.content)
        detail_links = html.xpath('//div[@class = "newlist_list_content"]/table/tr/td/div/a/@href')
        # print link_list
        # print len(link_list)
        for link in detail_links:
            if link.endswith('.htm'):
                yield Request(link, headers=self.headers, parse="parse_detail")

        next_link = html.xpath("/html/body/div[3]/div[3]/div[3]/form/div[1]/div[1]/div[3]/ul/li[12]/a/@href")
        if next_link:
            yield Request(next_link[0], headers=self.headers, parse="parse_list")
コード例 #3
0
    def parse_list(self, response):

        html = etree.HTML(response.content)
        # parse detail page links
        detail_links = html.xpath('//*[@id="resultList"]/div/p/span/a/@href')
        for link in detail_links:
            yield Request(link, headers=self.headers, parse="parse_detail")

        # parse next page link
        next_link = html.xpath("//li[@class='bk'][2]/a/@href")
        if next_link:
            yield Request(next_link[0],
                          headers=self.headers,
                          parse="parse_list")
コード例 #4
0
    def start_requests(self):
        # 请求参数
        lg_params = {
            "px": "new",
            "city": "北京",
            "needAddtionalResult": "false",
        }
        lg_data = {
            "first": "false",
            "pn": "1",
            "kd": "python",
        }

        for city in CITY:
            for keyword in KEYWORDS:
                lg_params['city'] = self.city_code[city]
                lg_data['kd'] = urllib.quote(keyword.encode("utf-8"))

                yield Request(self.baseurl,
                              meta={"next": int(lg_data["pn"]) + 1},
                              method="POST",
                              data=lg_data,
                              params=lg_params,
                              headers=self.headers,
                              parse="parse_list")
コード例 #5
0
    def start_requests(self):
        #add

        # 请求参数  add
        # zl_params = {
        #     "pd": "7",  # 发布时间
        #     "jl": "北京",  # 地区
        #     "kw": "python",  # 搜索条件
        #     "sm": "0",
        #     "p": "1",
        #     "sf": "0",
        #     "st": "99999",
        #     "isadv": "1"
        # }

        for city in CITY:
            for keyword in KEYWORDS:
                self.params['pd'] = self.update_time_code['7d']
                self.params['jl'] = self.city_code[city]
                self.params['kw'] = urllib.quote(keyword.encode("utf-8"))

                yield Request(self.baseURL, params=self.params, headers=self.headers, parse="parse_list")
コード例 #6
0
 def start_requests(self):
     # 请求参数
     qcgw_params = {
         "location": "010000",
         "pub_date": "2",
         "keyword": "python",
         "workyear": "99",
         "page": "1"  #add
     }
     for city in CITY:
         for keyword in KEYWORDS:
             qcgw_params['pub_date'] = self.update_time_code['7d']
             qcgw_params['location'] = self.city_code[city]
             qcgw_params['keyword'] = urllib.quote(keyword.encode("utf-8"))
             # url模板
             origin_url = u'http://search.51job.com/list/{location},000000,0000,00,{pub_date},99,{keyword},2,1.html?lang=c&stype=1&postchannel=0000&workyear={workyear}&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=5&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
             # 组成url
             url = origin_url.format(location=qcgw_params["location"],
                                     pub_date=qcgw_params["pub_date"],
                                     keyword=qcgw_params["keyword"],
                                     workyear=qcgw_params["workyear"])
             #yield Request(url, headers=self.headers, parser="parse_list")
             yield Request(url, headers=self.headers, parse="parse_list")