def parse_list(self, response): lg_data = { "first": "false", "pn": "1", "kd": "python", } # 将json格式的数据转化为python字符串 res = json.loads(response.content) # 提取详情页链接中的positionId positionIdlist = jsonpath.jsonpath(res, "$..positionId") # 详情页url模板 origin_url = "https://www.lagou.com/jobs/{positionId}.html" # 组装url index = 15 result = self.judgeDays(response) if result[0]: lg_data['pn'] = str(response.meta['next']) yield Request(response.url, data=lg_data, headers=self.headers, parse="parse_list") else: index = result[1] for id in positionIdlist[:index]: url = origin_url.format(positionId=id) yield Request(url, headers=self.headers, parse="parse_detail")
def parse_list(self, response): html = etree.HTML(response.content) detail_links = html.xpath('//div[@class = "newlist_list_content"]/table/tr/td/div/a/@href') # print link_list # print len(link_list) for link in detail_links: if link.endswith('.htm'): yield Request(link, headers=self.headers, parse="parse_detail") next_link = html.xpath("/html/body/div[3]/div[3]/div[3]/form/div[1]/div[1]/div[3]/ul/li[12]/a/@href") if next_link: yield Request(next_link[0], headers=self.headers, parse="parse_list")
def parse_list(self, response): html = etree.HTML(response.content) # parse detail page links detail_links = html.xpath('//*[@id="resultList"]/div/p/span/a/@href') for link in detail_links: yield Request(link, headers=self.headers, parse="parse_detail") # parse next page link next_link = html.xpath("//li[@class='bk'][2]/a/@href") if next_link: yield Request(next_link[0], headers=self.headers, parse="parse_list")
def start_requests(self): # 请求参数 lg_params = { "px": "new", "city": "北京", "needAddtionalResult": "false", } lg_data = { "first": "false", "pn": "1", "kd": "python", } for city in CITY: for keyword in KEYWORDS: lg_params['city'] = self.city_code[city] lg_data['kd'] = urllib.quote(keyword.encode("utf-8")) yield Request(self.baseurl, meta={"next": int(lg_data["pn"]) + 1}, method="POST", data=lg_data, params=lg_params, headers=self.headers, parse="parse_list")
def start_requests(self): #add # 请求参数 add # zl_params = { # "pd": "7", # 发布时间 # "jl": "北京", # 地区 # "kw": "python", # 搜索条件 # "sm": "0", # "p": "1", # "sf": "0", # "st": "99999", # "isadv": "1" # } for city in CITY: for keyword in KEYWORDS: self.params['pd'] = self.update_time_code['7d'] self.params['jl'] = self.city_code[city] self.params['kw'] = urllib.quote(keyword.encode("utf-8")) yield Request(self.baseURL, params=self.params, headers=self.headers, parse="parse_list")
def start_requests(self): # 请求参数 qcgw_params = { "location": "010000", "pub_date": "2", "keyword": "python", "workyear": "99", "page": "1" #add } for city in CITY: for keyword in KEYWORDS: qcgw_params['pub_date'] = self.update_time_code['7d'] qcgw_params['location'] = self.city_code[city] qcgw_params['keyword'] = urllib.quote(keyword.encode("utf-8")) # url模板 origin_url = u'http://search.51job.com/list/{location},000000,0000,00,{pub_date},99,{keyword},2,1.html?lang=c&stype=1&postchannel=0000&workyear={workyear}&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=5&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # 组成url url = origin_url.format(location=qcgw_params["location"], pub_date=qcgw_params["pub_date"], keyword=qcgw_params["keyword"], workyear=qcgw_params["workyear"]) #yield Request(url, headers=self.headers, parser="parse_list") yield Request(url, headers=self.headers, parse="parse_list")