def parse_business_district(self, response): """ 解析商圈url,获取所属业态url :param response: :return: """ city_name = Selector(response).xpath( '//*[@id="logo-input"]/div/a[2]/span/text()').extract()[0] business_district = response.meta.get('business_district') adname = response.meta.get('item') all_url = Selector(response=response).xpath( '/html/body/div[2]/div[2]/div[1]/div/div/div/a/@href').extract() all_text = Selector(response=response).xpath( '/html/body/div[2]/div[2]/div[1]/div/div/div/a/span/text()' ).extract() for c_url in range(len(all_url)): create_sleep() dic = { 'item': adname, 'business_district': business_district, 'category_type': all_text[c_url], 'city_name': city_name } yield scrapy.Request(url=all_url[c_url], meta=dic, callback=self.parse_category_type, headers=header1)
def parse(self, response): """ 解析初始页面 :param response: :return: """ try: city_name = Selector(response).xpath( '//*[@id="logo-input"]/div/a[2]/span/text()').extract()[0] except: city_name = None all_area_url = Selector(response).xpath( '//*[@id="region-nav"]/a/@href').extract() all_area = Selector(response).xpath( '//*[@id="region-nav"]/a/span/text()').extract() for i in range(len(all_area_url)): area_url = all_area_url[i] adname = all_area[i] create_sleep() yield scrapy.Request(url=area_url, meta={ "city_name": city_name, "adname": adname }, callback=self.parse_page, headers=header)
def parse(self, response): """ 解析获取商圈url :param response: :return: """ try: all_area = Selector(response).xpath( '//*[@id="top"]/div[6]/div/div[2]/dl[1]') except Exception as e: all_area = [] for c_area in all_area: adname = c_area.xpath('./dt/a/text()').extract()[0] all_url = c_area.xpath('./dd/ul/li/a/@href').extract() all_text = c_area.xpath('./dd/ul/li/a/text()').extract() for c_url in range(len(all_url)): n_url = self.header_url + all_url[c_url] create_sleep() yield scrapy.Request(url=n_url, meta={ 'item': adname, 'business_district': all_text[c_url] }, callback=self.parse_business_district, headers=header)
def parse_page(self, response): """ 解析页面url, 获取店铺url :param response: :return: """ city_name = response.meta.get('city_name') category_type = response.meta.get('category_type') business_district = response.meta.get('business_district') adname = response.meta.get('item') all_url = Selector(response=response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a/@href').extract() # header["User-Agent"] = get_user_agent() dic = { 'item': adname, 'business_district': business_district, 'category_type': category_type, 'city_name': city_name } for c_url in all_url: store_id = c_url.split('/')[-1] ret = spider_service.select_id(store_id, self.data_table) dic['execute'] = ret if ret: create_sleep() yield scrapy.Request(url=c_url, meta=dic, callback=self.parse_brand, headers=create_header()) else: pass
def parse_category_type(self, response): """ 解析业态url,获取页码url :param response: :return: """ city_name = response.meta.get('city_name') category_type = response.meta.get('category_type') business_district = response.meta.get('business_district') adname = response.meta.get('item') dic = { 'item': adname, 'business_district': business_district, 'category_type': category_type, 'city_name': city_name } try: all_page = PyQuery( response.body).find('.page').find('a').eq(-2).attr('href') except: all_page = None if all_page: num = all_page.split('/')[-1].split('p')[-1] try: c_num, aid = num.split('?') except: c_num = num aid = "" head, head1, mid, end = all_page.split('p') for c_page in range(1, int(c_num) + 1): if aid == '': n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) else: n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) + '?' + aid md5_url = self.md5(n_page) ret = spider_service.select_url(md5_url, self.md5_table) if not ret: create_sleep() spider_service.update_into(md5_url, self.md5_table) yield scrapy.Request(url=n_page, meta=dic, callback=self.parse_page, headers=header2) else: time.sleep(3) md5_url = self.md5(response.url) spider_service.update_into(md5_url, self.md5_table) yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_page, headers=header3)
def parse(self, response): """ 开始 :param response: :return: """ city = response.xpath( '/html/body/div[2]/div[3]/div/p/text()').extract()[0] all_company = response.xpath( '//*[@id="resultList"]/div/span[1]/a/@href').extract() for c_company in all_company: create_sleep() yield scrapy.Request(url=c_company, meta={'city': city}, callback=self.parse_company)
def parse_page(self, response): """ 获取当前页小区url :param response: :return: """ dic = response.meta # city_name = Selector(response).xpath('//*[@id="city"]/a/span/text()').extract()[0] all_url = Selector(response).xpath( '/html/body/div[4]/div[4]/div[1]/div[2]/dl/dd[1]/p/a/@href' ).extract() heade_url = response.url.split('//')[-1].split('/')[0] for c_url in all_url: n_url = "http://" + heade_url + c_url create_sleep() yield scrapy.Request(url=n_url, meta=dic, callback=self.parse_name)
def click_end_page(self): """ 跳转到末尾页 :return: """ area_html = self.browser.page_source tag_page = '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[{}]' c_page_num = len( Selector(text=area_html).xpath( '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a')) c_tag_page = tag_page.format(c_page_num) n_tag_page = self.browser.find_element_by_xpath(c_tag_page) webdriver.ActionChains( self.browser).move_to_element(n_tag_page).perform() create_sleep() webdriver.ActionChains( self.browser).move_to_element(n_tag_page).click().perform() create_sleep()
def parse(self, response): """ 获取区域url :param response: :return: """ province, city_name = get_info(response.url) dic = {"province": province, "city_name": city_name} all_area = Selector(response).xpath( '/html/body/div[4]/div[3]/div/dl[1]/dd[1]/a') for i in range(1, len(all_area)): c_area = all_area[i].xpath('./text()').extract()[0] dic['area'] = c_area c_url = response.url + '/' + all_area[i].xpath( './@href').extract()[0].split('/')[-1] create_sleep() yield scrapy.Request(url=c_url, meta=dic, callback=self.parse_all_page)
def brower_get(self): """ 进入特定的url页面 :return: """ self.browser.get( 'https://www.liepin.com/zhaopin/?d_sfrom=search_fp_nvbar&init=1') create_sleep() try: WebDriverWait( self.browser, 10 ).until(lambda the_brower: the_brower.find_element_by_xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[14]' )) except: pass other = self.browser.find_element_by_xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[14]') webdriver.ActionChains( self.browser).move_to_element(other).click().perform() create_sleep()
def parse_page(self, response): """ 解析页码 :param response: :return: """ city_name = response.meta.get('city_name') adname = response.meta.get('adname') end_page = PyQuery( response.body).find('.page').find('a').eq(-2).attr('href') dic = {"city_name": city_name, "adname": adname} if end_page: num = end_page.split('/')[-1].split('p')[-1] try: c_num, aid = num.split('?') except: c_num = num aid = "" head, head1, mid, end = end_page.split('p') for c_page in range(1, int(c_num) + 1): if aid == '': n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) else: n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) + '?' + aid create_sleep() yield scrapy.Request(url=n_page, meta=dic, callback=self.parse_area, headers=header1) else: create_sleep() yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_area, headers=header1)
def login(self, url): """ 登录 :param url: :return: """ self.browser.get(url) self.browser.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[2]/div[5]/p/a' ).click() self.browser.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/div[1]/input' ).send_keys('15736755067') create_sleep() self.browser.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/div[2]/input' ).send_keys('taochen123') create_sleep() self.browser.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/input[3]').click( ) create_sleep()
def parse_all_page(self, response): """ 获取最大页url,生成所有页url :param response: :return: """ dic = response.meta try: max_page_li = Selector(response).xpath( '/html/body/div[4]/div[4]/div[1]/div[3]/ul/li')[-3] except: max_page_li = None if max_page_li != None: try: max_page = max_page_li.xpath('./a/text()').extract()[0] except: max_page = None if max_page: max_page_url = max_page_li.xpath('./a/@href').extract()[0] head_url = response.url for i in range(1, int(max_page) + 1): n_page_url = head_url + "-bl{}".format(i) create_sleep() yield scrapy.Request(url=n_page_url, meta=dic, callback=self.parse_page) else: create_sleep() yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_page) else: create_sleep() yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_page)
def parse(self, response): """ 开始 :param response: :return: """ login_url = response.url self.login(login_url) self.brower_get() tag_province = '//*[@class="data-list"]/ul/li[{}]/a' html = self.browser.page_source all_num_tag_province = len( Selector( text=html).xpath('//*[@class="data-list"]/ul/li/a').extract()) for i in range(1, all_num_tag_province + 1): self.brower_get() c_tag_province = tag_province.format(i) n_tag_province = self.browser.find_element_by_xpath(c_tag_province) webdriver.ActionChains(self.browser).move_to_element( n_tag_province).click().perform() create_sleep() c_html = self.browser.page_source tag_city = '//*[@class="data-list"]/ul/li[{}]/a' all_num_tag_city = len( Selector(text=c_html).xpath( '//*[@class="data-list"]/ul/li/a').extract()) for k in range(1, all_num_tag_city + 1): self.brower_get() create_sleep() cc_tag_province = self.browser.find_element_by_xpath( c_tag_province) webdriver.ActionChains( self.browser).move_to_element(cc_tag_province).perform() create_sleep() webdriver.ActionChains(self.browser).move_to_element( cc_tag_province).click().perform() create_sleep() c_tag_city = tag_city.format(k) c_tag_city = self.browser.find_element_by_xpath(c_tag_city) webdriver.ActionChains( self.browser).move_to_element(c_tag_city).perform() create_sleep() webdriver.ActionChains(self.browser).move_to_element( c_tag_city).click().perform() create_sleep() self.browser.find_element_by_xpath( '//*[@id="sojob"]/div[10]/div[3]/a[2]').click() create_sleep() city_html = self.browser.page_source key_tag = Selector(text=city_html).xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl/dt/text()' ).extract() if key_tag[3] == '地区:': all_num_tag_area = len( Selector(text=city_html).xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a' ).extract()) elif key_tag[2] == '地区:': tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]' all_num_tag_area = len( Selector(text=city_html).xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a' ).extract()) else: self.click_end_page() self.parse_page() continue for m in range(1, all_num_tag_area + 1): c_area_html = self.browser.page_source c_key_tag = Selector(text=c_area_html).xpath( '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl/dt/text()' ).extract() if c_key_tag[3] == '地区:': try: tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a[{}]' c_tag_area = tag_area.format(m) n_tag_area = self.browser.find_element_by_xpath( c_tag_area) except: tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]' c_tag_area = tag_area.format(m) n_tag_area = self.browser.find_element_by_xpath( c_tag_area) elif c_key_tag[2] == '地区:': try: tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]' c_tag_area = tag_area.format(m) n_tag_area = self.browser.find_element_by_xpath( c_tag_area) except: tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a[{}]' c_tag_area = tag_area.format(m) n_tag_area = self.browser.find_element_by_xpath( c_tag_area) '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[12]' webdriver.ActionChains(self.browser).move_to_element( n_tag_area).click().perform() create_sleep() self.click_end_page() self.parse_page()
def parse(self, response): """ 通过redis获取页面url,解析页面获取company信息 :param response: :return: """ login_url = response.url self.login(login_url) url = r.brpop('liepin', 0)[1] self.browser.get(url) create_sleep() try: WebDriverWait( self.browser, 10 ).until(lambda the_brower: the_brower.find_element_by_xpath( '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li/div/div[1]/h3/a' )) except: pass c_page_html = self.browser.page_source all_job_url = Selector(text=c_page_html).xpath( '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li/div/div[1]/h3/a/@href' ).extract() for c_job_url in all_job_url: if c_job_url.startswith('http'): self.browser.get(c_job_url) else: n_job_url = 'https://www.liepin.com' + c_job_url self.browser.get(n_job_url) create_sleep() try: WebDriverWait(self.browser, 10).until( lambda the_browser: the_browser.find_element_by_xpath( '//*[@class="job-qualifications"]/span')) except: pass company_html = self.browser.page_source try: city = Selector(text=company_html).xpath( '//*[@class="basic-infor"]/span/a/text()').extract()[0] except: city = Selector(text=company_html).xpath( '//*[@class="basic-infor"]/span').extract()[0].split( '</i>')[-1].split('</span>')[0].strip() if city: try: city = city.split('-')[0] except: pass try: name = Selector(text=company_html).xpath( '//*[@class="company-logo"]/p/a/text()').extract()[0] except: name = Selector(text=company_html).xpath( '//*[@class="company-name"]/text()').extract()[0] try: size = Selector(text=company_html).xpath( '//*[@class="new-compintro"]/li[2]/text()').extract( )[0].split(':')[-1] except: try: size = Selector(text=company_html).xpath( '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[6]' ).extract()[0].split('</span>')[-1].split('</li>')[0] except: size = None try: nature = Selector(text=company_html).xpath( '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[5]' ).extract()[0].split('</span>')[-1].split('</li>')[0] except: nature = None try: industry = Selector(text=company_html).xpath( '//*[@class="new-compintro"]/li[1]/a/text()').extract()[0] except: try: industry = Selector(text=company_html).xpath( '//*[@class="new-compintro"]/li[1]/text()').extract( )[0].split(':')[-1] except: try: industry = Selector(text=company_html).xpath( '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[3]/a/@title' ).extract()[0] except: industry = None # website try: address = Selector(text=company_html).xpath( '//*[@class="new-compintro"]/li[3]/text()').extract( )[0].split(':')[-1] except: address = None # job_qualifications = Selector(text=company_html).xpath( # '//*[@class="job-qualifications"]/span/text()').extract() # all_job_qualifications = '' # for c_job_qualification in job_qualifications: # all_job_qualifications = all_job_qualifications + '|' + c_job_qualification dic = { 'city': city, 'name': name, 'nature': nature, 'size': size, 'industry': industry, 'address': address, } item = LiepinreceiveItem(dic=dic) yield item
def parse_area(self, response): """ 解析行政区页面 :param response: :return: """ city_name = response.meta.get('city_name') adname = response.meta.get('adname') type = Selector(response).xpath( '//*[@id="classfy"]/a[@class="cur"]/span/text()').extract()[0] all_store_url = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a[1]/@href').extract( ) all_comments_count = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[1]') all_comments_count_list = [] for comments_count_i in all_comments_count: try: comments_count = comments_count_i.xpath( './b/text()').extract()[0] except: comments_count = None all_comments_count_list.append(comments_count) all_address = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]') all_address_list = [] for c_address in all_address: try: cc_address = c_address.xpath('./span/text()').extract()[0] except: cc_address = None all_address_list.append(cc_address) all_consumption_amt = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[2]') all_consumption_amt_list = [] for c_consumption_amt in all_consumption_amt: try: cc_consumption_amt = c_consumption_amt.xpath( './b/text()').extract()[0].split('¥')[-1] except: cc_consumption_amt = None all_consumption_amt_list.append(cc_consumption_amt) all_business_district = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[2]') all_business_district_list = [] for c_business_district in all_business_district: try: cc_business_district = c_business_district.xpath( './span/text()').extract()[0] except: cc_business_district = None all_business_district_list.append(cc_business_district) all_place_name = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a') all_place_name_list = [] for c_place_name in all_place_name: try: cc_place_name = c_place_name.xpath('./h4/text()').extract()[0] except: cc_place_name = None all_place_name_list.append(cc_place_name) all_star = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]') all_star_list = [] for c_star in all_star: try: cc_star = c_star.xpath('./span/@title').extract()[0] except: cc_star = "该商户暂无星级" all_star_list.append(cc_star) all_con = Selector(response).xpath( '//*[@id="shop-all-list"]/ul/li/div[@class="txt"]') all_quality_list = [] all_environment_list = [] all_service_list = [] for c_con in all_con: try: c_quality = c_con.xpath( './span[@class="comment-list"]/span[1]/b/text()').extract( )[0] except: c_quality = "" all_quality_list.append(c_quality) try: c_environment = c_con.xpath( './span[@class="comment-list"]/span[2]/b/text()').extract( )[0] except: c_environment = "" all_environment_list.append(c_environment) try: c_service = c_con.xpath( './span[@class="comment-list"]/span[3]/b/text()').extract( )[0] except: c_service = "" all_service_list.append(c_service) for i in range(len(all_store_url)): c_store_url = all_store_url[i] id = c_store_url.split('/')[-1] ret = spider_service.select_id(id, self.table_name) c_comments_count = all_comments_count_list[i] address = all_address_list[i] consumption_amt = all_consumption_amt_list[i] business_district = all_business_district_list[i] place_name = all_place_name_list[i] star = all_star_list[i] quality = all_quality_list[i] environment = all_environment_list[i] service = all_service_list[i] dic = { "city_name": city_name, "adname": adname, "comments_count": c_comments_count, "execute": ret, "type": type, "address": address, "consumption_amt": consumption_amt, "id": id, "business_district": business_district, "place_name": place_name, "star": star, "quality": quality, "environment": environment, "service": service } create_sleep() if ret: # item = DazhongguangchangItem(dic=dic) # yield item yield scrapy.Request(url=c_store_url, meta=dic, callback=self.parse_store, headers=header4) else: pass