Example #1
0
 def parse_business_district(self, response):
     """
     解析商圈url,获取所属业态url
     :param response:
     :return:
     """
     city_name = Selector(response).xpath(
         '//*[@id="logo-input"]/div/a[2]/span/text()').extract()[0]
     business_district = response.meta.get('business_district')
     adname = response.meta.get('item')
     all_url = Selector(response=response).xpath(
         '/html/body/div[2]/div[2]/div[1]/div/div/div/a/@href').extract()
     all_text = Selector(response=response).xpath(
         '/html/body/div[2]/div[2]/div[1]/div/div/div/a/span/text()'
     ).extract()
     for c_url in range(len(all_url)):
         create_sleep()
         dic = {
             'item': adname,
             'business_district': business_district,
             'category_type': all_text[c_url],
             'city_name': city_name
         }
         yield scrapy.Request(url=all_url[c_url],
                              meta=dic,
                              callback=self.parse_category_type,
                              headers=header1)
Example #2
0
 def parse(self, response):
     """
     解析初始页面
     :param response:
     :return:
     """
     try:
         city_name = Selector(response).xpath(
             '//*[@id="logo-input"]/div/a[2]/span/text()').extract()[0]
     except:
         city_name = None
     all_area_url = Selector(response).xpath(
         '//*[@id="region-nav"]/a/@href').extract()
     all_area = Selector(response).xpath(
         '//*[@id="region-nav"]/a/span/text()').extract()
     for i in range(len(all_area_url)):
         area_url = all_area_url[i]
         adname = all_area[i]
         create_sleep()
         yield scrapy.Request(url=area_url,
                              meta={
                                  "city_name": city_name,
                                  "adname": adname
                              },
                              callback=self.parse_page,
                              headers=header)
Example #3
0
    def parse(self, response):
        """
        解析获取商圈url
        :param response:
        :return:
        """

        try:
            all_area = Selector(response).xpath(
                '//*[@id="top"]/div[6]/div/div[2]/dl[1]')
        except Exception as e:
            all_area = []
        for c_area in all_area:
            adname = c_area.xpath('./dt/a/text()').extract()[0]
            all_url = c_area.xpath('./dd/ul/li/a/@href').extract()
            all_text = c_area.xpath('./dd/ul/li/a/text()').extract()
            for c_url in range(len(all_url)):
                n_url = self.header_url + all_url[c_url]
                create_sleep()
                yield scrapy.Request(url=n_url,
                                     meta={
                                         'item': adname,
                                         'business_district': all_text[c_url]
                                     },
                                     callback=self.parse_business_district,
                                     headers=header)
Example #4
0
    def parse_page(self, response):
        """
        解析页面url, 获取店铺url
        :param response:
        :return:
        """
        city_name = response.meta.get('city_name')
        category_type = response.meta.get('category_type')
        business_district = response.meta.get('business_district')
        adname = response.meta.get('item')
        all_url = Selector(response=response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a/@href').extract()
        # header["User-Agent"] = get_user_agent()
        dic = {
            'item': adname,
            'business_district': business_district,
            'category_type': category_type,
            'city_name': city_name
        }

        for c_url in all_url:

            store_id = c_url.split('/')[-1]
            ret = spider_service.select_id(store_id, self.data_table)
            dic['execute'] = ret
            if ret:
                create_sleep()
                yield scrapy.Request(url=c_url,
                                     meta=dic,
                                     callback=self.parse_brand,
                                     headers=create_header())
            else:
                pass
Example #5
0
    def parse_category_type(self, response):
        """
        解析业态url,获取页码url
        :param response:
        :return:
        """
        city_name = response.meta.get('city_name')
        category_type = response.meta.get('category_type')
        business_district = response.meta.get('business_district')
        adname = response.meta.get('item')
        dic = {
            'item': adname,
            'business_district': business_district,
            'category_type': category_type,
            'city_name': city_name
        }
        try:
            all_page = PyQuery(
                response.body).find('.page').find('a').eq(-2).attr('href')
        except:
            all_page = None
        if all_page:
            num = all_page.split('/')[-1].split('p')[-1]
            try:
                c_num, aid = num.split('?')
            except:
                c_num = num
                aid = ""
            head, head1, mid, end = all_page.split('p')

            for c_page in range(1, int(c_num) + 1):
                if aid == '':
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page)
                else:
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page) + '?' + aid
                md5_url = self.md5(n_page)
                ret = spider_service.select_url(md5_url, self.md5_table)
                if not ret:
                    create_sleep()
                    spider_service.update_into(md5_url, self.md5_table)
                    yield scrapy.Request(url=n_page,
                                         meta=dic,
                                         callback=self.parse_page,
                                         headers=header2)

        else:
            time.sleep(3)
            md5_url = self.md5(response.url)
            spider_service.update_into(md5_url, self.md5_table)
            yield scrapy.Request(url=response.url,
                                 meta=dic,
                                 callback=self.parse_page,
                                 headers=header3)
Example #6
0
 def parse(self, response):
     """
     开始
     :param response:
     :return:
     """
     city = response.xpath(
         '/html/body/div[2]/div[3]/div/p/text()').extract()[0]
     all_company = response.xpath(
         '//*[@id="resultList"]/div/span[1]/a/@href').extract()
     for c_company in all_company:
         create_sleep()
         yield scrapy.Request(url=c_company,
                              meta={'city': city},
                              callback=self.parse_company)
Example #7
0
 def parse_page(self, response):
     """
     获取当前页小区url
     :param response:
     :return:
     """
     dic = response.meta
     # city_name = Selector(response).xpath('//*[@id="city"]/a/span/text()').extract()[0]
     all_url = Selector(response).xpath(
         '/html/body/div[4]/div[4]/div[1]/div[2]/dl/dd[1]/p/a/@href'
     ).extract()
     heade_url = response.url.split('//')[-1].split('/')[0]
     for c_url in all_url:
         n_url = "http://" + heade_url + c_url
         create_sleep()
         yield scrapy.Request(url=n_url, meta=dic, callback=self.parse_name)
Example #8
0
 def click_end_page(self):
     """
     跳转到末尾页
     :return:
     """
     area_html = self.browser.page_source
     tag_page = '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[{}]'
     c_page_num = len(
         Selector(text=area_html).xpath(
             '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a'))
     c_tag_page = tag_page.format(c_page_num)
     n_tag_page = self.browser.find_element_by_xpath(c_tag_page)
     webdriver.ActionChains(
         self.browser).move_to_element(n_tag_page).perform()
     create_sleep()
     webdriver.ActionChains(
         self.browser).move_to_element(n_tag_page).click().perform()
     create_sleep()
Example #9
0
 def parse(self, response):
     """
     获取区域url
     :param response:
     :return:
     """
     province, city_name = get_info(response.url)
     dic = {"province": province, "city_name": city_name}
     all_area = Selector(response).xpath(
         '/html/body/div[4]/div[3]/div/dl[1]/dd[1]/a')
     for i in range(1, len(all_area)):
         c_area = all_area[i].xpath('./text()').extract()[0]
         dic['area'] = c_area
         c_url = response.url + '/' + all_area[i].xpath(
             './@href').extract()[0].split('/')[-1]
         create_sleep()
         yield scrapy.Request(url=c_url,
                              meta=dic,
                              callback=self.parse_all_page)
Example #10
0
 def brower_get(self):
     """
     进入特定的url页面
     :return:
     """
     self.browser.get(
         'https://www.liepin.com/zhaopin/?d_sfrom=search_fp_nvbar&init=1')
     create_sleep()
     try:
         WebDriverWait(
             self.browser, 10
         ).until(lambda the_brower: the_brower.find_element_by_xpath(
             '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[14]'
         ))
     except:
         pass
     other = self.browser.find_element_by_xpath(
         '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[14]')
     webdriver.ActionChains(
         self.browser).move_to_element(other).click().perform()
     create_sleep()
Example #11
0
    def parse_page(self, response):
        """
        解析页码
        :param response:
        :return:
        """
        city_name = response.meta.get('city_name')
        adname = response.meta.get('adname')
        end_page = PyQuery(
            response.body).find('.page').find('a').eq(-2).attr('href')
        dic = {"city_name": city_name, "adname": adname}
        if end_page:
            num = end_page.split('/')[-1].split('p')[-1]
            try:
                c_num, aid = num.split('?')
            except:
                c_num = num
                aid = ""
            head, head1, mid, end = end_page.split('p')

            for c_page in range(1, int(c_num) + 1):
                if aid == '':
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page)
                else:
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page) + '?' + aid
                create_sleep()
                yield scrapy.Request(url=n_page,
                                     meta=dic,
                                     callback=self.parse_area,
                                     headers=header1)
        else:
            create_sleep()
            yield scrapy.Request(url=response.url,
                                 meta=dic,
                                 callback=self.parse_area,
                                 headers=header1)
Example #12
0
 def login(self, url):
     """
     登录
     :param url:
     :return:
     """
     self.browser.get(url)
     self.browser.find_element_by_xpath(
         '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[2]/div[5]/p/a'
     ).click()
     self.browser.find_element_by_xpath(
         '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/div[1]/input'
     ).send_keys('15736755067')
     create_sleep()
     self.browser.find_element_by_xpath(
         '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/div[2]/input'
     ).send_keys('taochen123')
     create_sleep()
     self.browser.find_element_by_xpath(
         '//*[@id="home"]/div[2]/div[1]/div[2]/div/form[1]/input[3]').click(
         )
     create_sleep()
Example #13
0
 def parse_all_page(self, response):
     """
     获取最大页url,生成所有页url
     :param response:
     :return:
     """
     dic = response.meta
     try:
         max_page_li = Selector(response).xpath(
             '/html/body/div[4]/div[4]/div[1]/div[3]/ul/li')[-3]
     except:
         max_page_li = None
     if max_page_li != None:
         try:
             max_page = max_page_li.xpath('./a/text()').extract()[0]
         except:
             max_page = None
         if max_page:
             max_page_url = max_page_li.xpath('./a/@href').extract()[0]
             head_url = response.url
             for i in range(1, int(max_page) + 1):
                 n_page_url = head_url + "-bl{}".format(i)
                 create_sleep()
                 yield scrapy.Request(url=n_page_url,
                                      meta=dic,
                                      callback=self.parse_page)
         else:
             create_sleep()
             yield scrapy.Request(url=response.url,
                                  meta=dic,
                                  callback=self.parse_page)
     else:
         create_sleep()
         yield scrapy.Request(url=response.url,
                              meta=dic,
                              callback=self.parse_page)
Example #14
0
 def parse(self, response):
     """
     开始
     :param response:
     :return:
     """
     login_url = response.url
     self.login(login_url)
     self.brower_get()
     tag_province = '//*[@class="data-list"]/ul/li[{}]/a'
     html = self.browser.page_source
     all_num_tag_province = len(
         Selector(
             text=html).xpath('//*[@class="data-list"]/ul/li/a').extract())
     for i in range(1, all_num_tag_province + 1):
         self.brower_get()
         c_tag_province = tag_province.format(i)
         n_tag_province = self.browser.find_element_by_xpath(c_tag_province)
         webdriver.ActionChains(self.browser).move_to_element(
             n_tag_province).click().perform()
         create_sleep()
         c_html = self.browser.page_source
         tag_city = '//*[@class="data-list"]/ul/li[{}]/a'
         all_num_tag_city = len(
             Selector(text=c_html).xpath(
                 '//*[@class="data-list"]/ul/li/a').extract())
         for k in range(1, all_num_tag_city + 1):
             self.brower_get()
             create_sleep()
             cc_tag_province = self.browser.find_element_by_xpath(
                 c_tag_province)
             webdriver.ActionChains(
                 self.browser).move_to_element(cc_tag_province).perform()
             create_sleep()
             webdriver.ActionChains(self.browser).move_to_element(
                 cc_tag_province).click().perform()
             create_sleep()
             c_tag_city = tag_city.format(k)
             c_tag_city = self.browser.find_element_by_xpath(c_tag_city)
             webdriver.ActionChains(
                 self.browser).move_to_element(c_tag_city).perform()
             create_sleep()
             webdriver.ActionChains(self.browser).move_to_element(
                 c_tag_city).click().perform()
             create_sleep()
             self.browser.find_element_by_xpath(
                 '//*[@id="sojob"]/div[10]/div[3]/a[2]').click()
             create_sleep()
             city_html = self.browser.page_source
             key_tag = Selector(text=city_html).xpath(
                 '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl/dt/text()'
             ).extract()
             if key_tag[3] == '地区:':
                 all_num_tag_area = len(
                     Selector(text=city_html).xpath(
                         '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a'
                     ).extract())
             elif key_tag[2] == '地区:':
                 tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]'
                 all_num_tag_area = len(
                     Selector(text=city_html).xpath(
                         '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a'
                     ).extract())
             else:
                 self.click_end_page()
                 self.parse_page()
                 continue
             for m in range(1, all_num_tag_area + 1):
                 c_area_html = self.browser.page_source
                 c_key_tag = Selector(text=c_area_html).xpath(
                     '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl/dt/text()'
                 ).extract()
                 if c_key_tag[3] == '地区:':
                     try:
                         tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a[{}]'
                         c_tag_area = tag_area.format(m)
                         n_tag_area = self.browser.find_element_by_xpath(
                             c_tag_area)
                     except:
                         tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]'
                         c_tag_area = tag_area.format(m)
                         n_tag_area = self.browser.find_element_by_xpath(
                             c_tag_area)
                 elif c_key_tag[2] == '地区:':
                     try:
                         tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[{}]'
                         c_tag_area = tag_area.format(m)
                         n_tag_area = self.browser.find_element_by_xpath(
                             c_tag_area)
                     except:
                         tag_area = '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[4]/dd/a[{}]'
                         c_tag_area = tag_area.format(m)
                         n_tag_area = self.browser.find_element_by_xpath(
                             c_tag_area)
                         '//*[@id="sojob"]/div[1]/form/div[2]/div/div[1]/dl[3]/dd/a[12]'
                 webdriver.ActionChains(self.browser).move_to_element(
                     n_tag_area).click().perform()
                 create_sleep()
                 self.click_end_page()
                 self.parse_page()
Example #15
0
    def parse(self, response):
        """
        通过redis获取页面url,解析页面获取company信息
        :param response:
        :return:
        """
        login_url = response.url
        self.login(login_url)
        url = r.brpop('liepin', 0)[1]
        self.browser.get(url)
        create_sleep()
        try:
            WebDriverWait(
                self.browser, 10
            ).until(lambda the_brower: the_brower.find_element_by_xpath(
                '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li/div/div[1]/h3/a'
            ))
        except:
            pass
        c_page_html = self.browser.page_source
        all_job_url = Selector(text=c_page_html).xpath(
            '//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li/div/div[1]/h3/a/@href'
        ).extract()

        for c_job_url in all_job_url:
            if c_job_url.startswith('http'):
                self.browser.get(c_job_url)
            else:
                n_job_url = 'https://www.liepin.com' + c_job_url
                self.browser.get(n_job_url)
            create_sleep()
            try:
                WebDriverWait(self.browser, 10).until(
                    lambda the_browser: the_browser.find_element_by_xpath(
                        '//*[@class="job-qualifications"]/span'))
            except:
                pass
            company_html = self.browser.page_source
            try:
                city = Selector(text=company_html).xpath(
                    '//*[@class="basic-infor"]/span/a/text()').extract()[0]
            except:
                city = Selector(text=company_html).xpath(
                    '//*[@class="basic-infor"]/span').extract()[0].split(
                        '</i>')[-1].split('</span>')[0].strip()
            if city:
                try:
                    city = city.split('-')[0]
                except:
                    pass
            try:
                name = Selector(text=company_html).xpath(
                    '//*[@class="company-logo"]/p/a/text()').extract()[0]
            except:
                name = Selector(text=company_html).xpath(
                    '//*[@class="company-name"]/text()').extract()[0]
            try:
                size = Selector(text=company_html).xpath(
                    '//*[@class="new-compintro"]/li[2]/text()').extract(
                    )[0].split(':')[-1]
            except:
                try:
                    size = Selector(text=company_html).xpath(
                        '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[6]'
                    ).extract()[0].split('</span>')[-1].split('</li>')[0]
                except:
                    size = None
            try:
                nature = Selector(text=company_html).xpath(
                    '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[5]'
                ).extract()[0].split('</span>')[-1].split('</li>')[0]
            except:
                nature = None
            try:
                industry = Selector(text=company_html).xpath(
                    '//*[@class="new-compintro"]/li[1]/a/text()').extract()[0]
            except:
                try:
                    industry = Selector(text=company_html).xpath(
                        '//*[@class="new-compintro"]/li[1]/text()').extract(
                        )[0].split(':')[-1]
                except:
                    try:
                        industry = Selector(text=company_html).xpath(
                            '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[4]/div/ul/li[3]/a/@title'
                        ).extract()[0]
                    except:
                        industry = None
            # website
            try:
                address = Selector(text=company_html).xpath(
                    '//*[@class="new-compintro"]/li[3]/text()').extract(
                    )[0].split(':')[-1]
            except:
                address = None
            # job_qualifications = Selector(text=company_html).xpath(
            #     '//*[@class="job-qualifications"]/span/text()').extract()
            # all_job_qualifications = ''
            # for c_job_qualification in job_qualifications:
            #     all_job_qualifications = all_job_qualifications + '|' + c_job_qualification
            dic = {
                'city': city,
                'name': name,
                'nature': nature,
                'size': size,
                'industry': industry,
                'address': address,
            }
            item = LiepinreceiveItem(dic=dic)
            yield item
Example #16
0
    def parse_area(self, response):
        """
        解析行政区页面
        :param response:
        :return:
        """

        city_name = response.meta.get('city_name')
        adname = response.meta.get('adname')
        type = Selector(response).xpath(
            '//*[@id="classfy"]/a[@class="cur"]/span/text()').extract()[0]
        all_store_url = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a[1]/@href').extract(
            )
        all_comments_count = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[1]')
        all_comments_count_list = []
        for comments_count_i in all_comments_count:
            try:
                comments_count = comments_count_i.xpath(
                    './b/text()').extract()[0]
            except:
                comments_count = None
            all_comments_count_list.append(comments_count)
        all_address = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]')
        all_address_list = []
        for c_address in all_address:
            try:
                cc_address = c_address.xpath('./span/text()').extract()[0]
            except:
                cc_address = None
            all_address_list.append(cc_address)
        all_consumption_amt = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[2]')
        all_consumption_amt_list = []
        for c_consumption_amt in all_consumption_amt:
            try:
                cc_consumption_amt = c_consumption_amt.xpath(
                    './b/text()').extract()[0].split('¥')[-1]
            except:
                cc_consumption_amt = None
            all_consumption_amt_list.append(cc_consumption_amt)
        all_business_district = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[2]')
        all_business_district_list = []
        for c_business_district in all_business_district:
            try:
                cc_business_district = c_business_district.xpath(
                    './span/text()').extract()[0]
            except:
                cc_business_district = None
            all_business_district_list.append(cc_business_district)
        all_place_name = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a')
        all_place_name_list = []
        for c_place_name in all_place_name:
            try:
                cc_place_name = c_place_name.xpath('./h4/text()').extract()[0]
            except:
                cc_place_name = None
            all_place_name_list.append(cc_place_name)
        all_star = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]')
        all_star_list = []
        for c_star in all_star:
            try:
                cc_star = c_star.xpath('./span/@title').extract()[0]
            except:
                cc_star = "该商户暂无星级"
            all_star_list.append(cc_star)
        all_con = Selector(response).xpath(
            '//*[@id="shop-all-list"]/ul/li/div[@class="txt"]')
        all_quality_list = []
        all_environment_list = []
        all_service_list = []
        for c_con in all_con:
            try:
                c_quality = c_con.xpath(
                    './span[@class="comment-list"]/span[1]/b/text()').extract(
                    )[0]
            except:
                c_quality = ""
            all_quality_list.append(c_quality)
            try:
                c_environment = c_con.xpath(
                    './span[@class="comment-list"]/span[2]/b/text()').extract(
                    )[0]
            except:
                c_environment = ""
            all_environment_list.append(c_environment)
            try:
                c_service = c_con.xpath(
                    './span[@class="comment-list"]/span[3]/b/text()').extract(
                    )[0]
            except:
                c_service = ""
            all_service_list.append(c_service)
        for i in range(len(all_store_url)):
            c_store_url = all_store_url[i]
            id = c_store_url.split('/')[-1]
            ret = spider_service.select_id(id, self.table_name)
            c_comments_count = all_comments_count_list[i]
            address = all_address_list[i]
            consumption_amt = all_consumption_amt_list[i]
            business_district = all_business_district_list[i]
            place_name = all_place_name_list[i]
            star = all_star_list[i]
            quality = all_quality_list[i]
            environment = all_environment_list[i]
            service = all_service_list[i]
            dic = {
                "city_name": city_name,
                "adname": adname,
                "comments_count": c_comments_count,
                "execute": ret,
                "type": type,
                "address": address,
                "consumption_amt": consumption_amt,
                "id": id,
                "business_district": business_district,
                "place_name": place_name,
                "star": star,
                "quality": quality,
                "environment": environment,
                "service": service
            }
            create_sleep()
            if ret:
                # item = DazhongguangchangItem(dic=dic)
                # yield item
                yield scrapy.Request(url=c_store_url,
                                     meta=dic,
                                     callback=self.parse_store,
                                     headers=header4)
            else:
                pass