Esempio n. 1
0
 def get_position_detail(self, url, position, cookies=None):
     #position = copy.deepcopy(position)
     response = request.get(url, cookies=cookies)
     self.request_count += 1
     if response:
         html = etree.HTML(response.content)
         self.logger.info(
             html.xpath('//title/text()')[0] if html.
             xpath('//title/text()') else 'title error ')
         # education = html.xpath("//dd[@class='job_request']/p[1]/span[4]/text()")
         # work_year = html.xpath("//dd[@class='job_request']/p[1]/span[3]/text()")
         job_nature = html.xpath(
             "//dd[@class='job_request']/p[1]/span[5]/text()")
         # education = str(education[0]).strip('/') if education else ''
         # work_year = str(work_year[0]).strip('/') if work_year else ''
         job_nature = job_nature[0].strip() if job_nature else ''
         job_detail = html.xpath("//dd[@class='job_bt']/div//text()")
         job_detail = [item.strip() for item in job_detail if item.strip()]
         job_detail = '\n'.join(job_detail).strip()
         job_address = html.xpath("//div[@class='work_addr']//text()")
         job_address = [item.strip() for item in job_address]
         job_address = ''.join(job_address[:-2])
         district = html.xpath("//div[@class='work_addr']/a[2]/text()")
         district = district[0].strip() if district else ''
         position['job_nature'] = job_nature
         position['job_detail'] = job_detail
         position['job_address'] = job_address
         position['district'] = district
     else:
         self.except_count += 1
     self.save_infos(position)
Esempio n. 2
0
 def get_positons_list(self, url, item, cookies):
     self.request_count += 1
     response = request.get(url, cookies=cookies)
     if response:
         cookies = response.cookies
     else:
         self.except_count += 1
     self.get_new_list(response, copy.deepcopy(item), cookies)
Esempio n. 3
0
 def get_positions_urls(self, list_url, item, cookies=None):
     self.logger.debug(type(cookies))
     response = request.get(list_url, cookies=cookies)
     self.request_count += 1
     if response:
         cookies = response.cookies
         html = etree.HTML(response.content)
         self.logger.info(
             html.xpath('//title/text()')[0] if html.
             xpath('//title/text()') else 'title error')
         item_list = html.xpath("//ul[@class='item_con_list']/li")
         for position in item_list:
             publish_date = position.xpath(
                 ".//span[@class='format-time']/text()")[0]
             publish_date = self.switch_publish_date(publish_date)
             url = position.xpath(".//a[@class='position_link']/@href")[0]
             # 判断url是否存在
             if url not in self.urls and not self.lagou_db.isexist_url(url):
                 self.urls.append(url)
                 position_name = position.xpath("@data-positionname")[0]
                 salary = position.xpath("@data-salary")[0]
                 other = position.xpath(
                     ".//div[@class='li_b_l']/text()")[2].strip()
                 add = position.xpath(".//span[@class='add']/em/text()")[0]
                 city = add.split('·')[0]
                 company_name = position.xpath("@data-company")[0]
                 item['position_name'] = position_name.strip()
                 item['publish_date'] = publish_date
                 item['salary'] = salary.strip()
                 item['education'] = other.split('/')[1].strip()
                 item['work_year'] = other.split('/')[0][2:].strip()
                 item['city'] = city.strip()
                 item['company_name'] = company_name.strip()
                 item['url'] = url.strip()
                 item['job_nature'] = ''
                 item['job_detail'] = ''
                 item['job_address'] = ''
                 item['district'] = ''
                 g = gevent.spawn(self.get_position_detail,
                                  url,
                                  copy.deepcopy(item),
                                  cookies=cookies)
                 self.pool.add(g)
                 # self.get_position_detail(url, item, cookies=cookies)
             else:
                 self.logger.debug('此url %s 已经存在!' % url)
     else:
         self.except_count += 1
Esempio n. 4
0
def update_data():
    db = dbmysql.DB()
    sql = "select * from positions where publish_date > '2018-04-06'"
    #sql = "select * from (select * from positions limit 1,100) " \
    #      "as t where t.publish_date > '2018-03-16'"
    positions = db.fetchall(sql)
    for position in positions:
        try:
            city, district = position['job_address'].split('-')[0:2]
        except Exception as e:
            city = ''
            district = ''
        if city == position['city'] and district == position['district']:
            pass
        else:
            response = request.get(position['url'])
            html = etree.HTML(response.content)
            # education = html.xpath("//dd[@class='job_request']/p[1]/span[4]/text()")
            # work_year = html.xpath("//dd[@class='job_request']/p[1]/span[3]/text()")
            # job_nature = html.xpath("//dd[@class='job_request']/p[1]/span[5]/text()")
            # education = str(education[0]).strip('/') if education else ''
            # work_year = str(work_year[0]).strip('/') if work_year else ''
            # job_nature = job_nature[0].strip() if job_nature else ''
            job_detail = html.xpath("//dd[@class='job_bt']/div//text()")
            job_detail = [item.strip() for item in job_detail if item.strip()]
            job_detail = '\n'.join(job_detail).strip()
            job_address = html.xpath("//div[@class='work_addr']//text()")
            job_address = [item.strip() for item in job_address]
            job_address = ''.join(job_address[:-2])
            # district = html.xpath("//div[@class='work_addr']/a[2]/text()")
            # district = district[0].strip() if district else ''
            # position['job_nature'] = job_nature
            # position['job_detail'] = job_detail
            # position['job_address'] = job_address
            position = dict(position)
            position['publish_date'] = str(position['publish_date'])
            print_log(position['url'], position)
            sql = 'update positions set job_address=:job_address,job_detail=:job_detail where url=:url'
            db.edit(
                sql, {
                    'job_detail': job_detail,
                    'job_address': job_address,
                    'url': position['url']
                })
Esempio n. 5
0
 def get_new_list(self, response, item, cookies):
     if response:
         # new_url = html.xpath("//div[@class='item order']/a[2]/@href")
         new_url = self.second_url % (item['first_type'])
         response = request.get(url=new_url, cookies=cookies)
         self.request_count += 1
         if response:
             referer = response.url
             html = etree.HTML(response.content)
             page_num = html.xpath("//span[@class='span totalNum']/text()")
             page_num = int(page_num[0]) if page_num else 1
             for num in range(1, page_num + 1):
                 g = gevent.spawn(self.get_positions_list, num,
                                  copy.deepcopy(item), referer, cookies)
                 self.pool.add(g)
                 # self.get_positions_list(num, item, referer, cookies)
                 # form_data = {
                 #     'first': 'false',
                 #     'pn': str(num),
                 #     'kd': item['first_type'],
                 # }
                 # headers = {
                 #     'Referer': referer,
                 # }
                 # # 如果请求失败,重新请求
                 # for i in range(5):
                 #     time.sleep(random.randint(1, 3))
                 #     response = request.post(url=self.post_url, data=form_data, headers=headers, cookies=cookies)
                 #     try:
                 #         result = response.json(encoding='utf-8')
                 #     except Exception as e:
                 #         self.logger.error(e)
                 #     else:
                 #         if result.get('success'):
                 #             result = self.get_positions_urls(result, item, cookies=response.cookies)
                 #             if not result:
                 #                 return
                 #             break
                 #         else:
                 #             self.logger.error('%s %s %s' % (self.post_url, form_data, response.text))
         else:
             self.except_count += 1
Esempio n. 6
0
    def get_position_detail(self, url, position, cookies=None):
        response = request.get(url=url, cookies=cookies)
        self.request_count += 1
        if response:
            html = etree.HTML(response.content)
            self.logger.info(
                html.xpath('//title/text()')[0] if html.
                xpath('//title/text()') else 'title error ')
            job_detail = html.xpath("//dd[@class='job_bt']/div//text()")
            job_detail = [item.strip() for item in job_detail if item.strip()]
            job_detail = '\n'.join(job_detail).strip()

            job_address = html.xpath("//div[@class='work_addr']//text()")
            job_address = [item.strip() for item in job_address]
            job_address = ''.join(job_address[:-2])

            position['job_detail'] = job_detail
            position['job_address'] = job_address
        else:
            self.except_count += 1
        self.save_infos(position)
Esempio n. 7
0
 def start_spider(self):
     """
         爬虫开始
         : 获取所有 "技术" 相关的职位的url
     """
     self.count = 0
     self.request_count = 0
     self.except_count = 0
     self.error_count = 0
     self.urls = []
     start_time = time.time()
     response = request.get(self.start_url)
     self.request_count += 1
     if response:
         cookies = response.cookies
         html = etree.HTML(response.content)
         print(html.xpath("//title/text()")[0])
         menu = html.xpath("//div[@class='menu_sub dn']")[0]
         positions_dict = {}
         types = menu.xpath("dl")
         for item in types:
             type_name = item.xpath("dt/span/text()")[0]
             # print(type_name)
             positions_dict[type_name] = {}
             positions = item.xpath("dd/a")
             for position in positions[0:1]:
                 position_name = position.text
                 position_url = position.xpath('@href')[0]
                 positions_dict[type_name][position_name] = position_url
                 position_data = {'first_type': position_name, 'second_type': type_name}
                 # self.get_positons_list(position_url, position_data, cookies)
                 g = gevent.spawn(self.get_positons_list,*(position_url, position_data, cookies))
                 self.pool.add(g)
     else:
         self.except_count += 1
     self.pool.join()
     self.send_email(start_time)
Esempio n. 8
0
 def get_positons_list(self, url, item, cookies):
     response = request.get(url, cookies=cookies)
     self.request_count += 1
     if response:
         cookies = response.cookies
         html = etree.HTML(response.content)
         title = html.xpath('//title/text()')
         if not title or title[0] == '找工作-互联网招聘求职网-拉勾网':
             self.logger.error(url + '  error ')
             return
         html = etree.HTML(response.content)
         page_num = html.xpath("//span[@class='span totalNum']/text()")
         page_num = int(page_num[0]) if page_num else 1
         if page_num > 0:
             for num in range(1, page_num + 1):
                 list_url = '%s%d/' % (url, num)
                 g = gevent.spawn(self.get_positions_urls,
                                  list_url,
                                  copy.deepcopy(item),
                                  cookies=cookies)
                 self.pool.add(g)
                 # self.get_positions_urls(list_url, item, cookies=cookies)
     else:
         self.except_count += 1