def process_one_cat(url, cat_list): PAGE_NUM_PROCESSING = 1 global_var._init() global_var.set_value("PAGE_NUM_PROCESSING", PAGE_NUM_PROCESSING) global_var.set_value("isLastPage", False) while True: page_num = global_var.get_value('PAGE_NUM_PROCESSING') tmp_url = url + str(page_num) + "/" # getJobList(tmp_url) print("while True main:" + tmp_url) s = requests.Session() lg = Lagou() pagegen = lg.getJobListPerPage(tmp_url, s) for item in pagegen: time_wait = 1 + float(random.randint(1, 100)) / 20 time.sleep(time_wait) print("休息时间:" + str(time_wait)) for job in item: db.insert(job, cat_list) print("跳出生成器") print("休息5秒钟") time.sleep(5) if global_var.get_value("isLastPage"): print("----------爬取结束---------,共" + str(global_var.get_value('PAGE_NUM_PROCESSING')) + "页") break
def do_task(city, query): boss = Boss(city, query) boss.open_url() lagou = Lagou(city, query) lagou.open_url() zhilian = Zhilian(city, query) zhilian.open_url()
def __init__(self): self.mrq = MyRedisQueue() self.boss_obj = Boss() self.lagou_obj = Lagou() self.boss_key = 'boss' self.boss_task_level = 2 self.boss_url_level = 1 self.lagou_key = 'lagou' self.lagou_task_level = 2 self.lagou_url_level = 1
def get_lagou(): keyword = input("【拉勾网】 请输入搜索关键词: ") judge = input("是否确定?y/n: ") while True: if judge == "y": break else: keyword = input(">> 【拉勾网】 请输入搜索关键词: ") judge = input("是否确定?y/n: ") city = input(">> 【拉勾网】 请输入城市: ") print("Sucessfully! 开始采集【拉勾网 %s-%s】 数据..." % (keyword, city)) clawer = Lagou(username=None, password=None, city=city, keywords=keyword, n=None, rootfile=rootfile) #clawer.dbtable.remove({"城市":clawer.city,"关键词":clawer.keywords}) clawer.main()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Dongyouyuan # @Software: PyCharm # @File: main.py # @Time: 17-11-23 上午11:57 from lagou import Lagou from db import init_db if __name__ == "__main__": # 第一次运行请初始化你的数据库 init_db() cookie = "user_trace_token=20171120113056-ba4eb409-b665-4fc9-8cbf-92ea7e4b1520; LGUID=20171120113057-38d05b37-cda3-11e7-996a-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=8f7c1b487db070b59111650e8396d3d3; X_HTTP_TOKEN=9cfa44c75b49564bf4bb24410edeb4e7; TG-TRACK-CODE=index_hotsearch; JSESSIONID=ABAAABAACDBAAIAFDC183EB010548C17203563788B54991; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511148657,1511172863,1511258038; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511328820; LGSID=20171122133340-b23fe104-cf46-11e7-9986-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2FPHP%2F%3FlabelWords%3Dlabel%3FlabelWords%3Dhot; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_PHP%3Fpx%3Ddefault%26city%3D%25E5%25B9%25BF%25E5%25B7%259E; LGRID=20171122133340-b23fe39c-cf46-11e7-9986-5254005c3644; _ga=GA1.2.525386246.1511148657; _gid=GA1.2.2044927823.1511148657; hibext_instdsigdip=1; SEARCH_ID=107bd6aa53a042e5955ff22f624fe3c5" city = "杭州" lang = "python" lagou = Lagou(cookie=cookie, city=city, lang=lang) lagou.get_data(page_sum=20) print(lagou.list_dict_results) # lagou.insert_to_db()
time_wait = 1 + float(random.randint(1, 100)) / 20 time.sleep(time_wait) print("休息时间:" + str(time_wait)) for job in item: db.insert(job, cat_list) print("跳出生成器") print("休息5秒钟") time.sleep(5) if global_var.get_value("isLastPage"): print("----------爬取结束---------,共" + str(global_var.get_value('PAGE_NUM_PROCESSING')) + "页") break lg = Lagou() top_cat_dict = lg.get_all_positions() for top_cat_key in top_cat_dict: # print(top_cat_key + ":" + top_cat_dict[top_cat_key]) grade2_cat_dict = top_cat_dict[top_cat_key] for grade2_key in grade2_cat_dict: # print(grade2_key + ":" + grade2_cat_dict[grade2_key]) grade3_cat_dict = grade2_cat_dict[grade2_key] for grade3_key in grade3_cat_dict: url = grade3_cat_dict[grade3_key] print(top_cat_key) print(grade2_key) print(grade3_key) cat_list = [top_cat_key, grade2_key, grade3_key]
class Main: def __init__(self): self.mrq = MyRedisQueue() self.boss_obj = Boss() self.lagou_obj = Lagou() self.boss_key = 'boss' self.boss_task_level = 2 self.boss_url_level = 1 self.lagou_key = 'lagou' self.lagou_task_level = 2 self.lagou_url_level = 1 def boss_worker(self): while True: task_type, task = self.mrq.pop_task(keys=[self.boss_key]) print task_type, task level = task_type.split('-')[-1] if level == str(self.boss_task_level): detail_urls = self.boss_obj.get_position_url( task) # 获取职位详细信息的url self.mrq.push_task(self.boss_key, detail_urls, self.boss_url_level) elif level == str(self.boss_url_level): self.boss_obj.position_detail(task) # 获取每个职位的信息 time.sleep(15) # 控制每隔10秒访问一次 def lagou_worker(self): while True: task_type, task = self.mrq.pop_task(keys=[self.lagou_key]) print task_type, task level = task_type.split('-')[-1] if level == str(self.lagou_task_level): detail_urls = self.lagou_obj.get_position_url( task) # 获取职位详细信息的url self.mrq.push_task(self.lagou_key, detail_urls, self.lagou_url_level) elif level == str(self.lagou_url_level): self.lagou_obj.position_detail(task) # 获取每个职位的信息 time.sleep(15) # 控制每隔10秒访问一次 def boss_task(self): pos_lst = ['Python', 'Python爬虫', 'Python数据分析', '机器学习', '数据挖掘', '大数据'] url_str = 'https://www.zhipin.com/c100010000/h_100010000/?query={pos}&page={page}&ka=page-{page}' url = [ url_str.format(page=str(i + 1), pos=p) for p in pos_lst for i in xrange(1) ] self.mrq.push_task(self.boss_key, url, level=self.boss_task_level) def lagou_task(self): pos_lst = ['Python', 'shujuwajue'] url = [] for p in pos_lst: url += [ 'https://www.lagou.com/zhaopin/{pos}/{page}/'.format( pos=p, page=str(i + 1)) for i in xrange(5) ] self.mrq.push_task(self.lagou_key, url, level=self.lagou_task_level)