# 'spec.recover': True, 'sch.recover': False, 'score': 750, # 'detail.parser': False, 'thcnt': 4} # # ] proxies = [] am = AccountManager() with open('proxy_r') as f: for l in f: proxies.append(l.strip()) proxies.append(None) idx = [22] count = len(idx) name = '青海' ac = { 'accounts': [] , 'name': name , 'prefix': provinces[name] } acsa = am.get(name, 8) i = 0 acs = [] for a in acsa: acs.insert(0, a) for a in acs: a.proxy = None runner = FullJobRunner( gen_jobs_params([a.gen_run_param()], name, provinces[name], spec=False, detail=True, school=False, spec_parser=False, detail_parser=False, spec_r=False, sch_parser=False)) runner.run()
# { # 'accounts': [ # {'username': '******', 'proxy': '101.200.178.46:3128', 'password': '******', 'ua': ua[2]}, # ], # 'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'], # 'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True, # 'spec.recover': True, 'sch.recover': False, 'score': 750, # 'detail.parser': False, 'thcnt': 4} # # ] am = AccountManager() name = '宁夏' ac = {'accounts': [], 'name': name, 'prefix': provinces[name]} acs = am.get(name, 1) i = 0 for a in acs: a.proxy = '192.168.1.39:3428' ac['accounts'].append(a.gen_run_param()) i += 1 run_fun(ac) name = '青海' ac = {'accounts': [], 'name': name, 'prefix': provinces[name]} acs = am.get(name, 1) i = 0 for a in acs: a.proxy = '192.168.1.39:3428' ac['accounts'].append(a.gen_run_param()) i += 1 run_fun(ac)
# # ] # jobs = [ # { # 'accounts': [ # {'username': '******', 'proxy': '101.200.178.46:3128', 'password': '******', 'ua': ua[2]}, # ], # 'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'], # 'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True, # 'spec.recover': True, 'sch.recover': False, 'score': 750, # 'detail.parser': False, 'thcnt': 4} # # ] proxies = [] am = AccountManager() with open('proxy_r') as f: for l in f: proxies.append(l.strip()) idx = [1, 2, 3, 4, 5, 6, 7] count = len(idx) name = '广东' ac = {'accounts': [], 'name': name, 'prefix': provinces[name]} acs = am.get(name, count) i = 0 for a in acs: a.proxy = proxies[idx[i]] ac['accounts'].append(a.gen_run_param()) i += 1 run_fun(ac)
# a.proxy = proxies[idx[i]] # ac['accounts'].append(a.gen_run_param()) # i += 1 # run_fun3(ac) proxies = [] am = AccountManager() with open('proxy_r') as f: for l in f: proxies.append(l.strip()) proxies.append(None) idx = [5, 6] count = len(idx) name = '贵州' ac = {'accounts': [], 'name': name, 'prefix': provinces[name]} acsa = am.get(name, count) i = 0 acs = [] for a in acsa: acs.insert(0, a) for a in acs: a.proxy = proxies[idx[0]] ac['accounts'].append(a.gen_run_param()) runner = FullJobRunner( gen_jobs_params(ac['accounts'], name, provinces[name], spec=False, detail=True, school=False, spec_parser=False,
# ], # 'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'], # 'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True, # 'spec.recover': True, 'sch.recover': False, 'score': 750, # 'detail.parser': False, 'thcnt': 4} # # ] proxies = [] am = AccountManager() with open('proxy_r') as f: for l in f: proxies.append(l.strip()) idx = [16] count = len(idx) acsa = am.get('天津', 8) acs = [] for a in acsa: acs.insert(0, a) name = '天津' for a in acs: a.proxy = '192.168.1.39:3428' runner = FullJobRunner( gen_jobs_params([a.gen_run_param()], name, provinces[name], spec=False, detail=True, school=False, spec_parser=False, detail_parser=False,
# a.proxy = proxies[idx[i]] # ac['accounts'].append(a.gen_run_param()) # i += 1 # run_fun3(ac) proxies = [] am = AccountManager() with open('proxy_r') as f: for l in f: proxies.append(l.strip()) proxies.append(None) idx = [13] count = len(idx) name = '甘肃' ac = {'accounts': [], 'name': name, 'prefix': provinces[name]} acs = am.get(name, 8) i = 0 for a in acs: a.proxy = proxies[idx[0]] runner = FullJobRunner( gen_jobs_params([a.gen_run_param()], name, provinces[name], spec=False, detail=True, school=False, spec_parser=False, detail_parser=False, spec_r=False, detail_r=False,
class JobManager(): def __init__(self, jobs, proxy='proxy_r'): self.ac = AccountManager() self.pm = ProxyManager() self.pq = ProxyQueue() self.pq.load(proxy) self.pm.load(proxy) self.jobs = jobs self.threads = [] self.running = False def init(self): random.seed(int(time.time())) for job in self.jobs: for ac in self.ac.get(job['name'], 2): ac.proxy = self.pm.get_good_proxy(7) ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)] for job in self.jobs: for ac in self.ac.get(job['name'], job['count']): if ac.proxy is None: ac.proxy = self.pm.get_good_proxy(1) ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)] def run(self): if self.running: return self.init() for tid in range(len(self.jobs)): t = threading.Thread(target=self.runner, args=(tid, )) self.threads.append(t) for t in self.threads: t.start() time.sleep(2) for t in self.threads: t.join() self.ac.save() def random_run(self): if self.running: return for tid in range(len(self.jobs)): t = threading.Thread(target=self.rand_runner, args=(tid, )) t.start() t.setDaemon(True) time.sleep(1) self.threads.append(t) time.sleep(2) for t in self.threads: t.join() self.ac.save() def rand_runner(self, tid): job = self.jobs[tid] ac = { 'accounts': [], 'name': job['name'], 'prefix': provinces[job['name']] } acs = self.ac.get(job['name'], job['count']) if len(acs) > 0: for a in acs: a.proxy = self.pq.get_good_proxy() ac['accounts'].append(a.gen_run_param()) level = 0 print '%s start ' % ac['name'] while level < 3: if ac['name'] != '海南': level = smart_full_job(ac['accounts'], ac['name'], ac['prefix'], level=level) else: level = smart_full_job(ac['accounts'], ac['name'], ac['prefix'], 900, level=level) for a in acs: self.pq.release(a.proxy) a.proxy = None def runner(self, tid): job = self.jobs[tid] ac = { 'accounts': [], 'name': job['name'], 'prefix': provinces[job['name']] } for a in self.ac.get(job['name'], job['count']): ac['accounts'].append(a.gen_run_param()) level = 0 print '%s start ' % ac['name'] while level < 3: if ac['name'] != '海南': level = smart_full_job(ac['accounts'], ac['name'], ac['prefix'], level=level) else: level = smart_full_job(ac['accounts'], ac['name'], ac['prefix'], 900, level=level)
# 'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'], # 'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True, # 'spec.recover': True, 'sch.recover': False, 'score': 750, # 'detail.parser': False, 'thcnt': 4} # # ] # proxies = [] am = AccountManager() # with open('proxy_r') as f: # for l in f: # proxies.append(l.strip()) # idx = [28] # count = len(idx) name = '江苏' acsa = am.get(name, 3) acs = [] for a in acsa: acs.insert(0, a) for a in acs: a.proxy = '101.226.249.237:80' runner = FullJobRunner( gen_jobs_params([a.gen_run_param()], name, provinces[name], spec=False, detail=True, school=False, spec_parser=False, detail_parser=False, spec_r=False,
class QueueJobManager(): def __init__(self, jobs, thcnt=2, proxy='proxy_s', times=3): self.ac = AccountManager() self.pq = ProxyManager() self.pq.load(proxy) self.jobs = jobs self.threads = [] self.running = False self.thread_cnt = thcnt self.times = times self.job_queue = Queue.Queue() self.done_job = 0 self.job_lock = threading.RLock() def distpatch(self): for job in self.jobs: ac = { 'accounts': [] , 'name': job['name'] , 'prefix': provinces[job['name']] , 'score': 750 , 'times': 0 } if job['name'] == '海南': ac['score'] = 900 elif job['name'] == '上海': ac['score'] = 600 elif job['name'] == '江苏': ac['score'] = 500 acs = self.ac.get(job['name'], job['count']) if len(acs) > 0: for a in acs: ac['accounts'].append(a.gen_run_param()) self.job_queue.put(ac) def run(self): if self.running: return self.running = True self.distpatch() for tid in range(self.thread_cnt): proxy = self.pq.get_good_proxy() if proxy is None: break t = threading.Thread(target=self.runner, args=(proxy,)) self.threads.append(t) for t in self.threads: t.start() time.sleep(2) for t in self.threads: t.join() self.ac.save() self.threads = [] def runner(self, proxy): while True: ac = self.job_queue.get() ac['proxy'] = proxy times = ac.get('times', 0) if times < self.times: print '%s start %d' % (ac['name'], ac['times']) recruit_jobs(ac) ac['times'] += 1 ac['proxy'] = None self.job_queue.put(ac) else: print '%s start check crawling' % ac['name'] level = 0 while level < 3: level = recheck(ac, level=level) with self.job_lock: self.done_job += 1 if self.done_job >= len(self.jobs): break
class JobManager(): def __init__(self, jobs, proxy='proxy_s'): self.ac = AccountManager() self.pm = ProxyManager() self.pq = ProxyQueue() self.pq.load(proxy) self.pm.load(proxy) self.jobs = jobs self.threads = [] self.running = False self.job_queue = Queue.Queue() def init(self): random.seed(int(time.time())) for job in self.jobs: job['proxy'] = self.pm.get_good_proxy(1) for ac in self.ac.get(job['name'], job['count']): ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)] def run(self): if self.running: return self.init() for tid in range(len(self.jobs)): t = threading.Thread(target=self.runner, args=(tid,)) self.threads.append(t) for t in self.threads: t.start() time.sleep(2) for t in self.threads: t.join() self.ac.save() def random_run(self): if self.running: return for tid in range(len(self.jobs)): t = threading.Thread(target=self.rand_runner, args=(tid,)) t.start() time.sleep(1) self.threads.append(t) time.sleep(2) for t in self.threads: t.join() self.ac.save() def rand_runner(self, tid): job = self.jobs[tid] ac = { 'accounts': [] , 'name': job['name'] , 'prefix': provinces[job['name']] , 'proxy': None , 'score': 750 } if job['name'] == '海南': ac['score'] = 900 elif job['name'] == '上海': ac['score'] = 600 elif job['name'] == '江苏': ac['score'] = 500 acs = self.ac.get(job['name'], job['count']) if len(acs) > 0: for a in acs: ac['accounts'].append(a.gen_run_param()) proxy = self.pq.get_good_proxy() print '%s start ' % ac['name'] recruit_jobs(ac) self.pq.release(proxy) def runner(self, tid): job = self.jobs[tid] ac = { 'accounts': [] , 'name': job['name'] , 'prefix': provinces[job['name']] , 'score': 750 , 'proxy': job['proxy'] } if job['name'] == '海南': ac['score'] = 900 elif job['name'] == '上海': ac['score'] = 600 elif job['name'] == '江苏': ac['score'] = 500 for a in self.ac.get(job['name'], job['count']): ac['accounts'].append(a.gen_run_param()) print '%s start ' % ac['name'] recruit_jobs(ac)