class JobuiBin2DB(Spider): """ bin文件读取并写入到库里面去 """ def __init__(self, thcnt): Spider.__init__(self, thcnt) self.num_count = 0 self.page_store = PageStoreJobUI() self.page_store.testmode = False self.bin_list = [ 'jobui_job_data1.bin', 'jobui_job_bu.bin', 'jobui_job_data2.bin' ] def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): for binname in self.bin_list: bin = BinReader("./jobdata/" + binname) while True: (a, b) = self.bs.readone() if a is None: break job = {"index": a, "html": b} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): #jobui_job.131298184.1451893899 #http://www.jobui.com/job/131298184/ id = jobid.get("index").split(".")[1] url = "http://www.jobui.com/job/%s/" % (id) html = jobid.get("html") self.page_store.save(int(time.time()), id, url, html) self.num_count += 1 def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)
class JobuiSpider(Spider): """ jobui增量--爬取直接入库 """ def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): i = 130000000 while i < 140000000: job = {"id": i} self.add_job(job, True) i += 1 self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): jobid_int = jobid.get("id") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%d ======》 404" % jobid_int return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------> %d " % (jobid_int, res.code) self.add_job(jobid) time.sleep(1) return elif res.code == 200: print "%d ————> will be into database......." % jobid_int self.page_store.save(int(time.time()), str(jobid_int), url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code self.__fail_ids.append(str(jobid_int)) #raise AccountErrors.NoAccountError('fatal error') #if self.request_count % 10000 == range(0,9): print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)
class JobuiSpiderUrlAdd(Spider): def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_add_url = FileSaver("fail_add_url.txt") self.start_time = time.time() self.domain = self.read_domain() self.domain_file = FileSaver("domains.txt") self.page_store = PageStoreJobUI() self.page_store.testmode = True def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open('job_url.txt', 'r') as f: while True: line = f.readline().strip() if line is None: break job = {"url": line} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): url = jobid.get("url") tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1) < 3: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_add_url.append(url) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%s ======》 404" % url return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------> %d " % (url, res.code) self.add_job(jobid) time.sleep(1) return elif res.code == 200: print "%s ————> will be into database......." % url #http://www.jobui.com/job/92336088/ m = re.search(ur'http://www.jobui.com/job/(\d+)/', url) if m: jid = m.group(1) self.page_store.save(int(time.time()), jid, url, res.text) self.success_count += 1 self.parseDomain(res.text) else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code self.__fail_add_url.append(url) #raise AccountErrors.NoAccountError('fatal error') #if self.request_count % 10000 == range(0,9): print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "jobui_add_url.py -- execute result : success_count : {} ,request_count:{}".format( self.success_count, self.request_count) spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) #execfile("./genq_add.py") if evt == 'STARTED': msg = 'jobui_add_url.py start execute...' spider.util.sendmail('*****@*****.**', '%s STARTED' % sys.argv[0], msg) def parseDomain(self, content): m = re.search(ur'<em class="sourceWeb common-icon"></em>(.*)</dd>', content) if m: dm_str = m.group(1) m1 = re.search( ur'<a class="no-style fwb " rel="nofllow" target="_blank" href="(.*)" onclick="_hmt.push\(\[\'_trackEvent\', \'jobInfo\', \'jobInfo_info\',\'jobInfo_info_jobSourceWeb\'\]\);">(.*)</a>', dm_str) if m1: dm_str = m1.group(2) dm = '"' + dm_str + '"' if dm in self.domain: print '[%s] already in domains...' % dm_str else: self.domain.append(dm) self.domain_file.append(dm) print '[%s] add to domains...' % dm_str else: print 'no match domain...' def read_domain(self): domain = [] with open('domains.txt', 'r') as f: for line in f: domain.append(line.strip()) return domain def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)
class JobuiSpider(Spider): """ jobui数据重爬--爬取直接入库 2016-04-27 """ def __init__(self): self.is_debug = True if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_url.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True self.init_time = time.time() self.already_url = FileSaver("already_url.txt") self.init_already_url() def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def init_already_url(self): i = 0 with open("already_url.txt") as f: for line in f: i += 1 url = line.strip() already_url.add(url) print "init already url :", i def dispatch(self): cnt = 0 with open("db_export_jobui_url.csv") as f: for line in f: cnt += 1 url = line.strip() if url in already_url: print "already crawler......ignore..." continue m = re.search("http://www\.jobui\.com/job/(\d+)/", url) if m: id = int(m.group(1)) job = {"cnt": cnt, "id": id, "url": url, "retry": 0} self.add_job(job, True) else: print "url error:", line continue self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv, type): fc = getattr(self._curltls, type, 0) if (addv): fc += addv setattr(self._curltls, type, fc) return fc def run_job(self, job): url = job.get("url") id = job.get("id") cnt = job.get("cnt") retry = job.get("retry") tid = self.get_tid() proxies = None if self.is_debug: proxies = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } else: proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=proxies) self.request_count += 1 if res is None or res.code != 200: #print "%d --- %d ---> error:", "RES NONE " if res is None else "res.code = %d" % res.code self.re_add_job({ "id": id, "cnt": cnt, "url": url, "retry": (retry + 1) }) else: if self.page_store.save(int(time.time()), str(id), url, res.text): self.success_count += 1 already_url.add(url) self.already_url.append(url) print "%d ### %d ### be into database success......." % (cnt, id) else: print "%d === %d === be into database failure......." % (cnt, id) self.re_add_job({ "id": id, "cnt": cnt, "url": url, "retry": (retry + 1) }) if time.time() - self.init_time > 20: print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) self.init_time = time.time() self.request_count = 0 self.success_count = 0 def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "jobui re_url.py is over !" spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) print " loaded [ %d ] proxis " % len(self.proxies_dict) def _match_proxy(self, line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
class JobuiSpider(Spider): """ jobui增量--爬取直接入库 2016-04-18 """ def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True self.serial_num = 0 def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): i = 136848805 #134111700 while i < 150000000: i += 1 job = {"id": i, "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv, type): fc = getattr(self._curltls, type, 0) if (addv): fc += addv setattr(self._curltls, type, fc) return fc def run_job(self, jobid): jobid_int = jobid.get("id") retry = jobid.get("retry") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 self.serial_num += 1 if res is None: if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-none'))) #return else: setattr(self._curltls, 'failcount-none', 0) if res.code == 407: if self.get_fail_cnt(1, 'failcount-407') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-407'))) #return else: setattr(self._curltls, 'failcount-407', 0) if res.code == 404: print "%d ======》 404 ---> retry:%d" % (jobid_int, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_ids.append(str(jobid_int)) #return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------> %d " % (jobid_int, res.code) self.re_add_job(jobid) time.sleep(random.randrange(1, 3, 1)) #return elif res.code == 200: self.serial_num = 0 print "%d ————> will be into database......." % jobid_int self.page_store.save(int(time.time()), str(jobid_int), url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_ids.append(str(jobid_int)) print "serial_number:{},request_count:{},success_count:{},request_speed:{}".format( self.serial_num, self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) print " loaded [ %d ] proxis " % len(self.proxies_dict) def _match_proxy(self, line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
class JobuiSpider(Spider): """ jobui读取库内旧ID重爬 """ def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_030814.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_urls = FileSaver("fail_urls.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = False def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open("db_jobid.txt") as f: for url in f: job = {"url": url.strip(), "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv, type): fc = getattr(self._curltls, type, 0) if (addv): fc += addv setattr(self._curltls, type, fc) return fc def run_job(self, jobid): url = jobid.get("url") retry = jobid.get("retry") tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) #self.__fail_urls.append(url) self.re_add_job(jobid) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-none'))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 407: if self.get_fail_cnt(1, 'failcount-407') < 10: print "%s ======》 407 , retry:%d" % (url, retry) self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.re_add_job(jobid) #self.__fail_urls.append(url) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-407'))) return else: setattr(self._curltls, 'failcount-407', 0) if res.code == 404: print "%s ======》 404 , retry:%d" % (url, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_urls.append(url) return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------> %d " % (url, res.code) self.add_job(jobid) time.sleep(random.randrange(1, 3, 1)) return elif res.code == 200: print "%s ————> will be into database......." % url m = re.search(ur'http://www.jobui.com/job/(\d+)/', url) if m: jid = m.group(1) self.page_store.save(int(time.time()), jid, url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ] retry:%d" % ( res.code, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_urls.append(url) #raise AccountErrors.NoAccountError('fatal error') print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time)) spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) # m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) # if m: # prstr = m.group(1) # proxies = {'http': 'http://' + prstr+"/", 'https': 'https://' + prstr+"/"} # self.proxies_dict.append(proxies) # elif re.match('\s*#', line): # continue print " loaded [ %d ] proxis " % len(self.proxies_dict) def _match_proxy(self, line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)