def __validProxy(self): """ 验证代理 :return: """ while True: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): # 兼容PY3 each_proxy = each_proxy.decode('utf-8') value = self.db.getvalue(each_proxy) if validUsefulProxy(each_proxy): # 成功计数器加1 if value and int(value) < 1: self.db.inckey(each_proxy, 1) self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 if value and int(value) < -5: # 计数器小于-5删除该代理 self.db.delete(each_proxy) else: self.db.inckey(each_proxy, -1) self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy)) self.log.info('ProxyValidSchedule running normal') sleep(60 * 1)
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: proxy_item = self.db.pop() while proxy_item: proxy = proxy_item.get('proxy') counter = proxy_item.get('value', 1) if validUsefulProxy(proxy): # 验证通过计数器加1 if counter and int(counter) < 1: self.db.put(proxy, num=int(counter) + 1) else: self.db.put(proxy) self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 if counter and int(counter) <= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(counter) - 1) proxy_item = self.db.pop() sleep(60 * 5)
def validProxy(self): # 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue self.db.changeTable(self.raw_proxy_queue) try: raw_proxy_item, value = self.db.pop() self.log.info('ProxyRefreshSchedule: %s 爬取ip 检验 start' % time.ctime()) # 计算剩余代理,用来减少重复计算 # remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item if validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(value) self.log.info('ProxyRefreshSchedule: %s 爬取ip 检验 pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s 爬取ip 检验 fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item, value = self.db.pop() if raw_proxy_item is None: break # remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s 爬取ip 检验 complete' % time.ctime()) except Exception as e: # print(e) pass
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: proxy_item = self.db.pop() while proxy_item: proxy = proxy_item.get('proxy') counter = proxy_item.get('value') if validUsefulProxy(proxy): # 验证通过计数器加1 if counter and int(counter) < 1: self.db.put(proxy, num=int(counter) + 1) else: self.db.put(proxy) #self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: #self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 if counter and int(counter) <= -FAIL_COUNT: self.log.warning( 'ProxyCheck: {} fail too many, delete!'.format( proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(counter) - 1) proxy_item = self.db.pop() sleep(60 * 5)
def run(self): self.db.changeTable(self.useful_proxy_queue) thread_id = threading.currentThread().ident log.info("thread_id:{thread_id} useful_proxy proxy check start".format( thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() (http_result, _) = validUsefulProxy(proxy) if http_result: self.tickUsefulProxyVaildSucc(proxy) succ = succ + 1 log.debug( "ProxyCheck: {proxy} validation pass".format(proxy=proxy)) else: self.tickUsefulProxyVaildFail(proxy) fail = fail + 1 log.debug( "ProxyCheck: {proxy} validation fail".format(proxy=proxy)) self.queue.task_done() total = total + 1 self.tickUsefulProxyVaildTotal(proxy) log.info( 'thread_id:{thread_id} proxy check end, total:{total}, succ:{succ}, fail:{fail}' .format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def __validProxy(self): """ 验证代理 :return: """ while True: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): each_proxy = each_proxy.decode('utf-8') if validUsefulProxy(each_proxy): # 成功计数器加1 self.db.inckey(each_proxy, 1) self.log.debug('validProxy_b: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 self.db.inckey(each_proxy, -1) # self.db.delete(each_proxy) self.log.info('validProxy_b: {} validation fail'.format(each_proxy)) value = self.db.getvalue(each_proxy) if value and value < -5: # 计数器小于-5删除该代理 self.db.delete(each_proxy) self.log.info('validProxy_a running normal')
def run(self): #todo 该方法重写threading里面的run方法,实例化该类,然后.start()就按照平时的进程执行 self.db.changeTable(self.useful_proxy_queue) while True: proxy_item = self.db.pop() while proxy_item: #todo 一直去循环判断数据库里面的useful_proxy_queue proxy = proxy_item.get('proxy') counter = proxy_item.get('value') if validUsefulProxy(proxy): # 验证通过计数器加1 if counter and int(counter) < 1: self.db.put(proxy, num=int(counter) + 1) else: self.db.put(proxy) self.log.info( 'ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info( 'ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 if counter and int(counter) <= -FAIL_COUNT: self.log.info( 'ProxyCheck: {} fail too many, delete!'.format( proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(counter) - 1) proxy_item = self.db.pop() sleep(60 * 5)
def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): # 兼容Py3 raw_proxy = raw_proxy.decode('utf8') if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
def validProxy(self): thread_id = threading.currentThread().ident log.info("thread_id:{thread_id}, Start ValidProxy `raw_proxy_queue`".format(thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() if proxy not in self.remaining_proxies: (http_result, https_result) = validUsefulProxy(proxy) if http_result: self.saveUsefulProxy(proxy, https_result) self.deleteRawProxy(proxy) self.remaining_proxies.append(proxy) succ = succ + 1 else: self.tickRawProxyVaildFail(proxy) fail = fail + 1 log.debug('ProxyRefreshSchedule: %s validation fail' % proxy) # self.tickRawProxyVaildSucc(proxy) log.debug('ProxyRefreshSchedule: %s validation pass' % proxy) else: self.deleteRawProxy(proxy) log.debug('ProxyRefreshSchedule: %s repetition, skip!' % proxy) self.queue.task_done() self.tickRawProxyVaildTotal(proxy) total = total + 1 log.info('thread_id:{thread_id}, ValidProxy Complete `raw_proxy_queue`, total:{total}, succ:{succ}, fail:{fail}'.format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def __validProxy(self): """ 验证代理 :return: """ time.sleep(60 * 0 * random.random()) while True: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): each_proxy = each_proxy.decode('utf-8') if validUsefulProxy(each_proxy) == True: # 成功计数器加1 self.db.inckey(each_proxy, 1) self.log.debug('validProxy_b: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 print "原有value " + str(self.db.getvalue(each_proxy)) if self.db.getvalue(each_proxy) >= 0: self.db.inckey(each_proxy, -1*int(self.db.getvalue(each_proxy))) else: self.db.inckey(each_proxy, -1) # self.db.delete(each_proxy) self.log.info('validProxy_b: {} validation fail'.format(each_proxy)) value = self.db.getvalue(each_proxy) print value if None != value and int(value) < 0: # 计数器小于-5删除该代理 print "删除" + each_proxy self.db.delete(each_proxy) self.log.info('validProxy_a running normal')
def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy, self.mode): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('Mode:{} ProxyCheck: {} validation pass'.format( self.mode, proxy)) else: self.log.info('Mode:{} ProxyCheck: {} validation fail'.format( self.mode, proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info( 'Mode:{} ProxyCheck: {} fail too many, delete!'.format( self.mode, proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): # 兼容Py3 raw_proxy = raw_proxy.decode('utf8') if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: proxy = self.db.pop() if proxy: addr = "%s:%s" % (proxy.get('ip'), proxy.get('port')) if validUsefulProxy(addr): self.log.info('ProxyCheck: {} validation pass'.format(addr)) else: self.log.info('ProxyCheck: {} validation fail'.format(addr)) self.db.delete(proxy['ip']) sleep(20)
def __validProxy__(self): """ 验证代理 :return: """ while 1: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if validUsefulProxy(each_proxy): self.log.debug('proxy: {} validation pass'.format(each_proxy)) else: self.db.delete(each_proxy) self.log.info('proxy: {} validation fail'.format(each_proxy)) self.log.info(u'代理验证程序运行正常')
def __validProxy__(self): """ 验证代理 :return: """ while 1: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if validUsefulProxy(each_proxy): self.log.debug( 'proxy: {} validation pass'.format(each_proxy)) else: self.db.delete(each_proxy) self.log.info( 'proxy: {} validation fail'.format(each_proxy)) self.log.info(u'代理验证程序运行正常')
def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): try: proxy = self.queue.get() except Empty: break if validUsefulProxy(proxy): self.log.info(f'ProxyCheck: {proxy} validation pass') else: self.log.info( f'ProxyCheck: {proxy} validation fail, delete it from useful_proxy!' ) # self.db.delete(proxy) ProxyManager.delete_proxy(proxy) self.queue.task_done()
def __validProxy(self): """ 验证代理 :return: """ while True: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): each_proxy = each_proxy.decode('utf-8') if validUsefulProxy(each_proxy): self.log.debug('validProxy_b: {} validation pass'.format(each_proxy)) else: self.db.delete(each_proxy) self.log.info('validProxy_b: {} validation fail'.format(each_proxy)) self.log.info('validProxy_a running normal')
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: proxy_item = self.db.pop() while proxy_item: proxy = proxy_item.get('proxy') counter = proxy_item.get('value') if validUsefulProxy(proxy): self.log.info( 'ProxyCheck: {} validation pass'.format(proxy)) self.db.put(proxy) else: self.log.info( 'ProxyCheck: {} validation fail'.format(proxy)) self.db.delete(proxy) proxy_item = self.db.pop() sleep(30)
def valid_proxy(self): """ valid_proxy :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start valid proxy' % time.ctime()) while raw_proxy: if validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.debug('proxy: %s validation passes' % raw_proxy) else: self.log.debug('proxy: %s validation fail' % raw_proxy) pass self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s valid proxy complete' % time.ctime())
def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start validProxy_a' % time.ctime()) exist_proxy = self.db.getAll() while raw_proxy: if validUsefulProxy(raw_proxy) and (raw_proxy not in exist_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('validProxy_a: %s validation pass' % raw_proxy) else: self.log.debug('validProxy_a: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s validProxy_a complete' % time.ctime())
def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() while raw_proxy: self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) addr = "%s:%s" % (raw_proxy.get('ip'), raw_proxy.get('port')) if validUsefulProxy(addr): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % addr) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % addr) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
def validProxy(self): """ 验证代理 :return: """ while True: # for num in range(5): self.db.changeTable(self.useful_proxy_queue) each_proxys = self.db.getAll() print "验证所有ip", each_proxys if not each_proxys: time.sleep(100) for each_proxy in each_proxys: if isinstance(each_proxy, bytes): each_proxy = each_proxy.decode('utf-8') print "验证ip:", each_proxy if validUsefulProxy(each_proxy): # 成功计数器加1 self.db.inckey(each_proxy, 1) self.log.debug( 'validProxy_b: {} validation pass'.format(each_proxy)) else: # print "删除:",each_proxy # self.db.delete(each_proxy) # 失败计数器减一 self.db.inckey(each_proxy, -1) # self.db.delete(each_proxy) self.log.info( 'validProxy_b: {} validation fail'.format(each_proxy)) value = self.db.getvalue(each_proxy) if value and int(value) < -1: # 计数器小于-5删除该代理 self.db.delete(each_proxy) self.log.info('validProxy_a running normal')
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: try: proxy = self.queue.get(block=False) except Empty: break count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
def run(self): self.db.changeTable(self.useful_proxy_queue) while True: for proxy, count in self.db.getAll().items(): if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info( 'ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info( 'ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) > FAIL_COUNT: self.log.info( 'ProxyCheck: {} fail too many, delete!'.format( proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) sleep(60 * 5)
def validateProxy(self, proxy): return validUsefulProxy(proxy)