def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() myip = getMyIP() spawns = [] for proxy in proxylist: spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies)) gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) if len(self.proxies) < MINNUM: str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() self.crawl_pool.map(self.crawl, parserList) else: str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' sys.stdout.write(str + "\r\n") sys.stdout.flush() time.sleep(UPDATE_TIME)
def update(self, conditions=None, value=None): if conditions and value: from validator.Validator import getMyIP, checkSped selfip = getMyIP() speeds = checkSped(selfip, conditions) if speeds: if speeds[0] < 50 or speeds[1] < 50: conditon_list = [] for key in list(conditions.keys()): if self.params.get(key, None): conditon_list.append( self.params.get(key) == conditions.get(key)) conditions = conditon_list query = self.session.query(Proxy) for condition in conditions: query = query.filter(condition) updatevalue = {} for key in list(value.keys()): if self.params.get(key, None): updatevalue[self.params.get(key, None)] = value.get(key) updateNum = query.update(updatevalue) self.session.commit() else: self.delete(conditions) updateNum = None else: self.delete(conditions) updateNum = None else: updateNum = 0 return {'updateNum': updateNum}
def store_data(queue2, db_proxy_num): successNum = 0 failNum = 0 while True: try: proxy = queue2.get(timeout=300) if proxy: from validator.Validator import getMyIP, checkSped selfip = getMyIP() speeds = checkSped(selfip, proxy) value2 = [speeds, config.GOAL_HTTPS_LIST] print(value2) if speeds: sqlhelper.insert(proxy, value2) successNum += 1 else: failNum += 1 str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % ( successNum, failNum) sys.stdout.write(str + "\r") sys.stdout.flush() except BaseException as e: if db_proxy_num.value != 0: successNum += db_proxy_num.value db_proxy_num.value = 0 str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % ( successNum, failNum) sys.stdout.write(str + "\r") sys.stdout.flush() successNum = 0 failNum = 0
def main(): myip = getMyIP() DB_PROXY_NUM = Value('i', 0) q1 = Queue(maxsize=TASK_QUEUE_SIZE) q2 = Queue() p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM, myip, 'FREE')) p2 = Process(target=validator, args=(q1, q2, myip, 'FREE')) p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM, 'FREE')) p1.start() p2.start() p3.start() p1.join() p2.join() p3.join()
# coding:utf-8 from multiprocessing import Value, Queue, Process from api.apiServer import start_api_server from db.DataStore import store_data from validator.Validator import validator, getMyIP from spider.ProxyCrawl import startProxyCrawl from config import TASK_QUEUE_SIZE if __name__ == "__main__": myip = getMyIP() DB_PROXY_NUM = Value('i', 0) q1 = Queue(maxsize=TASK_QUEUE_SIZE) q2 = Queue() #p0 = Process(target=start_api_server) p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM, myip)) p2 = Process(target=validator, args=(q1, q2, myip)) p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) #p0.start() p1.start() p2.start() p3.start() #p0.join() p1.join() p2.join() p3.join()
from util.watcher import Watcher monkey.patch_all() from db.DataStore import store_data from multiprocessing import Value, Queue, Process from api.apiServer import start_api_server from spider.ProxyCrawl import startProxyCrawl from validator.Validator import validator, getMyIP from loguru import logger if __name__ == "__main__": #Watcher() my_current_ip = getMyIP() logger.info("My Current IP:{}", my_current_ip) DB_PROXY_NUM = Value('i', 0) q1 = Queue() q2 = Queue() # 内部处理ip/port验证的process 控制列表 process_pool = {} p0 = Process(target=start_api_server) p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM)) p2 = Process(target=validator, args=(q1, q2, my_current_ip, process_pool)) p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) try: p0.start()
# coding:utf-8 from multiprocessing import Value, Queue, Process from api.apiServer import start_api_server from db.DataStore import store_data from validator.Validator import validator, getMyIP from spider.ProxyCrawl import startProxyCrawl from config import TASK_QUEUE_SIZE if __name__ == "__main__": myip = getMyIP() DB_PROXY_NUM = Value('i', 0) q1 = Queue(maxsize=TASK_QUEUE_SIZE) q2 = Queue() p0 = Process(target=start_api_server) p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip)) p2 = Process(target=validator, args=(q1, q2, myip)) p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) p0.start() p1.start() p2.start() p3.start() p0.join() p1.join() p2.join() p3.join()