예제 #1
0
 def __init__(self, queue: Queue):
     self.queue = queue
     app = schedule.app
     with app.app_context():
         self.store_type = current_app.config.get('DATA_STORE_TYPE')
     if self.store_type == 'mysql':
         self.db = sql
     else:
         self.db = RedisHelper(jobs.PROXY_VALID_KEY)
예제 #2
0
def FetchJob():
    # 通过scheduler得到对应的app context对象
    app = schedule.app
    with app.app_context():
        store_type = current_app.config.get('DATA_STORE_TYPE')
    if store_type == 'mysql':
        db = sql
    else:
        db = RedisHelper(jobs.PROXY_RAW_KEY)
    import inspect
    member_list = inspect.getmembers(FetchFreeProxy,
                                     predicate=inspect.isfunction)
    proxy_set = set()
    for func_name, func in member_list:
        log.debug(u"开始获取代理: {}".format(func_name))
        try:
            for proxy in func():
                proxy = proxy.strip()
                if not proxy or not verifyProxyFormat(proxy):
                    log.error('ProxyFetch - {func}: '
                              '{proxy} illegal'.format(func=func_name,
                                                       proxy=proxy.ljust(20)))
                    continue
                elif proxy in proxy_set:
                    log.debug('ProxyFetch - {func}: '
                              '{proxy} exist'.format(func=func_name,
                                                     proxy=proxy.ljust(20)))
                    continue
                else:
                    log.debug('ProxyFetch - {func}: '
                              '{proxy} success'.format(func=func_name,
                                                       proxy=proxy.ljust(20)))
                    # 保存数据
                    last_time = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    p = Proxy(name=func_name, proxy=proxy, last_time=last_time)
                    # 持久化保存
                    if store_type == 'mysql':
                        record = ProxyRaw(name=func_name,
                                          proxy=proxy,
                                          https=False,
                                          gmt_create=last_time,
                                          gmt_modified=last_time)
                        with app.app_context():
                            db.session.add(record)
                            db.session.commit()
                    else:
                        db.add(p.proxy, p.Json)
                    # 保存到set中检查是否重复
                    proxy_set.add(p)
        except Exception as e:
            log.error(u"代理获取函数 {} 运行出错!".format(func_name))
    # 执行相关统计数据
    log.debug('本次插入代理总数: %s', len(proxy_set))
예제 #3
0
def checkRawProxyJob():
    app = schedule.app
    with app.app_context():
        store_type = current_app.config.get('DATA_STORE_TYPE')
    if store_type == 'mysql':
        db = sql
    else:
        db = RedisHelper(jobs.PROXY_RAW_KEY)
    proxy_queue = Queue()
    # 此处直接装填队列,因为进程已经启动好了,此时只要一个队列有数据,就有一个进程进行处理
    if store_type == 'mysql':
        with app.app_context():
            data = db.session.query(ProxyRaw).all()
            data = [Proxy(name=item.name, proxy=item.proxy, https=item.https,
                          success=item.success if item.success else 0, fail=item.fail if item.fail else 0,
                          total=item.total if item.total else 0, quality=item.quality if item.quality else 0,
                          last_time=item.gmt_modified) for item in data]
    else:
        data = db.getAll()
    if len(data) > 0:
        for proxy in data:
            proxy_queue.put(proxy)
            # 很奇怪的问题,会导致Queue退出,BrokenPipeError: [Errno 32] Broken pipe
            time.sleep(0.01)
        # 清除临时旧数据库
        if store_type == 'mysql':
            with app.app_context():
                db.session.query(ProxyRaw).delete()
        else:
            db.clear()
        # 启动多进行进行检查所有的代理地址是否合法
        process_list = list()
        num = getCpuNumber()
        for index in range(num):
            process = Process(target=CheckProcess(proxy_queue).run())
            process.daemon = True
            process_list.append(process)
        for work in process_list:
            work.start()
        # 终止所有的进程操作
        for _ in process_list:
            proxy_queue.put(None)

        for work in process_list:
            work.join()
        log.debug("RawProxyCheck - 本次验证执行结束,验证数量: {}".format(len(data)))
예제 #4
0
 def get(self):
     response = WebResponse()
     store_type = current_app.config.get('DATA_STORE_TYPE')
     if store_type == 'mysql':
         db = sql
         data = db.session.query(ProxyValid).all()
     else:
         db = RedisHelper(jobs.PROXY_VALID_KEY)
         data = db.getAll()
     if data:
         if store_type != 'mysql':
             data = [Proxy.fromJson(item) for item in data]
         else:
             data = [item.serialize() for item in data]
         response.data = data
     else:
         response.code = WebResponseCode.NO_RECORD
     return response.tojson()
예제 #5
0
class CheckProcess(object):

    def __init__(self, queue: Queue):
        self.queue = queue
        app = schedule.app
        with app.app_context():
            self.store_type = current_app.config.get('DATA_STORE_TYPE')
        if self.store_type == 'mysql':
            self.db = sql
        else:
            self.db = RedisHelper(jobs.PROXY_VALID_KEY)

    def run(self):
        if self.store_type != 'mysql':
            self.db.change(jobs.PROXY_VALID_KEY)
        while True:
            if self.queue.empty(): break
            proxy_data = self.queue.get()
            if proxy_data is None: break
            if self.store_type != 'mysql':
                proxyObj = Proxy.fromJson(proxy_data)
            else:
                proxyObj = proxy_data
            proxy, status = proxyObj.validateProxy()
            if status:
                # 保存到数据库中
                if self.store_type != 'mysql':
                    self.db.add(proxy.proxy, proxy.Json)
                else:
                    proxy_valid = ProxyValid(name=proxy.name, proxy=proxy.proxy, https=proxy.https,
                                             proxy_type=proxy.type, china=proxy.china, location=proxy.location,
                                             success=proxy.success, fail=proxy.fail, total=proxy.total,
                                             quality=proxy.quality,
                                             last_status=proxy.last_status, gmt_modified=proxy.last_time)
                    app = schedule.app
                    with app.app_context():
                        self.db.session.add(proxy_valid)
                        self.db.session.commit()
                log.debug('RawProxyCheck - {}  : {} validation pass'.format(proxy.name, proxy.proxy.ljust(20)))
            else:
                log.error(
                    'RawProxyCheck - {}  : {}, into time: {} validation fail'.format(proxy.name, proxy.proxy.ljust(20),
                                                                                     proxy.last_time))
class CheckProcess(object):

    def __init__(self, queue: Queue):
        self.queue = queue
        app = schedule.app
        with app.app_context():
            self.store_type = current_app.config.get('DATA_STORE_TYPE')
        if self.store_type == 'mysql':
            self.db = sql
        else:
            self.db = RedisHelper(jobs.PROXY_VALID_KEY)

    def run(self):
        if self.store_type != 'mysql':
           self.db.change(jobs.PROXY_VALID_KEY)
        while True:
            if self.queue.empty(): break
            # log.info('正在运行代理检查,检查队列是: {}'.format(self.queue.qsize()))
            proxy_data = self.queue.get()
            if proxy_data is None: break
            if self.store_type != 'mysql':
                proxyObj = Proxy.fromJson(proxy_data)
            else:
                proxyObj = proxy_data
            proxy, status = proxyObj.validateProxy()
            # log.info('执行检查结果: {}, 结果是: {}'.format(str(proxy),status))
            if status or proxy.fail < FAIL_COUNT:
                # 保存到数据库中
                if self.store_type != 'mysql':
                    if self.db.exists(proxy.proxy):
                        log.debug('ValidProxyCheck - {}  : {} validation exists'.format(proxy.name,
                                                                                       proxy.proxy.ljust(20)))
                    self.db.add(proxy.proxy, proxy.Json)
                else:
                    app = schedule.app
                    with app.app_context():
                        proxy_info=self.db.session.query(ProxyValid).filter(ProxyValid.proxy==proxy.proxy).first()
                        proxy_info.success=proxy.success
                        proxy_info.fail=proxy.fail
                        proxy_info.total=proxy.total
                        proxy_info.quality=proxy.quality
                        proxy_info.last_status=proxy.last_status
                        proxy_info.gmt_modified= proxy.last_time
                        self.db.session.commit()
                log.debug('ValidProxyCheck - {}  : {} validation pass'.format(proxy.name, proxy.proxy.ljust(20)))
            else:
                log.debug('ValidProxyCheck - {}  : {} validation fail'.format(proxy.name, proxy.proxy.ljust(20)))
                if self.store_type != 'mysql':
                   self.db.delete(proxy.proxy)
                else:
                    app = schedule.app
                    with app.app_context():
                        self.db.session.query(ProxyValid).filter(ProxyValid.proxy==proxy.proxy).delete()
예제 #7
0
 def get(self):
     '''
     请求样例: /get
              /get?type=valid
              /get?type=raw
     :return:
     '''
     # 是否需要未处理的数据
     proxy_type = request.args.get('type')
     type = proxy_type if proxy_type else 'valid'
     response = WebResponse()
     store_type = current_app.config.get('DATA_STORE_TYPE')
     if store_type == 'mysql':
         db = sql
         if type == 'valid':
             # .order_by(
             # desc(ProxyValid.total))
             data = db.session.query(ProxyValid).filter(
                 ProxyValid.quality == 100).order_by(desc(
                     ProxyValid.total)).all()
         else:
             data = db.session.query(ProxyRaw).order_by(
                 desc(ProxyRaw.gmt_modified)).all()
     else:
         if type == 'valid':
             db = RedisHelper(jobs.PROXY_VALID_KEY)
         else:
             db = RedisHelper(jobs.PROXY_RAW_KEY)
         data = db.getAll()
     if data:
         random_choice = random.choice(data)
         if store_type == 'mysql':
             response.data = random_choice.serialize()
         else:
             response.data = json.loads(random_choice)
     else:
         response.code = WebResponseCode.NO_RECORD
     return response.tojson()
예제 #8
0
 def get(self):
     response = WebResponse()
     store_type = current_app.config.get('DATA_STORE_TYPE')
     if store_type == 'mysql':
         db = sql
         data = db.session.query(ProxyValid).all()
     else:
         db = RedisHelper(jobs.PROXY_VALID_KEY)
         data = db.getAll()
     if data:
         items = [
             item for item in data if item.serialize().get('total') == 1
             and item.serialize().get('success') == 1
         ]
         random_choice = random.choice(items)
         if store_type == 'mysql':
             item = random_choice.serialize()
         else:
             item = json.loads(random_choice)
         response.data = item
     else:
         response.code = WebResponseCode.NO_RECORD
     return response.tojson()
예제 #9
0
 def get(self):
     response = WebResponse()
     store_type = current_app.config.get('DATA_STORE_TYPE')
     if store_type == 'mysql':
         db = sql
         valid_number = db.session.query(ProxyValid).count()
         raw_number = db.session.query(ProxyRaw).count()
     else:
         db = RedisHelper(jobs.PROXY_VALID_KEY)
         valid_number = db.len()
         db.change(jobs.PROXY_RAW_KEY)
         raw_number = db.len()
     data = {'raw': raw_number, 'valid': valid_number}
     response.data = data
     return response.tojson()