def validProxy(self):
        thread_id = threading.currentThread().ident
        log.info("thread_id:{thread_id}, Start ValidProxy `raw_proxy_queue`".format(thread_id=thread_id))

        total = 0
        succ = 0
        fail = 0

        while self.queue.qsize():
            proxy = self.queue.get()
            if proxy not in self.remaining_proxies:
                (http_result, https_result) = validUsefulProxy(proxy)
                if http_result:
                    self.saveUsefulProxy(proxy, https_result)
                    self.deleteRawProxy(proxy)
                    self.remaining_proxies.append(proxy)

                    succ = succ + 1
                else:
                    self.tickRawProxyVaildFail(proxy)

                    fail = fail + 1
                    log.debug('ProxyRefreshSchedule: %s validation fail' % proxy)
                # self.tickRawProxyVaildSucc(proxy)
                log.debug('ProxyRefreshSchedule: %s validation pass' % proxy)
            else:
                self.deleteRawProxy(proxy)

                log.debug('ProxyRefreshSchedule: %s repetition, skip!' % proxy)

            self.queue.task_done()
            self.tickRawProxyVaildTotal(proxy)
            total = total + 1

        log.info('thread_id:{thread_id}, ValidProxy Complete `raw_proxy_queue`, total:{total}, succ:{succ}, fail:{fail}'.format(thread_id=thread_id, total=total, succ=succ, fail=fail))
Beispiel #2
0
    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        thread_id = threading.currentThread().ident
        log.info("thread_id:{thread_id} useful_proxy proxy check start".format(
            thread_id=thread_id))

        total = 0
        succ = 0
        fail = 0
        while self.queue.qsize():
            proxy = self.queue.get()
            (http_result, _) = validUsefulProxy(proxy)
            if http_result:
                self.tickUsefulProxyVaildSucc(proxy)
                succ = succ + 1
                log.debug(
                    "ProxyCheck: {proxy} validation pass".format(proxy=proxy))
            else:
                self.tickUsefulProxyVaildFail(proxy)
                fail = fail + 1
                log.debug(
                    "ProxyCheck: {proxy} validation fail".format(proxy=proxy))

            self.queue.task_done()
            total = total + 1
            self.tickUsefulProxyVaildTotal(proxy)

        log.info(
            'thread_id:{thread_id} proxy check end, total:{total}, succ:{succ}, fail:{fail}'
            .format(thread_id=thread_id, total=total, succ=succ, fail=fail))
Beispiel #3
0
    def start(self):

        start_time = time.time()
        log.debug("useful_proxy proxy verify start")

        self.stat = dict(
            total=0,
            succ=0,
            fail=0,
        )

        concurrency = ConfigManager.setting_config.setting.get(
            "verify_useful_proxy_concurrency")
        queue_size = self.queue.qsize()
        if concurrency > queue_size:
            spawn_num = queue_size
        else:
            spawn_num = concurrency

        greenlet_list = []
        for _ in range(spawn_num):
            greenlet_list.append(gevent.spawn(self.run))

        gevent.joinall(greenlet_list)

        end_time = time.time()
        elapsed_time = int(end_time - start_time)
        log.info(
            'useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s'
            .format(total=self.stat["total"],
                    succ=self.stat["succ"],
                    fail=self.stat["fail"],
                    elapsed_time=elapsed_time))
Beispiel #4
0
    def update_job_interval(self, **kwargs):
        job_name = kwargs.get("job_name")

        value = ConfigManager.setting_config.setting.get(job_name)
        trigger_args = { "minutes": value }
        trigger='interval'
        job = self._update_job(job_name, trigger, **trigger_args)
        log.info("update_job_interval: {job_name}, {job}".format(job_name=job_name, job=job))
        return job
Beispiel #5
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher = self.queue.get()
        name = fetcher["name"]

        fetcher_class = FetcherManager.getFetcherClass(name)
        log.debug("fetch [{name}] proxy start".format(name=name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                        not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy):

                    ProxyManager.proxy_manager.saveUsefulProxy(proxy)
                    succ = succ + 1
                    log.debug("fetch [{name}] proxy {proxy} succ".format(
                        name=name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug("fetch [{name}] proxy {proxy} skip".format(
                        name=name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{name}] proxy fail: {error}".format(name=name,
                                                                  error=e))
            fail = fail + 1

        self.queue.task_done()

        now = int(time.time())
        elapsed_time = int(now - start_time)

        next_fetch_time = self.start_time + (fetcher["interval"] * 60)

        data = {
            "$inc": {
                "succ": succ,
                "fail": fail,
                "skip": skip,
                "total": total,
            },
            "$set": {
                "next_fetch_time": next_fetch_time,
            }
        }

        ProxyManager.proxy_manager.updateFetcher(name, data)
        log.info("fetch [{name:^15}] proxy finish, \
            total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"                                                                                               . \
                 format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
Beispiel #6
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher_name = self.queue.get()
        fetcher_class = FetcherManager.get_class(fetcher_name)
        log.debug("fetch [{fetcher_name}] proxy start".format(
            fetcher_name=fetcher_name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                not proxy_manager.checkRawProxyExists(proxy) and \
                not proxy_manager.checkUsefulProxyExists(proxy):

                    proxy_manager.saveRawProxy(proxy)
                    succ = succ + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} succ".format(
                            fetcher_name=fetcher_name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} skip".format(
                            fetcher_name=fetcher_name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{fetcher_name}] proxy fail: {error}".format(
                fetcher_name=fetcher_name, error=e))
            fail = fail + 1

        end_time = time.time()
        elapsed_time = int(end_time - start_time)

        self.queue.task_done()

        stat = dict(
            total=total,
            succ=succ,
            fail=fail,
            skip=skip,
        )
        ConfigManager.fetcher_config.update_stat(fetcher_name, stat)

        log.info(
            "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"
            .format(fetcher_name=fetcher_name,
                    total=total,
                    succ=succ,
                    fail=fail,
                    skip=skip,
                    elapsed_time=elapsed_time))
 def main(self):
     self.putQueue()
     while True:
         if not self.queue.empty():
             log.info("Start Valid useful_proxy proxy")
             self.__validProxy()
         else:
             log.info('Valid Complete, Sleep 5 Min!')
             time.sleep(60 * config.BASE.verify_useful_proxy_interval)
             self.putQueue()
Beispiel #8
0
    def run(self):
        total_number = proxy_manager.getRawProxyNumber()
        clean_number = proxy_manager.cleanRawProxy()
        remain_number = total_number - clean_number

        log.info(
            "clean raw_proxy, total_number:{total_number}, clean_number:{clean_number}, remain_number:{remain_number}"
            .format(total_number=total_number,
                    clean_number=clean_number,
                    remain_number=remain_number))
Beispiel #9
0
    def get(self):
        result = {"data": {}}

        options = {"proxy": self.args.get('proxy'), "quality": -1}
        log.info("receive params: {}".format(options))
        info = {}
        item = proxy_manager.updateUsefulProxy(options, info)
        log.info("delete {}".format(item))
        result["data"] = item

        return result
Beispiel #10
0
    def run(self):
        hold_number = ConfigManager.setting_config.setting.get(
            "hold_useful_proxy_number")
        total_number = proxy_manager.getUsefulProxyNumber()
        clean_number = proxy_manager.cleanUsefulProxy(hold_number=hold_number)

        log.info(
            "clean useful, total_number:{total_number}, clean_number:{clean_number}, hold_number:{hold_number}"
            .format(total_number=total_number,
                    clean_number=clean_number,
                    hold_number=hold_number))
Beispiel #11
0
    def start(self):
        concurrency = ConfigManager.setting_config.setting.get(
            "fetch_new_proxy_concurrency")
        task_pool = pool.Pool(concurrency)

        queue_size = self.queue.qsize()
        if queue_size > 0:
            greenlet_list = []
            for _ in range(queue_size):
                greenlet_list.append(task_pool.spawn(self.fetch))

            gevent.joinall(greenlet_list)
        else:
            log.info("Not Have Fetcher Of Now, skip!")
Beispiel #12
0
    def refresh(self):
        proxy_getter_functions = config.cf.options("ProxyGetter")
        for proxyGetter in proxy_getter_functions:
            try:
                log.info(
                    "Fetch Proxy Start, func:{func}".format(func=proxyGetter))

                total = 0
                succ = 0
                fail = 0
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(
                            proxy) and not self.checkRawProxyExists(proxy):
                        self.saveRawProxy(proxy)
                        succ = succ + 1
                        log.debug('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                    else:
                        fail = fail + 1
                        log.error('{func}: fetch proxy {proxy} error'.format(
                            func=proxyGetter, proxy=proxy))

                    total = total + 1

                log.info(
                    "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}"
                    .format(func=proxyGetter,
                            total=total,
                            succ=succ,
                            fail=fail))

            except Exception as e:
                log.error(
                    "func_name:{func_name} fetch proxy fail, error:{error}".
                    format(func_name=proxyGetter, error=e))
                continue