Example #1
0
    def verify(self):
        item = self.queue.get()
        proxy = item.get("proxy")

        if ConfigManager.setting_config.setting.get("custom_verify_url"):
            verify_result = self.customVerifyProxy(proxy)
        else:
            verify_result = self.defaultVerifyProxy(proxy)

        if verify_result:
            if self.checkProxyInfo(item):
                self.updateUsefulProxy(item)

            ProxyManager.proxy_manager.tickUsefulProxyVaildSucc(proxy)
            self.stat["succ"] = self.stat["succ"] + 1
            log.debug("useful_proxy:{proxy} verify succ".format(proxy=proxy))
        else:
            ProxyManager.proxy_manager.tickUsefulProxyVaildFail(proxy)
            self.stat["fail"] = self.stat["fail"] + 1
            log.debug("useful_proxy:{proxy} verify fail".format(proxy=proxy))

        self.queue.task_done()
        ProxyManager.proxy_manager.tickUsefulProxyVaildTotal(proxy)
        ProxyManager.proxy_manager.updateUsefulProxyNextVerifyTime(proxy, self.start_time)
        self.stat["total"] = self.stat["total"] + 1
Example #2
0
    def defaultVerifyProxy(self, proxy):
        result = None

        if isinstance(proxy, bytes):
            proxy = proxy.decode('utf8')

        proxies = {
            "http": proxy,
        }
        http_url = "http://httpbin.org/ip"

        try:
            r = requests.get(http_url,
                             proxies=proxies,
                             timeout=10,
                             verify=False)
            data = r.json()

            status_result = r.status_code == 200
            content_result = "origin" in data
            if status_result and content_result:
                result = True

        except Exception as e:
            log.debug(
                "proxy:{proxy} http verify proxy fail, error:{error}".format(
                    proxy=proxy, error=e))
            result = False

        return result
Example #3
0
    def getQualityUsefulProxy(self, **kwargs):
        https = kwargs.get("https", None)
        region = kwargs.get("region", None)
        type_ = kwargs.get("type", None)

        result = None
        operation_list = 	[
            {
                "$match": {
                    "total": { "$ne": 0},  
                    "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] },
                }
            },
            {
                "$project": { "proxy": 1, "total": 1, "succ_rate": { "$divide": ["$succ", "$total"] } },
            },
            {
                "$sort": { "succ_rate": -1, "total": -1 },
            },

        ]

        if https:
            operation_list[0]["$match"]["https"] = { "$eq": https }

        if type_:
            operation_list[0]["$match"]["type"] = { "$eq": type_ }

        if region: 
            operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)

        log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
        result = self.mc.aggregate(operation_list)

        return result
Example #4
0
    def customVerifyProxy(self, proxy):
        result = None

        if isinstance(proxy, bytes):
            proxy = proxy.decode('utf8')

        proxies = {
            "http": proxy,
            "https": proxy,
        }
        verify_url = ConfigManager.setting_config.setting.get("custom_verify_url")

        try:
            content_result = True
            r = requests.get(verify_url, proxies=proxies, timeout=10, verify=False)
            pattern = ConfigManager.setting_config.setting.get("custom_verify_content")
            if pattern:
                content = r.content.decode('utf-8')
                search_result = re.search(pattern, content)
                content_result = search_result != None

            status_result = r.status_code == 200
            if status_result and content_result:
                result = True

        except Exception as e:
            log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e))
            result = False

        return result
Example #5
0
    def getSampleUsefulProxy(self, **kwargs):
        https = kwargs.get("https", None)
        region = kwargs.get("region", None)
        type_ = kwargs.get("type", None)

        result = None
        operation_list = 	[
            {
                "$match": {
                    "total": { "$ne": 0},  
                    "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] },
                }
            },
            {
                "$sample": { "size": 1}
            }
        ]

        if https:
            operation_list[0]["$match"]["https"] = { "$eq": https }

        if type_:
            operation_list[0]["$match"]["type"] = { "$eq": type_ }

        if region: 
            operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)

        log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
        data = self.mc.aggregate(operation_list)
        if data:
            result = data[0]

        return result
Example #6
0
    def getQualityUsefulProxy(self, **kwargs):
        https = kwargs.get("https", None)
        region = kwargs.get("region", None)
        type_ = kwargs.get("type", None)

        result = None
        operation_list = [
            {
                "$match": {
                    "total": { "$ne": 0 },
                }
            },
            {
                "$sort": { "quality": -1, "total": -1 },
            },
        ]

        if https:
            operation_list[0]["$match"]["https"] = { "$eq": https }

        if type_:
            operation_list[0]["$match"]["type"] = { "$eq": type_ }

        if region: 
            operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)

        log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
        result = self.mc.aggregate(operation_list)

        return result
Example #7
0
    def getAllValidUsefulProxy(self, **kwargs):
        https = kwargs.get("https", None)
        region = kwargs.get("region", None)
        type_ = kwargs.get("type", None)

        result = []
        operation_list = [
            {
                "$match": { "total": { "$ne": 0 } }
            }
        ]

        if https:
            operation_list[0]["$match"]["https"] = { "$eq": https }

        if type_:
            operation_list[0]["$match"]["type"] = { "$eq": type_ }

        if region:
            operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)

        log.debug("getAllValidUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
        result = self.mc.aggregate(operation_list)

        return result
Example #8
0
    def start(self):

        start_time = time.time()
        log.debug("useful_proxy proxy verify start")

        self.stat = dict(
            total=0,
            succ=0,
            fail=0,
        )

        concurrency = ConfigManager.setting_config.setting.get(
            "verify_useful_proxy_concurrency")
        queue_size = self.queue.qsize()
        if concurrency > queue_size:
            spawn_num = queue_size
        else:
            spawn_num = concurrency

        greenlet_list = []
        for _ in range(spawn_num):
            greenlet_list.append(gevent.spawn(self.run))

        gevent.joinall(greenlet_list)

        end_time = time.time()
        elapsed_time = int(end_time - start_time)
        log.info(
            'useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s'
            .format(total=self.stat["total"],
                    succ=self.stat["succ"],
                    fail=self.stat["fail"],
                    elapsed_time=elapsed_time))
Example #9
0
    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        thread_id = threading.currentThread().ident
        log.info("thread_id:{thread_id} useful_proxy proxy check start".format(
            thread_id=thread_id))

        total = 0
        succ = 0
        fail = 0
        while self.queue.qsize():
            proxy = self.queue.get()
            (http_result, _) = validUsefulProxy(proxy)
            if http_result:
                self.tickUsefulProxyVaildSucc(proxy)
                succ = succ + 1
                log.debug(
                    "ProxyCheck: {proxy} validation pass".format(proxy=proxy))
            else:
                self.tickUsefulProxyVaildFail(proxy)
                fail = fail + 1
                log.debug(
                    "ProxyCheck: {proxy} validation fail".format(proxy=proxy))

            self.queue.task_done()
            total = total + 1
            self.tickUsefulProxyVaildTotal(proxy)

        log.info(
            'thread_id:{thread_id} proxy check end, total:{total}, succ:{succ}, fail:{fail}'
            .format(thread_id=thread_id, total=total, succ=succ, fail=fail))
Example #10
0
    def getSampleProxy(self, **kwargs):
        item = self.db.getSampleUsefulProxy(**kwargs)
        result = None
        if item:
            result = item["proxy"]

        log.debug("getSampleUsefulProxy, item:{item}".format(item=str(item)))

        return result
Example #11
0
    def getProxyInfo(self, proxy):
        info = {}

        data = proxy.split(':')
        info["ip"] = data[0]
        info["port"] = data[1]
        info["address"] = proxy

        proxies = {
            "http": proxy,
            "https": proxy,
        }
        http_url = "http://httpbin.org/ip"
        https_url = "https://httpbin.org/ip"

        result = False

        info["https"] = ProxyManager.PROXY_HTTPS["UNKNOWN"]
        info["type"] = ProxyManager.PROXY_TYPE["UNKNOWN"]
        # http verify
        try:
            r = requests.get(http_url, proxies=proxies, timeout=10, verify=False)
            data = r.json()
            ip_string = data["origin"]
            ip_list = ip_string.split(", ")

            status_result = r.status_code == 200
            content_result = "origin" in data
            if status_result and content_result:
                result = True

            if len(ip_list) > 1:
                info["type"] = ProxyManager.PROXY_TYPE["CLEAR"]
            else:
                info["type"] = ProxyManager.PROXY_TYPE["ANONYMOUS"]

        except Exception as e:
            log.debug("proxy:[{proxy}] http verify fail, error:{error}".format(proxy=proxy, error=e))
            result = False

        if result:

            # https verify
            try:
                r = requests.get(https_url, proxies=proxies, timeout=10, verify=False)
                status_result = r.status_code == 200
                content_result = "origin" in data
                if status_result and content_result:
                    info["https"] = ProxyManager.PROXY_HTTPS["ENABLE"]

            except Exception as e:
                log.debug("proxy [{proxy}] https verify fail, error:{error}".format(proxy=proxy, error=e))
                info["https"] = ProxyManager.PROXY_HTTPS["DISABLE"]

        return info 
Example #12
0
    def check_fetch_new_proxy(self):

        total_number = ProxyManager.proxy_manager.getRawProxyNumber()
        hold_number = ConfigManager.setting_config.setting.get("hold_raw_proxy_number")
        if total_number < hold_number or hold_number == -1:
            log.debug("fetch new proxy start, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number))
            result = True
        else:
            log.debug("fetch new proxy skip, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number))
            result = False
        
        return result
Example #13
0
    def getSampleUsefulProxy(self, **kwargs):
        item = self.db.getSampleUsefulProxy(**kwargs)
        result = None
        if item:
            result = item["proxy"]

            token = kwargs.get("token", None)
            if token:
                self.db.addProxyUsedToken(result, token)

        log.debug("getSampleUsefulProxy, item:{item}".format(item=str(item)))

        return result
Example #14
0
    def get(self):
        result = {"data": {}}

        options = {
            "https": self.args.get('https'),
            "type": self.args.get('type'),
            "region": self.args.get('region'),
        }
        log.debug("receive params: {}".format(options))

        item = proxy_manager.getSampleUsefulProxy(**options)
        if item:
            del item["_id"]
        result["data"] = item

        return result
Example #15
0
    def get(self):
        item = None
        item_list = []
        self.db.changeTable(self.useful_proxy_queue)

        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                item_list = list(item_dict.keys())
            else:
                item_list = item_dict.keys()

        if item_list:
            item = random.choice(item_list)

        log.debug('Get Random Proxy {item} of {total}'.format(
            item=item, total=len(item_list)))
        return item
    def validProxy(self):
        thread_id = threading.currentThread().ident
        log.info("thread_id:{thread_id}, Start ValidProxy `raw_proxy_queue`".format(thread_id=thread_id))

        total = 0
        succ = 0
        fail = 0

        while self.queue.qsize():
            proxy = self.queue.get()
            if proxy not in self.remaining_proxies:
                (http_result, https_result) = validUsefulProxy(proxy)
                if http_result:
                    self.saveUsefulProxy(proxy, https_result)
                    self.deleteRawProxy(proxy)
                    self.remaining_proxies.append(proxy)

                    succ = succ + 1
                else:
                    self.tickRawProxyVaildFail(proxy)

                    fail = fail + 1
                    log.debug('ProxyRefreshSchedule: %s validation fail' % proxy)
                # self.tickRawProxyVaildSucc(proxy)
                log.debug('ProxyRefreshSchedule: %s validation pass' % proxy)
            else:
                self.deleteRawProxy(proxy)

                log.debug('ProxyRefreshSchedule: %s repetition, skip!' % proxy)

            self.queue.task_done()
            self.tickRawProxyVaildTotal(proxy)
            total = total + 1

        log.info('thread_id:{thread_id}, ValidProxy Complete `raw_proxy_queue`, total:{total}, succ:{succ}, fail:{fail}'.format(thread_id=thread_id, total=total, succ=succ, fail=fail))
Example #17
0
    def verify(self):
        raw_proxy_item = self.queue.get()
        raw_proxy = raw_proxy_item.get("proxy")
        if isinstance(raw_proxy, bytes):
            raw_proxy = raw_proxy.decode('utf8')

        if raw_proxy not in self.useful_proxies:
            if ConfigManager.setting_config.setting.get("custom_verify_url"):
                verify_result = self.customVerifyProxy(raw_proxy)
            else:
                verify_result = self.defaultVerifyProxy(raw_proxy)

            if verify_result:
                ProxyManager.proxy_manager.saveUsefulProxy(raw_proxy)
                ProxyManager.proxy_manager.deleteRawProxy(raw_proxy)
                self.useful_proxies[raw_proxy] = True

                self.stat["succ"] = self.stat["succ"] + 1
                log.debug("raw_proxy:{raw_proxy} verify succ".format(
                    raw_proxy=raw_proxy))
            else:
                ProxyManager.proxy_manager.tickRawProxyVaildFail(raw_proxy)

                self.stat["fail"] = self.stat["fail"] + 1
                log.debug("raw_proxy:{raw_proxy} verify fail".format(
                    raw_proxy=raw_proxy))
        else:
            ProxyManager.proxy_manager.deleteRawProxy(raw_proxy)

            self.stat["skip"] = self.stat["skip"] + 1
            log.debug("raw_proxy:{raw_proxy} verify repetition".format(
                raw_proxy=raw_proxy))

        self.queue.task_done()
        self.stat["total"] = self.stat["total"] + 1
Example #18
0
    def refresh(self):
        proxy_getter_functions = config.cf.options("ProxyGetter")
        for proxyGetter in proxy_getter_functions:
            try:
                log.info(
                    "Fetch Proxy Start, func:{func}".format(func=proxyGetter))

                total = 0
                succ = 0
                fail = 0
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(
                            proxy) and not self.checkRawProxyExists(proxy):
                        self.saveRawProxy(proxy)
                        succ = succ + 1
                        log.debug('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                    else:
                        fail = fail + 1
                        log.error('{func}: fetch proxy {proxy} error'.format(
                            func=proxyGetter, proxy=proxy))

                    total = total + 1

                log.info(
                    "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}"
                    .format(func=proxyGetter,
                            total=total,
                            succ=succ,
                            fail=fail))

            except Exception as e:
                log.error(
                    "func_name:{func_name} fetch proxy fail, error:{error}".
                    format(func_name=proxyGetter, error=e))
                continue
Example #19
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher = self.queue.get()
        name = fetcher["name"]

        fetcher_class = FetcherManager.getFetcherClass(name)
        log.debug("fetch [{name}] proxy start".format(name=name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                        not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy):

                    ProxyManager.proxy_manager.saveUsefulProxy(proxy)
                    succ = succ + 1
                    log.debug("fetch [{name}] proxy {proxy} succ".format(
                        name=name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug("fetch [{name}] proxy {proxy} skip".format(
                        name=name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{name}] proxy fail: {error}".format(name=name,
                                                                  error=e))
            fail = fail + 1

        self.queue.task_done()

        now = int(time.time())
        elapsed_time = int(now - start_time)

        next_fetch_time = self.start_time + (fetcher["interval"] * 60)

        data = {
            "$inc": {
                "succ": succ,
                "fail": fail,
                "skip": skip,
                "total": total,
            },
            "$set": {
                "next_fetch_time": next_fetch_time,
            }
        }

        ProxyManager.proxy_manager.updateFetcher(name, data)
        log.info("fetch [{name:^15}] proxy finish, \
            total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"                                                                                               . \
                 format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
Example #20
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher_name = self.queue.get()
        fetcher_class = FetcherManager.get_class(fetcher_name)
        log.debug("fetch [{fetcher_name}] proxy start".format(
            fetcher_name=fetcher_name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                not proxy_manager.checkRawProxyExists(proxy) and \
                not proxy_manager.checkUsefulProxyExists(proxy):

                    proxy_manager.saveRawProxy(proxy)
                    succ = succ + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} succ".format(
                            fetcher_name=fetcher_name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} skip".format(
                            fetcher_name=fetcher_name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{fetcher_name}] proxy fail: {error}".format(
                fetcher_name=fetcher_name, error=e))
            fail = fail + 1

        end_time = time.time()
        elapsed_time = int(end_time - start_time)

        self.queue.task_done()

        stat = dict(
            total=total,
            succ=succ,
            fail=fail,
            skip=skip,
        )
        ConfigManager.fetcher_config.update_stat(fetcher_name, stat)

        log.info(
            "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"
            .format(fetcher_name=fetcher_name,
                    total=total,
                    succ=succ,
                    fail=fail,
                    skip=skip,
                    elapsed_time=elapsed_time))