Beispiel #1
0
def random_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    conn = LocalDict()
    ip = conn.random()
    Log.info(f"ip: {ip}")
    return ip
Beispiel #2
0
def get_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    conn = LocalDict()
    ip = conn.max()
    Log.info(f"ip: {ip}")
    return ip
Beispiel #3
0
 def get_proxies(self, callback):
     proxies = []
     try:
         for proxy in eval("self.{}()".format(callback)):
             Log.debug(f'getter:成功获取到代理: {proxy}')
             proxies.append(proxy)
     except:
         Log.error(f'getter:抓取代理异常,{traceback.format_exc()}')
         return
     self.save_proxies(proxies)
Beispiel #4
0
def decrease_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    proxy = request.args.get("proxy")
    conn = LocalDict()
    conn.decrease(proxy, MAX_SCORE)
    Log.info(f"删除的ip为{proxy}")
    return "ok"
Beispiel #5
0
 def clear(self):
     with self.mutex:
         try:
             useless = []
             for key, value in self.proxys.items():
                 if value <= 0:
                     useless.append(key)
             for i in useless:
                 Log.info(f"清理值为0的无效ip:{i}")
                 self.proxys.pop(i)
         except Exception as e:
             print("清理 ip 异常", e)
Beispiel #6
0
    def run(self):
        """
        测试主函数
        :return:
        """
        t = set()
        self._minus_count = 0
        count = self.local.count()
        if count == 0:
            Log.info("Tester:无代理")
            return
        Log.info(f'Tester:开始运行, 当前容量:{count}')
        try:
            stop = max(0, count)
            test_proxies = self.local.batch(0, stop)
            for proxy in test_proxies:
                for url in TEST_URLS:
                    t.add(self.factory.add(self.test_single_proxy, url, proxy))

            self.local.clear()

        except Exception as e:
            Log.error(f'Tester:发生错误 {e.args}')

        self.factory.wait(t)
        Log.info(f'Tester:执行结束, 测试前容量:{count}, 剩余:{count-self._minus_count}')
Beispiel #7
0
 def test_single_proxy(self, url, proxy):
     """
     测试单个代理
     :param proxy:
     :return:
     """
     proxies = {
         "http": "http://" + proxy,
     }
     try:
         response = requests.head(url,
                                  headers=base_headers,
                                  proxies=proxies,
                                  timeout=15,
                                  allow_redirects=False,
                                  verify=False)
         status_code = response.status_code
         if status_code in VALID_STATUS_CODES:
             Log.debug(f'Tester:代理可用 {proxy}')
             pass
         else:
             if status_code in FORBIDEN_STATUS_CODES:
                 self._minus()
                 self.local.decrease(proxy, -MAX_SCORE)
             else:
                 self.local.decrease(proxy)
             Log.error(
                 f'Tester:请求响应码不合法 {status_code} ,IP {proxy}, URL: {url}')
     except (ReadTimeout, HTTPError, ProxyError, ConnectionError):
         self._minus()
         self.local.decrease(proxy, -MAX_SCORE)
         Log.warning(f'Tester:无用ip,直接删掉, ip: {proxy}')
     except (TypeError, AttributeError) as e:
         self.local.decrease(proxy)
         Log.error(f'Tester:代理请求失败 {proxy} ERROR: {e}')
Beispiel #8
0
def schedule_getter():
    """
    定时获取代理
    """
    getter = Getter()
    while True:
        try:
            getter.run()
        except:
            getter = Getter()
            Log.error(f'getter: 抓取代理异常, {traceback.format_exc()}')
        finally:
            gc.collect()
            time.sleep(GETTER_CYCLE)
Beispiel #9
0
def schedule_tester():
    """
    定时测试代理
    """
    tester = Tester()
    while True:
        try:
            tester.run()
        except:
            tester = Tester()
            Log.error(f'Tester: 测试代理异常, {traceback.format_exc()}')
        finally:
            gc.collect()
            time.sleep(TESTER_CYCLE)
Beispiel #10
0
 def shutdown(self):
     for w in self.workers:
         while w.wf != "fired":
             Log.debug(f"正在赶打工人{w.wid}号离开")
             time.sleep(CHECK_FACTORY_STATUS)
         Log.debug(f"打工人{w.wid}号离开")
     Log.info("正常关闭工厂")
     return True
Beispiel #11
0
    def run(self):
        t = set()
        count = self.local.count()
        if self.is_over_threshold():
            Log.info("Getter:此时容量已达上限,不获取ip")
            return
        Log.info(f'Getter:开始执行, 当前容量:{count}')
        for callback_label in range(self.crawler.__CrawlFuncCount__):
            try:
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                t.add(self.factory.add(self.crawler.get_proxies, callback))
                sys.stdout.flush()
            except:
                traceback.print_exc()

        self.factory.wait(t)
        Log.info(f'Getter:执行结束, 获取前容量:{count}, 当前:{self.local.count()}')
Beispiel #12
0
 def run(self):
     self.wf = "doing"
     while self.factory.factory_status and self.status:
         if not self.__running.is_set():
             Log.info("咋瓦鲁多")
         # 获取任务
         t = self.factory.get_task()
         if not t:
             self._sleep(self.wid)
             continue
         try:
             self.sleep_count = 0
             t.run()
             # print(f"打工人{self.wid}号 ,剩余工作 {self.factory._active_working}")
         except Exception as e:
             Log.error(f"打工人工作异常: {e}")
         finally:
             self.factory.minus_active(t.name)
         self.__running.wait()  # 为True时立即返回, 为False时阻塞直到内部的标识位为True后返回
     Log.debug(f"打工人{self.wid}你被炒鱿鱼了")
     self.wf = "fired"
Beispiel #13
0
def get_page(url, options={}, proxy=None):
    """
    抓取代理
    :param url:
    :param options:
    :return:
    """
    headers = dict(base_headers, **options)
    Log.debug(f'getter:正在抓取 {url}')
    try:

        response = requests.get(url,
                                headers=headers,
                                verify=False,
                                proxies=proxy,
                                timeout=10)
        Log.debug(f'getter:抓取成功 {url} {response.status_code}')
        if response.status_code == 200:
            return response.text
    except (ConnectionError, requests.exceptions.ReadTimeout):
        Log.error(f'getter:抓取失败 {url}')
        return None
Beispiel #14
0
 def _sleep(self, wid):
     self.sleep_count += 1
     Log.debug(f"打工人{wid}号,沉睡第{self.sleep_count}次")
     time.sleep(WORKER_SLEEP)