Ejemplo n.º 1
0
 def get_raw_proxies(self, callback):
     proxies = []
     Logger.log_normal('Callback %s' % callback)
     for proxy in eval("self.{}()".format(callback)):
         Logger.log_normal('Getting %s from %s' % (proxy, callback))
         proxies.append(proxy)
     return proxies
Ejemplo n.º 2
0
 def test(self):
     """
     异步检测_raw_proxies中的全部代理
     """
     Logger.log_normal('VaildityTester is working')
     try:
         loop = asyncio.get_event_loop()
         tasks = [
             self.test_single_proxy(proxy) for proxy in self._raw_proxies
         ]
         loop.run_until_complete(asyncio.wait(tasks))
     except ValueError:
         Logger.log_fail('Async Error')
Ejemplo n.º 3
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """从redis里面获取一半的代理
     """
     conn = RedisClient()
     tester = VaildityTester()
     while True:
         Logger.log_high('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             Logger.log_normal('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 4
0
 async def test_single_proxy(self, proxy):
     """
     检测单个代理,如果可用,则将其加入_usable_proxies
     """
     async with aiohttp.ClientSession() as session:
         try:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode('utf-8')
             real_proxy = 'http://' + proxy
             Logger.log_normal('Testing %s' % proxy)
             async with session.get(self.test_api,
                                    proxy=real_proxy,
                                    timeout=15) as resp:
                 if resp.status == 200:
                     self._conn.put(proxy)
                     Logger.log_high('Valid proxy %s' % proxy)
         except Exception:
             pass
Ejemplo n.º 5
0
    def add_to_queue(self):
        """
        命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理加入到代理池中
        """

        Logger.log_normal('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)

                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()

                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    Logger.log_high('IP is enough, waiting to be used')
                    break
                if proxy_count == 0:
                    raise ResourceDepletionError
Ejemplo n.º 6
0
def get_page(url, options={}):
    headers = dict(base_header, **options)
    Logger.log_normal('Getting %s' % url)
    try:
        r = requests.get(url, headers=headers)
        Logger.log_high('Getting result %s %s' % (url, r.status_code))
        if r.status_code == 200:
            return r.text
    except ConnectionError:
        Logger.log_fail('Crawling Failed %s' % url)
        return None
Ejemplo n.º 7
0
 def run(self):
     Logger.log_high('Ip processing running')
     valid_process = Process(target=Schedule.valid_proxy)
     check_process = Process(target=Schedule.check_pool)
     valid_process.start()
     check_process.start()
Ejemplo n.º 8
0
 def __init__(self):
     self.logger = Logger()
     self.que = PriorityQueue()
     self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID)
     self.pipe = self.rdb.pipeline(transaction=True)
Ejemplo n.º 9
0
class Scheduler:
    logger = Logger()

    def __init__(self):
        self.logger = Logger()
        self.que = PriorityQueue()
        self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID)
        self.pipe = self.rdb.pipeline(transaction=True)

    def start_spider(self):
        # prepare_task_items()
        settings = get_project_settings()
        process = CrawlerProcess(settings)
        process.crawl('common_spider')
        process.start()
        # configure_logging(settings)
        # runner = CrawlerRunner(settings)
        # print(all_spiders)
        # #for spider in all_spiders:
        # #d=runner.crawl(all_spiders[0])
        # d = runner.join()
        # d.addBoth(lambda _: reactor.stop())
        # reactor.run()

    def start_validator(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)
        process.crawl('baidu_validator')
        process.start()

    def schedule(self):
        self.logger.log('start schedule')
        while not self.que.empty():
            self.que.get()
        time_start = time.time()
        for taskname, task in TASKS.items():
            self.que.put(Task_dict(
                taskname,
                task,
                time_start,
            ))

        while not self.que.empty():
            time_now = time.time()
            task_dict = self.que.get()
            self.logger.log('now waiting for ' + task_dict.taskname)
            if time_now < task_dict.start_time:
                time.sleep(task_dict.start_time - time_now)

            self.start_processing(task_dict.taskname, task_dict.task)

            time_now = time.time()
            self.que.put(
                Task_dict(
                    task_dict.taskname,
                    task_dict.task,
                    time_now + task_dict.task['interval'] * 60,
                ))
            # print(time_now+task['interval']*60, task)

    def start_processing(self, taskname, task):
        self.logger.log('\n' + "*" * 54)
        self.logger.log('%-20s%s' % (taskname, 'start'))

        task_queue = [taskname + DB_SPLIT_SYMBOL + x for x in task["resource"]]
        self.rdb.delete(DB_RAW_IPPOOL_NAME)
        self.pipe.lpush(DB_TASK_QUEUE_NAME, *task_queue)
        self.pipe.execute()

        self.logger.log('%-20s%s' % (taskname, 'crawling'))
        process = Process(target=self.start_spider)
        process.start()
        process.join()
        ippool_size = self.ippool_turn_raw()

        self.logger.log('%-20s%s' % (taskname, 'validating'))
        process = Process(target=self.start_validator)
        process.start()
        process.join()

        ippool_size_now = self.rdb.zcard(DB_IPPOOL_NAME)
        self.logger.log(
            '%-20s%s %03d\n' %
            (taskname, 'contribution', ippool_size_now - ippool_size))
        self.logger.log("*" * 54)

    def ippool_turn_raw(self):
        ippool = self.rdb.zrange(DB_IPPOOL_NAME, 0, -1)
        ippool_size = len(ippool)
        if ippool_size > 0:
            self.pipe.sadd(DB_RAW_IPPOOL_NAME, *ippool)
            self.pipe.delete(DB_IPPOOL_NAME)
        self.pipe.execute()
        return ippool_size