Exemple #1
0
from IPython import embed
import time
from storage_manager import StorageManager, RedisManager, RedisDetailQueue

sm = StorageManager()

new_proxy = sm.create_proxy('1.1.1.3', 8080, 'http')
new_proxy = sm.create_proxy('1.1.1.4', 8080, 'http')
new_proxy = sm.create_proxy('1.1.1.5', 8080, 'http')
rdq1 = RedisDetailQueue(queue_key='q_1', active=False)
tq1 = sm.create_queue('https://www.google.com')
tq2 = sm.create_queue('https://www.bing.com')

rdq2 = RedisDetailQueue(queue_key=tq1.queue_key)

for i in range(rdq1.length()):
    detail_to_clone = rdq1.dequeue()
    clone = sm.clone_detail(detail_to_clone, tq1)
    clone = sm.clone_detail(detail_to_clone, tq2)

sm.sync_to_db()

embed()
Exemple #2
0
class ProxyManager(object):
    def __init__(self):
        self.storage_mgr = StorageManager()

    def load_from_seed_queue(self, queue, num=None):
        if queue.queue_id == SEED_QUEUE_ID:
            return

        seed_queue = self.storage_mgr.redis_mgr.get_queue_by_id(SEED_QUEUE_ID)
        seed_rdq_active = RedisDetailQueue(queue_key=seed_queue.queue_key,
                                           active=True)
        seed_rdq_inactive = RedisDetailQueue(queue_key=seed_queue.queue_key,
                                             active=False)

        active_seeds_to_dequeue = min(ACTIVE_PROXIES_PER_QUEUE,
                                      seed_rdq_active.length())
        inactive_seeds_to_dequeue = INACTIVE_PROXIES_PER_QUEUE - active_seeds_to_dequeue

        if num is not None:
            active_seeds_to_dequeue = num
            inactive_seeds_to_dequeue = num

        for i in range(active_seeds_to_dequeue):
            try:
                self.storage_mgr.clone_detail(seed_rdq_active.dequeue(), queue)
            except Exception as e:
                pass

        for i in range(inactive_seeds_to_dequeue):
            self.storage_mgr.clone_detail(seed_rdq_inactive.dequeue(), queue)

    def get_seed_proxy(self, queue):
        if queue.id == SEED_QUEUE_ID:
            logging.warn("trying to copy seed proxy to seed_queue")
            return

        seed_queue = self.storage_mgr.redis_mgr.get_queue_by_id(SEED_QUEUE_ID)
        self.storage_mgr.clone_detail(seed_queue.dequeue(), queue)

    def get_proxy(self, request_url):
        domain = parse_domain(request_url)
        queue = self.storage_mgr.redis_mgr.get_queue_by_domain(domain)
        rdq_active = RedisDetailQueue(queue_key=queue.queue_key, active=True)
        rdq_inactive = RedisDetailQueue(queue_key=queue.queue_key,
                                        active=False)

        logging.info("active queue count: %s" % rdq_active.length())
        logging.info("inactive queue count: %s" % rdq_inactive.length())

        use_active = True
        clone_seed = flip_coin(SEED_FREQUENCY)

        if rdq_active.length() < MIN_ACTIVE:
            self.load_from_seed_queue(queue)

        elif clone_seed:
            self.load_from_seed_queue(queue, num=1)

        if rdq_active.length() < MIN_ACTIVE:
            use_active = False

        if flip_coin(INACTIVE_PCT):
            use_active = False

        if use_active and rdq_active.length() > 0:
            logging.info("using active queue")
            draw_queue = rdq_active

        else:
            logging.info("using inactive queue")
            draw_queue = rdq_inactive

        detail = None

        detail = draw_queue.dequeue(requeue=False)
        proxy = ProxyObject(self.storage_mgr, detail)
        proxy.dispatch(rdq_active, rdq_inactive)
        return proxy