Example #1
0
class ProxyManager(object):
    """
    ProxyManager
    """

    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.useful_proxy_queue = 'useful_proxy_queue'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy.strip():
                    proxy_set.add(proxy.strip())

            # store raw proxy
            self.db.changeTable(self.raw_proxy_queue)
            for proxy in proxy_set:
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.get()
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.getAll()

    def get_status(self):
        self.db.changeTable(self.raw_proxy_queue)
        quan_raw_proxy = self.db.get_status()
        self.db.changeTable(self.useful_proxy_queue)
        quan_useful_queue = self.db.get_status()
        return {'raw_proxy': quan_raw_proxy, 'useful_proxy_queue': quan_useful_queue}
Example #2
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy:
                    self.log.info('{func}: fetch proxy {proxy}'.format(
                        func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy.strip())

            # store raw proxy
            self.db.changeTable(self.raw_proxy_queue)
            for proxy in proxy_set:
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.get()
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.getAll()

    def get_status(self):
        # TODO rename get_count..
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.get_status()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.get_status()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
Example #3
0
class MongoSsdbUrlManager(object):
    def __init__(self, host="localhost", client=None):
        self.config = GetConfig()
        self.client = MongoClient('mongodb://localhost:27017/') if client is None else client
        self.ssdb_client = DbClient()
        self.db = self.client.spider
        if self.db.lazada.count() is 0:
            self.db.lazada.create_index([("status", ASCENDING),
                                         ("pr", DESCENDING)])

    def enqueue_url(self, url, status, depth):
        md5 = hashlib.md5(url).hexdigest()
        i = 0
        while i < 2:
            try:
                num = self.ssdb_client.get(md5)
                if num is not None:
                    self.ssdb_client.update(key=md5, value=int(num) + 1)
                    return
                self.ssdb_client.put(md5)
                i = 2
            except Exception as error:
                print error
                i += 1
        self.db.lazada.save({
            '_id': md5,
            'url': url,
            'status': status,
            'queue_time': datetime.utcnow(),
            'depth': depth,
            'pr': 0
        })

    def dequeue_url(self, depth):
        # record = self.db.lazada.find_one_and_update(
        #     {'status': 'downloading', 'depth': depth},
        #     {'$set': {'status': 'downloading'}},
        #     upsert=False,
        #     sort=[('pr', DESCENDING)],  # sort by pr in descending
        #     returnNewDocument=False
        # )

        record = self.db.lazada.find_one(
            {'status': 'downloading', 'depth': depth}
        )

        if record:
            return record
        else:
            return None

    def finish_url(self, url):
        record = {'status': 'done', 'done_time': datetime.utcnow()}
        self.db.lazada.update({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False)

    def clear(self):
        self.ssdb_client.clear()

    def save_error(self, url, status, depth):
        md5 = hashlib.md5(url).hexdigest()
        self.db.lazada_error.save({
            '_id': md5,
            'url': url,
            'status': status,
            'queue_time': datetime.utcnow(),
            'depth': depth,
            'pr': 0
        })

    def update(self):
        self.db.lazada.update({'status': 'downloading'}, {'$set': {'status': 'new'}}, multi=True)


    def save(self, record):
        self.db.record.save(record)
Example #4
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.useful_proxy_queue = 'useful_proxy_queue'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy.strip():
                    proxy_set.add(proxy.strip())

            # store raw proxy
            self.db.changeTable(self.raw_proxy_queue)
            for proxy in proxy_set:
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.get()
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.getAll()

    def get_status(self):
        self.db.changeTable(self.raw_proxy_queue)
        quan_raw_proxy = self.db.get_status()
        self.db.changeTable(self.useful_proxy_queue)
        quan_useful_queue = self.db.get_status()
        return {
            'raw_proxy': quan_raw_proxy,
            'useful_proxy_queue': quan_useful_queue
        }