Python StorageManager.new_proxyの例

プログラミング言語: Python

名前空間/パッケージ名: scrapy_autoproxy.storage_manager

クラス/型: StorageManager

メソッド/関数: new_proxy

hotexamples.comのコード掲載数: 4

Python StorageManager.new_proxy - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのscrapy_autoproxy.storage_manager.StorageManager.new_proxyの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

StorageManager(6)

new_proxy(4)

sync_to_db(2)

create_new_details(1)

initialize_seed_queue(1)

コード例 #1

ファイルを表示

class ProxylistySpider(scrapy.Spider):
    name = 'proxylisty'
    allowed_domains = ['proxylisty.com']
    start_urls = ['http://www.proxylisty.com/ip-proxylist']

    def __init__(self, *args, **kwargs):
        self.count = int(kwargs.get('count', 1))
        self.storage_mgr = StorageManager()

    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(
                url='http://www.proxylisty.com/ip-proxylist', dont_filter=True)
            logging.info("GET %s" % request.url)

            yield request

    def parse_proxies(self, trs):
        for tr in trs:
            tds = tr.xpath('td')
            address = tr.xpath('td[1]/text()').extract_first()
            port = tr.xpath('td[2]/a/text()').extract_first()
            protocol = tr.xpath('td[3]/text()').extract_first()

            if address is not None and port is not None and protocol is not None:
                self.storage_mgr.new_proxy(address, port, protocol)

    def parse2(self, response):

        trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]')
        if trs and len(trs) > 0:
            self.parse_proxies(trs)

        next_link = response.xpath(
            '//div[@id="content"]//table[1]/tr/td[@colspan="9"]/ul/li/a[text()="Next"]/@href'
        ).extract_first()

        if next_link is not None:
            yield scrapy.Request(url=response.urljoin(next_link),
                                 callback=self.parse2,
                                 dont_filter=True)

    def parse(self, response):

        if 'ip-proxylist' in str(response.request.url):
            additional_links = response.xpath(
                '//li[@class="has-sub"][2]/div[@class="wideblock"][1]/div[1]/ul//a/@href'
            ).extract()

        trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]')
        if trs and len(trs) > 0:
            self.parse_proxies(trs)

        for link in additional_links:
            yield scrapy.Request(url=link, callback=self.parse2)

コード例 #2

ファイルを表示

class IpAdressSpider(scrapy.Spider):
    name = 'ip-adress'
    allowed_domains = ['ip-adress.com']
    start_urls = ['https://www.ip-adress.com/proxy-list']

    def __init__(self, *args, **kwargs):
        self.count = int(kwargs.get('count', 1))
        self.storage_mgr = StorageManager()

    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(
                url='https://www.ip-adress.com/proxy-list', dont_filter=True)
            logging.info("GET %s" % request.url)
            yield request

    def parse(self, response):
        trs = response.xpath(
            '//table[contains(@class,"proxylist")]//tr[position() > 1]')
        for tr in trs:
            address = tr.xpath('td[1]/a/text()').extract_first()
            port = tr.xpath('td[1]/text()').extract_first()
            port = re.search(r'(\d+)', port).group(1)
            self.storage_mgr.new_proxy(address, port)

コード例 #3

ファイルを表示

class ProxyManager(object):
    def __init__(self):
        self.storage_mgr = StorageManager()
        self.logger = logging.getLogger(__name__)

    def get_proxy(self,request_url):
        is_seed = False
        domain = parse_domain(request_url)
        # get the queue for the request url's domain. If a queue doesn't exist, one will be created.
        queue = self.storage_mgr.redis_mgr.get_queue_by_domain(domain)
        
        if queue.id() == SEED_QUEUE_ID:
            is_seed = True
        
        # self logger name to requst url domain
        self.logger = logging.getLogger(queue.domain)
        
        # first get all details that may already be in redis
        # TODO, change this to a simple count

        num_details = self.storage_mgr.redis_mgr.get_queue_count(queue)
        #logging.debug("\n\n\n\n\nafter get num details for queue")
        
        
        if num_details == 0 and is_seed:
            self.storage_mgr.initialize_seed_queue()
        
        if num_details == 0 and not is_seed:
            self.storage_mgr.redis_mgr.initialize_queue(queue=queue)
        
        rdq_active = RedisDetailQueue(queue,active=True)
        rdq_inactive = RedisDetailQueue(queue,active=False)
        num_enqueued = rdq_active.length() + rdq_inactive.length()

        not_enqueued = num_details - num_enqueued
        logging.info("""
        ------------------------------------|
        --------------| Cached total   : %s |
        --------------| Not enqueued   : %s |
        --- ----------| Active RDQ     : %s |
        --------------| Inactive RDQ   : %s |
        -----------------------------------------------|
        """ % (num_details,not_enqueued,rdq_active.length(),rdq_inactive.length()))

        if rdq_inactive.length() < MIN_QUEUE_SIZE and not is_seed:
            self.logger.info("rdq is less than the min queue size, creating some new details...")
            self.storage_mgr.create_new_details(queue=queue)
            # will return a list of new seed details that have not yet been used for this queue

        elif flip_coin(SEED_FREQUENCY) and not is_seed:
            self.storage_mgr.create_new_details(queue=queue,count=1)

        use_active = False

        active_pct_chance = rdq_active.length() / TARGET_ACTIVE_COUNT

        if flip_coin(active_pct_chance):
            use_active = True

        if rdq_active.length() < MIN_QUEUE_SIZE:
            use_active=False
            
        
    

        draw_queue = None
        
        if use_active:
            self.logger.info("using active RDQ")
            draw_queue = rdq_active
        
        else:
            self.logger.info("using inactive RDQ")
            draw_queue = rdq_inactive
        
        
        detail = draw_queue.dequeue()
        proxy = ProxyObject(detail, StorageManager(), draw_queue)
        
        now = datetime.utcnow()
        elapsed_time = now - proxy.detail.last_used
        if elapsed_time.seconds < PROXY_INTERVAL:
            self.logger.warn("Proxy %s was last used against %s %s seconds ago, using a different proxy." % (proxy.address, domain, elapsed_time.seconds))
            return self.get_proxy(request_url)            
        
        proxy.dispatch()
        return proxy
        
        

    def new_proxy(self,address,port,protocol='http'):
        return self.storage_mgr.new_proxy(address,port,protocol)

コード例 #4

ファイルを表示

ファイル: proxydb.py プロジェクト: zanachka/autoproxy

class ProxydbSpider(scrapy.Spider):
    name = 'proxydb'
    allowed_domains = ['proxydb.net']
    start_urls = ['http://proxydb.net/']
    handle_httpstatus_list = [403,404]

    def __init__(self,*args,**kwargs):
        self.count = int(kwargs.get('count',1))
        self.storage_mgr = StorageManager()
    
    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(url='http://proxydb.net/', dont_filter=True)
            logging.info("GET %s" % request.url)

            yield request

    def deobfuscate(self,resp):
        proxies = []
        try:
            trs = resp.xpath('//div[@class="table-responsive"]/table[contains(@class,"table-hover")]/tbody/tr')
            
            for tr in trs:
                script = tr.xpath('td[1]/script/text()').extract_first()
                rnnum_var_full_search = re.search(r'getAttribute\(\'(data\-(\w+))\'\)',script)

                rnnum_var_full = rnnum_var_full_search.group(1)
                rnnum_var = rnnum_var_full_search.group(2)
                
                rnnum = resp.xpath('//div[@%s]/@%s' % (rnnum_var_full,rnnum_var_full)).extract_first()
                string_to_replace = "(+document.querySelector('[%s]').getAttribute('%s'))" % (rnnum_var_full,rnnum_var_full)
                
                ctx.eval(" var %s = %s " % (rnnum_var,rnnum))

                script = script.replace(string_to_replace, " %s " % rnnum_var)
                
                scripts = script.split(';')[0:3]
                var_re = r'var\s+(\w+)\s*\='

                addr1_var = re.search(var_re, scripts[0]).group(1)
                addr2_var = re.search(var_re, scripts[1]).group(1)
                port_var = re.search(var_re, scripts[2]).group(1)

                for js in scripts:
                    ctx.eval(js)

                addr1 = ctx.eval(addr1_var)
                addr2 = base64.b64decode(ctx.eval(addr2_var)).decode('utf-8')
                port = int(ctx.eval(port_var))

                address = "%s%s" % (addr1,addr2)
                protocol = tr.xpath('td[5]/text()').extract_first().strip().lower()
                logging.info("successfully deobfuscated proxy:\naddress=%s port=%s protocol=%s" % (address,port, protocol))
                proxies.append({ 'address': address, 'port':port, 'protocol': protocol })

        except Exception as e:
            logging.warning(e)

        return proxies

    def parse(self,response):
        proxies = self.deobfuscate(response)
        for pdata in proxies:
            #proxy = Proxy(address=pdata['address'], port=pdata['port'],protocol=pdata['protocol'])
            self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol'])
        
        proxies_by_dropdown_urls = response.xpath('//div[@aria-labelledby="navbar_dropdown_shortcuts"]/a/@href').extract()
        for url in proxies_by_dropdown_urls:
            url = response.urljoin(url)
            req = scrapy.Request(url=url, callback=self.parse_dropdown, dont_filter=True)
            yield req



    def parse_dropdown(self,response):
        print("parsing cat link")
        proxies = self.deobfuscate(response)
        for pdata in proxies:
            self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol'])