class ProxylistySpider(scrapy.Spider): name = 'proxylisty' allowed_domains = ['proxylisty.com'] start_urls = ['http://www.proxylisty.com/ip-proxylist'] def __init__(self, *args, **kwargs): self.count = int(kwargs.get('count', 1)) self.storage_mgr = StorageManager() def start_requests(self): for i in range(self.count): request = scrapy.Request( url='http://www.proxylisty.com/ip-proxylist', dont_filter=True) logging.info("GET %s" % request.url) yield request def parse_proxies(self, trs): for tr in trs: tds = tr.xpath('td') address = tr.xpath('td[1]/text()').extract_first() port = tr.xpath('td[2]/a/text()').extract_first() protocol = tr.xpath('td[3]/text()').extract_first() if address is not None and port is not None and protocol is not None: self.storage_mgr.new_proxy(address, port, protocol) def parse2(self, response): trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]') if trs and len(trs) > 0: self.parse_proxies(trs) next_link = response.xpath( '//div[@id="content"]//table[1]/tr/td[@colspan="9"]/ul/li/a[text()="Next"]/@href' ).extract_first() if next_link is not None: yield scrapy.Request(url=response.urljoin(next_link), callback=self.parse2, dont_filter=True) def parse(self, response): if 'ip-proxylist' in str(response.request.url): additional_links = response.xpath( '//li[@class="has-sub"][2]/div[@class="wideblock"][1]/div[1]/ul//a/@href' ).extract() trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]') if trs and len(trs) > 0: self.parse_proxies(trs) for link in additional_links: yield scrapy.Request(url=link, callback=self.parse2)
class IpAdressSpider(scrapy.Spider): name = 'ip-adress' allowed_domains = ['ip-adress.com'] start_urls = ['https://www.ip-adress.com/proxy-list'] def __init__(self, *args, **kwargs): self.count = int(kwargs.get('count', 1)) self.storage_mgr = StorageManager() def start_requests(self): for i in range(self.count): request = scrapy.Request( url='https://www.ip-adress.com/proxy-list', dont_filter=True) logging.info("GET %s" % request.url) yield request def parse(self, response): trs = response.xpath( '//table[contains(@class,"proxylist")]//tr[position() > 1]') for tr in trs: address = tr.xpath('td[1]/a/text()').extract_first() port = tr.xpath('td[1]/text()').extract_first() port = re.search(r'(\d+)', port).group(1) self.storage_mgr.new_proxy(address, port)
class ProxyManager(object): def __init__(self): self.storage_mgr = StorageManager() self.logger = logging.getLogger(__name__) def get_proxy(self,request_url): is_seed = False domain = parse_domain(request_url) # get the queue for the request url's domain. If a queue doesn't exist, one will be created. queue = self.storage_mgr.redis_mgr.get_queue_by_domain(domain) if queue.id() == SEED_QUEUE_ID: is_seed = True # self logger name to requst url domain self.logger = logging.getLogger(queue.domain) # first get all details that may already be in redis # TODO, change this to a simple count num_details = self.storage_mgr.redis_mgr.get_queue_count(queue) #logging.debug("\n\n\n\n\nafter get num details for queue") if num_details == 0 and is_seed: self.storage_mgr.initialize_seed_queue() if num_details == 0 and not is_seed: self.storage_mgr.redis_mgr.initialize_queue(queue=queue) rdq_active = RedisDetailQueue(queue,active=True) rdq_inactive = RedisDetailQueue(queue,active=False) num_enqueued = rdq_active.length() + rdq_inactive.length() not_enqueued = num_details - num_enqueued logging.info(""" ------------------------------------| --------------| Cached total : %s | --------------| Not enqueued : %s | --- ----------| Active RDQ : %s | --------------| Inactive RDQ : %s | -----------------------------------------------| """ % (num_details,not_enqueued,rdq_active.length(),rdq_inactive.length())) if rdq_inactive.length() < MIN_QUEUE_SIZE and not is_seed: self.logger.info("rdq is less than the min queue size, creating some new details...") self.storage_mgr.create_new_details(queue=queue) # will return a list of new seed details that have not yet been used for this queue elif flip_coin(SEED_FREQUENCY) and not is_seed: self.storage_mgr.create_new_details(queue=queue,count=1) use_active = False active_pct_chance = rdq_active.length() / TARGET_ACTIVE_COUNT if flip_coin(active_pct_chance): use_active = True if rdq_active.length() < MIN_QUEUE_SIZE: use_active=False draw_queue = None if use_active: self.logger.info("using active RDQ") draw_queue = rdq_active else: self.logger.info("using inactive RDQ") draw_queue = rdq_inactive detail = draw_queue.dequeue() proxy = ProxyObject(detail, StorageManager(), draw_queue) now = datetime.utcnow() elapsed_time = now - proxy.detail.last_used if elapsed_time.seconds < PROXY_INTERVAL: self.logger.warn("Proxy %s was last used against %s %s seconds ago, using a different proxy." % (proxy.address, domain, elapsed_time.seconds)) return self.get_proxy(request_url) proxy.dispatch() return proxy def new_proxy(self,address,port,protocol='http'): return self.storage_mgr.new_proxy(address,port,protocol)
class ProxydbSpider(scrapy.Spider): name = 'proxydb' allowed_domains = ['proxydb.net'] start_urls = ['http://proxydb.net/'] handle_httpstatus_list = [403,404] def __init__(self,*args,**kwargs): self.count = int(kwargs.get('count',1)) self.storage_mgr = StorageManager() def start_requests(self): for i in range(self.count): request = scrapy.Request(url='http://proxydb.net/', dont_filter=True) logging.info("GET %s" % request.url) yield request def deobfuscate(self,resp): proxies = [] try: trs = resp.xpath('//div[@class="table-responsive"]/table[contains(@class,"table-hover")]/tbody/tr') for tr in trs: script = tr.xpath('td[1]/script/text()').extract_first() rnnum_var_full_search = re.search(r'getAttribute\(\'(data\-(\w+))\'\)',script) rnnum_var_full = rnnum_var_full_search.group(1) rnnum_var = rnnum_var_full_search.group(2) rnnum = resp.xpath('//div[@%s]/@%s' % (rnnum_var_full,rnnum_var_full)).extract_first() string_to_replace = "(+document.querySelector('[%s]').getAttribute('%s'))" % (rnnum_var_full,rnnum_var_full) ctx.eval(" var %s = %s " % (rnnum_var,rnnum)) script = script.replace(string_to_replace, " %s " % rnnum_var) scripts = script.split(';')[0:3] var_re = r'var\s+(\w+)\s*\=' addr1_var = re.search(var_re, scripts[0]).group(1) addr2_var = re.search(var_re, scripts[1]).group(1) port_var = re.search(var_re, scripts[2]).group(1) for js in scripts: ctx.eval(js) addr1 = ctx.eval(addr1_var) addr2 = base64.b64decode(ctx.eval(addr2_var)).decode('utf-8') port = int(ctx.eval(port_var)) address = "%s%s" % (addr1,addr2) protocol = tr.xpath('td[5]/text()').extract_first().strip().lower() logging.info("successfully deobfuscated proxy:\naddress=%s port=%s protocol=%s" % (address,port, protocol)) proxies.append({ 'address': address, 'port':port, 'protocol': protocol }) except Exception as e: logging.warning(e) return proxies def parse(self,response): proxies = self.deobfuscate(response) for pdata in proxies: #proxy = Proxy(address=pdata['address'], port=pdata['port'],protocol=pdata['protocol']) self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol']) proxies_by_dropdown_urls = response.xpath('//div[@aria-labelledby="navbar_dropdown_shortcuts"]/a/@href').extract() for url in proxies_by_dropdown_urls: url = response.urljoin(url) req = scrapy.Request(url=url, callback=self.parse_dropdown, dont_filter=True) yield req def parse_dropdown(self,response): print("parsing cat link") proxies = self.deobfuscate(response) for pdata in proxies: self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol'])