Example #1
0
    def do_sync():
        while len(scheduler.active_jobs()) > 0:
            logging.info('waiting for the jobs to stop')
            logging.info("there are %s active jobs" %
                         len(scheduler.active_jobs()))
            timed_out_jobs = scheduler.get_timed_out_jobs()
            if len(timed_out_jobs) > 0:
                logging.warning("There are %s timed out jobs" %
                                len(timed_out_jobs))
                for toj in timed_out_jobs:
                    to_project = toj['project']
                    to_jid = toj['id']
                    to_spider = toj['spider']
                    logging.info("Terminating %s job with id %s " %
                                 (to_spider, to_jid))
                    ScrapydApi.cancel_job(to_project, to_jid)

            time.sleep(5)
        logging.info("STARTING SYNC...")
        storage_mgr = StorageManager()
        storage_mgr.sync_to_db()
        logging.info("SYNC COMPLETE")
        now = datetime.datetime.now()
        scheduler.start_time = now
        scheduler.allow_new_jobs = True
Example #2
0
class ProxylistySpider(scrapy.Spider):
    name = 'proxylisty'
    allowed_domains = ['proxylisty.com']
    start_urls = ['http://www.proxylisty.com/ip-proxylist']

    def __init__(self, *args, **kwargs):
        self.count = int(kwargs.get('count', 1))
        self.storage_mgr = StorageManager()

    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(
                url='http://www.proxylisty.com/ip-proxylist', dont_filter=True)
            logging.info("GET %s" % request.url)

            yield request

    def parse_proxies(self, trs):
        for tr in trs:
            tds = tr.xpath('td')
            address = tr.xpath('td[1]/text()').extract_first()
            port = tr.xpath('td[2]/a/text()').extract_first()
            protocol = tr.xpath('td[3]/text()').extract_first()

            if address is not None and port is not None and protocol is not None:
                self.storage_mgr.new_proxy(address, port, protocol)

    def parse2(self, response):

        trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]')
        if trs and len(trs) > 0:
            self.parse_proxies(trs)

        next_link = response.xpath(
            '//div[@id="content"]//table[1]/tr/td[@colspan="9"]/ul/li/a[text()="Next"]/@href'
        ).extract_first()

        if next_link is not None:
            yield scrapy.Request(url=response.urljoin(next_link),
                                 callback=self.parse2,
                                 dont_filter=True)

    def parse(self, response):

        if 'ip-proxylist' in str(response.request.url):
            additional_links = response.xpath(
                '//li[@class="has-sub"][2]/div[@class="wideblock"][1]/div[1]/ul//a/@href'
            ).extract()

        trs = response.xpath('//div[@id="content"]//table[1]/tr[position()>1]')
        if trs and len(trs) > 0:
            self.parse_proxies(trs)

        for link in additional_links:
            yield scrapy.Request(url=link, callback=self.parse2)
Example #3
0
def daemon():
    print(threading.currentThread().getName(), 'Starting daemon.')
    for w in workers:
        w.start()
    time.sleep(15)
    while (True):
        scoreboard()
        if getRunningThreads() == 1:
            break
        time.sleep(5)
    sm = StorageManager()
    sm.sync_to_db()
    return scoreboard()
Example #4
0
class IpAdressSpider(scrapy.Spider):
    name = 'ip-adress'
    allowed_domains = ['ip-adress.com']
    start_urls = ['https://www.ip-adress.com/proxy-list']

    def __init__(self, *args, **kwargs):
        self.count = int(kwargs.get('count', 1))
        self.storage_mgr = StorageManager()

    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(
                url='https://www.ip-adress.com/proxy-list', dont_filter=True)
            logging.info("GET %s" % request.url)
            yield request

    def parse(self, response):
        trs = response.xpath(
            '//table[contains(@class,"proxylist")]//tr[position() > 1]')
        for tr in trs:
            address = tr.xpath('td[1]/a/text()').extract_first()
            port = tr.xpath('td[1]/text()').extract_first()
            port = re.search(r'(\d+)', port).group(1)
            self.storage_mgr.new_proxy(address, port)
Example #5
0
    def get_proxy(self,request_url):
        is_seed = False
        domain = parse_domain(request_url)
        # get the queue for the request url's domain. If a queue doesn't exist, one will be created.
        queue = self.storage_mgr.redis_mgr.get_queue_by_domain(domain)
        
        if queue.id() == SEED_QUEUE_ID:
            is_seed = True
        
        # self logger name to requst url domain
        self.logger = logging.getLogger(queue.domain)
        
        # first get all details that may already be in redis
        # TODO, change this to a simple count

        num_details = self.storage_mgr.redis_mgr.get_queue_count(queue)
        #logging.debug("\n\n\n\n\nafter get num details for queue")
        
        
        if num_details == 0 and is_seed:
            self.storage_mgr.initialize_seed_queue()
        
        if num_details == 0 and not is_seed:
            self.storage_mgr.redis_mgr.initialize_queue(queue=queue)
        
        rdq_active = RedisDetailQueue(queue,active=True)
        rdq_inactive = RedisDetailQueue(queue,active=False)
        num_enqueued = rdq_active.length() + rdq_inactive.length()

        not_enqueued = num_details - num_enqueued
        logging.info("""
        ------------------------------------|
        --------------| Cached total   : %s |
        --------------| Not enqueued   : %s |
        --- ----------| Active RDQ     : %s |
        --------------| Inactive RDQ   : %s |
        -----------------------------------------------|
        """ % (num_details,not_enqueued,rdq_active.length(),rdq_inactive.length()))

        if rdq_inactive.length() < MIN_QUEUE_SIZE and not is_seed:
            self.logger.info("rdq is less than the min queue size, creating some new details...")
            self.storage_mgr.create_new_details(queue=queue)
            # will return a list of new seed details that have not yet been used for this queue

        elif flip_coin(SEED_FREQUENCY) and not is_seed:
            self.storage_mgr.create_new_details(queue=queue,count=1)

        use_active = False

        active_pct_chance = rdq_active.length() / TARGET_ACTIVE_COUNT

        if flip_coin(active_pct_chance):
            use_active = True

        if rdq_active.length() < MIN_QUEUE_SIZE:
            use_active=False
            
        
    

        draw_queue = None
        
        if use_active:
            self.logger.info("using active RDQ")
            draw_queue = rdq_active
        
        else:
            self.logger.info("using inactive RDQ")
            draw_queue = rdq_inactive
        
        
        detail = draw_queue.dequeue()
        proxy = ProxyObject(detail, StorageManager(), draw_queue)
        
        now = datetime.utcnow()
        elapsed_time = now - proxy.detail.last_used
        if elapsed_time.seconds < PROXY_INTERVAL:
            self.logger.warn("Proxy %s was last used against %s %s seconds ago, using a different proxy." % (proxy.address, domain, elapsed_time.seconds))
            return self.get_proxy(request_url)            
        
        proxy.dispatch()
        return proxy
Example #6
0
 def __init__(self):
     self.storage_mgr = StorageManager()
     self.logger = logging.getLogger(__name__)
Example #7
0
 def __init__(self, *args, **kwargs):
     self.count = int(kwargs.get('count', 1))
     self.storage_mgr = StorageManager()
Example #8
0
from scrapy_autoproxy.config import configuration
from scrapy_autoproxy.storage_manager import StorageManager, Redis
from scrapy_autoproxy.proxy_manager import ProxyManager
import random
import time

redis = Redis(**configuration.redis_config)

sm = StorageManager()
pm = ProxyManager()

details = sm.redis_mgr.get_all_queue_details('qt_1')

active_details = []
inactive_details = []
for detail in details:
    if detail.active:
        active_details.append(detail)
    else:
        inactive_details.append(detail)

print("active details: %s" % len(active_details))
print("inactive details: %s" % len(inactive_details))
Example #9
0
class ProxydbSpider(scrapy.Spider):
    name = 'proxydb'
    allowed_domains = ['proxydb.net']
    start_urls = ['http://proxydb.net/']
    handle_httpstatus_list = [403,404]

    def __init__(self,*args,**kwargs):
        self.count = int(kwargs.get('count',1))
        self.storage_mgr = StorageManager()
    
    def start_requests(self):
        for i in range(self.count):
            request = scrapy.Request(url='http://proxydb.net/', dont_filter=True)
            logging.info("GET %s" % request.url)

            yield request

    def deobfuscate(self,resp):
        proxies = []
        try:
            trs = resp.xpath('//div[@class="table-responsive"]/table[contains(@class,"table-hover")]/tbody/tr')
            
            for tr in trs:
                script = tr.xpath('td[1]/script/text()').extract_first()
                rnnum_var_full_search = re.search(r'getAttribute\(\'(data\-(\w+))\'\)',script)

                rnnum_var_full = rnnum_var_full_search.group(1)
                rnnum_var = rnnum_var_full_search.group(2)
                
                rnnum = resp.xpath('//div[@%s]/@%s' % (rnnum_var_full,rnnum_var_full)).extract_first()
                string_to_replace = "(+document.querySelector('[%s]').getAttribute('%s'))" % (rnnum_var_full,rnnum_var_full)
                
                ctx.eval(" var %s = %s " % (rnnum_var,rnnum))

                script = script.replace(string_to_replace, " %s " % rnnum_var)
                
                scripts = script.split(';')[0:3]
                var_re = r'var\s+(\w+)\s*\='

                addr1_var = re.search(var_re, scripts[0]).group(1)
                addr2_var = re.search(var_re, scripts[1]).group(1)
                port_var = re.search(var_re, scripts[2]).group(1)

                for js in scripts:
                    ctx.eval(js)

                addr1 = ctx.eval(addr1_var)
                addr2 = base64.b64decode(ctx.eval(addr2_var)).decode('utf-8')
                port = int(ctx.eval(port_var))

                address = "%s%s" % (addr1,addr2)
                protocol = tr.xpath('td[5]/text()').extract_first().strip().lower()
                logging.info("successfully deobfuscated proxy:\naddress=%s port=%s protocol=%s" % (address,port, protocol))
                proxies.append({ 'address': address, 'port':port, 'protocol': protocol })

        except Exception as e:
            logging.warning(e)

        return proxies

    def parse(self,response):
        proxies = self.deobfuscate(response)
        for pdata in proxies:
            #proxy = Proxy(address=pdata['address'], port=pdata['port'],protocol=pdata['protocol'])
            self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol'])
        
        proxies_by_dropdown_urls = response.xpath('//div[@aria-labelledby="navbar_dropdown_shortcuts"]/a/@href').extract()
        for url in proxies_by_dropdown_urls:
            url = response.urljoin(url)
            req = scrapy.Request(url=url, callback=self.parse_dropdown, dont_filter=True)
            yield req



    def parse_dropdown(self,response):
        print("parsing cat link")
        proxies = self.deobfuscate(response)
        for pdata in proxies:
            self.storage_mgr.new_proxy(pdata['address'],pdata['port'],pdata['protocol'])