Example #1
0
class Proxies:
    """"
    a class object to get proxies from kuaidaili
    """
    baseurl1 = 'http://www.kuaidaili.com/free/inha/%d/'
    baseurl2 = 'http://www.kuaidaili.com/free/intr/%d/'

    def __init__(self):
        self.driver = init_phantomjs_driver()
        self.sql = SqlManager()
        self.sql.init_proxy_table(config.free_ipproxy_table)
        # self.urls = [Proxies.baseurl1 % i for i in range(1,5)]
        self.urls = [Proxies.baseurl1 % i for i in range(1, 11)
                     ] + [Proxies.baseurl2 % i for i in range(1, 11)]

    def run(self):
        for url in self.urls:
            self.get_proxy(url)

    def get_proxy(self, url):
        """
         get the list of proxies from the url using phantomjs
        :param driver: phantomjs driver
        :param url: url link of the page
        :return: a list contains the proxies
        """

        self.driver.get(url)
        sleep(2)
        if 'HTTP' not in self.driver.title:
            return []
        else:
            tbody = self.driver.find_element_by_tag_name('tbody')
            content = tbody.text.split('\n')
            proxies = []
            for line in content:
                tt = line.split()
                tmp = tt[0:4]
                tmp.append(''.join(tt[4:7]))
                proxies.append(tmp)

            for proxy in proxies:
                tmp = Proxy()
                tmp.set_value(
                    ip=proxy[0],
                    port=proxy[1],
                    country=proxy[4],
                    anonymity=proxy[2],
                    source='kuaidaili',
                )
                self.add_proxy(tmp)

    def add_proxy(self, proxy):
        """if in the testing mode, the spider will print out the proxy instead of inserting to the database"""
        if not config.TestMode:
            self.sql.insert_proxy(config.free_ipproxy_table, proxy)
        else:
            print(proxy)
Example #2
0
class BaseSpider(Spider):
    name = 'basespider'

    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10
        self.is_record_web_page = False

        self.sql = SqlManager()

    def init(self):
        self.meta = {
            'download_timeout': self.timeout,
        }

        self.dir_log = 'log/proxy/%s' % self.name
        utils.make_dir(self.dir_log)
        self.sql.init_proxy_table(config.free_ipproxy_table)

    def start_requests(self):
        for i, url in enumerate(self.urls):
            yield Request(
                url=url,
                headers=self.headers,
                meta=self.meta,
                dont_filter=True,
                callback=self.parse_page,
                errback=self.error_parse,
            )

    def parse_page(self, response):
        self.write(response.body)
        pass

    def error_parse(self, failure):
        request = failure.request
        pass

    def add_proxy(self, proxy):
        self.sql.insert_proxy(config.free_ipproxy_table, proxy)

    def write(self, data):
        if self.is_record_web_page:
            with open(
                    '%s/%s.html' %
                (self.dir_log,
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
                    'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
Example #3
0
class Validator(Spider):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)

        self.urls = []
        self.headers = None
        self.timeout = 10
        self.is_record_web_page = False

        self.sql = SqlManager()

    def init(self):
        self.dir_log = 'log/validator/%s' % self.name
        utils.make_dir(self.dir_log)

        self.sql.init_proxy_table(self.name)

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {
            'CONCURRENT_REQUESTS': cls.concurrent_requests,
            'RETRY_ENABLED': cls.retry_enabled,
        },
                         priority='spider')

    def start_requests(self):
        count = self.sql.get_proxy_count(self.name)
        count_free = self.sql.get_proxy_count(config.httpbin_table)

        ids = self.sql.get_proxy_ids(self.name)
        ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_httpbin[i - len(ids)]

            proxy = self.sql.get_proxy_with_id(table, id)
            if proxy == None:
                continue

            url = random.choice(self.urls)
            cur_time = time.time()
            yield Request(
                url=url,
                headers=self.headers,
                meta={
                    'cur_time': cur_time,
                    'download_timeout': self.timeout,
                    'proxy_info': proxy,
                    'table': table,
                    'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
                },
                dont_filter=True,
                callback=self.success_parse,
                errback=self.error_parse,
            )

    def success_parse(self, response):
        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')

        self.save_page(proxy.ip, response.body)
        self.log('success_parse speed:%s meta:%s' %
                 (time.time() - response.meta.get('cur_time'), response.meta))

        proxy.vali_count += 1
        proxy.speed = time.time() - response.meta.get('cur_time')
        if self.success_content_parse(response):
            if table == self.name:
                if proxy.speed > self.timeout:
                    self.sql.del_proxy_with_id(table, proxy.id)
                else:
                    self.sql.update_proxy(table, proxy)
            else:
                if proxy.speed < self.timeout:
                    self.sql.insert_proxy(table_name=self.name, proxy=proxy)
        else:
            if table == self.name:
                self.sql.del_proxy_with_id(table_name=table, id=proxy.id)

        self.sql.commit()

    def success_content_parse(self, response):
        return True

    def error_parse(self, failure):
        request = failure.request
        self.log('error_parse value:%s url:%s meta:%s' %
                 (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')

        if table == self.name:
            self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
Example #4
0
class BaseSpider(Spider):
    name = 'basespider'

    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10
        self.is_record_web_page = False
        self.proxy = None
        self.method = 'GET'
        self.formdata = {}

        self.sql = SqlManager()

    def init(self):
        self.meta = {
            'download_timeout': self.timeout,
        }
        if self.proxy:
            self.meta['proxy'] = self.proxy

        self.dir_log = 'log/proxy/%s' % self.name
        utils.make_dir(self.dir_log)
        self.sql.init_proxy_table(config.free_ipproxy_table)

    def start_requests(self):
        for i, url in enumerate(self.urls):
            if self.method == 'POST':
                yield FormRequest(
                    url=url,
                    # headers=self.headers,
                    formdata=self.formdata,
                    meta=self.meta,
                    dont_filter=True,
                    callback=self.parse_page,
                    errback=self.error_parse,
                )
            else:
                yield Request(
                    url=url,
                    # headers=self.headers,
                    method=self.method,
                    meta=self.meta,
                    dont_filter=True,
                    callback=self.parse_page,
                    errback=self.error_parse,
                )

    def parse_page(self, response):
        self.write(response.body)
        pass

    def error_parse(self, failure):
        request = failure.request
        pass

    def add_proxy(self, proxy):
        # query = {
        #     'ip': proxy.ip,
        # }
        # update_set = {
        #     '$set': proxy.get_dict()
        # }
        # self.freeproxy.find_one_and_update(query, update_set, upsert=True)
        self.sql.insert_proxy(config.free_ipproxy_table, proxy)

    def write(self, data):
        if self.is_record_web_page:
            with open(
                    '%s/%s.html' %
                (self.dir_log,
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
                    'w') as f:
                f.write(data)
                f.close()
Example #5
0
class Validator(object):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):

        self.urls = []
        self.headers = None
        self.timeout = 10
        self.success_status = [200]
        self.is_record_web_page = False
        self.query = {}

        self.sql = SqlManager()

        self.threadpool = SimplePool.ThreadPool(config.thread_num)

    def init(self):
        self.dir_log = 'log/validator/%s' % self.name
        utils.make_dir(self.dir_log)

        self.sql.init_proxy_table(self.name)

    def start_requests(self):
        count = self.sql.get_proxy_count(config.free_ipproxy_table)
        count_free = self.sql.get_proxy_count(config.httpbin_table)

        # ids = self.sql.get_proxy_ids(config.free_ipproxy_table)
        # ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)

        logging.info('init data...')
        for data in self.sql.db[config.free_ipproxy_table].find(self.query):
            url = random.choice(self.urls)
            cur_time = time.time()

            proxy = Proxy()
            proxy.set_value(ip=data.get('ip'),
                            port=data.get('port'),
                            country=data.get('country'),
                            anonymity=data.get('country'),
                            https=data.get('https'),
                            speed=data.get('speed'),
                            source=data.get('source'),
                            vali_count=data.get('vali_count'),
                            err_count=data.get('err_count'))
            proxy.id = data.get('_id')

            args = (cur_time, data, 'http://%s:%s' % (proxy.ip, proxy.port))

            j = SimplePool.ThreadJob(self.valid, args)

            self.threadpool.add_job(j)

        result = ValidThread(self.threadpool)
        result.start()
        self.threadpool.start()
        self.threadpool.finish()

    def valid(self, cur_time, proxy_info, proxy):
        print proxy

    def success_parse(self, response):
        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')

        self.save_page(proxy.ip, response.body)
        self.log('success_parse speed:%s meta:%s' %
                 (time.time() - response.meta.get('cur_time'), response.meta))

        proxy.vali_count += 1
        proxy.speed = time.time() - response.meta.get('cur_time')
        if self.success_content_parse(response):
            if table == self.name:
                if proxy.speed > self.timeout:
                    self.sql.del_proxy_with_id(table, proxy.id)
                else:
                    self.sql.update_proxy(table, proxy)
            else:
                if proxy.speed < self.timeout:
                    self.sql.insert_proxy(table_name=self.name, proxy=proxy)
        else:
            if table == self.name:
                self.sql.del_proxy_with_id(table_name=table, id=proxy.id)

        self.sql.commit()

    def success_content_parse(self, response):
        if response.status not in self.success_status:
            return False
        return True

    def error_parse(self, failure):
        request = failure.request
        self.log('error_parse value:%s url:%s meta:%s' %
                 (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')

        if table == self.name:
            self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()