class Proxies: """" a class object to get proxies from kuaidaili """ baseurl1 = 'http://www.kuaidaili.com/free/inha/%d/' baseurl2 = 'http://www.kuaidaili.com/free/intr/%d/' def __init__(self): self.driver = init_phantomjs_driver() self.sql = SqlManager() self.sql.init_proxy_table(config.free_ipproxy_table) # self.urls = [Proxies.baseurl1 % i for i in range(1,5)] self.urls = [Proxies.baseurl1 % i for i in range(1, 11) ] + [Proxies.baseurl2 % i for i in range(1, 11)] def run(self): for url in self.urls: self.get_proxy(url) def get_proxy(self, url): """ get the list of proxies from the url using phantomjs :param driver: phantomjs driver :param url: url link of the page :return: a list contains the proxies """ self.driver.get(url) sleep(2) if 'HTTP' not in self.driver.title: return [] else: tbody = self.driver.find_element_by_tag_name('tbody') content = tbody.text.split('\n') proxies = [] for line in content: tt = line.split() tmp = tt[0:4] tmp.append(''.join(tt[4:7])) proxies.append(tmp) for proxy in proxies: tmp = Proxy() tmp.set_value( ip=proxy[0], port=proxy[1], country=proxy[4], anonymity=proxy[2], source='kuaidaili', ) self.add_proxy(tmp) def add_proxy(self, proxy): """if in the testing mode, the spider will print out the proxy instead of inserting to the database""" if not config.TestMode: self.sql.insert_proxy(config.free_ipproxy_table, proxy) else: print(proxy)
class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager() def init(self): self.meta = { 'download_timeout': self.timeout, } self.dir_log = 'log/proxy/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(config.free_ipproxy_table) def start_requests(self): for i, url in enumerate(self.urls): yield Request( url=url, headers=self.headers, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): self.sql.insert_proxy(config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
class Validator(Spider): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.urls = [] self.headers = None self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager() def init(self): self.dir_log = 'log/validator/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(self.name) @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or { 'CONCURRENT_REQUESTS': cls.concurrent_requests, 'RETRY_ENABLED': cls.retry_enabled, }, priority='spider') def start_requests(self): count = self.sql.get_proxy_count(self.name) count_free = self.sql.get_proxy_count(config.httpbin_table) ids = self.sql.get_proxy_ids(self.name) ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) for i in range(0, count + count_free): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_httpbin[i - len(ids)] proxy = self.sql.get_proxy_with_id(table, id) if proxy == None: continue url = random.choice(self.urls) cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): proxy = response.meta.get('proxy_info') table = response.meta.get('table') self.save_page(proxy.ip, response.body) self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy.vali_count += 1 proxy.speed = time.time() - response.meta.get('cur_time') if self.success_content_parse(response): if table == self.name: if proxy.speed > self.timeout: self.sql.del_proxy_with_id(table, proxy.id) else: self.sql.update_proxy(table, proxy) else: if proxy.speed < self.timeout: self.sql.insert_proxy(table_name=self.name, proxy=proxy) else: if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) self.sql.commit() def success_content_parse(self, response): return True def error_parse(self, failure): request = failure.request self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.proxy = None self.method = 'GET' self.formdata = {} self.sql = SqlManager() def init(self): self.meta = { 'download_timeout': self.timeout, } if self.proxy: self.meta['proxy'] = self.proxy self.dir_log = 'log/proxy/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(config.free_ipproxy_table) def start_requests(self): for i, url in enumerate(self.urls): if self.method == 'POST': yield FormRequest( url=url, # headers=self.headers, formdata=self.formdata, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) else: yield Request( url=url, # headers=self.headers, method=self.method, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): # query = { # 'ip': proxy.ip, # } # update_set = { # '$set': proxy.get_dict() # } # self.freeproxy.find_one_and_update(query, update_set, upsert=True) self.sql.insert_proxy(config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close()
class Validator(object): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): self.urls = [] self.headers = None self.timeout = 10 self.success_status = [200] self.is_record_web_page = False self.query = {} self.sql = SqlManager() self.threadpool = SimplePool.ThreadPool(config.thread_num) def init(self): self.dir_log = 'log/validator/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(self.name) def start_requests(self): count = self.sql.get_proxy_count(config.free_ipproxy_table) count_free = self.sql.get_proxy_count(config.httpbin_table) # ids = self.sql.get_proxy_ids(config.free_ipproxy_table) # ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) logging.info('init data...') for data in self.sql.db[config.free_ipproxy_table].find(self.query): url = random.choice(self.urls) cur_time = time.time() proxy = Proxy() proxy.set_value(ip=data.get('ip'), port=data.get('port'), country=data.get('country'), anonymity=data.get('country'), https=data.get('https'), speed=data.get('speed'), source=data.get('source'), vali_count=data.get('vali_count'), err_count=data.get('err_count')) proxy.id = data.get('_id') args = (cur_time, data, 'http://%s:%s' % (proxy.ip, proxy.port)) j = SimplePool.ThreadJob(self.valid, args) self.threadpool.add_job(j) result = ValidThread(self.threadpool) result.start() self.threadpool.start() self.threadpool.finish() def valid(self, cur_time, proxy_info, proxy): print proxy def success_parse(self, response): proxy = response.meta.get('proxy_info') table = response.meta.get('table') self.save_page(proxy.ip, response.body) self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy.vali_count += 1 proxy.speed = time.time() - response.meta.get('cur_time') if self.success_content_parse(response): if table == self.name: if proxy.speed > self.timeout: self.sql.del_proxy_with_id(table, proxy.id) else: self.sql.update_proxy(table, proxy) else: if proxy.speed < self.timeout: self.sql.insert_proxy(table_name=self.name, proxy=proxy) else: if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) self.sql.commit() def success_content_parse(self, response): if response.status not in self.success_status: return False return True def error_parse(self, failure): request = failure.request self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()