def GET(self): try: sql = SqlManager() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', '%') https = inputs.get('https', '%') order = inputs.get('order', 'speed') sort = inputs.get('sort', 'asc') count = inputs.get('count', 100) kwargs = { 'anonymity': anonymity, 'https': https, 'order': order, 'sort': sort, 'count': count } result = sql.select_proxy(name, **kwargs) data = [{ 'id': item[0], 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5], 'speed': item[6], 'save_time': str(item[8]) } for item in result] data = json.dumps(data, indent = 4) return data except Exception, e: logging.exception('select exception msg:%s' % e)
def __init__(self): self.driver = init_phantomjs_driver() self.sql = SqlManager() self.sql.init_proxy_table(config.free_ipproxy_table) # self.urls = [Proxies.baseurl1 % i for i in range(1,5)] self.urls = [Proxies.baseurl1 % i for i in range(1, 11) ] + [Proxies.baseurl2 % i for i in range(1, 11)]
def select(): sql = SqlManager() name = request.args.get('name') anonymity = request.args.get('anonymity', '') https = request.args.get('https', '') order = request.args.get('order', 'speed') sort = request.args.get('sort', 'asc') count = request.args.get('count', 100) kwargs = { 'anonymity': anonymity, 'https': https, 'order': order, 'sort': sort, 'count': count } result = sql.select_proxy(name, **kwargs) data = [{ 'ip': item.get('ip'), 'port': item.get('port'), 'anonymity': item.get('anonymity'), 'https': item.get('https'), 'speed': item.get('speed'), 'save_time': item.get('save_time', '') } for item in result] return json.dumps(data, indent=4)
def delete(): sql = SqlManager() name = request.args.get('name') ip = request.args.get('ip') result = sql.del_proxy_with_ip(name, ip) data = {'result': result} return json.dumps(data, indent=4)
def __init__(self, file, verbose=False): self.file = file self.verbose = verbose self.sql = SqlManager() if not self.verbose: self._init_log() else: logging.disable(sys.maxint)
class Proxies: """" a class object to get proxies from kuaidaili """ baseurl1 = 'http://www.kuaidaili.com/free/inha/%d/' baseurl2 = 'http://www.kuaidaili.com/free/intr/%d/' def __init__(self): self.driver = init_phantomjs_driver() self.sql = SqlManager() self.sql.init_proxy_table(config.free_ipproxy_table) # self.urls = [Proxies.baseurl1 % i for i in range(1,5)] self.urls = [Proxies.baseurl1 % i for i in range(1, 11) ] + [Proxies.baseurl2 % i for i in range(1, 11)] def run(self): for url in self.urls: self.get_proxy(url) def get_proxy(self, url): """ get the list of proxies from the url using phantomjs :param driver: phantomjs driver :param url: url link of the page :return: a list contains the proxies """ self.driver.get(url) sleep(2) if 'HTTP' not in self.driver.title: return [] else: tbody = self.driver.find_element_by_tag_name('tbody') content = tbody.text.split('\n') proxies = [] for line in content: tt = line.split() tmp = tt[0:4] tmp.append(''.join(tt[4:7])) proxies.append(tmp) for proxy in proxies: tmp = Proxy() tmp.set_value( ip=proxy[0], port=proxy[1], country=proxy[4], anonymity=proxy[2], source='kuaidaili', ) self.add_proxy(tmp) def add_proxy(self, proxy): """if in the testing mode, the spider will print out the proxy instead of inserting to the database""" if not config.TestMode: self.sql.insert_proxy(config.free_ipproxy_table, proxy) else: print(proxy)
def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager()
def GET(self): try: sql = SqlManager() inputs = web.input() name = inputs.get('name') ip = inputs.get('ip') return sql.del_proxy_with_ip(name, ip) except Exception, e: logging.exception('delete exception msg:%s' % e)
def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.urls = [] self.headers = None self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager()
class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager() def init(self): self.meta = { 'download_timeout': self.timeout, } self.dir_log = 'log/proxy/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(config.free_ipproxy_table) def start_requests(self): for i, url in enumerate(self.urls): yield Request( url=url, headers=self.headers, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): self.sql.insert_proxy(config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.proxy = None self.method = 'GET' self.formdata = {} self.sql = SqlManager()
def __init__(self, name=None, **kwargs): self.urls = [] self.headers = None self.timeout = 10 self.success_status = [200] self.is_record_web_page = False self.query = {} self.sql = SqlManager() self.threadpool = SimplePool.ThreadPool(config.thread_num)
def query(): sql = SqlManager() start_id = request.args.get('sid') limit = int(request.args.get('limit', '100')) proxies = sql.get_proxies_info(config.httpbin_table, start_id=start_id, limit=limit) data = [{ 'id': proxy[0], 'ip': proxy[1], 'port': proxy[2], 'https': proxy[3] } for proxy in proxies] return json.dumps(data, indent=4)
def insert(): sql = SqlManager() name = request.args.get('name') proxy = Proxy() proxy.set_value( ip=request.args.get('ip'), port=request.args.get('port'), country=request.args.get('country', None), anonymity=request.args.get('anonymity', None), https=request.args.get('https', 'no'), speed=request.args.get('speed', -1), source=request.args.get('source', name), ) result = sql.insert_proxy(name, proxy) data = {'result': result} return json.dumps(data, indent=4)
def GET(self): try: sql = SqlManager() inputs = web.input() name = inputs.get('name') proxy = Proxy() proxy.set_value( ip = inputs.get('ip'), port = inputs.get('port'), country = inputs.get('country', None), anonymity = inputs.get('anonymity', None), https = inputs.get('https', 'no'), speed = inputs.get('speed', -1), source = inputs.get('source', name), ) sql.insert_proxy(name, proxy) except Exception, e: logging.exception('insert exception msg:%s' % e)
def proxy_list(): sql = SqlManager() page_size = request.args.get('page_size', 50) page = request.args.get('page', 1) skip = (page - 1) * page_size result = sql.db[config.free_ipproxy_table].find().limit(page_size).skip( skip) data = [{ 'ip': item.get('ip'), 'port': item.get('port'), 'country': item.get('country', ''), 'anonymity': item.get('anonymity'), 'https': item.get('https'), 'speed': item.get('speed'), 'save_time': item.get('save_time', '') } for item in result] return json.dumps(data, indent=4)
class Validator(object): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): self.urls = [] self.headers = None self.timeout = 10 self.success_status = [200] self.is_record_web_page = False self.query = {} self.sql = SqlManager() self.threadpool = SimplePool.ThreadPool(config.thread_num) def init(self): self.dir_log = 'log/validator/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(self.name) def start_requests(self): count = self.sql.get_proxy_count(config.free_ipproxy_table) count_free = self.sql.get_proxy_count(config.httpbin_table) # ids = self.sql.get_proxy_ids(config.free_ipproxy_table) # ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) logging.info('init data...') for data in self.sql.db[config.free_ipproxy_table].find(self.query): url = random.choice(self.urls) cur_time = time.time() proxy = Proxy() proxy.set_value(ip=data.get('ip'), port=data.get('port'), country=data.get('country'), anonymity=data.get('country'), https=data.get('https'), speed=data.get('speed'), source=data.get('source'), vali_count=data.get('vali_count'), err_count=data.get('err_count')) proxy.id = data.get('_id') args = (cur_time, data, 'http://%s:%s' % (proxy.ip, proxy.port)) j = SimplePool.ThreadJob(self.valid, args) self.threadpool.add_job(j) result = ValidThread(self.threadpool) result.start() self.threadpool.start() self.threadpool.finish() def valid(self, cur_time, proxy_info, proxy): print proxy def success_parse(self, response): proxy = response.meta.get('proxy_info') table = response.meta.get('table') self.save_page(proxy.ip, response.body) self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy.vali_count += 1 proxy.speed = time.time() - response.meta.get('cur_time') if self.success_content_parse(response): if table == self.name: if proxy.speed > self.timeout: self.sql.del_proxy_with_id(table, proxy.id) else: self.sql.update_proxy(table, proxy) else: if proxy.speed < self.timeout: self.sql.insert_proxy(table_name=self.name, proxy=proxy) else: if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) self.sql.commit() def success_content_parse(self, response): if response.status not in self.success_status: return False return True def error_parse(self, failure): request = failure.request self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.is_record_web_page = False self.proxy = None self.method = 'GET' self.formdata = {} self.sql = SqlManager() def init(self): self.meta = { 'download_timeout': self.timeout, } if self.proxy: self.meta['proxy'] = self.proxy self.dir_log = 'log/proxy/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(config.free_ipproxy_table) def start_requests(self): for i, url in enumerate(self.urls): if self.method == 'POST': yield FormRequest( url=url, # headers=self.headers, formdata=self.formdata, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) else: yield Request( url=url, # headers=self.headers, method=self.method, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): # query = { # 'ip': proxy.ip, # } # update_set = { # '$set': proxy.get_dict() # } # self.freeproxy.find_one_and_update(query, update_set, upsert=True) self.sql.insert_proxy(config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close()
def rand(): sql = SqlManager() result = sql.get_rand_proxy() print result return result[0]
from ipproxytool.spiders.proxy.proxydb import ProxyDBSpider scrapydo.setup() if __name__ == '__main__': os.chdir(sys.path[0]) if not os.path.exists('log'): os.makedirs('log') logging.basicConfig( filename='log/crawl_proxy.log', format='%(levelname)s %(asctime)s: %(message)s', level=config.log_level ) sql = SqlManager() spiders = [ XiCiDaiLiSpider, SixSixIpSpider, IpOneEightOneSpider, # KuaiDaiLiSpider, # 使用其他方法 # GatherproxySpider, HidemySpider, ProxylistplusSpider, # FreeProxyListsSpider, # PeulandSpider, # 目标站点失效 # UsProxySpider, ProxyDBSpider, # ProxyRoxSpider, ]
class Validator(Spider): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.urls = [] self.headers = None self.timeout = 10 self.is_record_web_page = False self.sql = SqlManager() def init(self): self.dir_log = 'log/validator/%s' % self.name utils.make_dir(self.dir_log) self.sql.init_proxy_table(self.name) @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or { 'CONCURRENT_REQUESTS': cls.concurrent_requests, 'RETRY_ENABLED': cls.retry_enabled, }, priority='spider') def start_requests(self): count = self.sql.get_proxy_count(self.name) count_free = self.sql.get_proxy_count(config.httpbin_table) ids = self.sql.get_proxy_ids(self.name) ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) for i in range(0, count + count_free): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_httpbin[i - len(ids)] proxy = self.sql.get_proxy_with_id(table, id) if proxy == None: continue url = random.choice(self.urls) cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): proxy = response.meta.get('proxy_info') table = response.meta.get('table') self.save_page(proxy.ip, response.body) self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy.vali_count += 1 proxy.speed = time.time() - response.meta.get('cur_time') if self.success_content_parse(response): if table == self.name: if proxy.speed > self.timeout: self.sql.del_proxy_with_id(table, proxy.id) else: self.sql.update_proxy(table, proxy) else: if proxy.speed < self.timeout: self.sql.insert_proxy(table_name=self.name, proxy=proxy) else: if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) self.sql.commit() def success_content_parse(self, response): return True def error_parse(self, failure): request = failure.request self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') if table == self.name: self.sql.del_proxy_with_id(table_name=table, id=proxy.id) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
class DataProces(object): def __init__(self, file, verbose=False): self.file = file self.verbose = verbose self.sql = SqlManager() if not self.verbose: self._init_log() else: logging.disable(sys.maxint) def _init_log(self): # init logging logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='log/export.log', filemode='w') ####################################################################### #定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象# console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s' ) console.setFormatter(formatter) logging.getLogger('').addHandler(console) def doimport(self): infile = open(self.file, 'r') lines = infile.readlines() total = len(lines) index = 0 for line in lines: index = index + 1 ipport = line.split(':') if len(ipport) != 2: continue ip = ipport[0] port = ipport[1] proxy = Proxy() proxy.set_value( ip=ip, port=port, country='None', anonymity='None', source='dataproces', ) logging.info('[%d/%d]proxy:%s' % (index, total, proxy.get_dict())) self.sql.insert_proxy(config.free_ipproxy_table, proxy) def doexport(self): out = open(self.file, 'w') query = {'httpbin': True} proxies = self.sql.db[config.free_ipproxy_table].find(query).sort( 'httpbin_vali_time', -1) for proxy in proxies: logging.info(proxy) p = "%s:%s\n" % (proxy.get('ip'), proxy['port']) out.write(p) out.flush() out.close()
def handle_msg(self, cs, data): self.__mutex.acquire() self.__input_stream.reset_stream(data) js = json.loads(self.__input_stream.get_data_bytes()) conn, c = SqlManager.connect_sql() if js["type"] == "login": username = js["username"] pwd = js["pwd"] cursor = c.execute( "select * from user where user_name=? and user_pwd=?", (username, pwd)) size = len(cursor.fetchall()) self.__output_stream.push_char(1) if size > 0: self.__output_stream.push_string( json.dumps({ "type": "login", "msg": username, "data": True })) cs.sendall(self.__output_stream.flush_stream()) else: self.__output_stream.push_string( json.dumps({ "type": "login", "msg": "account not found", "data": False })) cs.sendall(self.__output_stream.flush_stream()) elif js["type"] == "register": username = js["username"] pwd = js["pwd"] cursor = c.execute('''select * from where user_name=?''', (username, )) exist_account = len(cursor.fetchall()) self.__output_stream.push_char(2) if exist_account > 0: self.__output_stream.push_string( json.dumps({ "type": "register", "msg": "account exist", "data": False })) cs.sendall(self.__output_stream.flush_stream()) else: c.execute( '''insert into user (user_name,user_pwd) values(?,?)''', (username, pwd)) conn.commit() self.__output_stream.push_string( json.dumps({ "type": "register", "msg": username, "data": True })) cs.sendall(self.__output_stream.flush_stream()) elif js["type"] == "match": name = js["name"] room, client_id = self.__battle_room_handler.get_room(name, cs) self.__output_stream.push_char(3) self.__output_stream.push_string( json.dumps({ "roomId": room.room_id, "clientId": client_id })) cs.sendall(self.__output_stream.flush_stream()) if self.__battle_room_handler.is_full(room.room_id): print "room full", room.room_id, client_id self.__output_stream.push_char(6) self.__output_stream.push_string(json.dumps({"msg": "matched"})) room.broadcast(self.__output_stream.flush_stream()) elif js["type"] == "ready": client_id = js["clientId"] room_id = js["roomId"] ret = self.__battle_room_handler.ready(room_id, client_id) print "ready ", room_id, client_id if not ret: self.__output_stream.push_char(4) self.__output_stream.push_string( json.dumps({"error": "Room Not Found"})) cs.sendall(self.__output_stream.flush_stream()) # If All Ready,Start Game if self.__battle_room_handler.is_all_ready(room_id): server = self.__get_server() addr = server.get_bind_addr() self.__output_stream.push_char(5) self.__output_stream.push_string( json.dumps({ "ip": addr[0], "port": addr[1] })) self.__battle_room_handler.broadcast_room( room_id, self.__output_stream.flush_stream()) else: self.__output_stream.push_string( json.dumps({ "type": "unknown", "msg": "unknown type", "data": False })) cs.sendall(self.__output_stream.flush_stream()) conn.close() self.__mutex.release()
from ipproxytool.spiders.proxy.freeproxylists import FreeProxyListsSpider from ipproxytool.spiders.proxy.usproxy import UsProxySpider from ipproxytool.spiders.proxy.proxydb import ProxyDBSpider scrapydo.setup() if __name__ == '__main__': os.chdir(sys.path[0]) if not os.path.exists('log'): os.makedirs('log') logging.basicConfig(filename='log/crawl_proxy.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) sql = SqlManager() spiders = [ XiCiDaiLiSpider, SixSixIpSpider, IpOneEightOneSpider, KuaiDaiLiSpider, # 在访问前加了一个 js ,反爬 GatherproxySpider, # HidemySpider, 已失效 ProxylistplusSpider, FreeProxyListsSpider, # PeulandSpider, # 目标站点失效 UsProxySpider, ProxyDBSpider, ] while True: