class WebSavestart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WebSavestart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.user_id = '' self.protected_urls = [] self.counterfeit_urls = [] self.gray_urls = [] self.monitor_urls = [] self.url_num = 0 self.gary_objectid = '' self.file_context = '' self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'protected_id', 'gray_id', 'counterfeit_id', 'monitor_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] original_protected_list = task_info['protected_id'] original_counterfeit_list = task_info['counterfeit_id'] original_gray_list = task_info['gray_id'] original_monitor_list = task_info['monitor_id'] # get protected url, all test may have protected url to save if original_protected_list is not None and original_protected_list != '': protected_id_list = original_protected_list.split('-') table_name = 'protected_list' fields = ['url'] for protected_id in protected_id_list: # 读取mysql中的被保护名单 wheres = {'id': [int(protected_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue protected_url = select_result['url'].encode('utf-8') self.protected_urls.append(protected_url) # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) # get gray url in mysql gray_list if original_gray_list is not None and original_gray_list != '': gray_id_list = original_gray_list.split('-') table_name = 'gray_list' fields = ['url'] for gray_id in gray_id_list: wheres = {'id': [int(gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # get monitor url in mysql monitor_list if original_monitor_list is not None and original_monitor_list != '': monitor_id_list = original_monitor_list.split('-') table_name = 'monitor_list' fields = ['url'] for monitor_id in monitor_id_list: wheres = {'id': [int(monitor_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue monitor_url = select_result['url'].encode('utf-8') self.monitor_urls.append(monitor_url) # get suspected url table_name = 'task_result' fields = ['filtrate_objectid'] wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is not False: self.gary_objectid = select_result['filtrate_objectid'] if self.gary_objectid is None: self.get_gray_iter = iter([]) self.gray_url_num = 0 else: self.gray_url_num = self.mongo_operate.get_gray_num( self.gary_objectid) self.gary_objectid = self.mongo_operate.expand_gray_list( self.gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( self.gary_objectid) else: self.get_gray_iter = iter([]) self.gray_url_num = 0 self.url_num = self.gray_url_num + \ len(self.protected_urls) + len(self.gray_urls) + \ len(self.counterfeit_urls) + len(self.monitor_urls) def update_running_state(self, saved_num, request_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def add_saved_ulr_mongo(self, ulist): ''' 将保存的URL分类存入mongo中 ''' saved_protected_urls = [] saved_gray_urls = [] saved_counterfeit_urls = [] saved_monitor_urls = [] # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls # url last '/n' for url in ulist: if url[1] == 'gray': saved_gray_urls.append(url[0]) elif url[1] == 'protected': saved_protected_urls.append(url[0]) elif url[1] == 'counterfeit': saved_counterfeit_urls.append(url[0]) elif url[1] == 'monitor': saved_monitor_urls.append(url[0]) if saved_gray_urls != []: self.save_gray_objectID = self.mongo_operate.create_gray( gray_name='save_gray_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_gray_urls, self.save_gray_objectID) else: self.save_gray_objectID = '' if saved_protected_urls != []: self.save_protected_objectID = self.mongo_operate.create_gray( gray_name='saved_protected_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_protected_urls, self.save_protected_objectID) else: self.save_protected_objectID = '' if saved_counterfeit_urls != []: self.save_counterfeit_objectID = self.mongo_operate.create_gray( gray_name='saved_counterfeit_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_counterfeit_urls, self.save_counterfeit_objectID) else: self.save_counterfeit_objectID = '' if saved_monitor_urls != []: self.save_monitor_objectID = self.mongo_operate.create_gray( gray_name='saved_monitor_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_monitor_urls, self.save_monitor_objectID) else: self.save_monitor_objectID = '' def update_finished_state(self, ulist, run_time, request_num): ''' 在mysql中更新探测状态及结果 ''' if ['http://cpuzt.cc/', 'gray'] not in ulist: ulist.append(['http://cpuzt.cc/', 'gray']) if ['http://www.138.gg/', 'gray'] not in ulist: ulist.append(['http://www.138.gg/', 'gray']) if ['http://www.bjstkc.com/', 'gray'] not in ulist: ulist.append(['http://www.bjstkc.com/', 'gray']) self.add_saved_ulr_mongo(ulist) saved_num = len(ulist) table_name = 'task_result' fields = {'e_web_save_state': [03, 'd'], 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'], 'web_save_run_time': [run_time, 's'], 'save_protected_objectid': [self.save_protected_objectID, 's'], 'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'], 'save_monitor_objectid': [self.save_monitor_objectID, 's'], 'save_gray_objectid': [self.save_gray_objectID, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') if ulist == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: send_result = self.message_other_engine(3, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|web_save engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'web_save', 2) engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter, self.gray_urls, self.counterfeit_urls, self.monitor_urls, self.url_num, self.update_running_state, self.update_finished_state, self.mongo_operate) engine.download() reactor.run(installSignalHandlers=0)
class WebSavestart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WebSavestart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.user_id = '' self.protected_urls = [] self.counterfeit_urls = [] self.gray_urls = [] self.monitor_urls = [] self.url_num = 0 self.gary_objectid = '' self.file_context = '' self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = [ 'last_time', 'user_id', 'protected_id', 'gray_id', 'counterfeit_id', 'monitor_id' ] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write('%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] original_protected_list = task_info['protected_id'] original_counterfeit_list = task_info['counterfeit_id'] original_gray_list = task_info['gray_id'] original_monitor_list = task_info['monitor_id'] # get protected url, all test may have protected url to save if original_protected_list is not None and original_protected_list != '': protected_id_list = original_protected_list.split('-') table_name = 'protected_list' fields = ['url'] for protected_id in protected_id_list: # 读取mysql中的被保护名单 wheres = {'id': [int(protected_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue protected_url = select_result['url'].encode('utf-8') self.protected_urls.append(protected_url) # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) # get gray url in mysql gray_list if original_gray_list is not None and original_gray_list != '': gray_id_list = original_gray_list.split('-') table_name = 'gray_list' fields = ['url'] for gray_id in gray_id_list: wheres = {'id': [int(gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # get monitor url in mysql monitor_list if original_monitor_list is not None and original_monitor_list != '': monitor_id_list = original_monitor_list.split('-') table_name = 'monitor_list' fields = ['url'] for monitor_id in monitor_id_list: wheres = {'id': [int(monitor_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue monitor_url = select_result['url'].encode('utf-8') self.monitor_urls.append(monitor_url) # get suspected url table_name = 'task_result' fields = ['filtrate_objectid'] wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if select_result is not False: self.gary_objectid = select_result['filtrate_objectid'] if self.gary_objectid is None: self.get_gray_iter = iter([]) self.gray_url_num = 0 else: self.gray_url_num = self.mongo_operate.get_gray_num( self.gary_objectid) self.gary_objectid = self.mongo_operate.expand_gray_list( self.gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( self.gary_objectid) else: self.get_gray_iter = iter([]) self.gray_url_num = 0 self.url_num = self.gray_url_num + \ len(self.protected_urls) + len(self.gray_urls) + \ len(self.counterfeit_urls) + len(self.monitor_urls) def update_running_state(self, saved_num, request_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def add_saved_ulr_mongo(self, ulist): ''' 将保存的URL分类存入mongo中 ''' saved_protected_urls = [] saved_gray_urls = [] saved_counterfeit_urls = [] saved_monitor_urls = [] # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls # url last '/n' for url in ulist: if url[1] == 'gray': saved_gray_urls.append(url[0]) elif url[1] == 'protected': saved_protected_urls.append(url[0]) elif url[1] == 'counterfeit': saved_counterfeit_urls.append(url[0]) elif url[1] == 'monitor': saved_monitor_urls.append(url[0]) if saved_gray_urls != []: self.save_gray_objectID = self.mongo_operate.create_gray( gray_name='save_gray_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_gray_urls, self.save_gray_objectID) else: self.save_gray_objectID = '' if saved_protected_urls != []: self.save_protected_objectID = self.mongo_operate.create_gray( gray_name='saved_protected_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_protected_urls, self.save_protected_objectID) else: self.save_protected_objectID = '' if saved_counterfeit_urls != []: self.save_counterfeit_objectID = self.mongo_operate.create_gray( gray_name='saved_counterfeit_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_counterfeit_urls, self.save_counterfeit_objectID) else: self.save_counterfeit_objectID = '' if saved_monitor_urls != []: self.save_monitor_objectID = self.mongo_operate.create_gray( gray_name='saved_monitor_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_monitor_urls, self.save_monitor_objectID) else: self.save_monitor_objectID = '' def update_finished_state(self, ulist, run_time, request_num): ''' 在mysql中更新探测状态及结果 ''' if ['http://cpuzt.cc/', 'gray'] not in ulist: ulist.append(['http://cpuzt.cc/', 'gray']) if ['http://www.138.gg/', 'gray'] not in ulist: ulist.append(['http://www.138.gg/', 'gray']) if ['http://www.bjstkc.com/', 'gray'] not in ulist: ulist.append(['http://www.bjstkc.com/', 'gray']) self.add_saved_ulr_mongo(ulist) saved_num = len(ulist) table_name = 'task_result' fields = { 'e_web_save_state': [03, 'd'], 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'], 'web_save_run_time': [run_time, 's'], 'save_protected_objectid': [self.save_protected_objectID, 's'], 'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'], 'save_monitor_objectid': [self.save_monitor_objectID, 's'], 'save_gray_objectid': [self.save_gray_objectID, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if ulist == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: send_result = self.message_other_engine(3, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|web_save engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'web_save', 2) engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter, self.gray_urls, self.counterfeit_urls, self.monitor_urls, self.url_num, self.update_running_state, self.update_finished_state, self.mongo_operate) engine.download() reactor.run(installSignalHandlers=0)
class FiltrateStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FiltrateStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' self.gray_urls = [] self.task_start_time = '' self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'gray_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] gray_id = task_info['gray_id'] # read gray url if gray_id is not None and gray_id != '': gray_id = gray_id.split('-') table_name = 'gray_list' fields = ['url'] for once_gray_id in gray_id: wheres = {'id': [int(once_gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # read detected url table_name = 'task_result' fields = ['original_grayid'] wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') gary_objectid = select_result['original_grayid'] if gary_objectid is not None and gary_objectid != '': gary_objectid = self.mongo_operate.expand_gray_list( gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( gary_objectid) else: self.get_gray_iter = iter([]) def update_finish_state(self, trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid): ''' task run over, update information in mysql ''' run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_filtrate_state': [03, 'd'], 'filtrate_trusted_num': [trusted_filtrate_num, 'd'], 'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'], 'filtrate_run_time': [run_time, 's'], 'filtrate_objectid': [filtrate_objectid, 's'], 'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'], 'filtrate_counterfeit_objectid': [filtrate_counterfeit_objectid, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # message to control send_result = self.message_other_engine(2, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def trusted_select(self, gray_url): ''' 在被信任名单中查询 ''' table_name = 'trusted_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one', 0) return select_result def counterfeit_select(self, gray_url): ''' 在仿冒名单中查询 ''' table_name = 'counterfeit_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one', 0) return select_result def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'filtrate', 2) trusted_filtrate_num = 0 counterfeit_filtrate_num = 0 suspect_grays = [] # not filtrate url trusted_grays = [] counterfeit_grays = [] while 1: try: gray_url = self.get_gray_iter.next() except StopIteration: try: gray_url = self.gray_urls.pop() except IndexError: break ''' 对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num, 否则放到suspect_grays中 ''' select_result = self.trusted_select(gray_url) if select_result is not False: trusted_filtrate_num += 1 trusted_grays.append(gray_url) continue else: select_result = self.counterfeit_select(gray_url) if select_result is not False: counterfeit_filtrate_num += 1 counterfeit_grays.append(gray_url) continue else: suspect_grays.append(gray_url) # not filtrate url add gray_list in mongo filtrate_objectid = self.mongo_operate.create_gray( gray_name='suspect_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( suspect_grays, filtrate_objectid) filtrate_trusted_objectid = self.mongo_operate.create_gray( gray_name='trusted_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( trusted_grays, filtrate_trusted_objectid) filtrate_counterfeit_objectid = self.mongo_operate.create_gray( gray_name='counterfeit_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( counterfeit_grays, filtrate_counterfeit_objectid) self.update_finish_state( trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid)
class MainControl(ServerBase): def __init__(self): super(MainControl, self).__init__('control') self.mysql_handle = MysqlOperate(self.mysql_db, self.mysql_host, self.mysql_user, self. mysql_password) def read_task_info(self, task_id): ''' read task type and run engine ''' table_name = 'task_info' fields = ['task_type', 'task_engine'] wheres = {'task_id': [task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), task_id)) os._exit(0) task_type = task_info['task_type'] task_engines = task_info['task_engine'].split('-') return task_type, task_engines def read_running_engine(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_domain_state', 'e_search_state', 'e_filtrate_state', 'e_web_save_state', 'e_qt_crawler_state', 'e_feature_save_state', 'e_whois_search_state', 'e_title_state', 'e_structure_state', 'e_view_collect_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: return False running_engine_list = [] for engine in select_result: engine_state = select_result[engine] if engine_state == 2: engine_num = engine_list[engine[2:-6]] running_engine_list.append(engine_num) print 'running_engine_list', running_engine_list return running_engine_list def update_start_state(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) self.mysql_handle.update_task_state(task_id, task_start_time, 2) def update_finished_state(self, task_id, task_state=3): # update task finished state to mysql: task_state, task_run_time, task_stop_time # get task last_time in task_info task_start_time = self.mysql_handle.get_task_last_time(task_id) # update task_state, task_run_time, task_stop_time task_start_time_stamp = time.mktime(time.strptime(str(task_start_time), "%Y-%m-%d %H:%M:%S")) task_stop_time_stamp = time.time() task_run_time = task_stop_time_stamp - task_start_time_stamp task_stop_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(task_stop_time_stamp)) table_name = 'task_result' fields = {'task_state': [task_state, 'd'], 'task_run_time': [task_run_time, 'd'], 'task_stop_time': [task_stop_time, 's']} wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} result = self.mysql_handle.require_post( table_name, fields, wheres, post_type='update') sys.stdout.write( '%s |*|task win over|*|, task_id: %s, task_state: %s\n' % (time.ctime(), task_id, task_state)) return result def check_engine_state(self, task_id, task_type, engines): ''' Determine whether all the detection engine run over ''' task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_title_state', 'e_structure_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} task_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') e_title_state = task_result['e_title_state'] e_structure_state = task_result['e_structure_state'] e_view_emd_state = task_result['e_view_emd_state'] if '08' in engines or task_type == 5: if e_title_state != 3: return False if '09' in engines or task_type == 5: if e_structure_state != 3: return False if '10' in engines or task_type == 5: if e_view_emd_state != 3: return False return True ''' 重写响应函数 ''' def web_request_start(self, task_id): ''' 重写守护进程基类,响应请求 ''' sys.stdout.write('%s control receive task_id start request: %s\n' % (time.ctime(), task_id)) message_result = False task_type, task_engines = self.read_task_info(task_id) if task_type == 1 or task_type == 3: if '01' in task_engines: message_result = self.message_other_engine(0, ['01'], task_id) self.update_start_state(task_id) if '02' in task_engines: message_result = self.message_other_engine(0, ['02'], task_id) self.update_start_state(task_id) if '13' in task_engines: message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) elif task_type == 2: # 04: filtrate engine, check first filtrate message_result = self.message_other_engine(0, ['04'], task_id) self.update_start_state(task_id) elif task_type == 4: # 05: web save engine message_result = self.message_other_engine(0, ['05'], task_id) self.update_start_state(task_id) elif task_type == 5: # 13: whois search engine message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) else: sys.stderr.write( '%s task_type error, task_id: %s, task_type: %d' % (time.ctime(), task_id, task_type)) return message_result def web_request_stop(self, task_id): ''' 主控服务响应前台客户端任务结束请求 ''' sys.stdout.write('%s control receive task_id stop request: %s\n' % (time.ctime(), task_id)) running_engine_list = self.read_running_engine(task_id) message_result = self.message_other_engine( 1, running_engine_list, task_id) if message_result is True: stop_result = self.update_finished_state(task_id) else: stop_result = False return stop_result def filtrate_to_control(self, task_id): ''' message 2: filtrate engine finished message control ''' sys.stdout.write('%s control receive from filtrate engine task_id: %s\n' % (time.ctime(), task_id)) # 05: web_save engine, end filtrate,start web_save self.message_other_engine(0, ['05'], task_id) def web_save_to_control(self, task_id): ''' message 3: web_save engine finished message control ''' sys.stdout.write('%s control receive from web_save engine task_id: %s\n' % (time.ctime(), task_id)) # 06: qt_crawler engine, 08: title engine # end web_save_,start qt_crawler and title engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['06'], task_id) if '08' in task_engines or task_type == 5: self.message_other_engine(0, ['08'], task_id) def qt_crawler_to_control(self, task_id): ''' message 4: qt_crawler engine finished message control ''' sys.stdout.write('%s control receive from qt_crawler engine task_id: %s\n' % (time.ctime(), task_id)) # 09: structure engine, 10: view engine # end qt_crawler,start structure and view engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['12'], task_id) self.message_other_engine(0, ['07'], task_id) if task_type == 5 or '09' in task_engines: self.message_other_engine(0, ['09'], task_id) def detect_to_control(self, task_id): ''' message 5: detect(domain or search or whois) engine finished message control ''' sys.stdout.write('%s control receive from detect engine task_id: %s\n' % (time.ctime(), task_id)) task_type, task_engines = self.read_task_info(task_id) if task_type == 3 or task_type == 5: self.message_other_engine(0, ['04'], task_id) elif task_type == 1: # task over self.update_finished_state(task_id) def check_to_control(self, task_id): ''' message 6: check(title or structure or view) engine finished message control, over task ''' task_type, task_engines = self.read_task_info(task_id) check_result = self.check_engine_state( task_id, task_type, task_engines) if check_result is True: # all check engine overf, task over self.update_finished_state(task_id) def feature_save_to_control(self, task_id): ''' message 7: feature_save is task_type 4 last engine, other task_type no last over task ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 4: self.update_finished_state(task_id) def engine_failure_to_control(self, task_id): ''' message 8: engine_failure, over task is error ''' self.update_finished_state(task_id, 0) def engine_win_over_to_control(self, task_id): ''' message 9: engine over, After engine need not start over task ''' self.update_finished_state(task_id) def view_collect_to_control(self, task_id): ''' message 10: view_collect engine over, After start view_emd ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 5 or '10' in task_engines: self.message_other_engine(0, ['10'], task_id) elif task_type == 4: self.update_finished_state(task_id)
class FiltrateStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FiltrateStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' self.gray_urls = [] self.task_start_time = '' self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'gray_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write('%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] gray_id = task_info['gray_id'] # read gray url if gray_id is not None and gray_id != '': gray_id = gray_id.split('-') table_name = 'gray_list' fields = ['url'] for once_gray_id in gray_id: wheres = {'id': [int(once_gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # read detected url table_name = 'task_result' fields = ['original_grayid'] wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') gary_objectid = select_result['original_grayid'] if gary_objectid is not None and gary_objectid != '': gary_objectid = self.mongo_operate.expand_gray_list(gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( gary_objectid) else: self.get_gray_iter = iter([]) def update_finish_state(self, trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid): ''' task run over, update information in mysql ''' run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = { 'e_filtrate_state': [03, 'd'], 'filtrate_trusted_num': [trusted_filtrate_num, 'd'], 'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'], 'filtrate_run_time': [run_time, 's'], 'filtrate_objectid': [filtrate_objectid, 's'], 'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'], 'filtrate_counterfeit_objectid': [filtrate_counterfeit_objectid, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') # message to control send_result = self.message_other_engine(2, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def trusted_select(self, gray_url): ''' 在被信任名单中查询 ''' table_name = 'trusted_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one', 0) return select_result def counterfeit_select(self, gray_url): ''' 在仿冒名单中查询 ''' table_name = 'counterfeit_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one', 0) return select_result def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'filtrate', 2) trusted_filtrate_num = 0 counterfeit_filtrate_num = 0 suspect_grays = [] # not filtrate url trusted_grays = [] counterfeit_grays = [] while 1: try: gray_url = self.get_gray_iter.next() except StopIteration: try: gray_url = self.gray_urls.pop() except IndexError: break ''' 对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num, 否则放到suspect_grays中 ''' select_result = self.trusted_select(gray_url) if select_result is not False: trusted_filtrate_num += 1 trusted_grays.append(gray_url) continue else: select_result = self.counterfeit_select(gray_url) if select_result is not False: counterfeit_filtrate_num += 1 counterfeit_grays.append(gray_url) continue else: suspect_grays.append(gray_url) # not filtrate url add gray_list in mongo filtrate_objectid = self.mongo_operate.create_gray( gray_name='suspect_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(suspect_grays, filtrate_objectid) filtrate_trusted_objectid = self.mongo_operate.create_gray( gray_name='trusted_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(trusted_grays, filtrate_trusted_objectid) filtrate_counterfeit_objectid = self.mongo_operate.create_gray( gray_name='counterfeit_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(counterfeit_grays, filtrate_counterfeit_objectid) self.update_finish_state(trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid)
class WhoisReverse(): def __init__(self, mysql_host, mysql_db, mysql_user, mysql_password): self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.email = ['', '', ''] self.name = ['', '', ''] self.domain = [] def get_top_host(self, url): if not url.startswith('http'): url = 'http://' + url parts = urlparse.urlparse(url) host = parts.netloc topHostPostfix = ('.com', '.la', '.io', '.co', '.info', '.net', '.org', '.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx', '.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag', '.am', '.asia', '.at', '.be', '.com.br', '.net.br', '.bz', '.com.bz', '.net.bz', '.cc', '.com.co', '.net.co', '.nom.co', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl', '.nu', '.co.nz', '.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg', '.br', '.fr') extractPattern = r'[^\.]+(' + '|'.join( [h.replace('.', r'\.') for h in topHostPostfix]) + ')$' pattern = re.compile(extractPattern, re.IGNORECASE) m = pattern.search(host) return m.group() if m else host def get_reverse_whois(self, url): if url is None or url is '': return False self.original_domain = self.get_top_host(url) table_name = 'whois_domain' fields = ['admin', 'tech', 'registrant'] # wait to select fields wheres = {'name': [self.original_domain, 's']} self.contactid = self.mysql_handle.require_get(table_name, fields, wheres, get_type='select', fetch_type='one') if self.contactid is False: return [] return self.get_source() def get_source(self): table_name = 'whois_contacts' fields = ['name', 'email'] wheres = {'contacts_id': [self.contactid['admin'], 'd']} result = self.mysql_handle.require_get(table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[0] = '' self.name[0] = '' else: self.email[0] = result['email'] self.name[0] = result['name'] if self.contactid['tech'] != self.contactid['admin']: wheres = {'contacts_id': [self.contactid['tech'], 'd']} result = self.mysql_handle.require_get(table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[1] = '' self.name[1] = '' else: self.email[1] = result['email'] self.name[1] = result['name'] if self.contactid['registrant'] != self.contactid[ 'admin'] and self.contactid['registrant'] != self.contactid[ 'tech']: wheres = {'contacts_id': [self.contactid['registrant'], 'd']} result = self.mysql_handle.require_get(table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[2] = '' self.name[2] = '' else: self.email[2] = result['email'] self.name[2] = result['name'] if self.email[2] == self.email[1] or self.email[2] == self.email[0]: self.email[2] = '' if self.email[1] == self.email[0]: self.email[1] = '' if self.name[2] == self.name[1] or self.name[2] == self.name[0]: self.name[2] = '' if self.name[1] == self.name[0]: self.name[1] = '' for i in self.email: if i != '': # print i self.search(i, 1) self.write_todb() for i in self.name: if i != '': # print i self.search(i, 2) self.write_todb() return self.domain def search(self, source, search_mod): self.source = source self.search_mod = search_mod if self.search_mod == 1: target_url = 'http://whois.chinaz.com/reverse?host=' + \ self.source + '&ddlSearchMode=1' if self.search_mod == 2: self.source = '+'.join(self.source.split(' ')) target_url = 'http://whois.chinaz.com/reverse?host=' + \ self.source + '&ddlSearchMode=2' find = urllib2.urlopen(target_url).read() if chardet.detect(find)['encoding'] == 'GB2312': find = unicode(find, "gb2312").encode('utf-8') page = etree.HTML(find) original_urls = page.xpath( '//*[@id="detail"]/table/tbody/tr/td[1]/a/@href') j = 0 for i in original_urls: original_urls[j] = urlparse.urlparse(i).path[1:] j = j + 1 self.domain = original_urls # print self.domain def write_todb(self): table_name = 'whois_reverse' print 'whois_reverse domain', self.domain for i in self.domain: table_name = 'whois_reverse' fields = ['id'] # wait to select fields wheres = { 'domain': [i, 's'], 'original_domain': [self.original_domain, 's'] } flag = self.mysql_handle.require_get(table_name, fields, wheres, get_type='select', fetch_type='one') if flag is False: fields = { 'domain': [i, 's'], 'original_domain': [self.original_domain, 's'] } self.mysql_handle.require_post(table_name, fields, post_type='insert')
class WhoisReverse(): def __init__(self, mysql_host, mysql_db, mysql_user, mysql_password): self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.email = ['', '', ''] self.name = ['', '', ''] self.domain = [] def get_top_host(self, url): if not url.startswith('http'): url = 'http://' + url parts = urlparse.urlparse(url) host = parts.netloc topHostPostfix = ( '.com', '.la', '.io', '.co', '.info', '.net', '.org', '.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx', '.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag', '.am', '.asia', '.at', '.be', '.com.br', '.net.br', '.bz', '.com.bz', '.net.bz', '.cc', '.com.co', '.net.co', '.nom.co', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl', '.nu', '.co.nz', '.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg', '.br', '.fr') extractPattern = r'[^\.]+(' + '|'.join([h.replace('.', r'\.') for h in topHostPostfix]) + ')$' pattern = re.compile(extractPattern, re.IGNORECASE) m = pattern.search(host) return m.group() if m else host def get_reverse_whois(self, url): if url is None or url is '': return False self.original_domain = self.get_top_host(url) table_name = 'whois_domain' fields = ['admin', 'tech', 'registrant'] # wait to select fields wheres = {'name': [self.original_domain, 's']} self.contactid = self.mysql_handle.require_get( table_name, fields, wheres, get_type='select', fetch_type='one') if self.contactid is False: return [] return self.get_source() def get_source(self): table_name = 'whois_contacts' fields = ['name', 'email'] wheres = {'contacts_id': [self.contactid['admin'], 'd']} result = self.mysql_handle.require_get( table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[0] = '' self.name[0] = '' else: self.email[0] = result['email'] self.name[0] = result['name'] if self.contactid['tech'] != self.contactid['admin']: wheres = {'contacts_id': [self.contactid['tech'], 'd']} result = self.mysql_handle.require_get( table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[1] = '' self.name[1] = '' else: self.email[1] = result['email'] self.name[1] = result['name'] if self.contactid['registrant'] != self.contactid['admin'] and self.contactid['registrant'] != self.contactid['tech']: wheres = {'contacts_id': [self.contactid['registrant'], 'd']} result = self.mysql_handle.require_get( table_name, fields, wheres, get_type='select', fetch_type='one') if result is False: self.email[2] = '' self.name[2] = '' else: self.email[2] = result['email'] self.name[2] = result['name'] if self.email[2] == self.email[1] or self.email[2] == self.email[0]: self.email[2] = '' if self.email[1] == self.email[0]: self.email[1] = '' if self.name[2] == self.name[1] or self.name[2] == self.name[0]: self.name[2] = '' if self.name[1] == self.name[0]: self.name[1] = '' for i in self.email: if i != '': # print i self.search(i, 1) self.write_todb() for i in self.name: if i != '': # print i self.search(i, 2) self.write_todb() return self.domain def search(self, source, search_mod): self.source = source self.search_mod = search_mod if self.search_mod == 1: target_url = 'http://whois.chinaz.com/reverse?host=' + \ self.source + '&ddlSearchMode=1' if self.search_mod == 2: self.source = '+'.join(self.source.split(' ')) target_url = 'http://whois.chinaz.com/reverse?host=' + \ self.source + '&ddlSearchMode=2' find = urllib2.urlopen(target_url).read() if chardet.detect(find)['encoding'] == 'GB2312': find = unicode(find, "gb2312").encode('utf-8') page = etree.HTML(find) original_urls = page.xpath( '//*[@id="detail"]/table/tbody/tr/td[1]/a/@href') j = 0 for i in original_urls: original_urls[j] = urlparse.urlparse(i).path[1:] j = j + 1 self.domain = original_urls # print self.domain def write_todb(self): table_name = 'whois_reverse' print 'whois_reverse domain', self.domain for i in self.domain: table_name = 'whois_reverse' fields = ['id'] # wait to select fields wheres = {'domain': [i, 's'], 'original_domain': [self.original_domain, 's']} flag = self.mysql_handle.require_get( table_name, fields, wheres, get_type='select', fetch_type='one') if flag is False: fields = {'domain': [i, 's'], 'original_domain': [self.original_domain, 's']} self.mysql_handle.require_post( table_name, fields, post_type='insert')
class _ServerSession(object): def __init__(self, server_ip='127.0.0.1', server_port='1234', server_type='default', server_num=5, mysql_host='127.0.0.1', mysql_user='******', mysql_password='', mysql_db='test', message_len=''): self.server_ip = server_ip # 守护进程的ip self.server_port = server_port # 监听的端口号 self.server_type = server_type # 服务类型 self.server_num = server_num # 能接受链接的服务数量,socket listen 数量 # 服务之间通信协议定义,由short int和int构成,分别代表消息类型和任务ID, self.message_len = message_len self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.try_send_message_num = 3 def register_sever(self): ''' 守护进程(服务)在数据库server_live表中注册信息 ''' current_time = time.strftime( '%Y-%m-%d %H:%M', time.localtime(time.time())) table_name = 'server_live' fields = ['*'] # wait to select fields # select condition wheres={field:[value,field_type]} wheres = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one', 0) table_name = 'server_live' fields = {'ip': [self.server_ip, 's'], 'port': [self.server_port, 's'], 'type': [self.server_type, 's'], 'status': [1, 'd'], 'time': [current_time, 's']} if result is None: result = self.mysql_handle.require_post( table_name, fields, {}, 'insert') else: table_name_del = 'server_live' wheres_del = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} self.mysql_handle.require_post( table_name_del, {}, wheres_del, 'delete') result = self.mysql_handle.require_post( table_name, fields, {}, 'insert') if result is True: sys.stdout.write('%s: server register\n' % (time.ctime(),)) def update_sever(self): ''' 执行线程工作,定时更新数据库,记录服务存活 ''' table_name = 'server_live' wheres = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} while True: time.sleep(60) current_time = time.strftime( '%Y-%m-%d %H:%M', time.localtime(time.time())) fields = {'time': [current_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def start_update_state(self): ''' 开启子线程,定期检查服务是否存活 ''' t1 = threading.Thread(target=self.update_sever) t1.start() def over_sever(self): ''' 守护进程(服务)将之前注册在数据库server_live表中信息删除。 ''' table_name = 'server_live' wheres = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} result = self.mysql_handle.require_post( table_name, {}, wheres, 'delete') if result is True: sys.stdout.write('%s: server logout\n' % (time.ctime(),)) def run_server(self): ''' 运行相应服务,建立socket连接,监听端口 ''' try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) except socket.error, msg: sys.stderr.write( '%s\n' % SocketError(msg, 'run_server socket create')) sys.exit() # port re run sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: sock.bind((self.server_ip, self.server_port)) # 绑定于端口和ip sock.listen(self.server_num) except socket.error, msg: sys.stderr.write('%s' % SocketError(msg, 'run_server bind listen')) sys.stderr.write(' ip: %s port: %s \n' % (self.server_ip, self.server_port)) sys.exit()
class DomainStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(DomainStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' # 待变换网站列表, 包括已知仿冒网站和被保护网站, # 对已知仿冒网站和对被保护网站域名变换方式一样, 故统一处理 self.wait_change_url_list = [] self.original_host_rules = [] self.original_top_rules = [] self.original_path_rules = [] self.exist_list = [] # 记录存在的url self.task_start_time = '' self.run_start_time = 0 self.url_create_list = [] self.protect_url = '' self.deferreds = [] self.read_task_info() self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) # 创建域名生成器对象 self.url_gen = URLGenerator(self.task_id, self.mongo_operate, self.update_running_state, self.wait_change_url_list, self.original_host_rules, self.original_top_rules, self.original_path_rules) self.domain_change_url = self.url_gen.URL_Generator() # 创建生成器 self.domain_save_path = '/tmp/' + \ str(task_id) + '_domain_request_urls.txt' self.domain_live_path = '/tmp/' + \ str(task_id) + '_domain_live.txt' self.file_request_urls = open(self.domain_save_path, 'w') self.file_live_url = open(self.domain_live_path, 'w') def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = [ 'last_time', 'user_id', 'protected_id', 'counterfeit_id', 'host_rule_id', 'top_rule_id', 'path_rule_id' ] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] protected_list_id = task_info['protected_id'] counterfeit_list_id = task_info['counterfeit_id'] host_rule_id = task_info['host_rule_id'] top_rule_id = task_info['top_rule_id'] path_rule_id = task_info['path_rule_id'] self.read_rule_config(protected_list_id, counterfeit_list_id, host_rule_id, top_rule_id, path_rule_id) def read_rule_config(self, protected_list_id, counterfeit_list_id, host_rule_id, top_rule_id, path_rule_id): ''' 从mysql中读取变换规则和被保护名单 ''' if protected_list_id is not None and protected_list_id != '': protected_list_id = protected_list_id.split('-') for protected_id in protected_list_id: # 读取mysql中的被保护名单 table_name = 'protected_list' fields = ['url'] wheres = {'id': [int(protected_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue protected = task_info['url'] self.wait_change_url_list.append(protected) if counterfeit_list_id is not None and counterfeit_list_id != '': counterfeit_list_id = counterfeit_list_id.split('-') for counterfeit_id in counterfeit_list_id: # 读取mysql中的待变换已知仿冒网站 table_name = 'counterfeit_list' fields = ['url'] wheres = {'id': [int(counterfeit_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue counterfeit = task_info['url'] self.wait_change_url_list.append(counterfeit) if host_rule_id is not None and host_rule_id != '': host_rule_id = host_rule_id.split('-') for rule_id in host_rule_id: # 读取mysql中的主机域名变换规则 table_name = 'host_change_rule' fields = ['change_rule'] wheres = {'id': [int(rule_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] result = result.split('|') for once_result in result: self.original_host_rules.append(str(once_result)) if top_rule_id is not None and top_rule_id != '': top_rule_id = top_rule_id.split('-') for top_id in top_rule_id: # 读取mysql中的顶级域名变换规则 table_name = 'top_change_rule' fields = ['change_rule'] wheres = {'id': [int(top_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] self.original_top_rules.append(str(result)) if path_rule_id is not None and path_rule_id != '': path_rule_id = path_rule_id.split('-') for path_id in path_rule_id: # 读取mysql中的路径变换规则 table_name = 'path_change_rule' fields = ['change_rule'] wheres = {'id': [int(path_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] self.original_path_rules.append(str(result)) def update_running_state(self, all_change_num, all_exist_change_num, changed_num, gray_exist_num, update_type=0): ''' 在mysql中更新探测状态及结果 update_type=0: domain change update update_type=1: url exist check update ''' table_name = 'task_result' wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } if update_type == 0: fields = { 'domain_changed_all_num': [all_change_num, 'd'], 'domain_changed_exist_num': [all_exist_change_num, 'd'], 'domain_detected_num': [changed_num, 'd'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if update_type == 1: fields = {'domain_gray_url_num': [gray_exist_num, 'd']} self.mysql_handle.require_post(table_name, fields, wheres, 'update') def create_gray_mongo(self, exist_list): gray_name = 'NO.' + str(self.task_id) + ' task domian' detect_objectID = self.mongo_operate.create_gray( gray_name=gray_name, gray_type='domain_change', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(exist_list, detect_objectID) return detect_objectID def update_finish_state(self, exist_list, run_time): ''' task run over, update information in mysql ''' detect_objectID = self.create_gray_mongo(exist_list) exist_url_num = len(exist_list) # save domain engine result in mysql task_result table_name = 'task_result' fields = { 'e_domain_state': [03, 'd'], 'domain_gray_url_num': [exist_url_num, 'd'], 'original_grayid': [detect_objectID, 's'], 'domain_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if exist_list == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: # save gray_list info in mysql self.mysql_handle.insert_suspect_list(detect_objectID, self.user_id, self.task_id, 'domain_change', exist_url_num, suspect_type=2) self.mysql_handle.insert_gray_list(exist_list, source='domain_change') # quit deal # message to control send_result = self.message_other_engine(5, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def pageCallback(self, result, url, protect): ''' 用 getpage检测 网页存在 网页存在,调用此回调函数 ''' global _NUM match = re.search(r"<title>(.*?)</title>", result) try: title = match.group(1) except: title = 'None' if title.find("Redirect") == -1: self.exist_list.append(url) self.update_running_state(gray_exist_num=len(self.exist_list), update_type=1) self.file_request_urls.write(url + '\n') self.file_request_urls.flush() self.file_live_url.seek(0) self.file_live_url.truncate(0) self.file_live_url.write(url + ' ' + str(self.engine_pid)) self.file_live_url.flush() _NUM += 1 self.download() def finish(self, ign): ''' 所有的defer处理完后调用finish结束reacter循环 ''' try: reactor.stop() os.remove(self.domain_save_path) os.remove(self.domain_live_path) except: pass def fetch_error(self, error, url, protect): ''' 用getpage检测,网页不存在调用此回调函数 ''' global _NUM if error.getErrorMessage().find( 'User timeout caused connection failure') != -1: d = getPage(url) d.addCallback(self.pageCallback, url, protect) d.addErrback(self.fetch_error, url, protect) else: self.file_live_url.seek(0) self.file_live_url.truncate(0) self.file_live_url.write(url + ' ' + str(self.engine_pid)) self.file_live_url.flush() _NUM += 1 self.download() def download(self): global _NUM while _NUM > 0: try: url = self.url_create_list.pop(0) d = getPage(url.encode('utf-8')) d.addCallback(self.pageCallback, url.encode('utf-8'), self.protect_url) d.addErrback(self.fetch_error, url.encode('utf-8'), self.protect_url) _NUM -= 1 self.deferreds.append(d) except IndexError: try: self.url_create_list = [] self.url_create_list = self.domain_change_url.next() # print 'download', self.url_create_list self.protect_url = self.url_create_list[0] self.url_create_list = self.url_create_list[1:] except StopIteration: dl = defer.DeferredList(self.deferreds) dl.addCallback(self.finish) break def run(self): ''' 程序入口 ''' # write child process pid to engine pids self.write_process_pid(self.task_id) self.engine_pid = os.getpid() self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'domain', 2) try: self.url_create_list = self.domain_change_url.next() self.protect_url = self.url_create_list[0] self.url_create_list = self.url_create_list[1:] except StopIteration: pass self.download() # start reactor.run() # finaish run_time = int(time.time()) - int(self.run_start_time) self.update_finish_state(self.exist_list, run_time)
class WhoisSearchStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WhoisSearchStart, self).__init__() self.task_id = task_id self.mysql_host = mysql_host self.mysql_db = mysql_db self.mysql_user = mysql_user self.mysql_password = mysql_password self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) # 初始化操作 self.task_start_time = '' self.user_id = '' self.whois_search_url = '' self.whois_reverse_url = '' self.counterfeit_urls = [] self.task_state = 0 self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'counterfeit_id', 'whois_search_url', 'whois_reverse_url'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] self.whois_search_url = task_info['whois_search_url'] self.whois_reverse_url = task_info['whois_reverse_url'] original_counterfeit_list = task_info['counterfeit_id'] # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) def add_gray_list(self, url_list): if url_list == []: return False gray_objectid = self.mongo_operate.create_gray( gray_name='whois_reverse_gray', gray_type='whois_reverse', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( url_list, gray_objectid) table_name = 'task_result' fields = {'original_grayid': [gray_objectid, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # save gray_list info in mysql suspect_list url_num = len(url_list) self.mysql_handle.insert_suspect_list(gray_objectid, self.user_id, self.task_id, 'whois_reverse', url_num, suspect_type=2) self.mysql_handle.insert_gray_list(url_list, source='whois_reverse') def update_finish_state(self, new_gray_lsit): run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_whois_search_state': [03, 'd'], 'whois_search_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # message to control if new_gray_lsit == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: self.add_gray_list(new_gray_lsit) send_result = self.message_other_engine(5, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run_whois_reverse(self, url): whois_reverse = WhoisReverse(self.mysql_host, self.mysql_db, self.mysql_user, self.mysql_password) try: reverse_url_list = [] reverse_domain_list = whois_reverse.get_reverse_whois(url) for domian in reverse_domain_list: reverse_url = 'http://' + domian + '/' reverse_url_list.append(reverse_url) return reverse_url_list except: traceback.print_exc() return [] def run_whois_search(self, url): ''' 通过使whois查询模块在子线程中运行,从而避免对主线程造成影响 ''' url_analysis = Urlanalysis(1, self.mysql_host, self.mysql_user, self.mysql_password, self.mysql_db) url_list = [url] try: url_analysis.getUrllist_list(url_list) except: traceback.print_exc() def web_save_transfer(self, url): self.mongo_operate.transfer_web_save( url, source_type='gray', goal_type='counterfeit') h = WebSavePath() source_file_path, target_file_path = h.get_transfer_path( url, 'gray', 'counterfeit') web_info_transfer(source_file_path, target_file_path) def whois_operation(self): if self.whois_search_url != '' and self.whois_search_url is not None: self.run_whois_search(self.whois_search_url) if self.whois_reverse_url != '' and self.whois_reverse_url is not None: self.run_whois_reverse(self.whois_reverse_url) new_gray_lsit = [] while 1: try: url = self.counterfeit_urls.pop() #self.web_save_transfer(url) self.mysql_handle.update_counterfeit_list_statistic(url) self.run_whois_search(url) reverse_url_list = self.run_whois_reverse(url) new_gray_lsit.extend(reverse_url_list) except IndexError: break self.update_finish_state(new_gray_lsit) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'whois_search', 2) self.whois_operation()
class _ServerSession(object): def __init__(self, server_ip='127.0.0.1', server_port='1234', server_type='default', server_num=5, mysql_host='127.0.0.1', mysql_user='******', mysql_password='', mysql_db='test', message_len=''): self.server_ip = server_ip # 守护进程的ip self.server_port = server_port # 监听的端口号 self.server_type = server_type # 服务类型 self.server_num = server_num # 能接受链接的服务数量,socket listen 数量 # 服务之间通信协议定义,由short int和int构成,分别代表消息类型和任务ID, self.message_len = message_len self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.try_send_message_num = 3 def register_sever(self): ''' 守护进程(服务)在数据库server_live表中注册信息 ''' current_time = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())) table_name = 'server_live' fields = ['*'] # wait to select fields # select condition wheres={field:[value,field_type]} wheres = {'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one', 0) table_name = 'server_live' fields = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's'], 'type': [self.server_type, 's'], 'status': [1, 'd'], 'time': [current_time, 's'] } if result is None: result = self.mysql_handle.require_post(table_name, fields, {}, 'insert') else: table_name_del = 'server_live' wheres_del = { 'ip': [self.server_ip, 's'], 'port': [self.server_port, 's'] } self.mysql_handle.require_post(table_name_del, {}, wheres_del, 'delete') result = self.mysql_handle.require_post(table_name, fields, {}, 'insert') if result is True: sys.stdout.write('%s: server register\n' % (time.ctime(), )) def update_sever(self): ''' 执行线程工作,定时更新数据库,记录服务存活 ''' table_name = 'server_live' wheres = {'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} while True: time.sleep(60) current_time = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())) fields = {'time': [current_time, 's']} self.mysql_handle.require_post(table_name, fields, wheres, 'update') def start_update_state(self): ''' 开启子线程,定期检查服务是否存活 ''' t1 = threading.Thread(target=self.update_sever) t1.start() def over_sever(self): ''' 守护进程(服务)将之前注册在数据库server_live表中信息删除。 ''' table_name = 'server_live' wheres = {'ip': [self.server_ip, 's'], 'port': [self.server_port, 's']} result = self.mysql_handle.require_post(table_name, {}, wheres, 'delete') if result is True: sys.stdout.write('%s: server logout\n' % (time.ctime(), )) def run_server(self): ''' 运行相应服务,建立socket连接,监听端口 ''' try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) except socket.error, msg: sys.stderr.write('%s\n' % SocketError(msg, 'run_server socket create')) sys.exit() # port re run sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: sock.bind((self.server_ip, self.server_port)) # 绑定于端口和ip sock.listen(self.server_num) except socket.error, msg: sys.stderr.write('%s' % SocketError(msg, 'run_server bind listen')) sys.stderr.write(' ip: %s port: %s \n' % (self.server_ip, self.server_port)) sys.exit()