class View_Emd_start(multiprocessing.Process): def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(View_Emd_start, self).__init__() self.task_id = task_id self.task_start_time = '' self.user_id = '' self.view_protected_objectid = '' self.view_gray_objectid = '' self.view_counterfeit_objectid = '' self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.current_path = sys.path[0] self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) def update_running_state(self, finish_num, view_find_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'view_check_num': [finish_num, 'd'], 'view_find_num': [view_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def engine_over_handle(self): send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) # 任务完成更新状态 def update_finished_state(self, run_time, finish_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'e_view_emd_state': [03, 'd'], 'view_emd_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def run(self): finish_num = 0 view_find_num = 0 view_find_flags = 0 start_time = time.time() View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id, self.task_start_time, self.protected_title_dict, self.counterfeit_title_dict) while True: try: gray_url = self.get_gray_iter.next() view_find_flags = View_emd.emdcalculate(gray_url) finish_num += 1 view_find_num += view_find_flags self.update_running_state(finish_num, view_find_num) except StopIteration: break run_time = int(time.time()) - int(start_time) #run_time = time.ctime(run_time) self.update_finished_state(run_time, finish_num)
class WebSavestart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WebSavestart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.user_id = '' self.protected_urls = [] self.counterfeit_urls = [] self.gray_urls = [] self.monitor_urls = [] self.url_num = 0 self.gary_objectid = '' self.file_context = '' self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = [ 'last_time', 'user_id', 'protected_id', 'gray_id', 'counterfeit_id', 'monitor_id' ] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write('%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] original_protected_list = task_info['protected_id'] original_counterfeit_list = task_info['counterfeit_id'] original_gray_list = task_info['gray_id'] original_monitor_list = task_info['monitor_id'] # get protected url, all test may have protected url to save if original_protected_list is not None and original_protected_list != '': protected_id_list = original_protected_list.split('-') table_name = 'protected_list' fields = ['url'] for protected_id in protected_id_list: # 读取mysql中的被保护名单 wheres = {'id': [int(protected_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue protected_url = select_result['url'].encode('utf-8') self.protected_urls.append(protected_url) # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) # get gray url in mysql gray_list if original_gray_list is not None and original_gray_list != '': gray_id_list = original_gray_list.split('-') table_name = 'gray_list' fields = ['url'] for gray_id in gray_id_list: wheres = {'id': [int(gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # get monitor url in mysql monitor_list if original_monitor_list is not None and original_monitor_list != '': monitor_id_list = original_monitor_list.split('-') table_name = 'monitor_list' fields = ['url'] for monitor_id in monitor_id_list: wheres = {'id': [int(monitor_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue monitor_url = select_result['url'].encode('utf-8') self.monitor_urls.append(monitor_url) # get suspected url table_name = 'task_result' fields = ['filtrate_objectid'] wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if select_result is not False: self.gary_objectid = select_result['filtrate_objectid'] if self.gary_objectid is None: self.get_gray_iter = iter([]) self.gray_url_num = 0 else: self.gray_url_num = self.mongo_operate.get_gray_num( self.gary_objectid) self.gary_objectid = self.mongo_operate.expand_gray_list( self.gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( self.gary_objectid) else: self.get_gray_iter = iter([]) self.gray_url_num = 0 self.url_num = self.gray_url_num + \ len(self.protected_urls) + len(self.gray_urls) + \ len(self.counterfeit_urls) + len(self.monitor_urls) def update_running_state(self, saved_num, request_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def add_saved_ulr_mongo(self, ulist): ''' 将保存的URL分类存入mongo中 ''' saved_protected_urls = [] saved_gray_urls = [] saved_counterfeit_urls = [] saved_monitor_urls = [] # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls # url last '/n' for url in ulist: if url[1] == 'gray': saved_gray_urls.append(url[0]) elif url[1] == 'protected': saved_protected_urls.append(url[0]) elif url[1] == 'counterfeit': saved_counterfeit_urls.append(url[0]) elif url[1] == 'monitor': saved_monitor_urls.append(url[0]) if saved_gray_urls != []: self.save_gray_objectID = self.mongo_operate.create_gray( gray_name='save_gray_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_gray_urls, self.save_gray_objectID) else: self.save_gray_objectID = '' if saved_protected_urls != []: self.save_protected_objectID = self.mongo_operate.create_gray( gray_name='saved_protected_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_protected_urls, self.save_protected_objectID) else: self.save_protected_objectID = '' if saved_counterfeit_urls != []: self.save_counterfeit_objectID = self.mongo_operate.create_gray( gray_name='saved_counterfeit_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_counterfeit_urls, self.save_counterfeit_objectID) else: self.save_counterfeit_objectID = '' if saved_monitor_urls != []: self.save_monitor_objectID = self.mongo_operate.create_gray( gray_name='saved_monitor_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list(saved_monitor_urls, self.save_monitor_objectID) else: self.save_monitor_objectID = '' def update_finished_state(self, ulist, run_time, request_num): ''' 在mysql中更新探测状态及结果 ''' if ['http://cpuzt.cc/', 'gray'] not in ulist: ulist.append(['http://cpuzt.cc/', 'gray']) if ['http://www.138.gg/', 'gray'] not in ulist: ulist.append(['http://www.138.gg/', 'gray']) if ['http://www.bjstkc.com/', 'gray'] not in ulist: ulist.append(['http://www.bjstkc.com/', 'gray']) self.add_saved_ulr_mongo(ulist) saved_num = len(ulist) table_name = 'task_result' fields = { 'e_web_save_state': [03, 'd'], 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'], 'web_save_run_time': [run_time, 's'], 'save_protected_objectid': [self.save_protected_objectID, 's'], 'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'], 'save_monitor_objectid': [self.save_monitor_objectID, 's'], 'save_gray_objectid': [self.save_gray_objectID, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if ulist == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: send_result = self.message_other_engine(3, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|web_save engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'web_save', 2) engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter, self.gray_urls, self.counterfeit_urls, self.monitor_urls, self.url_num, self.update_running_state, self.update_finished_state, self.mongo_operate) engine.download() reactor.run(installSignalHandlers=0)
class FeatureSaveStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FeatureSaveStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.run_start_time = 0 self.save_num = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, save_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'feature_save_num': [save_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def update_finish_state(self, save_num): run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_feature_save_state': [03, 'd'], 'feature_save_num': [save_num, 'd'], 'feature_save_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): send_result = self.message_other_engine(7, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def save_web_feature(self): while 1: try: url = self.get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = self.get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = self.get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = self.get_monitor_iter.next() url_type = 'monitor' except StopIteration: break table_name = url_type + '_feature' self.mysql_handle.insert_web_feature(url, url_type, table_name, update_sign=True) self.save_num += 1 self.update_running_state(self.save_num) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'feature_save', 2) self.save_web_feature() self.update_finish_state(self.save_num)
class View_Emd_start(multiprocessing.Process): def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(View_Emd_start, self).__init__() self.task_id = task_id self.task_start_time = '' self.user_id = '' self.view_protected_objectid = '' self.view_gray_objectid = '' self.view_counterfeit_objectid = '' self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.mongo_operate = Mongo_Operate( mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.current_path = sys.path[0] self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) def update_running_state(self, finish_num, view_find_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'view_check_num': [finish_num, 'd'], 'view_find_num': [view_find_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def engine_over_handle(self): send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) # 任务完成更新状态 def update_finished_state(self, run_time, finish_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'e_view_emd_state': [03, 'd'], 'view_emd_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def run(self): finish_num = 0 view_find_num = 0 view_find_flags = 0 start_time = time.time() View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id, self.task_start_time, self.protected_title_dict, self.counterfeit_title_dict) while True: try: gray_url = self.get_gray_iter.next() view_find_flags = View_emd.emdcalculate(gray_url) finish_num += 1 view_find_num += view_find_flags self.update_running_state(finish_num, view_find_num) except StopIteration: break run_time = int(time.time()) - int(start_time) #run_time = time.ctime(run_time) self.update_finished_state(run_time, finish_num)
class WebSavestart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WebSavestart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.user_id = '' self.protected_urls = [] self.counterfeit_urls = [] self.gray_urls = [] self.monitor_urls = [] self.url_num = 0 self.gary_objectid = '' self.file_context = '' self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'protected_id', 'gray_id', 'counterfeit_id', 'monitor_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] original_protected_list = task_info['protected_id'] original_counterfeit_list = task_info['counterfeit_id'] original_gray_list = task_info['gray_id'] original_monitor_list = task_info['monitor_id'] # get protected url, all test may have protected url to save if original_protected_list is not None and original_protected_list != '': protected_id_list = original_protected_list.split('-') table_name = 'protected_list' fields = ['url'] for protected_id in protected_id_list: # 读取mysql中的被保护名单 wheres = {'id': [int(protected_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue protected_url = select_result['url'].encode('utf-8') self.protected_urls.append(protected_url) # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) # get gray url in mysql gray_list if original_gray_list is not None and original_gray_list != '': gray_id_list = original_gray_list.split('-') table_name = 'gray_list' fields = ['url'] for gray_id in gray_id_list: wheres = {'id': [int(gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # get monitor url in mysql monitor_list if original_monitor_list is not None and original_monitor_list != '': monitor_id_list = original_monitor_list.split('-') table_name = 'monitor_list' fields = ['url'] for monitor_id in monitor_id_list: wheres = {'id': [int(monitor_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue monitor_url = select_result['url'].encode('utf-8') self.monitor_urls.append(monitor_url) # get suspected url table_name = 'task_result' fields = ['filtrate_objectid'] wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is not False: self.gary_objectid = select_result['filtrate_objectid'] if self.gary_objectid is None: self.get_gray_iter = iter([]) self.gray_url_num = 0 else: self.gray_url_num = self.mongo_operate.get_gray_num( self.gary_objectid) self.gary_objectid = self.mongo_operate.expand_gray_list( self.gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( self.gary_objectid) else: self.get_gray_iter = iter([]) self.gray_url_num = 0 self.url_num = self.gray_url_num + \ len(self.protected_urls) + len(self.gray_urls) + \ len(self.counterfeit_urls) + len(self.monitor_urls) def update_running_state(self, saved_num, request_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def add_saved_ulr_mongo(self, ulist): ''' 将保存的URL分类存入mongo中 ''' saved_protected_urls = [] saved_gray_urls = [] saved_counterfeit_urls = [] saved_monitor_urls = [] # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls # url last '/n' for url in ulist: if url[1] == 'gray': saved_gray_urls.append(url[0]) elif url[1] == 'protected': saved_protected_urls.append(url[0]) elif url[1] == 'counterfeit': saved_counterfeit_urls.append(url[0]) elif url[1] == 'monitor': saved_monitor_urls.append(url[0]) if saved_gray_urls != []: self.save_gray_objectID = self.mongo_operate.create_gray( gray_name='save_gray_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_gray_urls, self.save_gray_objectID) else: self.save_gray_objectID = '' if saved_protected_urls != []: self.save_protected_objectID = self.mongo_operate.create_gray( gray_name='saved_protected_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_protected_urls, self.save_protected_objectID) else: self.save_protected_objectID = '' if saved_counterfeit_urls != []: self.save_counterfeit_objectID = self.mongo_operate.create_gray( gray_name='saved_counterfeit_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_counterfeit_urls, self.save_counterfeit_objectID) else: self.save_counterfeit_objectID = '' if saved_monitor_urls != []: self.save_monitor_objectID = self.mongo_operate.create_gray( gray_name='saved_monitor_urls', gray_type='websave', usr_id=self.user_id) self.mongo_operate.add_gray_list( saved_monitor_urls, self.save_monitor_objectID) else: self.save_monitor_objectID = '' def update_finished_state(self, ulist, run_time, request_num): ''' 在mysql中更新探测状态及结果 ''' if ['http://cpuzt.cc/', 'gray'] not in ulist: ulist.append(['http://cpuzt.cc/', 'gray']) if ['http://www.138.gg/', 'gray'] not in ulist: ulist.append(['http://www.138.gg/', 'gray']) if ['http://www.bjstkc.com/', 'gray'] not in ulist: ulist.append(['http://www.bjstkc.com/', 'gray']) self.add_saved_ulr_mongo(ulist) saved_num = len(ulist) table_name = 'task_result' fields = {'e_web_save_state': [03, 'd'], 'web_save_num': [saved_num, 'd'], 'web_request_num': [request_num, 'd'], 'web_save_run_time': [run_time, 's'], 'save_protected_objectid': [self.save_protected_objectID, 's'], 'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'], 'save_monitor_objectid': [self.save_monitor_objectID, 's'], 'save_gray_objectid': [self.save_gray_objectID, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') if ulist == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: send_result = self.message_other_engine(3, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|web_save engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'web_save', 2) engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter, self.gray_urls, self.counterfeit_urls, self.monitor_urls, self.url_num, self.update_running_state, self.update_finished_state, self.mongo_operate) engine.download() reactor.run(installSignalHandlers=0)
class FiltrateStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FiltrateStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' self.gray_urls = [] self.task_start_time = '' self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'gray_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] gray_id = task_info['gray_id'] # read gray url if gray_id is not None and gray_id != '': gray_id = gray_id.split('-') table_name = 'gray_list' fields = ['url'] for once_gray_id in gray_id: wheres = {'id': [int(once_gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # read detected url table_name = 'task_result' fields = ['original_grayid'] wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') gary_objectid = select_result['original_grayid'] if gary_objectid is not None and gary_objectid != '': gary_objectid = self.mongo_operate.expand_gray_list( gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( gary_objectid) else: self.get_gray_iter = iter([]) def update_finish_state(self, trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid): ''' task run over, update information in mysql ''' run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_filtrate_state': [03, 'd'], 'filtrate_trusted_num': [trusted_filtrate_num, 'd'], 'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'], 'filtrate_run_time': [run_time, 's'], 'filtrate_objectid': [filtrate_objectid, 's'], 'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'], 'filtrate_counterfeit_objectid': [filtrate_counterfeit_objectid, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # message to control send_result = self.message_other_engine(2, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def trusted_select(self, gray_url): ''' 在被信任名单中查询 ''' table_name = 'trusted_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one', 0) return select_result def counterfeit_select(self, gray_url): ''' 在仿冒名单中查询 ''' table_name = 'counterfeit_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one', 0) return select_result def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'filtrate', 2) trusted_filtrate_num = 0 counterfeit_filtrate_num = 0 suspect_grays = [] # not filtrate url trusted_grays = [] counterfeit_grays = [] while 1: try: gray_url = self.get_gray_iter.next() except StopIteration: try: gray_url = self.gray_urls.pop() except IndexError: break ''' 对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num, 否则放到suspect_grays中 ''' select_result = self.trusted_select(gray_url) if select_result is not False: trusted_filtrate_num += 1 trusted_grays.append(gray_url) continue else: select_result = self.counterfeit_select(gray_url) if select_result is not False: counterfeit_filtrate_num += 1 counterfeit_grays.append(gray_url) continue else: suspect_grays.append(gray_url) # not filtrate url add gray_list in mongo filtrate_objectid = self.mongo_operate.create_gray( gray_name='suspect_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( suspect_grays, filtrate_objectid) filtrate_trusted_objectid = self.mongo_operate.create_gray( gray_name='trusted_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( trusted_grays, filtrate_trusted_objectid) filtrate_counterfeit_objectid = self.mongo_operate.create_gray( gray_name='counterfeit_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( counterfeit_grays, filtrate_counterfeit_objectid) self.update_finish_state( trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid)
class MainControl(ServerBase): def __init__(self): super(MainControl, self).__init__('control') self.mysql_handle = MysqlOperate(self.mysql_db, self.mysql_host, self.mysql_user, self. mysql_password) def read_task_info(self, task_id): ''' read task type and run engine ''' table_name = 'task_info' fields = ['task_type', 'task_engine'] wheres = {'task_id': [task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), task_id)) os._exit(0) task_type = task_info['task_type'] task_engines = task_info['task_engine'].split('-') return task_type, task_engines def read_running_engine(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_domain_state', 'e_search_state', 'e_filtrate_state', 'e_web_save_state', 'e_qt_crawler_state', 'e_feature_save_state', 'e_whois_search_state', 'e_title_state', 'e_structure_state', 'e_view_collect_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: return False running_engine_list = [] for engine in select_result: engine_state = select_result[engine] if engine_state == 2: engine_num = engine_list[engine[2:-6]] running_engine_list.append(engine_num) print 'running_engine_list', running_engine_list return running_engine_list def update_start_state(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) self.mysql_handle.update_task_state(task_id, task_start_time, 2) def update_finished_state(self, task_id, task_state=3): # update task finished state to mysql: task_state, task_run_time, task_stop_time # get task last_time in task_info task_start_time = self.mysql_handle.get_task_last_time(task_id) # update task_state, task_run_time, task_stop_time task_start_time_stamp = time.mktime(time.strptime(str(task_start_time), "%Y-%m-%d %H:%M:%S")) task_stop_time_stamp = time.time() task_run_time = task_stop_time_stamp - task_start_time_stamp task_stop_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(task_stop_time_stamp)) table_name = 'task_result' fields = {'task_state': [task_state, 'd'], 'task_run_time': [task_run_time, 'd'], 'task_stop_time': [task_stop_time, 's']} wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} result = self.mysql_handle.require_post( table_name, fields, wheres, post_type='update') sys.stdout.write( '%s |*|task win over|*|, task_id: %s, task_state: %s\n' % (time.ctime(), task_id, task_state)) return result def check_engine_state(self, task_id, task_type, engines): ''' Determine whether all the detection engine run over ''' task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_title_state', 'e_structure_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} task_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') e_title_state = task_result['e_title_state'] e_structure_state = task_result['e_structure_state'] e_view_emd_state = task_result['e_view_emd_state'] if '08' in engines or task_type == 5: if e_title_state != 3: return False if '09' in engines or task_type == 5: if e_structure_state != 3: return False if '10' in engines or task_type == 5: if e_view_emd_state != 3: return False return True ''' 重写响应函数 ''' def web_request_start(self, task_id): ''' 重写守护进程基类,响应请求 ''' sys.stdout.write('%s control receive task_id start request: %s\n' % (time.ctime(), task_id)) message_result = False task_type, task_engines = self.read_task_info(task_id) if task_type == 1 or task_type == 3: if '01' in task_engines: message_result = self.message_other_engine(0, ['01'], task_id) self.update_start_state(task_id) if '02' in task_engines: message_result = self.message_other_engine(0, ['02'], task_id) self.update_start_state(task_id) if '13' in task_engines: message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) elif task_type == 2: # 04: filtrate engine, check first filtrate message_result = self.message_other_engine(0, ['04'], task_id) self.update_start_state(task_id) elif task_type == 4: # 05: web save engine message_result = self.message_other_engine(0, ['05'], task_id) self.update_start_state(task_id) elif task_type == 5: # 13: whois search engine message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) else: sys.stderr.write( '%s task_type error, task_id: %s, task_type: %d' % (time.ctime(), task_id, task_type)) return message_result def web_request_stop(self, task_id): ''' 主控服务响应前台客户端任务结束请求 ''' sys.stdout.write('%s control receive task_id stop request: %s\n' % (time.ctime(), task_id)) running_engine_list = self.read_running_engine(task_id) message_result = self.message_other_engine( 1, running_engine_list, task_id) if message_result is True: stop_result = self.update_finished_state(task_id) else: stop_result = False return stop_result def filtrate_to_control(self, task_id): ''' message 2: filtrate engine finished message control ''' sys.stdout.write('%s control receive from filtrate engine task_id: %s\n' % (time.ctime(), task_id)) # 05: web_save engine, end filtrate,start web_save self.message_other_engine(0, ['05'], task_id) def web_save_to_control(self, task_id): ''' message 3: web_save engine finished message control ''' sys.stdout.write('%s control receive from web_save engine task_id: %s\n' % (time.ctime(), task_id)) # 06: qt_crawler engine, 08: title engine # end web_save_,start qt_crawler and title engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['06'], task_id) if '08' in task_engines or task_type == 5: self.message_other_engine(0, ['08'], task_id) def qt_crawler_to_control(self, task_id): ''' message 4: qt_crawler engine finished message control ''' sys.stdout.write('%s control receive from qt_crawler engine task_id: %s\n' % (time.ctime(), task_id)) # 09: structure engine, 10: view engine # end qt_crawler,start structure and view engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['12'], task_id) self.message_other_engine(0, ['07'], task_id) if task_type == 5 or '09' in task_engines: self.message_other_engine(0, ['09'], task_id) def detect_to_control(self, task_id): ''' message 5: detect(domain or search or whois) engine finished message control ''' sys.stdout.write('%s control receive from detect engine task_id: %s\n' % (time.ctime(), task_id)) task_type, task_engines = self.read_task_info(task_id) if task_type == 3 or task_type == 5: self.message_other_engine(0, ['04'], task_id) elif task_type == 1: # task over self.update_finished_state(task_id) def check_to_control(self, task_id): ''' message 6: check(title or structure or view) engine finished message control, over task ''' task_type, task_engines = self.read_task_info(task_id) check_result = self.check_engine_state( task_id, task_type, task_engines) if check_result is True: # all check engine overf, task over self.update_finished_state(task_id) def feature_save_to_control(self, task_id): ''' message 7: feature_save is task_type 4 last engine, other task_type no last over task ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 4: self.update_finished_state(task_id) def engine_failure_to_control(self, task_id): ''' message 8: engine_failure, over task is error ''' self.update_finished_state(task_id, 0) def engine_win_over_to_control(self, task_id): ''' message 9: engine over, After engine need not start over task ''' self.update_finished_state(task_id) def view_collect_to_control(self, task_id): ''' message 10: view_collect engine over, After start view_emd ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 5 or '10' in task_engines: self.message_other_engine(0, ['10'], task_id) elif task_type == 4: self.update_finished_state(task_id)
class FiltrateStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FiltrateStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' self.gray_urls = [] self.task_start_time = '' self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'gray_id'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write('%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] gray_id = task_info['gray_id'] # read gray url if gray_id is not None and gray_id != '': gray_id = gray_id.split('-') table_name = 'gray_list' fields = ['url'] for once_gray_id in gray_id: wheres = {'id': [int(once_gray_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue gray_url = select_result['url'].encode('utf-8') self.gray_urls.append(gray_url) # read detected url table_name = 'task_result' fields = ['original_grayid'] wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') gary_objectid = select_result['original_grayid'] if gary_objectid is not None and gary_objectid != '': gary_objectid = self.mongo_operate.expand_gray_list(gary_objectid) self.get_gray_iter = self.mongo_operate.get_gray_list( gary_objectid) else: self.get_gray_iter = iter([]) def update_finish_state(self, trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid): ''' task run over, update information in mysql ''' run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = { 'e_filtrate_state': [03, 'd'], 'filtrate_trusted_num': [trusted_filtrate_num, 'd'], 'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'], 'filtrate_run_time': [run_time, 's'], 'filtrate_objectid': [filtrate_objectid, 's'], 'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'], 'filtrate_counterfeit_objectid': [filtrate_counterfeit_objectid, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') # message to control send_result = self.message_other_engine(2, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def trusted_select(self, gray_url): ''' 在被信任名单中查询 ''' table_name = 'trusted_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one', 0) return select_result def counterfeit_select(self, gray_url): ''' 在仿冒名单中查询 ''' table_name = 'counterfeit_list' fields = ['*'] wheres = {'url': [gray_url, 's']} select_result = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one', 0) return select_result def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'filtrate', 2) trusted_filtrate_num = 0 counterfeit_filtrate_num = 0 suspect_grays = [] # not filtrate url trusted_grays = [] counterfeit_grays = [] while 1: try: gray_url = self.get_gray_iter.next() except StopIteration: try: gray_url = self.gray_urls.pop() except IndexError: break ''' 对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num, 否则放到suspect_grays中 ''' select_result = self.trusted_select(gray_url) if select_result is not False: trusted_filtrate_num += 1 trusted_grays.append(gray_url) continue else: select_result = self.counterfeit_select(gray_url) if select_result is not False: counterfeit_filtrate_num += 1 counterfeit_grays.append(gray_url) continue else: suspect_grays.append(gray_url) # not filtrate url add gray_list in mongo filtrate_objectid = self.mongo_operate.create_gray( gray_name='suspect_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(suspect_grays, filtrate_objectid) filtrate_trusted_objectid = self.mongo_operate.create_gray( gray_name='trusted_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(trusted_grays, filtrate_trusted_objectid) filtrate_counterfeit_objectid = self.mongo_operate.create_gray( gray_name='counterfeit_grays', gray_type='filtrate', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(counterfeit_grays, filtrate_counterfeit_objectid) self.update_finish_state(trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid, filtrate_trusted_objectid, filtrate_counterfeit_objectid)
class Title_start(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(Title_start, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.task_start_time = '' self.user_id = '' self.gary_objectid = '' self.protected_list_id = [] self.get_protect_dict = {} self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() self.run_start_time = 0 self.title_check_num = 0 # 检查数量 self.title_find_num = 0 # 检查到钓鱼url的数量 # self.split_values = 10 # 设置数值,分割每多少个url更新入数据库 self.once_update_num = 1 def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.protected_text_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_text) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) self.counterfeit_text_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_text) # 任务执行中更新状态 def update_running_state(self, title_check_num, title_find_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'title_check_num': [title_check_num, 'd'], 'title_find_num': [title_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } result = self.mysql_handle.require_post(table_name, fields, wheres, 'update') # 任务完成更新状态 def update_finished_state(self): ''' 在mysql中更新探测状态及结果 ''' run_time = int(time.time()) - int(self.run_start_time) table_name = 'task_result' fields = { 'e_title_state': [03, 'd'], 'title_run_time': [run_time, 's'], 'title_check_num': [self.title_check_num, 'd'], 'title_find_num': [self.title_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } result = self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): # message to control send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) self.remove_process_pid(self.task_id) def run(self): self.run_start_time = time.time() self.write_process_pid(self.task_id) sys.stdout.write('%s |*|title engine start|*|, task_id: %s\n' % (time.ctime(), self.task_id)) title_main = TitleMain(self.task_id, self.task_start_time, self.protected_title_dict, self.protected_text_dict, self.mongo_operate, self.mysql_handle) update_count = 0 counterfeit_get_gray_iter = [] while True: try: gray_url = self.get_gray_iter.next() counterfeit_get_gray_iter.append(gray_url) check_result = title_main.title_run(gray_url) self.title_find_num += check_result self.title_check_num += 1 update_count += 1 if update_count >= self.once_update_num: update_count = 0 self.update_running_state(self.title_check_num, self.title_find_num) except StopIteration: break title_main2 = TitleMain(self.task_id, self.task_start_time, self.counterfeit_title_dict, self.counterfeit_text_dict, self.mongo_operate, self.mysql_handle, 'counterfeit') while True: try: gray_url = counterfeit_get_gray_iter.pop() check_result = title_main2.title_run(gray_url) self.title_find_num += check_result self.title_check_num += 1 update_count += 1 if update_count >= self.once_update_num: update_count = 0 self.update_running_state(self.title_check_num, self.title_find_num) except IndexError: break self.update_finished_state()
class QtCrawler(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(QtCrawler, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, crawler_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'qt_crawler_num': [crawler_num, 'd']} wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def update_finish_state(self, crawler_num, run_time): table_name = 'task_result' fields = { 'e_qt_crawler_state': [03, 'd'], 'qt_crawler_num': [crawler_num, 'd'], 'qt_crawler_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) send_result = self.message_other_engine(4, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) # self.page_shot() def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write( '%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write( '%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot(main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over' def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() # self.read_task_info() # self.read_crawler_config() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'qt_crawler', 2) self.app = QApplication(sys.argv) self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter, self.get_counterfeit_iter, self.get_monitor_iter, self.mongo_operate, self.update_running_state, self.update_finish_state, self.mysql_handle, self.run_start_time) # self.br.showMaximized() # show web # self.br.show() sys.exit(self.app.exec_())
class QtCrawler(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(QtCrawler, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, crawler_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'qt_crawler_num': [crawler_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def update_finish_state(self, crawler_num, run_time): table_name = 'task_result' fields = {'e_qt_crawler_state': [03, 'd'], 'qt_crawler_num': [crawler_num, 'd'], 'qt_crawler_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) send_result = self.message_other_engine(4, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) # self.page_shot() def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write('%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write('%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot( main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over' def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() # self.read_task_info() # self.read_crawler_config() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'qt_crawler', 2) self.app = QApplication(sys.argv) self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter, self.get_counterfeit_iter, self.get_monitor_iter, self.mongo_operate, self.update_running_state, self.update_finish_state, self.mysql_handle, self.run_start_time) # self.br.showMaximized() # show web # self.br.show() sys.exit(self.app.exec_())
class StructureStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid, structure_num_compare_k, structure_num_compare_b, structure_area_compare_k, structure_area_compare_b): super(StructureStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.structure_num_compare_k = structure_num_compare_k self.structure_num_compare_b = structure_num_compare_b self.structure_area_compare_k = structure_area_compare_k self.structure_area_compare_b = structure_area_compare_b self.mongo_db = mongo_db self.mongo_host = mongo_host self.mongo_port = mongo_port self.mongo_user = mongo_user self.mongo_password = mongo_password # 初始化操作 self.run_start_time = 0 self.structure_check_num = 0 # 检查数量 self.structure_find_num = 0 # 检查到钓鱼url的数量 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_tree) self.counterfeit_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_tree) # 任务执行中更新状态 def update_running_state(self): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'structure_check_num': [self.structure_check_num, 'd'], 'structure_find_num': [self.structure_find_num, 'd'], } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def update_finished_state(self): ''' 在mysql中更新探测状态及结果 ''' run_time = int(time.time()) - int(self.run_start_time) table_name = 'task_result' fields = { 'e_structure_state': [03, 'd'], 'structure_run_time': [run_time, 's'], 'structure_check_num': [self.structure_check_num, 'd'], 'structure_find_num': [self.structure_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): # message to control send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) self.remove_process_pid(self.task_id) def run_structure_compare(self): structure_compare = StructureCompare(self.structure_num_compare_k, self.structure_num_compare_b, self.structure_area_compare_k, self.structure_area_compare_b) while True: try: gray_url = self.get_gray_iter.next() gray_block_list = self.mongo_operate.get_web_tree( gray_url, 'gray') # mongo not have tree of url if gray_block_list is False or gray_block_list == []: continue # cehck to protected for protected_url in self.protected_dict.keys(): protected_block_list = self.protected_dict[protected_url] if protected_block_list == []: continue check_result = structure_compare.once_compare( protected_block_list, gray_block_list) if check_result == 1: self.structure_find_num += 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'structure', source_url=protected_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'structure') break # check to counterfeit for counterfeit_url in self.counterfeit_dict.keys(): counterfeit_block_list = self.counterfeit_dict[ counterfeit_url] if counterfeit_block_list == []: continue check_result = structure_compare.once_compare( counterfeit_block_list, gray_block_list) if check_result == 1: self.structure_find_num += 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'structure', counterfeit_url=counterfeit_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'structure') break self.structure_check_num += 1 self.update_running_state() except StopIteration: break def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'structure', 2) self.run_structure_compare() self.update_finished_state()
class DomainStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(DomainStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.user_id = '' # 待变换网站列表, 包括已知仿冒网站和被保护网站, # 对已知仿冒网站和对被保护网站域名变换方式一样, 故统一处理 self.wait_change_url_list = [] self.original_host_rules = [] self.original_top_rules = [] self.original_path_rules = [] self.exist_list = [] # 记录存在的url self.task_start_time = '' self.run_start_time = 0 self.url_create_list = [] self.protect_url = '' self.deferreds = [] self.read_task_info() self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) # 创建域名生成器对象 self.url_gen = URLGenerator(self.task_id, self.mongo_operate, self.update_running_state, self.wait_change_url_list, self.original_host_rules, self.original_top_rules, self.original_path_rules) self.domain_change_url = self.url_gen.URL_Generator() # 创建生成器 self.domain_save_path = '/tmp/' + \ str(task_id) + '_domain_request_urls.txt' self.domain_live_path = '/tmp/' + \ str(task_id) + '_domain_live.txt' self.file_request_urls = open(self.domain_save_path, 'w') self.file_live_url = open(self.domain_live_path, 'w') def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = [ 'last_time', 'user_id', 'protected_id', 'counterfeit_id', 'host_rule_id', 'top_rule_id', 'path_rule_id' ] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get(table_name, fields, wheres, 'select', 'one') if task_info is False: os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] protected_list_id = task_info['protected_id'] counterfeit_list_id = task_info['counterfeit_id'] host_rule_id = task_info['host_rule_id'] top_rule_id = task_info['top_rule_id'] path_rule_id = task_info['path_rule_id'] self.read_rule_config(protected_list_id, counterfeit_list_id, host_rule_id, top_rule_id, path_rule_id) def read_rule_config(self, protected_list_id, counterfeit_list_id, host_rule_id, top_rule_id, path_rule_id): ''' 从mysql中读取变换规则和被保护名单 ''' if protected_list_id is not None and protected_list_id != '': protected_list_id = protected_list_id.split('-') for protected_id in protected_list_id: # 读取mysql中的被保护名单 table_name = 'protected_list' fields = ['url'] wheres = {'id': [int(protected_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue protected = task_info['url'] self.wait_change_url_list.append(protected) if counterfeit_list_id is not None and counterfeit_list_id != '': counterfeit_list_id = counterfeit_list_id.split('-') for counterfeit_id in counterfeit_list_id: # 读取mysql中的待变换已知仿冒网站 table_name = 'counterfeit_list' fields = ['url'] wheres = {'id': [int(counterfeit_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue counterfeit = task_info['url'] self.wait_change_url_list.append(counterfeit) if host_rule_id is not None and host_rule_id != '': host_rule_id = host_rule_id.split('-') for rule_id in host_rule_id: # 读取mysql中的主机域名变换规则 table_name = 'host_change_rule' fields = ['change_rule'] wheres = {'id': [int(rule_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] result = result.split('|') for once_result in result: self.original_host_rules.append(str(once_result)) if top_rule_id is not None and top_rule_id != '': top_rule_id = top_rule_id.split('-') for top_id in top_rule_id: # 读取mysql中的顶级域名变换规则 table_name = 'top_change_rule' fields = ['change_rule'] wheres = {'id': [int(top_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] self.original_top_rules.append(str(result)) if path_rule_id is not None and path_rule_id != '': path_rule_id = path_rule_id.split('-') for path_id in path_rule_id: # 读取mysql中的路径变换规则 table_name = 'path_change_rule' fields = ['change_rule'] wheres = {'id': [int(path_id), 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: continue result = task_info['change_rule'] self.original_path_rules.append(str(result)) def update_running_state(self, all_change_num, all_exist_change_num, changed_num, gray_exist_num, update_type=0): ''' 在mysql中更新探测状态及结果 update_type=0: domain change update update_type=1: url exist check update ''' table_name = 'task_result' wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } if update_type == 0: fields = { 'domain_changed_all_num': [all_change_num, 'd'], 'domain_changed_exist_num': [all_exist_change_num, 'd'], 'domain_detected_num': [changed_num, 'd'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if update_type == 1: fields = {'domain_gray_url_num': [gray_exist_num, 'd']} self.mysql_handle.require_post(table_name, fields, wheres, 'update') def create_gray_mongo(self, exist_list): gray_name = 'NO.' + str(self.task_id) + ' task domian' detect_objectID = self.mongo_operate.create_gray( gray_name=gray_name, gray_type='domain_change', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list(exist_list, detect_objectID) return detect_objectID def update_finish_state(self, exist_list, run_time): ''' task run over, update information in mysql ''' detect_objectID = self.create_gray_mongo(exist_list) exist_url_num = len(exist_list) # save domain engine result in mysql task_result table_name = 'task_result' fields = { 'e_domain_state': [03, 'd'], 'domain_gray_url_num': [exist_url_num, 'd'], 'original_grayid': [detect_objectID, 's'], 'domain_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') if exist_list == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: # save gray_list info in mysql self.mysql_handle.insert_suspect_list(detect_objectID, self.user_id, self.task_id, 'domain_change', exist_url_num, suspect_type=2) self.mysql_handle.insert_gray_list(exist_list, source='domain_change') # quit deal # message to control send_result = self.message_other_engine(5, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def pageCallback(self, result, url, protect): ''' 用 getpage检测 网页存在 网页存在,调用此回调函数 ''' global _NUM match = re.search(r"<title>(.*?)</title>", result) try: title = match.group(1) except: title = 'None' if title.find("Redirect") == -1: self.exist_list.append(url) self.update_running_state(gray_exist_num=len(self.exist_list), update_type=1) self.file_request_urls.write(url + '\n') self.file_request_urls.flush() self.file_live_url.seek(0) self.file_live_url.truncate(0) self.file_live_url.write(url + ' ' + str(self.engine_pid)) self.file_live_url.flush() _NUM += 1 self.download() def finish(self, ign): ''' 所有的defer处理完后调用finish结束reacter循环 ''' try: reactor.stop() os.remove(self.domain_save_path) os.remove(self.domain_live_path) except: pass def fetch_error(self, error, url, protect): ''' 用getpage检测,网页不存在调用此回调函数 ''' global _NUM if error.getErrorMessage().find( 'User timeout caused connection failure') != -1: d = getPage(url) d.addCallback(self.pageCallback, url, protect) d.addErrback(self.fetch_error, url, protect) else: self.file_live_url.seek(0) self.file_live_url.truncate(0) self.file_live_url.write(url + ' ' + str(self.engine_pid)) self.file_live_url.flush() _NUM += 1 self.download() def download(self): global _NUM while _NUM > 0: try: url = self.url_create_list.pop(0) d = getPage(url.encode('utf-8')) d.addCallback(self.pageCallback, url.encode('utf-8'), self.protect_url) d.addErrback(self.fetch_error, url.encode('utf-8'), self.protect_url) _NUM -= 1 self.deferreds.append(d) except IndexError: try: self.url_create_list = [] self.url_create_list = self.domain_change_url.next() # print 'download', self.url_create_list self.protect_url = self.url_create_list[0] self.url_create_list = self.url_create_list[1:] except StopIteration: dl = defer.DeferredList(self.deferreds) dl.addCallback(self.finish) break def run(self): ''' 程序入口 ''' # write child process pid to engine pids self.write_process_pid(self.task_id) self.engine_pid = os.getpid() self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'domain', 2) try: self.url_create_list = self.domain_change_url.next() self.protect_url = self.url_create_list[0] self.url_create_list = self.url_create_list[1:] except StopIteration: pass self.download() # start reactor.run() # finaish run_time = int(time.time()) - int(self.run_start_time) self.update_finish_state(self.exist_list, run_time)
class WhoisSearchStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(WhoisSearchStart, self).__init__() self.task_id = task_id self.mysql_host = mysql_host self.mysql_db = mysql_db self.mysql_user = mysql_user self.mysql_password = mysql_password self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) # 初始化操作 self.task_start_time = '' self.user_id = '' self.whois_search_url = '' self.whois_reverse_url = '' self.counterfeit_urls = [] self.task_state = 0 self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' table_name = 'task_info' fields = ['last_time', 'user_id', 'counterfeit_id', 'whois_search_url', 'whois_reverse_url'] wheres = {'task_id': [self.task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), self.task_id)) os._exit(0) self.task_start_time = task_info['last_time'] self.user_id = task_info['user_id'] self.whois_search_url = task_info['whois_search_url'] self.whois_reverse_url = task_info['whois_reverse_url'] original_counterfeit_list = task_info['counterfeit_id'] # get counterfeit url in mysql counterfeit_list if original_counterfeit_list is not None and original_counterfeit_list != '': counterfeit_id_list = original_counterfeit_list.split('-') table_name = 'counterfeit_list' fields = ['url'] for counterfeit_id in counterfeit_id_list: wheres = {'id': [int(counterfeit_id), 'd']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: continue counterfeit_url = select_result['url'].encode('utf-8') self.counterfeit_urls.append(counterfeit_url) def add_gray_list(self, url_list): if url_list == []: return False gray_objectid = self.mongo_operate.create_gray( gray_name='whois_reverse_gray', gray_type='whois_reverse', usr_id=self.user_id, task_id=self.task_id) self.mongo_operate.add_gray_list( url_list, gray_objectid) table_name = 'task_result' fields = {'original_grayid': [gray_objectid, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # save gray_list info in mysql suspect_list url_num = len(url_list) self.mysql_handle.insert_suspect_list(gray_objectid, self.user_id, self.task_id, 'whois_reverse', url_num, suspect_type=2) self.mysql_handle.insert_gray_list(url_list, source='whois_reverse') def update_finish_state(self, new_gray_lsit): run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_whois_search_state': [03, 'd'], 'whois_search_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') # message to control if new_gray_lsit == []: send_result = self.message_other_engine(9, ['00'], self.task_id) else: self.add_gray_list(new_gray_lsit) send_result = self.message_other_engine(5, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def run_whois_reverse(self, url): whois_reverse = WhoisReverse(self.mysql_host, self.mysql_db, self.mysql_user, self.mysql_password) try: reverse_url_list = [] reverse_domain_list = whois_reverse.get_reverse_whois(url) for domian in reverse_domain_list: reverse_url = 'http://' + domian + '/' reverse_url_list.append(reverse_url) return reverse_url_list except: traceback.print_exc() return [] def run_whois_search(self, url): ''' 通过使whois查询模块在子线程中运行,从而避免对主线程造成影响 ''' url_analysis = Urlanalysis(1, self.mysql_host, self.mysql_user, self.mysql_password, self.mysql_db) url_list = [url] try: url_analysis.getUrllist_list(url_list) except: traceback.print_exc() def web_save_transfer(self, url): self.mongo_operate.transfer_web_save( url, source_type='gray', goal_type='counterfeit') h = WebSavePath() source_file_path, target_file_path = h.get_transfer_path( url, 'gray', 'counterfeit') web_info_transfer(source_file_path, target_file_path) def whois_operation(self): if self.whois_search_url != '' and self.whois_search_url is not None: self.run_whois_search(self.whois_search_url) if self.whois_reverse_url != '' and self.whois_reverse_url is not None: self.run_whois_reverse(self.whois_reverse_url) new_gray_lsit = [] while 1: try: url = self.counterfeit_urls.pop() #self.web_save_transfer(url) self.mysql_handle.update_counterfeit_list_statistic(url) self.run_whois_search(url) reverse_url_list = self.run_whois_reverse(url) new_gray_lsit.extend(reverse_url_list) except IndexError: break self.update_finish_state(new_gray_lsit) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'whois_search', 2) self.whois_operation()