class View_Emd_start(multiprocessing.Process): def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(View_Emd_start, self).__init__() self.task_id = task_id self.task_start_time = '' self.user_id = '' self.view_protected_objectid = '' self.view_gray_objectid = '' self.view_counterfeit_objectid = '' self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.current_path = sys.path[0] self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) def update_running_state(self, finish_num, view_find_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'view_check_num': [finish_num, 'd'], 'view_find_num': [view_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def engine_over_handle(self): send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) # 任务完成更新状态 def update_finished_state(self, run_time, finish_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'e_view_emd_state': [03, 'd'], 'view_emd_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def run(self): finish_num = 0 view_find_num = 0 view_find_flags = 0 start_time = time.time() View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id, self.task_start_time, self.protected_title_dict, self.counterfeit_title_dict) while True: try: gray_url = self.get_gray_iter.next() view_find_flags = View_emd.emdcalculate(gray_url) finish_num += 1 view_find_num += view_find_flags self.update_running_state(finish_num, view_find_num) except StopIteration: break run_time = int(time.time()) - int(start_time) #run_time = time.ctime(run_time) self.update_finished_state(run_time, finish_num)
class View_Emd_start(multiprocessing.Process): def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(View_Emd_start, self).__init__() self.task_id = task_id self.task_start_time = '' self.user_id = '' self.view_protected_objectid = '' self.view_gray_objectid = '' self.view_counterfeit_objectid = '' self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.mongo_operate = Mongo_Operate( mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.current_path = sys.path[0] self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.read_task_info() def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) def update_running_state(self, finish_num, view_find_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'view_check_num': [finish_num, 'd'], 'view_find_num': [view_find_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def engine_over_handle(self): send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) # 任务完成更新状态 def update_finished_state(self, run_time, finish_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'e_view_emd_state': [03, 'd'], 'view_emd_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def run(self): finish_num = 0 view_find_num = 0 view_find_flags = 0 start_time = time.time() View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id, self.task_start_time, self.protected_title_dict, self.counterfeit_title_dict) while True: try: gray_url = self.get_gray_iter.next() view_find_flags = View_emd.emdcalculate(gray_url) finish_num += 1 view_find_num += view_find_flags self.update_running_state(finish_num, view_find_num) except StopIteration: break run_time = int(time.time()) - int(start_time) #run_time = time.ctime(run_time) self.update_finished_state(run_time, finish_num)
class FeatureSaveStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(FeatureSaveStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.task_start_time = '' self.run_start_time = 0 self.save_num = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, save_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'feature_save_num': [save_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def update_finish_state(self, save_num): run_time = int(time.time() - self.run_start_time) table_name = 'task_result' fields = {'e_feature_save_state': [03, 'd'], 'feature_save_num': [save_num, 'd'], 'feature_save_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): send_result = self.message_other_engine(7, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) def save_web_feature(self): while 1: try: url = self.get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = self.get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = self.get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = self.get_monitor_iter.next() url_type = 'monitor' except StopIteration: break table_name = url_type + '_feature' self.mysql_handle.insert_web_feature(url, url_type, table_name, update_sign=True) self.save_num += 1 self.update_running_state(self.save_num) def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'feature_save', 2) self.save_web_feature() self.update_finish_state(self.save_num)
class MainControl(ServerBase): def __init__(self): super(MainControl, self).__init__('control') self.mysql_handle = MysqlOperate(self.mysql_db, self.mysql_host, self.mysql_user, self. mysql_password) def read_task_info(self, task_id): ''' read task type and run engine ''' table_name = 'task_info' fields = ['task_type', 'task_engine'] wheres = {'task_id': [task_id, 'd']} task_info = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if task_info is False: sys.stderr.write( '%s task no exist, task_id: %s\n' % (time.ctime(), task_id)) os._exit(0) task_type = task_info['task_type'] task_engines = task_info['task_engine'].split('-') return task_type, task_engines def read_running_engine(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_domain_state', 'e_search_state', 'e_filtrate_state', 'e_web_save_state', 'e_qt_crawler_state', 'e_feature_save_state', 'e_whois_search_state', 'e_title_state', 'e_structure_state', 'e_view_collect_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} select_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') if select_result is False: return False running_engine_list = [] for engine in select_result: engine_state = select_result[engine] if engine_state == 2: engine_num = engine_list[engine[2:-6]] running_engine_list.append(engine_num) print 'running_engine_list', running_engine_list return running_engine_list def update_start_state(self, task_id): task_start_time = self.mysql_handle.get_task_last_time(task_id) self.mysql_handle.update_task_state(task_id, task_start_time, 2) def update_finished_state(self, task_id, task_state=3): # update task finished state to mysql: task_state, task_run_time, task_stop_time # get task last_time in task_info task_start_time = self.mysql_handle.get_task_last_time(task_id) # update task_state, task_run_time, task_stop_time task_start_time_stamp = time.mktime(time.strptime(str(task_start_time), "%Y-%m-%d %H:%M:%S")) task_stop_time_stamp = time.time() task_run_time = task_stop_time_stamp - task_start_time_stamp task_stop_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(task_stop_time_stamp)) table_name = 'task_result' fields = {'task_state': [task_state, 'd'], 'task_run_time': [task_run_time, 'd'], 'task_stop_time': [task_stop_time, 's']} wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} result = self.mysql_handle.require_post( table_name, fields, wheres, post_type='update') sys.stdout.write( '%s |*|task win over|*|, task_id: %s, task_state: %s\n' % (time.ctime(), task_id, task_state)) return result def check_engine_state(self, task_id, task_type, engines): ''' Determine whether all the detection engine run over ''' task_start_time = self.mysql_handle.get_task_last_time(task_id) table_name = 'task_result' fields = ['e_title_state', 'e_structure_state', 'e_view_emd_state'] wheres = {'task_id': [task_id, 'd'], 'start_time': [task_start_time, 's']} task_result = self.mysql_handle.require_get( table_name, fields, wheres, 'select', 'one') e_title_state = task_result['e_title_state'] e_structure_state = task_result['e_structure_state'] e_view_emd_state = task_result['e_view_emd_state'] if '08' in engines or task_type == 5: if e_title_state != 3: return False if '09' in engines or task_type == 5: if e_structure_state != 3: return False if '10' in engines or task_type == 5: if e_view_emd_state != 3: return False return True ''' 重写响应函数 ''' def web_request_start(self, task_id): ''' 重写守护进程基类,响应请求 ''' sys.stdout.write('%s control receive task_id start request: %s\n' % (time.ctime(), task_id)) message_result = False task_type, task_engines = self.read_task_info(task_id) if task_type == 1 or task_type == 3: if '01' in task_engines: message_result = self.message_other_engine(0, ['01'], task_id) self.update_start_state(task_id) if '02' in task_engines: message_result = self.message_other_engine(0, ['02'], task_id) self.update_start_state(task_id) if '13' in task_engines: message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) elif task_type == 2: # 04: filtrate engine, check first filtrate message_result = self.message_other_engine(0, ['04'], task_id) self.update_start_state(task_id) elif task_type == 4: # 05: web save engine message_result = self.message_other_engine(0, ['05'], task_id) self.update_start_state(task_id) elif task_type == 5: # 13: whois search engine message_result = self.message_other_engine(0, ['13'], task_id) self.update_start_state(task_id) else: sys.stderr.write( '%s task_type error, task_id: %s, task_type: %d' % (time.ctime(), task_id, task_type)) return message_result def web_request_stop(self, task_id): ''' 主控服务响应前台客户端任务结束请求 ''' sys.stdout.write('%s control receive task_id stop request: %s\n' % (time.ctime(), task_id)) running_engine_list = self.read_running_engine(task_id) message_result = self.message_other_engine( 1, running_engine_list, task_id) if message_result is True: stop_result = self.update_finished_state(task_id) else: stop_result = False return stop_result def filtrate_to_control(self, task_id): ''' message 2: filtrate engine finished message control ''' sys.stdout.write('%s control receive from filtrate engine task_id: %s\n' % (time.ctime(), task_id)) # 05: web_save engine, end filtrate,start web_save self.message_other_engine(0, ['05'], task_id) def web_save_to_control(self, task_id): ''' message 3: web_save engine finished message control ''' sys.stdout.write('%s control receive from web_save engine task_id: %s\n' % (time.ctime(), task_id)) # 06: qt_crawler engine, 08: title engine # end web_save_,start qt_crawler and title engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['06'], task_id) if '08' in task_engines or task_type == 5: self.message_other_engine(0, ['08'], task_id) def qt_crawler_to_control(self, task_id): ''' message 4: qt_crawler engine finished message control ''' sys.stdout.write('%s control receive from qt_crawler engine task_id: %s\n' % (time.ctime(), task_id)) # 09: structure engine, 10: view engine # end qt_crawler,start structure and view engine task_type, task_engines = self.read_task_info(task_id) self.message_other_engine(0, ['12'], task_id) self.message_other_engine(0, ['07'], task_id) if task_type == 5 or '09' in task_engines: self.message_other_engine(0, ['09'], task_id) def detect_to_control(self, task_id): ''' message 5: detect(domain or search or whois) engine finished message control ''' sys.stdout.write('%s control receive from detect engine task_id: %s\n' % (time.ctime(), task_id)) task_type, task_engines = self.read_task_info(task_id) if task_type == 3 or task_type == 5: self.message_other_engine(0, ['04'], task_id) elif task_type == 1: # task over self.update_finished_state(task_id) def check_to_control(self, task_id): ''' message 6: check(title or structure or view) engine finished message control, over task ''' task_type, task_engines = self.read_task_info(task_id) check_result = self.check_engine_state( task_id, task_type, task_engines) if check_result is True: # all check engine overf, task over self.update_finished_state(task_id) def feature_save_to_control(self, task_id): ''' message 7: feature_save is task_type 4 last engine, other task_type no last over task ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 4: self.update_finished_state(task_id) def engine_failure_to_control(self, task_id): ''' message 8: engine_failure, over task is error ''' self.update_finished_state(task_id, 0) def engine_win_over_to_control(self, task_id): ''' message 9: engine over, After engine need not start over task ''' self.update_finished_state(task_id) def view_collect_to_control(self, task_id): ''' message 10: view_collect engine over, After start view_emd ''' task_type, task_engines = self.read_task_info(task_id) if task_type == 5 or '10' in task_engines: self.message_other_engine(0, ['10'], task_id) elif task_type == 4: self.update_finished_state(task_id)
class Title_start(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(Title_start, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.task_start_time = '' self.user_id = '' self.gary_objectid = '' self.protected_list_id = [] self.get_protect_dict = {} self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() self.run_start_time = 0 self.title_check_num = 0 # 检查数量 self.title_find_num = 0 # 检查到钓鱼url的数量 # self.split_values = 10 # 设置数值,分割每多少个url更新入数据库 self.once_update_num = 1 def read_task_info(self): ''' 读取任务信息 ''' self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_title_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_title) self.protected_text_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_text) self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_title) self.counterfeit_text_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_text) # 任务执行中更新状态 def update_running_state(self, title_check_num, title_find_num): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'title_check_num': [title_check_num, 'd'], 'title_find_num': [title_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } result = self.mysql_handle.require_post(table_name, fields, wheres, 'update') # 任务完成更新状态 def update_finished_state(self): ''' 在mysql中更新探测状态及结果 ''' run_time = int(time.time()) - int(self.run_start_time) table_name = 'task_result' fields = { 'e_title_state': [03, 'd'], 'title_run_time': [run_time, 's'], 'title_check_num': [self.title_check_num, 'd'], 'title_find_num': [self.title_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } result = self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): # message to control send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) self.remove_process_pid(self.task_id) def run(self): self.run_start_time = time.time() self.write_process_pid(self.task_id) sys.stdout.write('%s |*|title engine start|*|, task_id: %s\n' % (time.ctime(), self.task_id)) title_main = TitleMain(self.task_id, self.task_start_time, self.protected_title_dict, self.protected_text_dict, self.mongo_operate, self.mysql_handle) update_count = 0 counterfeit_get_gray_iter = [] while True: try: gray_url = self.get_gray_iter.next() counterfeit_get_gray_iter.append(gray_url) check_result = title_main.title_run(gray_url) self.title_find_num += check_result self.title_check_num += 1 update_count += 1 if update_count >= self.once_update_num: update_count = 0 self.update_running_state(self.title_check_num, self.title_find_num) except StopIteration: break title_main2 = TitleMain(self.task_id, self.task_start_time, self.counterfeit_title_dict, self.counterfeit_text_dict, self.mongo_operate, self.mysql_handle, 'counterfeit') while True: try: gray_url = counterfeit_get_gray_iter.pop() check_result = title_main2.title_run(gray_url) self.title_find_num += check_result self.title_check_num += 1 update_count += 1 if update_count >= self.once_update_num: update_count = 0 self.update_running_state(self.title_check_num, self.title_find_num) except IndexError: break self.update_finished_state()
class QtCrawler(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(QtCrawler, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, crawler_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'qt_crawler_num': [crawler_num, 'd']} wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def update_finish_state(self, crawler_num, run_time): table_name = 'task_result' fields = { 'e_qt_crawler_state': [03, 'd'], 'qt_crawler_num': [crawler_num, 'd'], 'qt_crawler_run_time': [run_time, 's'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) send_result = self.message_other_engine(4, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) # self.page_shot() def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write( '%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write( '%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot(main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over' def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() # self.read_task_info() # self.read_crawler_config() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'qt_crawler', 2) self.app = QApplication(sys.argv) self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter, self.get_counterfeit_iter, self.get_monitor_iter, self.mongo_operate, self.update_running_state, self.update_finish_state, self.mysql_handle, self.run_start_time) # self.br.showMaximized() # show web # self.br.show() sys.exit(self.app.exec_())
class QtCrawler(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid): super(QtCrawler, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid # 初始化操作 self.run_start_time = 0 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_protected_iter = saved_urls_iters['get_protected_iter'] self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] def update_running_state(self, crawler_num): # 任务执行中更新状态 ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = {'qt_crawler_num': [crawler_num, 'd']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') def update_finish_state(self, crawler_num, run_time): table_name = 'task_result' fields = {'e_qt_crawler_state': [03, 'd'], 'qt_crawler_num': [crawler_num, 'd'], 'qt_crawler_run_time': [run_time, 's']} wheres = {'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's']} self.mysql_handle.require_post( table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): sys.stdout.write( '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) send_result = self.message_other_engine(4, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state( self.task_id, self.task_start_time, 0) self.remove_process_pid(self.task_id) # self.page_shot() def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write('%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write('%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot( main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over' def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() # self.read_task_info() # self.read_crawler_config() self.mysql_handle.update_engine_state( self.task_id, self.task_start_time, 'qt_crawler', 2) self.app = QApplication(sys.argv) self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter, self.get_counterfeit_iter, self.get_monitor_iter, self.mongo_operate, self.update_running_state, self.update_finish_state, self.mysql_handle, self.run_start_time) # self.br.showMaximized() # show web # self.br.show() sys.exit(self.app.exec_())
class StructureStart(multiprocessing.Process): def __init__(self, task_id, mysql_host, mysql_db, mysql_user, mysql_password, mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine, write_process_pid, remove_process_pid, structure_num_compare_k, structure_num_compare_b, structure_area_compare_k, structure_area_compare_b): super(StructureStart, self).__init__() self.task_id = task_id self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user, mysql_password) self.message_other_engine = message_other_engine self.write_process_pid = write_process_pid self.remove_process_pid = remove_process_pid self.structure_num_compare_k = structure_num_compare_k self.structure_num_compare_b = structure_num_compare_b self.structure_area_compare_k = structure_area_compare_k self.structure_area_compare_b = structure_area_compare_b self.mongo_db = mongo_db self.mongo_host = mongo_host self.mongo_port = mongo_port self.mongo_user = mongo_user self.mongo_password = mongo_password # 初始化操作 self.run_start_time = 0 self.structure_check_num = 0 # 检查数量 self.structure_find_num = 0 # 检查到钓鱼url的数量 self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port, mongo_user, mongo_password) self.read_task_info() def read_task_info(self): self.task_start_time = self.mysql_handle.get_task_last_time( self.task_id) saved_urls_iters = self.mysql_handle.read_saved_urls( self.task_id, self.mongo_operate) self.get_gray_iter = saved_urls_iters['get_gray_iter'] self.get_monitor_iter = saved_urls_iters['get_monitor_iter'] self.protected_dict = self.mysql_handle.get_all_protected_feature( self.mongo_operate.get_web_tree) self.counterfeit_dict = self.mysql_handle.get_all_counterfeit_feature( self.mongo_operate.get_web_tree) # 任务执行中更新状态 def update_running_state(self): ''' 在mysql中更新探测状态及结果 ''' table_name = 'task_result' fields = { 'structure_check_num': [self.structure_check_num, 'd'], 'structure_find_num': [self.structure_find_num, 'd'], } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') def update_finished_state(self): ''' 在mysql中更新探测状态及结果 ''' run_time = int(time.time()) - int(self.run_start_time) table_name = 'task_result' fields = { 'e_structure_state': [03, 'd'], 'structure_run_time': [run_time, 's'], 'structure_check_num': [self.structure_check_num, 'd'], 'structure_find_num': [self.structure_find_num, 'd'] } wheres = { 'task_id': [self.task_id, 'd'], 'start_time': [self.task_start_time, 's'] } self.mysql_handle.require_post(table_name, fields, wheres, 'update') self.engine_over_handle() def engine_over_handle(self): # message to control send_result = self.message_other_engine(6, ['00'], self.task_id) if send_result is False: # control engine no response, stop task self.mysql_handle.update_task_state(self.task_id, self.task_start_time, 0) sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id)) self.remove_process_pid(self.task_id) def run_structure_compare(self): structure_compare = StructureCompare(self.structure_num_compare_k, self.structure_num_compare_b, self.structure_area_compare_k, self.structure_area_compare_b) while True: try: gray_url = self.get_gray_iter.next() gray_block_list = self.mongo_operate.get_web_tree( gray_url, 'gray') # mongo not have tree of url if gray_block_list is False or gray_block_list == []: continue # cehck to protected for protected_url in self.protected_dict.keys(): protected_block_list = self.protected_dict[protected_url] if protected_block_list == []: continue check_result = structure_compare.once_compare( protected_block_list, gray_block_list) if check_result == 1: self.structure_find_num += 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'structure', source_url=protected_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'structure') break # check to counterfeit for counterfeit_url in self.counterfeit_dict.keys(): counterfeit_block_list = self.counterfeit_dict[ counterfeit_url] if counterfeit_block_list == []: continue check_result = structure_compare.once_compare( counterfeit_block_list, gray_block_list) if check_result == 1: self.structure_find_num += 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'structure', counterfeit_url=counterfeit_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'structure') break self.structure_check_num += 1 self.update_running_state() except StopIteration: break def run(self): # write child process pid to engine pids self.write_process_pid(self.task_id) self.run_start_time = time.time() self.mysql_handle.update_engine_state(self.task_id, self.task_start_time, 'structure', 2) self.run_structure_compare() self.update_finished_state()