Exemple #1
0
class FeatureSaveStart(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(FeatureSaveStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.task_start_time = ''
        self.run_start_time = 0
        self.save_num = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, save_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'feature_save_num': [save_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def update_finish_state(self, save_num):
        run_time = int(time.time() - self.run_start_time)
        table_name = 'task_result'
        fields = {'e_feature_save_state': [03, 'd'],
                  'feature_save_num': [save_num, 'd'],
                  'feature_save_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        send_result = self.message_other_engine(7, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    def save_web_feature(self):
        while 1:
            try:
                url = self.get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = self.get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = self.get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = self.get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            table_name = url_type + '_feature'
            self.mysql_handle.insert_web_feature(url, url_type, table_name, update_sign=True)
            self.save_num += 1
            self.update_running_state(self.save_num)

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'feature_save', 2)
        self.save_web_feature()
        self.update_finish_state(self.save_num)
Exemple #2
0
class View_Emd_start(multiprocessing.Process):
    def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(View_Emd_start, self).__init__()
        self.task_id = task_id
        self.task_start_time = ''
        self.user_id = ''
        self.view_protected_objectid = ''
        self.view_gray_objectid = ''
        self.view_counterfeit_objectid = ''
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)

        self.current_path = sys.path[0]
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

        self.protected_title_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_title)

        self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_title)

    def update_running_state(self, finish_num, view_find_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果

        '''
        table_name = 'task_result'
        fields = {
            'view_check_num': [finish_num, 'd'],
            'view_find_num': [view_find_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def engine_over_handle(self):
        send_result = self.message_other_engine(6, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))

    # 任务完成更新状态
    def update_finished_state(self, run_time, finish_num):
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {
            'e_view_emd_state': [03, 'd'],
            'view_emd_run_time': [run_time, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def run(self):
        finish_num = 0
        view_find_num = 0
        view_find_flags = 0
        start_time = time.time()

        View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id,
                           self.task_start_time, self.protected_title_dict,
                           self.counterfeit_title_dict)
        while True:
            try:
                gray_url = self.get_gray_iter.next()
                view_find_flags = View_emd.emdcalculate(gray_url)
                finish_num += 1
                view_find_num += view_find_flags
                self.update_running_state(finish_num, view_find_num)
            except StopIteration:
                break
        run_time = int(time.time()) - int(start_time)
        #run_time = time.ctime(run_time)

        self.update_finished_state(run_time, finish_num)
Exemple #3
0
class View_Emd_start(multiprocessing.Process):

    def __init__(self, task_id, current_path, mysql_host, mysql_db, mysql_user, mysql_password,
                 mongo_db, mongo_host, mongo_port, mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(View_Emd_start, self).__init__()
        self.task_id = task_id
        self.task_start_time = ''
        self.user_id = ''
        self.view_protected_objectid = ''
        self.view_gray_objectid = ''
        self.view_counterfeit_objectid = ''
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.mongo_operate = Mongo_Operate(
            mongo_db, mongo_host, mongo_port, mongo_user, mongo_password)

        self.current_path = sys.path[0]
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

        self.protected_title_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_title)

        self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_title)

    def update_running_state(self, finish_num, view_find_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果

        '''
        table_name = 'task_result'
        fields = {
            'view_check_num': [finish_num, 'd'], 'view_find_num': [view_find_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def engine_over_handle(self):
        send_result = self.message_other_engine(6, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    # 任务完成更新状态
    def update_finished_state(self, run_time, finish_num):
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'e_view_emd_state': [03, 'd'],
                  'view_emd_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def run(self):
        finish_num = 0
        view_find_num = 0
        view_find_flags = 0
        start_time = time.time()

        View_emd = ViewEmd(self.mysql_handle, self.mongo_operate, self.task_id, self.task_start_time,
                           self.protected_title_dict, self.counterfeit_title_dict)
        while True:
            try:
                gray_url = self.get_gray_iter.next()
                view_find_flags = View_emd.emdcalculate(gray_url)
                finish_num += 1
                view_find_num += view_find_flags
                self.update_running_state(finish_num, view_find_num)
            except StopIteration:
                break
        run_time = int(time.time()) - int(start_time)
        #run_time = time.ctime(run_time)

        self.update_finished_state(run_time, finish_num)
Exemple #4
0
class Title_start(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(Title_start, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.task_start_time = ''
        self.user_id = ''
        self.gary_objectid = ''
        self.protected_list_id = []
        self.get_protect_dict = {}
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.read_task_info()
        self.run_start_time = 0
        self.title_check_num = 0  # 检查数量
        self.title_find_num = 0  # 检查到钓鱼url的数量

        # self.split_values = 10  # 设置数值,分割每多少个url更新入数据库
        self.once_update_num = 1

    def read_task_info(self):
        '''
        读取任务信息
        '''
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']
        self.protected_title_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_title)
        self.protected_text_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_text)

        self.counterfeit_title_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_title)
        self.counterfeit_text_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_text)

    # 任务执行中更新状态
    def update_running_state(self, title_check_num, title_find_num):
        '''
        在mysql中更新探测状态及结果
        '''

        table_name = 'task_result'
        fields = {
            'title_check_num': [title_check_num, 'd'],
            'title_find_num': [title_find_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        result = self.mysql_handle.require_post(table_name, fields, wheres,
                                                'update')

    # 任务完成更新状态
    def update_finished_state(self):
        '''
        在mysql中更新探测状态及结果
        '''
        run_time = int(time.time()) - int(self.run_start_time)
        table_name = 'task_result'
        fields = {
            'e_title_state': [03, 'd'],
            'title_run_time': [run_time, 's'],
            'title_check_num': [self.title_check_num, 'd'],
            'title_find_num': [self.title_find_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        result = self.mysql_handle.require_post(table_name, fields, wheres,
                                                'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        # message to control
        send_result = self.message_other_engine(6, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))
        self.remove_process_pid(self.task_id)

    def run(self):
        self.run_start_time = time.time()
        self.write_process_pid(self.task_id)
        sys.stdout.write('%s  |*|title engine start|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))

        title_main = TitleMain(self.task_id, self.task_start_time,
                               self.protected_title_dict,
                               self.protected_text_dict, self.mongo_operate,
                               self.mysql_handle)
        update_count = 0
        counterfeit_get_gray_iter = []
        while True:
            try:
                gray_url = self.get_gray_iter.next()
                counterfeit_get_gray_iter.append(gray_url)
                check_result = title_main.title_run(gray_url)
                self.title_find_num += check_result
                self.title_check_num += 1
                update_count += 1
                if update_count >= self.once_update_num:
                    update_count = 0
                    self.update_running_state(self.title_check_num,
                                              self.title_find_num)
            except StopIteration:
                break
        title_main2 = TitleMain(self.task_id, self.task_start_time,
                                self.counterfeit_title_dict,
                                self.counterfeit_text_dict, self.mongo_operate,
                                self.mysql_handle, 'counterfeit')
        while True:
            try:
                gray_url = counterfeit_get_gray_iter.pop()
                check_result = title_main2.title_run(gray_url)
                self.title_find_num += check_result
                self.title_check_num += 1
                update_count += 1
                if update_count >= self.once_update_num:
                    update_count = 0
                    self.update_running_state(self.title_check_num,
                                              self.title_find_num)
            except IndexError:
                break
        self.update_finished_state()
Exemple #5
0
class QtCrawler(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(QtCrawler, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, crawler_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'qt_crawler_num': [crawler_num, 'd']}
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def update_finish_state(self, crawler_num, run_time):
        table_name = 'task_result'
        fields = {
            'e_qt_crawler_state': [03, 'd'],
            'qt_crawler_num': [crawler_num, 'd'],
            'qt_crawler_run_time': [run_time, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))
        send_result = self.message_other_engine(4, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        # self.page_shot()

    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write(
                    '%s  insert_web_info, web not be saved: %s\n' %
                    (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write(
                        '%s  insert_web_info, main.html not be exist: %s\n' %
                        (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(main_html_path, local_time,
                                              img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()

        # self.read_task_info()
        # self.read_crawler_config()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time,
                                              'qt_crawler', 2)
        self.app = QApplication(sys.argv)
        self.br = Browser(self.task_id, self.get_protected_iter,
                          self.get_gray_iter, self.get_counterfeit_iter,
                          self.get_monitor_iter, self.mongo_operate,
                          self.update_running_state, self.update_finish_state,
                          self.mysql_handle, self.run_start_time)
        # self.br.showMaximized() # show web
        # self.br.show()
        sys.exit(self.app.exec_())
Exemple #6
0
class QtCrawler(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(QtCrawler, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, crawler_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'qt_crawler_num': [crawler_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def update_finish_state(self, crawler_num, run_time):
        table_name = 'task_result'
        fields = {'e_qt_crawler_state': [03, 'd'],
                  'qt_crawler_num': [crawler_num, 'd'],
                  'qt_crawler_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))
        send_result = self.message_other_engine(4, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        # self.page_shot()

    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write('%s  insert_web_info, web not be saved: %s\n' %
                                 (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write('%s  insert_web_info, main.html not be exist: %s\n' %
                                     (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(
                    main_html_path, local_time, img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()

        # self.read_task_info()
        # self.read_crawler_config()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'qt_crawler', 2)
        self.app = QApplication(sys.argv)
        self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter,
                          self.get_counterfeit_iter, self.get_monitor_iter,
                          self.mongo_operate, self.update_running_state,
                          self.update_finish_state, self.mysql_handle, self.run_start_time)
        # self.br.showMaximized() # show web
        # self.br.show()
        sys.exit(self.app.exec_())
Exemple #7
0
class StructureStart(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid, structure_num_compare_k,
                 structure_num_compare_b, structure_area_compare_k,
                 structure_area_compare_b):
        super(StructureStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid
        self.structure_num_compare_k = structure_num_compare_k
        self.structure_num_compare_b = structure_num_compare_b
        self.structure_area_compare_k = structure_area_compare_k
        self.structure_area_compare_b = structure_area_compare_b
        self.mongo_db = mongo_db
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.mongo_user = mongo_user
        self.mongo_password = mongo_password

        # 初始化操作
        self.run_start_time = 0
        self.structure_check_num = 0  # 检查数量
        self.structure_find_num = 0  # 检查到钓鱼url的数量
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.read_task_info()

    def read_task_info(self):
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']
        self.protected_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_tree)
        self.counterfeit_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_tree)

    # 任务执行中更新状态
    def update_running_state(self):
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {
            'structure_check_num': [self.structure_check_num, 'd'],
            'structure_find_num': [self.structure_find_num, 'd'],
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def update_finished_state(self):
        '''
        在mysql中更新探测状态及结果
        '''
        run_time = int(time.time()) - int(self.run_start_time)
        table_name = 'task_result'
        fields = {
            'e_structure_state': [03, 'd'],
            'structure_run_time': [run_time, 's'],
            'structure_check_num': [self.structure_check_num, 'd'],
            'structure_find_num': [self.structure_find_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        # message to control
        send_result = self.message_other_engine(6, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))
        self.remove_process_pid(self.task_id)

    def run_structure_compare(self):
        structure_compare = StructureCompare(self.structure_num_compare_k,
                                             self.structure_num_compare_b,
                                             self.structure_area_compare_k,
                                             self.structure_area_compare_b)
        while True:
            try:
                gray_url = self.get_gray_iter.next()
                gray_block_list = self.mongo_operate.get_web_tree(
                    gray_url, 'gray')
                # mongo not have tree of url
                if gray_block_list is False or gray_block_list == []:
                    continue
                # cehck to protected
                for protected_url in self.protected_dict.keys():
                    protected_block_list = self.protected_dict[protected_url]
                    if protected_block_list == []:
                        continue
                    check_result = structure_compare.once_compare(
                        protected_block_list, gray_block_list)
                    if check_result == 1:
                        self.structure_find_num += 1
                        self.mysql_handle.undate_gray_list_check_result(
                            gray_url, 'structure', source_url=protected_url)
                        self.mysql_handle.undate_task_result_check_result(
                            self.task_id, self.task_start_time, gray_url,
                            'structure')
                        break
                # check to counterfeit
                for counterfeit_url in self.counterfeit_dict.keys():
                    counterfeit_block_list = self.counterfeit_dict[
                        counterfeit_url]
                    if counterfeit_block_list == []:
                        continue
                    check_result = structure_compare.once_compare(
                        counterfeit_block_list, gray_block_list)
                    if check_result == 1:
                        self.structure_find_num += 1
                        self.mysql_handle.undate_gray_list_check_result(
                            gray_url,
                            'structure',
                            counterfeit_url=counterfeit_url)
                        self.mysql_handle.undate_task_result_check_result(
                            self.task_id, self.task_start_time, gray_url,
                            'structure')
                        break
                self.structure_check_num += 1
                self.update_running_state()
            except StopIteration:
                break

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time,
                                              'structure', 2)
        self.run_structure_compare()
        self.update_finished_state()