コード例 #1
0
ファイル: feature_save_start.py プロジェクト: wyl-hit/job
class FeatureSaveStart(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(FeatureSaveStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.task_start_time = ''
        self.run_start_time = 0
        self.save_num = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, save_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'feature_save_num': [save_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def update_finish_state(self, save_num):
        run_time = int(time.time() - self.run_start_time)
        table_name = 'task_result'
        fields = {'e_feature_save_state': [03, 'd'],
                  'feature_save_num': [save_num, 'd'],
                  'feature_save_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        send_result = self.message_other_engine(7, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    def save_web_feature(self):
        while 1:
            try:
                url = self.get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = self.get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = self.get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = self.get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            table_name = url_type + '_feature'
            self.mysql_handle.insert_web_feature(url, url_type, table_name, update_sign=True)
            self.save_num += 1
            self.update_running_state(self.save_num)

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'feature_save', 2)
        self.save_web_feature()
        self.update_finish_state(self.save_num)
コード例 #2
0
ファイル: web_save_start.py プロジェクト: wyl-hit/job
class WebSavestart(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(WebSavestart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.task_start_time = ''
        self.user_id = ''
        self.protected_urls = []
        self.counterfeit_urls = []
        self.gray_urls = []
        self.monitor_urls = []
        self.url_num = 0
        self.gary_objectid = ''
        self.file_context = ''
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = ['last_time', 'user_id', 'protected_id', 'gray_id',
                  'counterfeit_id', 'monitor_id']
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one')
        if task_info is False:
            sys.stderr.write(
                '%s  task no exist, task_id: %s\n' % (time.ctime(), self.task_id))
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        original_protected_list = task_info['protected_id']
        original_counterfeit_list = task_info['counterfeit_id']
        original_gray_list = task_info['gray_id']
        original_monitor_list = task_info['monitor_id']

        # get protected url, all test may have protected url to save
        if original_protected_list is not None and original_protected_list != '':
            protected_id_list = original_protected_list.split('-')
            table_name = 'protected_list'
            fields = ['url']
            for protected_id in protected_id_list:  # 读取mysql中的被保护名单
                wheres = {'id': [int(protected_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                protected_url = select_result['url'].encode('utf-8')
                self.protected_urls.append(protected_url)
        # get counterfeit url in mysql counterfeit_list
        if original_counterfeit_list is not None and original_counterfeit_list != '':
            counterfeit_id_list = original_counterfeit_list.split('-')
            table_name = 'counterfeit_list'
            fields = ['url']
            for counterfeit_id in counterfeit_id_list:
                wheres = {'id': [int(counterfeit_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                counterfeit_url = select_result['url'].encode('utf-8')
                self.counterfeit_urls.append(counterfeit_url)
        # get gray url in mysql gray_list
        if original_gray_list is not None and original_gray_list != '':
            gray_id_list = original_gray_list.split('-')
            table_name = 'gray_list'
            fields = ['url']
            for gray_id in gray_id_list:
                wheres = {'id': [int(gray_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                gray_url = select_result['url'].encode('utf-8')
                self.gray_urls.append(gray_url)
        # get monitor url in mysql monitor_list
        if original_monitor_list is not None and original_monitor_list != '':
            monitor_id_list = original_monitor_list.split('-')
            table_name = 'monitor_list'
            fields = ['url']
            for monitor_id in monitor_id_list:
                wheres = {'id': [int(monitor_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                monitor_url = select_result['url'].encode('utf-8')
                self.monitor_urls.append(monitor_url)
        # get suspected url
        table_name = 'task_result'
        fields = ['filtrate_objectid']
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        select_result = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one')
        if select_result is not False:
            self.gary_objectid = select_result['filtrate_objectid']
            if self.gary_objectid is None:
                self.get_gray_iter = iter([])
                self.gray_url_num = 0
            else:
                self.gray_url_num = self.mongo_operate.get_gray_num(
                    self.gary_objectid)
                self.gary_objectid = self.mongo_operate.expand_gray_list(
                    self.gary_objectid)
                self.get_gray_iter = self.mongo_operate.get_gray_list(
                    self.gary_objectid)
        else:
            self.get_gray_iter = iter([])
            self.gray_url_num = 0
        self.url_num = self.gray_url_num + \
            len(self.protected_urls) + len(self.gray_urls) + \
            len(self.counterfeit_urls) + len(self.monitor_urls)

    def update_running_state(self, saved_num, request_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'web_save_num': [saved_num, 'd'],
                  'web_request_num': [request_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def add_saved_ulr_mongo(self, ulist):
        '''
        将保存的URL分类存入mongo中
        '''
        saved_protected_urls = []
        saved_gray_urls = []
        saved_counterfeit_urls = []
        saved_monitor_urls = []
        # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls
        # url last '/n'
        for url in ulist:
            if url[1] == 'gray':
                saved_gray_urls.append(url[0])
            elif url[1] == 'protected':
                saved_protected_urls.append(url[0])
            elif url[1] == 'counterfeit':
                saved_counterfeit_urls.append(url[0])
            elif url[1] == 'monitor':
                saved_monitor_urls.append(url[0])
        if saved_gray_urls != []:
            self.save_gray_objectID = self.mongo_operate.create_gray(
                gray_name='save_gray_urls', gray_type='websave', usr_id=self.user_id)
            self.mongo_operate.add_gray_list(
                saved_gray_urls, self.save_gray_objectID)
        else:
            self.save_gray_objectID = ''
        if saved_protected_urls != []:
            self.save_protected_objectID = self.mongo_operate.create_gray(
                gray_name='saved_protected_urls', gray_type='websave', usr_id=self.user_id)
            self.mongo_operate.add_gray_list(
                saved_protected_urls, self.save_protected_objectID)
        else:
            self.save_protected_objectID = ''
        if saved_counterfeit_urls != []:
            self.save_counterfeit_objectID = self.mongo_operate.create_gray(
                gray_name='saved_counterfeit_urls', gray_type='websave', usr_id=self.user_id)
            self.mongo_operate.add_gray_list(
                saved_counterfeit_urls, self.save_counterfeit_objectID)
        else:
            self.save_counterfeit_objectID = ''
        if saved_monitor_urls != []:
            self.save_monitor_objectID = self.mongo_operate.create_gray(
                gray_name='saved_monitor_urls', gray_type='websave', usr_id=self.user_id)
            self.mongo_operate.add_gray_list(
                saved_monitor_urls, self.save_monitor_objectID)
        else:
            self.save_monitor_objectID = ''

    def update_finished_state(self, ulist, run_time, request_num):
        '''
        在mysql中更新探测状态及结果
        '''
        if ['http://cpuzt.cc/', 'gray'] not in ulist:
            ulist.append(['http://cpuzt.cc/', 'gray'])
        if ['http://www.138.gg/', 'gray'] not in ulist:
            ulist.append(['http://www.138.gg/', 'gray'])
        if ['http://www.bjstkc.com/', 'gray'] not in ulist:
            ulist.append(['http://www.bjstkc.com/', 'gray'])
        self.add_saved_ulr_mongo(ulist)
        saved_num = len(ulist)
        table_name = 'task_result'
        fields = {'e_web_save_state': [03, 'd'],
                  'web_save_num': [saved_num, 'd'],
                  'web_request_num': [request_num, 'd'],
                  'web_save_run_time': [run_time, 's'],
                  'save_protected_objectid': [self.save_protected_objectID, 's'],
                  'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'],
                  'save_monitor_objectid': [self.save_monitor_objectID, 's'],
                  'save_gray_objectid': [self.save_gray_objectID, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

        if ulist == []:
            send_result = self.message_other_engine(9, ['00'], self.task_id)
        else:
            send_result = self.message_other_engine(3, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|web_save engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'web_save', 2)
        engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter, self.gray_urls,
                         self.counterfeit_urls, self.monitor_urls, self.url_num,
                         self.update_running_state, self.update_finished_state,
                         self.mongo_operate)
        engine.download()
        reactor.run(installSignalHandlers=0)
コード例 #3
0
class WebSavestart(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(WebSavestart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.task_start_time = ''
        self.user_id = ''
        self.protected_urls = []
        self.counterfeit_urls = []
        self.gray_urls = []
        self.monitor_urls = []
        self.url_num = 0
        self.gary_objectid = ''
        self.file_context = ''
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = [
            'last_time', 'user_id', 'protected_id', 'gray_id',
            'counterfeit_id', 'monitor_id'
        ]
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(table_name, fields, wheres,
                                                  'select', 'one')
        if task_info is False:
            sys.stderr.write('%s  task no exist, task_id: %s\n' %
                             (time.ctime(), self.task_id))
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        original_protected_list = task_info['protected_id']
        original_counterfeit_list = task_info['counterfeit_id']
        original_gray_list = task_info['gray_id']
        original_monitor_list = task_info['monitor_id']

        # get protected url, all test may have protected url to save
        if original_protected_list is not None and original_protected_list != '':
            protected_id_list = original_protected_list.split('-')
            table_name = 'protected_list'
            fields = ['url']
            for protected_id in protected_id_list:  # 读取mysql中的被保护名单
                wheres = {'id': [int(protected_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                protected_url = select_result['url'].encode('utf-8')
                self.protected_urls.append(protected_url)
        # get counterfeit url in mysql counterfeit_list
        if original_counterfeit_list is not None and original_counterfeit_list != '':
            counterfeit_id_list = original_counterfeit_list.split('-')
            table_name = 'counterfeit_list'
            fields = ['url']
            for counterfeit_id in counterfeit_id_list:
                wheres = {'id': [int(counterfeit_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                counterfeit_url = select_result['url'].encode('utf-8')
                self.counterfeit_urls.append(counterfeit_url)
        # get gray url in mysql gray_list
        if original_gray_list is not None and original_gray_list != '':
            gray_id_list = original_gray_list.split('-')
            table_name = 'gray_list'
            fields = ['url']
            for gray_id in gray_id_list:
                wheres = {'id': [int(gray_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                gray_url = select_result['url'].encode('utf-8')
                self.gray_urls.append(gray_url)
        # get monitor url in mysql monitor_list
        if original_monitor_list is not None and original_monitor_list != '':
            monitor_id_list = original_monitor_list.split('-')
            table_name = 'monitor_list'
            fields = ['url']
            for monitor_id in monitor_id_list:
                wheres = {'id': [int(monitor_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                monitor_url = select_result['url'].encode('utf-8')
                self.monitor_urls.append(monitor_url)
        # get suspected url
        table_name = 'task_result'
        fields = ['filtrate_objectid']
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        select_result = self.mysql_handle.require_get(table_name, fields,
                                                      wheres, 'select', 'one')
        if select_result is not False:
            self.gary_objectid = select_result['filtrate_objectid']
            if self.gary_objectid is None:
                self.get_gray_iter = iter([])
                self.gray_url_num = 0
            else:
                self.gray_url_num = self.mongo_operate.get_gray_num(
                    self.gary_objectid)
                self.gary_objectid = self.mongo_operate.expand_gray_list(
                    self.gary_objectid)
                self.get_gray_iter = self.mongo_operate.get_gray_list(
                    self.gary_objectid)
        else:
            self.get_gray_iter = iter([])
            self.gray_url_num = 0
        self.url_num = self.gray_url_num + \
            len(self.protected_urls) + len(self.gray_urls) + \
            len(self.counterfeit_urls) + len(self.monitor_urls)

    def update_running_state(self, saved_num, request_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {
            'web_save_num': [saved_num, 'd'],
            'web_request_num': [request_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def add_saved_ulr_mongo(self, ulist):
        '''
        将保存的URL分类存入mongo中
        '''
        saved_protected_urls = []
        saved_gray_urls = []
        saved_counterfeit_urls = []
        saved_monitor_urls = []
        # url is like ['http://www.taobao.com/', 'gray\n'] delete download_urls
        # url last '/n'
        for url in ulist:
            if url[1] == 'gray':
                saved_gray_urls.append(url[0])
            elif url[1] == 'protected':
                saved_protected_urls.append(url[0])
            elif url[1] == 'counterfeit':
                saved_counterfeit_urls.append(url[0])
            elif url[1] == 'monitor':
                saved_monitor_urls.append(url[0])
        if saved_gray_urls != []:
            self.save_gray_objectID = self.mongo_operate.create_gray(
                gray_name='save_gray_urls',
                gray_type='websave',
                usr_id=self.user_id)
            self.mongo_operate.add_gray_list(saved_gray_urls,
                                             self.save_gray_objectID)
        else:
            self.save_gray_objectID = ''
        if saved_protected_urls != []:
            self.save_protected_objectID = self.mongo_operate.create_gray(
                gray_name='saved_protected_urls',
                gray_type='websave',
                usr_id=self.user_id)
            self.mongo_operate.add_gray_list(saved_protected_urls,
                                             self.save_protected_objectID)
        else:
            self.save_protected_objectID = ''
        if saved_counterfeit_urls != []:
            self.save_counterfeit_objectID = self.mongo_operate.create_gray(
                gray_name='saved_counterfeit_urls',
                gray_type='websave',
                usr_id=self.user_id)
            self.mongo_operate.add_gray_list(saved_counterfeit_urls,
                                             self.save_counterfeit_objectID)
        else:
            self.save_counterfeit_objectID = ''
        if saved_monitor_urls != []:
            self.save_monitor_objectID = self.mongo_operate.create_gray(
                gray_name='saved_monitor_urls',
                gray_type='websave',
                usr_id=self.user_id)
            self.mongo_operate.add_gray_list(saved_monitor_urls,
                                             self.save_monitor_objectID)
        else:
            self.save_monitor_objectID = ''

    def update_finished_state(self, ulist, run_time, request_num):
        '''
        在mysql中更新探测状态及结果
        '''
        if ['http://cpuzt.cc/', 'gray'] not in ulist:
            ulist.append(['http://cpuzt.cc/', 'gray'])
        if ['http://www.138.gg/', 'gray'] not in ulist:
            ulist.append(['http://www.138.gg/', 'gray'])
        if ['http://www.bjstkc.com/', 'gray'] not in ulist:
            ulist.append(['http://www.bjstkc.com/', 'gray'])
        self.add_saved_ulr_mongo(ulist)
        saved_num = len(ulist)
        table_name = 'task_result'
        fields = {
            'e_web_save_state': [03, 'd'],
            'web_save_num': [saved_num, 'd'],
            'web_request_num': [request_num, 'd'],
            'web_save_run_time': [run_time, 's'],
            'save_protected_objectid': [self.save_protected_objectID, 's'],
            'save_counterfeit_objectid': [self.save_counterfeit_objectID, 's'],
            'save_monitor_objectid': [self.save_monitor_objectID, 's'],
            'save_gray_objectid': [self.save_gray_objectID, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

        if ulist == []:
            send_result = self.message_other_engine(9, ['00'], self.task_id)
        else:
            send_result = self.message_other_engine(3, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write('%s |*|web_save engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time, 'web_save',
                                              2)
        engine = WebSave(self.task_id, self.protected_urls, self.get_gray_iter,
                         self.gray_urls, self.counterfeit_urls,
                         self.monitor_urls, self.url_num,
                         self.update_running_state, self.update_finished_state,
                         self.mongo_operate)
        engine.download()
        reactor.run(installSignalHandlers=0)
コード例 #4
0
ファイル: filtrate_start.py プロジェクト: wyl-hit/job
class FiltrateStart(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(FiltrateStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.user_id = ''
        self.gray_urls = []
        self.task_start_time = ''
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = ['last_time', 'user_id', 'gray_id']
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(table_name, fields, wheres,
                                                  'select', 'one')
        if task_info is False:
            sys.stderr.write('%s  task no exist, task_id: %s\n' %
                             (time.ctime(), self.task_id))
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        gray_id = task_info['gray_id']
        # read gray url
        if gray_id is not None and gray_id != '':
            gray_id = gray_id.split('-')
            table_name = 'gray_list'
            fields = ['url']
            for once_gray_id in gray_id:
                wheres = {'id': [int(once_gray_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                gray_url = select_result['url'].encode('utf-8')
                self.gray_urls.append(gray_url)
        # read detected url
        table_name = 'task_result'
        fields = ['original_grayid']
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        select_result = self.mysql_handle.require_get(table_name, fields,
                                                      wheres, 'select', 'one')
        gary_objectid = select_result['original_grayid']
        if gary_objectid is not None and gary_objectid != '':
            gary_objectid = self.mongo_operate.expand_gray_list(gary_objectid)
            self.get_gray_iter = self.mongo_operate.get_gray_list(
                gary_objectid)
        else:
            self.get_gray_iter = iter([])

    def update_finish_state(self, trusted_filtrate_num,
                            counterfeit_filtrate_num, filtrate_objectid,
                            filtrate_trusted_objectid,
                            filtrate_counterfeit_objectid):
        '''
        task run over, update information in mysql
        '''
        run_time = int(time.time() - self.run_start_time)
        table_name = 'task_result'
        fields = {
            'e_filtrate_state': [03, 'd'],
            'filtrate_trusted_num': [trusted_filtrate_num, 'd'],
            'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'],
            'filtrate_run_time': [run_time, 's'],
            'filtrate_objectid': [filtrate_objectid, 's'],
            'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'],
            'filtrate_counterfeit_objectid':
            [filtrate_counterfeit_objectid, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        # message to control
        send_result = self.message_other_engine(2, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))

    def trusted_select(self, gray_url):
        '''
        在被信任名单中查询
        '''
        table_name = 'trusted_list'
        fields = ['*']
        wheres = {'url': [gray_url, 's']}
        select_result = self.mysql_handle.require_get(table_name, fields,
                                                      wheres, 'select', 'one',
                                                      0)
        return select_result

    def counterfeit_select(self, gray_url):
        '''
        在仿冒名单中查询
        '''
        table_name = 'counterfeit_list'
        fields = ['*']
        wheres = {'url': [gray_url, 's']}
        select_result = self.mysql_handle.require_get(table_name, fields,
                                                      wheres, 'select', 'one',
                                                      0)
        return select_result

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time, 'filtrate',
                                              2)

        trusted_filtrate_num = 0
        counterfeit_filtrate_num = 0
        suspect_grays = []  # not filtrate url
        trusted_grays = []
        counterfeit_grays = []
        while 1:
            try:
                gray_url = self.get_gray_iter.next()
            except StopIteration:
                try:
                    gray_url = self.gray_urls.pop()
                except IndexError:
                    break
            '''
            对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num,
            否则放到suspect_grays中
            '''
            select_result = self.trusted_select(gray_url)
            if select_result is not False:
                trusted_filtrate_num += 1
                trusted_grays.append(gray_url)
                continue
            else:
                select_result = self.counterfeit_select(gray_url)
                if select_result is not False:
                    counterfeit_filtrate_num += 1
                    counterfeit_grays.append(gray_url)
                    continue
                else:
                    suspect_grays.append(gray_url)
        # not filtrate url add gray_list in mongo
        filtrate_objectid = self.mongo_operate.create_gray(
            gray_name='suspect_grays',
            gray_type='filtrate',
            usr_id=self.user_id,
            task_id=self.task_id)
        self.mongo_operate.add_gray_list(suspect_grays, filtrate_objectid)
        filtrate_trusted_objectid = self.mongo_operate.create_gray(
            gray_name='trusted_grays',
            gray_type='filtrate',
            usr_id=self.user_id,
            task_id=self.task_id)
        self.mongo_operate.add_gray_list(trusted_grays,
                                         filtrate_trusted_objectid)
        filtrate_counterfeit_objectid = self.mongo_operate.create_gray(
            gray_name='counterfeit_grays',
            gray_type='filtrate',
            usr_id=self.user_id,
            task_id=self.task_id)
        self.mongo_operate.add_gray_list(counterfeit_grays,
                                         filtrate_counterfeit_objectid)
        self.update_finish_state(trusted_filtrate_num,
                                 counterfeit_filtrate_num, filtrate_objectid,
                                 filtrate_trusted_objectid,
                                 filtrate_counterfeit_objectid)
コード例 #5
0
ファイル: filtrate_start.py プロジェクト: wyl-hit/job
class FiltrateStart(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(FiltrateStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.user_id = ''
        self.gray_urls = []
        self.task_start_time = ''
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = ['last_time', 'user_id', 'gray_id']
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one')
        if task_info is False:
            sys.stderr.write(
                '%s  task no exist, task_id: %s\n' % (time.ctime(), self.task_id))
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        gray_id = task_info['gray_id']
        # read gray url
        if gray_id is not None and gray_id != '':
            gray_id = gray_id.split('-')
            table_name = 'gray_list'
            fields = ['url']
            for once_gray_id in gray_id:
                wheres = {'id': [int(once_gray_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                gray_url = select_result['url'].encode('utf-8')
                self.gray_urls.append(gray_url)
        # read detected url
        table_name = 'task_result'
        fields = ['original_grayid']
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        select_result = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one')
        gary_objectid = select_result['original_grayid']
        if gary_objectid is not None and gary_objectid != '':
            gary_objectid = self.mongo_operate.expand_gray_list(
                gary_objectid)
            self.get_gray_iter = self.mongo_operate.get_gray_list(
                gary_objectid)
        else:
            self.get_gray_iter = iter([])

    def update_finish_state(self, trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid,
                            filtrate_trusted_objectid, filtrate_counterfeit_objectid):
        '''
        task run over, update information in mysql
        '''
        run_time = int(time.time() - self.run_start_time)
        table_name = 'task_result'
        fields = {'e_filtrate_state': [03, 'd'],
                  'filtrate_trusted_num': [trusted_filtrate_num, 'd'],
                  'filtrate_counterfeit_num': [counterfeit_filtrate_num, 'd'],
                  'filtrate_run_time': [run_time, 's'],
                  'filtrate_objectid': [filtrate_objectid, 's'],
                  'filtrate_trusted_objectid': [filtrate_trusted_objectid, 's'],
                  'filtrate_counterfeit_objectid': [filtrate_counterfeit_objectid, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        # message to control
        send_result = self.message_other_engine(2, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    def trusted_select(self, gray_url):
        '''
        在被信任名单中查询
        '''
        table_name = 'trusted_list'
        fields = ['*']
        wheres = {'url': [gray_url, 's']}
        select_result = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one', 0)
        return select_result

    def counterfeit_select(self, gray_url):
        '''
        在仿冒名单中查询
        '''
        table_name = 'counterfeit_list'
        fields = ['*']
        wheres = {'url': [gray_url, 's']}
        select_result = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one', 0)
        return select_result

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'filtrate', 2)

        trusted_filtrate_num = 0
        counterfeit_filtrate_num = 0
        suspect_grays = []  # not filtrate url
        trusted_grays = []
        counterfeit_grays = []
        while 1:
            try:
                gray_url = self.get_gray_iter.next()
            except StopIteration:
                try:
                    gray_url = self.gray_urls.pop()
                except IndexError:
                    break
            '''
            对gray_url进行黑白名单比对,属于黑白名单则更新filtrate_num,
            否则放到suspect_grays中
            '''
            select_result = self.trusted_select(gray_url)
            if select_result is not False:
                trusted_filtrate_num += 1
                trusted_grays.append(gray_url)
                continue
            else:
                select_result = self.counterfeit_select(gray_url)
                if select_result is not False:
                    counterfeit_filtrate_num += 1
                    counterfeit_grays.append(gray_url)
                    continue
                else:
                    suspect_grays.append(gray_url)
        # not filtrate url add gray_list in mongo
        filtrate_objectid = self.mongo_operate.create_gray(
            gray_name='suspect_grays', gray_type='filtrate',
            usr_id=self.user_id, task_id=self.task_id)
        self.mongo_operate.add_gray_list(
            suspect_grays, filtrate_objectid)
        filtrate_trusted_objectid = self.mongo_operate.create_gray(
            gray_name='trusted_grays', gray_type='filtrate',
            usr_id=self.user_id, task_id=self.task_id)
        self.mongo_operate.add_gray_list(
            trusted_grays, filtrate_trusted_objectid)
        filtrate_counterfeit_objectid = self.mongo_operate.create_gray(
            gray_name='counterfeit_grays', gray_type='filtrate',
            usr_id=self.user_id, task_id=self.task_id)
        self.mongo_operate.add_gray_list(
            counterfeit_grays, filtrate_counterfeit_objectid)
        self.update_finish_state(
            trusted_filtrate_num, counterfeit_filtrate_num, filtrate_objectid,
            filtrate_trusted_objectid, filtrate_counterfeit_objectid)
コード例 #6
0
ファイル: qtcrawler_start.py プロジェクト: wyl-hit/job
class QtCrawler(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(QtCrawler, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, crawler_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'qt_crawler_num': [crawler_num, 'd']}
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def update_finish_state(self, crawler_num, run_time):
        table_name = 'task_result'
        fields = {
            'e_qt_crawler_state': [03, 'd'],
            'qt_crawler_num': [crawler_num, 'd'],
            'qt_crawler_run_time': [run_time, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))
        send_result = self.message_other_engine(4, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        # self.page_shot()

    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write(
                    '%s  insert_web_info, web not be saved: %s\n' %
                    (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write(
                        '%s  insert_web_info, main.html not be exist: %s\n' %
                        (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(main_html_path, local_time,
                                              img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()

        # self.read_task_info()
        # self.read_crawler_config()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time,
                                              'qt_crawler', 2)
        self.app = QApplication(sys.argv)
        self.br = Browser(self.task_id, self.get_protected_iter,
                          self.get_gray_iter, self.get_counterfeit_iter,
                          self.get_monitor_iter, self.mongo_operate,
                          self.update_running_state, self.update_finish_state,
                          self.mysql_handle, self.run_start_time)
        # self.br.showMaximized() # show web
        # self.br.show()
        sys.exit(self.app.exec_())
コード例 #7
0
ファイル: qtcrawler_start.py プロジェクト: wyl-hit/job
class QtCrawler(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(QtCrawler, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.run_start_time = 0
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_protected_iter = saved_urls_iters['get_protected_iter']
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_counterfeit_iter = saved_urls_iters['get_counterfeit_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']

    def update_running_state(self, crawler_num):  # 任务执行中更新状态
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {'qt_crawler_num': [crawler_num, 'd']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')

    def update_finish_state(self, crawler_num, run_time):
        table_name = 'task_result'
        fields = {'e_qt_crawler_state': [03, 'd'],
                  'qt_crawler_num': [crawler_num, 'd'],
                  'qt_crawler_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))
        send_result = self.message_other_engine(4, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        # self.page_shot()

    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write('%s  insert_web_info, web not be saved: %s\n' %
                                 (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write('%s  insert_web_info, main.html not be exist: %s\n' %
                                     (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(
                    main_html_path, local_time, img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()

        # self.read_task_info()
        # self.read_crawler_config()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'qt_crawler', 2)
        self.app = QApplication(sys.argv)
        self.br = Browser(self.task_id, self.get_protected_iter, self.get_gray_iter,
                          self.get_counterfeit_iter, self.get_monitor_iter,
                          self.mongo_operate, self.update_running_state,
                          self.update_finish_state, self.mysql_handle, self.run_start_time)
        # self.br.showMaximized() # show web
        # self.br.show()
        sys.exit(self.app.exec_())
コード例 #8
0
class StructureStart(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid, structure_num_compare_k,
                 structure_num_compare_b, structure_area_compare_k,
                 structure_area_compare_b):
        super(StructureStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid
        self.structure_num_compare_k = structure_num_compare_k
        self.structure_num_compare_b = structure_num_compare_b
        self.structure_area_compare_k = structure_area_compare_k
        self.structure_area_compare_b = structure_area_compare_b
        self.mongo_db = mongo_db
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.mongo_user = mongo_user
        self.mongo_password = mongo_password

        # 初始化操作
        self.run_start_time = 0
        self.structure_check_num = 0  # 检查数量
        self.structure_find_num = 0  # 检查到钓鱼url的数量
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        self.read_task_info()

    def read_task_info(self):
        self.task_start_time = self.mysql_handle.get_task_last_time(
            self.task_id)
        saved_urls_iters = self.mysql_handle.read_saved_urls(
            self.task_id, self.mongo_operate)
        self.get_gray_iter = saved_urls_iters['get_gray_iter']
        self.get_monitor_iter = saved_urls_iters['get_monitor_iter']
        self.protected_dict = self.mysql_handle.get_all_protected_feature(
            self.mongo_operate.get_web_tree)
        self.counterfeit_dict = self.mysql_handle.get_all_counterfeit_feature(
            self.mongo_operate.get_web_tree)

    # 任务执行中更新状态
    def update_running_state(self):
        '''
        在mysql中更新探测状态及结果
        '''
        table_name = 'task_result'
        fields = {
            'structure_check_num': [self.structure_check_num, 'd'],
            'structure_find_num': [self.structure_find_num, 'd'],
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')

    def update_finished_state(self):
        '''
        在mysql中更新探测状态及结果
        '''
        run_time = int(time.time()) - int(self.run_start_time)
        table_name = 'task_result'
        fields = {
            'e_structure_state': [03, 'd'],
            'structure_run_time': [run_time, 's'],
            'structure_check_num': [self.structure_check_num, 'd'],
            'structure_find_num': [self.structure_find_num, 'd']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        self.engine_over_handle()

    def engine_over_handle(self):
        # message to control
        send_result = self.message_other_engine(6, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))
        self.remove_process_pid(self.task_id)

    def run_structure_compare(self):
        structure_compare = StructureCompare(self.structure_num_compare_k,
                                             self.structure_num_compare_b,
                                             self.structure_area_compare_k,
                                             self.structure_area_compare_b)
        while True:
            try:
                gray_url = self.get_gray_iter.next()
                gray_block_list = self.mongo_operate.get_web_tree(
                    gray_url, 'gray')
                # mongo not have tree of url
                if gray_block_list is False or gray_block_list == []:
                    continue
                # cehck to protected
                for protected_url in self.protected_dict.keys():
                    protected_block_list = self.protected_dict[protected_url]
                    if protected_block_list == []:
                        continue
                    check_result = structure_compare.once_compare(
                        protected_block_list, gray_block_list)
                    if check_result == 1:
                        self.structure_find_num += 1
                        self.mysql_handle.undate_gray_list_check_result(
                            gray_url, 'structure', source_url=protected_url)
                        self.mysql_handle.undate_task_result_check_result(
                            self.task_id, self.task_start_time, gray_url,
                            'structure')
                        break
                # check to counterfeit
                for counterfeit_url in self.counterfeit_dict.keys():
                    counterfeit_block_list = self.counterfeit_dict[
                        counterfeit_url]
                    if counterfeit_block_list == []:
                        continue
                    check_result = structure_compare.once_compare(
                        counterfeit_block_list, gray_block_list)
                    if check_result == 1:
                        self.structure_find_num += 1
                        self.mysql_handle.undate_gray_list_check_result(
                            gray_url,
                            'structure',
                            counterfeit_url=counterfeit_url)
                        self.mysql_handle.undate_task_result_check_result(
                            self.task_id, self.task_start_time, gray_url,
                            'structure')
                        break
                self.structure_check_num += 1
                self.update_running_state()
            except StopIteration:
                break

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time,
                                              'structure', 2)
        self.run_structure_compare()
        self.update_finished_state()
コード例 #9
0
class DomainStart(multiprocessing.Process):
    def __init__(self, task_id, mysql_host, mysql_db, mysql_user,
                 mysql_password, mongo_db, mongo_host, mongo_port, mongo_user,
                 mongo_password, message_other_engine, write_process_pid,
                 remove_process_pid):
        super(DomainStart, self).__init__()
        self.task_id = task_id
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host, mysql_user,
                                         mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        # 初始化操作
        self.user_id = ''
        # 待变换网站列表, 包括已知仿冒网站和被保护网站,
        # 对已知仿冒网站和对被保护网站域名变换方式一样, 故统一处理
        self.wait_change_url_list = []
        self.original_host_rules = []
        self.original_top_rules = []
        self.original_path_rules = []
        self.exist_list = []  # 记录存在的url
        self.task_start_time = ''
        self.run_start_time = 0
        self.url_create_list = []
        self.protect_url = ''
        self.deferreds = []
        self.read_task_info()
        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host, mongo_port,
                                           mongo_user, mongo_password)
        # 创建域名生成器对象
        self.url_gen = URLGenerator(self.task_id, self.mongo_operate,
                                    self.update_running_state,
                                    self.wait_change_url_list,
                                    self.original_host_rules,
                                    self.original_top_rules,
                                    self.original_path_rules)
        self.domain_change_url = self.url_gen.URL_Generator()  # 创建生成器

        self.domain_save_path = '/tmp/' + \
            str(task_id) + '_domain_request_urls.txt'
        self.domain_live_path = '/tmp/' + \
            str(task_id) + '_domain_live.txt'
        self.file_request_urls = open(self.domain_save_path, 'w')
        self.file_live_url = open(self.domain_live_path, 'w')

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = [
            'last_time', 'user_id', 'protected_id', 'counterfeit_id',
            'host_rule_id', 'top_rule_id', 'path_rule_id'
        ]
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(table_name, fields, wheres,
                                                  'select', 'one')
        if task_info is False:
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        protected_list_id = task_info['protected_id']
        counterfeit_list_id = task_info['counterfeit_id']
        host_rule_id = task_info['host_rule_id']
        top_rule_id = task_info['top_rule_id']
        path_rule_id = task_info['path_rule_id']
        self.read_rule_config(protected_list_id, counterfeit_list_id,
                              host_rule_id, top_rule_id, path_rule_id)

    def read_rule_config(self, protected_list_id, counterfeit_list_id,
                         host_rule_id, top_rule_id, path_rule_id):
        '''
        从mysql中读取变换规则和被保护名单
        '''
        if protected_list_id is not None and protected_list_id != '':
            protected_list_id = protected_list_id.split('-')
            for protected_id in protected_list_id:  # 读取mysql中的被保护名单
                table_name = 'protected_list'
                fields = ['url']
                wheres = {'id': [int(protected_id), 'd']}
                task_info = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if task_info is False:
                    continue
                protected = task_info['url']
                self.wait_change_url_list.append(protected)
        if counterfeit_list_id is not None and counterfeit_list_id != '':
            counterfeit_list_id = counterfeit_list_id.split('-')
            for counterfeit_id in counterfeit_list_id:  # 读取mysql中的待变换已知仿冒网站
                table_name = 'counterfeit_list'
                fields = ['url']
                wheres = {'id': [int(counterfeit_id), 'd']}
                task_info = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if task_info is False:
                    continue
                counterfeit = task_info['url']
                self.wait_change_url_list.append(counterfeit)
        if host_rule_id is not None and host_rule_id != '':
            host_rule_id = host_rule_id.split('-')
            for rule_id in host_rule_id:  # 读取mysql中的主机域名变换规则
                table_name = 'host_change_rule'
                fields = ['change_rule']
                wheres = {'id': [int(rule_id), 'd']}
                task_info = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if task_info is False:
                    continue
                result = task_info['change_rule']
                result = result.split('|')
                for once_result in result:
                    self.original_host_rules.append(str(once_result))
        if top_rule_id is not None and top_rule_id != '':
            top_rule_id = top_rule_id.split('-')
            for top_id in top_rule_id:  # 读取mysql中的顶级域名变换规则
                table_name = 'top_change_rule'
                fields = ['change_rule']
                wheres = {'id': [int(top_id), 'd']}
                task_info = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if task_info is False:
                    continue
                result = task_info['change_rule']
                self.original_top_rules.append(str(result))
        if path_rule_id is not None and path_rule_id != '':
            path_rule_id = path_rule_id.split('-')
            for path_id in path_rule_id:  # 读取mysql中的路径变换规则
                table_name = 'path_change_rule'
                fields = ['change_rule']
                wheres = {'id': [int(path_id), 'd']}
                task_info = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if task_info is False:
                    continue
                result = task_info['change_rule']
                self.original_path_rules.append(str(result))

    def update_running_state(self,
                             all_change_num,
                             all_exist_change_num,
                             changed_num,
                             gray_exist_num,
                             update_type=0):
        '''
        在mysql中更新探测状态及结果
        update_type=0: domain change update
        update_type=1: url exist check update
        '''
        table_name = 'task_result'
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        if update_type == 0:
            fields = {
                'domain_changed_all_num': [all_change_num, 'd'],
                'domain_changed_exist_num': [all_exist_change_num, 'd'],
                'domain_detected_num': [changed_num, 'd']
            }
            self.mysql_handle.require_post(table_name, fields, wheres,
                                           'update')
        if update_type == 1:
            fields = {'domain_gray_url_num': [gray_exist_num, 'd']}
            self.mysql_handle.require_post(table_name, fields, wheres,
                                           'update')

    def create_gray_mongo(self, exist_list):
        gray_name = 'NO.' + str(self.task_id) + ' task domian'
        detect_objectID = self.mongo_operate.create_gray(
            gray_name=gray_name,
            gray_type='domain_change',
            usr_id=self.user_id,
            task_id=self.task_id)
        self.mongo_operate.add_gray_list(exist_list, detect_objectID)
        return detect_objectID

    def update_finish_state(self, exist_list, run_time):
        '''
        task run over, update information in mysql
        '''
        detect_objectID = self.create_gray_mongo(exist_list)
        exist_url_num = len(exist_list)
        # save domain engine result in mysql task_result
        table_name = 'task_result'
        fields = {
            'e_domain_state': [03, 'd'],
            'domain_gray_url_num': [exist_url_num, 'd'],
            'original_grayid': [detect_objectID, 's'],
            'domain_run_time': [run_time, 's']
        }
        wheres = {
            'task_id': [self.task_id, 'd'],
            'start_time': [self.task_start_time, 's']
        }
        self.mysql_handle.require_post(table_name, fields, wheres, 'update')
        if exist_list == []:
            send_result = self.message_other_engine(9, ['00'], self.task_id)
        else:
            # save gray_list info in mysql
            self.mysql_handle.insert_suspect_list(detect_objectID,
                                                  self.user_id,
                                                  self.task_id,
                                                  'domain_change',
                                                  exist_url_num,
                                                  suspect_type=2)
            self.mysql_handle.insert_gray_list(exist_list,
                                               source='domain_change')
            # quit deal
            # message to control
            send_result = self.message_other_engine(5, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(self.task_id,
                                                self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write('%s |*|engine win over|*|, task_id: %s\n' %
                         (time.ctime(), self.task_id))

    def pageCallback(self, result, url, protect):
        '''
        用 getpage检测 网页存在
        网页存在,调用此回调函数
        '''
        global _NUM
        match = re.search(r"<title>(.*?)</title>", result)
        try:
            title = match.group(1)
        except:
            title = 'None'
        if title.find("Redirect") == -1:
            self.exist_list.append(url)
            self.update_running_state(gray_exist_num=len(self.exist_list),
                                      update_type=1)

        self.file_request_urls.write(url + '\n')
        self.file_request_urls.flush()

        self.file_live_url.seek(0)
        self.file_live_url.truncate(0)
        self.file_live_url.write(url + ' ' + str(self.engine_pid))
        self.file_live_url.flush()
        _NUM += 1
        self.download()

    def finish(self, ign):
        '''
        所有的defer处理完后调用finish结束reacter循环
        '''
        try:
            reactor.stop()
            os.remove(self.domain_save_path)
            os.remove(self.domain_live_path)
        except:
            pass

    def fetch_error(self, error, url, protect):
        '''
        用getpage检测,网页不存在调用此回调函数
        '''
        global _NUM
        if error.getErrorMessage().find(
                'User timeout caused connection failure') != -1:
            d = getPage(url)
            d.addCallback(self.pageCallback, url, protect)
            d.addErrback(self.fetch_error, url, protect)
        else:
            self.file_live_url.seek(0)
            self.file_live_url.truncate(0)
            self.file_live_url.write(url + ' ' + str(self.engine_pid))
            self.file_live_url.flush()
            _NUM += 1
            self.download()

    def download(self):
        global _NUM
        while _NUM > 0:
            try:
                url = self.url_create_list.pop(0)
                d = getPage(url.encode('utf-8'))
                d.addCallback(self.pageCallback, url.encode('utf-8'),
                              self.protect_url)
                d.addErrback(self.fetch_error, url.encode('utf-8'),
                             self.protect_url)
                _NUM -= 1
                self.deferreds.append(d)
            except IndexError:
                try:
                    self.url_create_list = []
                    self.url_create_list = self.domain_change_url.next()
                    # print 'download', self.url_create_list
                    self.protect_url = self.url_create_list[0]
                    self.url_create_list = self.url_create_list[1:]
                except StopIteration:
                    dl = defer.DeferredList(self.deferreds)
                    dl.addCallback(self.finish)
                    break

    def run(self):
        '''
        程序入口
        '''
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.engine_pid = os.getpid()
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(self.task_id,
                                              self.task_start_time, 'domain',
                                              2)
        try:
            self.url_create_list = self.domain_change_url.next()
            self.protect_url = self.url_create_list[0]
            self.url_create_list = self.url_create_list[1:]
        except StopIteration:
            pass
        self.download()
        # start
        reactor.run()
        # finaish
        run_time = int(time.time()) - int(self.run_start_time)
        self.update_finish_state(self.exist_list, run_time)
コード例 #10
0
ファイル: whois_search_start.py プロジェクト: wyl-hit/job
class WhoisSearchStart(multiprocessing.Process):

    def __init__(self, task_id, mysql_host, mysql_db,
                 mysql_user, mysql_password, mongo_db, mongo_host, mongo_port,
                 mongo_user, mongo_password, message_other_engine,
                 write_process_pid, remove_process_pid):
        super(WhoisSearchStart, self).__init__()
        self.task_id = task_id
        self.mysql_host = mysql_host
        self.mysql_db = mysql_db
        self.mysql_user = mysql_user
        self.mysql_password = mysql_password
        self.mysql_handle = MysqlOperate(mysql_db, mysql_host,
                                         mysql_user, mysql_password)
        self.message_other_engine = message_other_engine
        self.write_process_pid = write_process_pid
        self.remove_process_pid = remove_process_pid

        self.mongo_operate = Mongo_Operate(mongo_db, mongo_host,
                                           mongo_port, mongo_user,
                                           mongo_password)

        # 初始化操作
        self.task_start_time = ''
        self.user_id = ''
        self.whois_search_url = ''
        self.whois_reverse_url = ''
        self.counterfeit_urls = []
        self.task_state = 0

        self.read_task_info()

    def read_task_info(self):
        '''
        读取任务信息
        '''
        table_name = 'task_info'
        fields = ['last_time', 'user_id', 'counterfeit_id',
                  'whois_search_url', 'whois_reverse_url']
        wheres = {'task_id': [self.task_id, 'd']}
        task_info = self.mysql_handle.require_get(
            table_name, fields, wheres, 'select', 'one')
        if task_info is False:
            sys.stderr.write(
                '%s  task no exist, task_id: %s\n' % (time.ctime(), self.task_id))
            os._exit(0)
        self.task_start_time = task_info['last_time']
        self.user_id = task_info['user_id']
        self.whois_search_url = task_info['whois_search_url']
        self.whois_reverse_url = task_info['whois_reverse_url']
        original_counterfeit_list = task_info['counterfeit_id']
        # get counterfeit url in mysql counterfeit_list
        if original_counterfeit_list is not None and original_counterfeit_list != '':
            counterfeit_id_list = original_counterfeit_list.split('-')
            table_name = 'counterfeit_list'
            fields = ['url']
            for counterfeit_id in counterfeit_id_list:
                wheres = {'id': [int(counterfeit_id), 'd']}
                select_result = self.mysql_handle.require_get(
                    table_name, fields, wheres, 'select', 'one')
                if select_result is False:
                    continue
                counterfeit_url = select_result['url'].encode('utf-8')
                self.counterfeit_urls.append(counterfeit_url)

    def add_gray_list(self, url_list):
        if url_list == []:
            return False
        gray_objectid = self.mongo_operate.create_gray(
            gray_name='whois_reverse_gray', gray_type='whois_reverse',
            usr_id=self.user_id, task_id=self.task_id)
        self.mongo_operate.add_gray_list(
            url_list, gray_objectid)
        table_name = 'task_result'
        fields = {'original_grayid': [gray_objectid, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        # save gray_list info in mysql suspect_list
        url_num = len(url_list)
        self.mysql_handle.insert_suspect_list(gray_objectid, self.user_id, self.task_id,
                                              'whois_reverse', url_num, suspect_type=2)
        self.mysql_handle.insert_gray_list(url_list, source='whois_reverse')

    def update_finish_state(self, new_gray_lsit):
        run_time = int(time.time() - self.run_start_time)
        table_name = 'task_result'
        fields = {'e_whois_search_state': [03, 'd'],
                  'whois_search_run_time': [run_time, 's']}
        wheres = {'task_id': [self.task_id, 'd'],
                  'start_time': [self.task_start_time, 's']}
        self.mysql_handle.require_post(
            table_name, fields, wheres, 'update')
        # message to control
        if new_gray_lsit == []:
            send_result = self.message_other_engine(9, ['00'], self.task_id)
        else:
            self.add_gray_list(new_gray_lsit)
            send_result = self.message_other_engine(5, ['00'], self.task_id)
        if send_result is False:  # control engine no response, stop task
            self.mysql_handle.update_task_state(
                self.task_id, self.task_start_time, 0)
        self.remove_process_pid(self.task_id)
        sys.stdout.write(
            '%s |*|engine win over|*|, task_id: %s\n' % (time.ctime(), self.task_id))

    def run_whois_reverse(self, url):
        whois_reverse = WhoisReverse(self.mysql_host, self.mysql_db,
                                     self.mysql_user, self.mysql_password)
        try:
            reverse_url_list = []
            reverse_domain_list = whois_reverse.get_reverse_whois(url)
            for domian in reverse_domain_list:
                reverse_url = 'http://' + domian + '/'
                reverse_url_list.append(reverse_url)
            return reverse_url_list
        except:
            traceback.print_exc()
            return []

    def run_whois_search(self, url):
        '''
        通过使whois查询模块在子线程中运行,从而避免对主线程造成影响
        '''
        url_analysis = Urlanalysis(1, self.mysql_host, self.mysql_user,
                                   self.mysql_password, self.mysql_db)
        url_list = [url]
        try:
            url_analysis.getUrllist_list(url_list)
        except:
            traceback.print_exc()

    def web_save_transfer(self, url):
        self.mongo_operate.transfer_web_save(
            url, source_type='gray', goal_type='counterfeit')
        h = WebSavePath()
        source_file_path, target_file_path = h.get_transfer_path(
            url, 'gray', 'counterfeit')
        web_info_transfer(source_file_path, target_file_path)

    def whois_operation(self):
        if self.whois_search_url != '' and self.whois_search_url is not None:
            self.run_whois_search(self.whois_search_url)
        if self.whois_reverse_url != '' and self.whois_reverse_url is not None:
            self.run_whois_reverse(self.whois_reverse_url)
        new_gray_lsit = []
        while 1:
            try:
                url = self.counterfeit_urls.pop()
                #self.web_save_transfer(url)
                self.mysql_handle.update_counterfeit_list_statistic(url)
                self.run_whois_search(url)
                reverse_url_list = self.run_whois_reverse(url)
                new_gray_lsit.extend(reverse_url_list)
            except IndexError:
                break
        self.update_finish_state(new_gray_lsit)

    def run(self):
        # write child process pid to engine pids
        self.write_process_pid(self.task_id)
        self.run_start_time = time.time()
        self.mysql_handle.update_engine_state(
            self.task_id, self.task_start_time, 'whois_search', 2)
        self.whois_operation()