コード例 #1
0
def set_state(**kwargs):
    sid = kwargs['sid']
    state = kwargs['state']
    pre_state = kwargs['pre_state']
    info = kwargs['info']
    need_parameters = kwargs['need_parameters']
    receive = kwargs.get('receive', False)

    filter_data = {'sid': sid}
    update_data = {
        'sid': sid,
        'state': state,
        'receive': receive,
        'pre_state': pre_state,
        'need_parameters': need_parameters,
        'info': info
    }
    try:
        for _ in xrange(3):
            result = db['state'].update_one(filter_data, {'$set': update_data},
                                            upsert=True)
            if result.raw_result.get('ok') == 1:
                break
            time.sleep(0.1)
        else:
            logger('mongo_log', 'ERROR', 'update state error', **update_data)
    except:
        message = traceback.format_exc()
        logger('mongo_log', 'ERROR', message, **update_data)
コード例 #2
0
def reset_parameter(**kwargs):
    sid = kwargs['state'].sid
    state_name = kwargs['state'].state_name
    filter_data = {'sid': sid}
    update_data = {'$set': {'receive': True}}

    try:
        for _ in xrange(3):
            result = db['params'].update_one(filter_data,
                                             update_data,
                                             upsert=True)
            if result.modified_count:
                break
            time.sleep(0.1)
        else:
            logger('mongo_log', 'ERROR', 'update params receive error', **{
                'sid': sid,
                'state': state_name
            })
    except:
        message = traceback.format_exc()
        logger('mongo_log', 'ERROR', message, **{
            'sid': sid,
            'state': state_name
        })
コード例 #3
0
    def log(self, level='ERROR', message=None, state=None, missing_dict=None):
        """
        状态机日志调用:
        usage:
            self.log("ERROR", "crawl_error", message, response)
        """
        data = {}
        log_name = 'state_log'
        state_log = {}

        if self.crawler is not None:
            crawler = self.crawler.__module__.replace("worker.crawler.",
                                                      "").replace(".main", "")
        else:
            crawler = 'None'

        if not message:
            message = '{}:{}'.format(self.execute_message, self.state_name)

            state_log = {
                'state_name': self.state_name,
                'execute_status': self.execute_status,
                'execute_msg': self.execute_message,
                'next_action': self.next_action,
                'state_flag': self.state_flag
            }

        if self.state_name in ['UnderVerify', 'UnderLogin']:
            parameters = self.parameters
            parameters.pop('sid', None)
            parameters.pop('crawler', None)
            state_log['parameters'] = parameters

        if missing_dict:
            state_log.update(missing_dict)

        if state:
            state_log['state_name'] = state.get('state_name', '')
            state_log['execute_status'] = state.get('execute_status', '')
            state_log['execute_msg'] = state.get('execute_message', '')

        data.update({
            'sid': self.sid,
            'crawler': crawler,
            'state_log': state_log
        })
        logger(log_name, level, message, **data)
コード例 #4
0
def update_db_data(collection, filter_data, update_data, upsert=True):
    try:
        for retry_time in xrange(3):
            res = db[collection].update_one(filter_data, {'$set': update_data},
                                            upsert=True)
            if res.modified_count:
                break
            time.sleep(0.1)
        else:
            # logger('mongo_log', 'ERROR', 'update data error', **merge_two_dicts(filter_data, update_data))
            logger('mongo_log', 'ERROR', 'update data error', **filter_data)
    except:
        message = traceback.format_exc()
        # logger('mongo_log', 'ERROR', message, **merge_two_dicts(filter_data, update_data))
        logger('mongo_log', 'ERROR', message, **filter_data)
        return True

    return True
コード例 #5
0
    def log(self, code, msg, resp):
        """
        crawler爬虫日志调用:
        code:
            爬虫错: crawler
            用户错: user
            官网错: website
            *网络错: network
            *系统错: system
        msg:
            status_key: 'crawl_error'
            try...except: 'crawl_error:{}'.format(msg)
        usage:
            self.log('crawler', 'crawl_error:{}'.format(msg), resp)
        """
        sid = self.kwargs.get('sid','')
        crawler = self.kwargs.get('crawler','').replace('worker.crawler.','').replace('.main','')
        log_name = 'crawler_log'
        crawler_log = {}
        if isinstance(resp, requests.models.Response):
            crawler_log = {
                'func_name' : inspect.stack()[1][3],
                'req_url': str(resp.request.url),
                'req_params': str(resp.request.body),
                'req_header': str(resp.request.headers),
                'res_status_code': str(resp.status_code),
                'res_header': str(resp.headers),
                'res_body': str(resp.text)
            }
        data = {
            'sid':sid,
            'crawler':crawler,
            'crawler_log':crawler_log,
        }
        if socket.gethostname() == 'w219':
            import pprint
            print '\n'
            print msg
            print str({k: v.encode('utf-8') for k, v in crawler_log.items()}).decode('string-escape')
            print 'line: '
            print inspect.stack()[1][2]
            print '\n'

        logger(log_name, code, msg, **data)
コード例 #6
0
def insert_db_data(table, data):
    if not data:
        logger('mongo_log', 'ERROR', 'There is no data to insert',
               **{'insert_data': data})
        return True

    if type(data) is dict:
        result = db[table].insert_one(data)
        return result.inserted_id

    try:
        for retry_time in xrange(3):
            if db[table].insert_many(data):
                break
            time.sleep(0.1)
    except:
        message = traceback.format_exc()
        logger('mongo_log', 'ERROR', message, **{'insert_data': data})
        return False

    return True
コード例 #7
0
def save_status(sid, status, message, cache_time=None):
    timestamp = int(time.mktime(time.localtime()))
    filter_data = {'sid': sid}
    update_data = {'status': status, 'message': message, 'end_time': timestamp}
    if cache_time:
        update_data.update({'cache_time': cache_time})

    # limu的渠道
    sid_info = db['sid_info'].find_one({
        'sid': sid,
        'crawler_channel': {
            '$exists': True
        }
    })
    if sid_info:
        return True

    try:
        for _ in xrange(3):
            result = db['sid_info'].update_one(filter_data,
                                               {'$set': update_data},
                                               upsert=True)
            if result.modified_count:
                break
            time.sleep(0.1)
        else:
            logger('mongo_log', 'ERROR', 'save status error', **{
                'sid': sid,
                'update_data': update_data
            })
    except:
        message = traceback.format_exc()
        logger('mongo_log', 'ERROR', message, **{
            'sid': sid,
            'update_data': update_data
        })
        return False
    return True
コード例 #8
0
def data_fusion(**kwargs):
    #pub_param,final_bill_logs, missing_month_list

    fusion_start_time = time.time()
    final_bill_logs = kwargs['final_bill_logs']
    missing_month_list = kwargs['missing_month_list']
    tel = kwargs['tel']
    sid = kwargs['sid']
    pad_code = kwargs['pad_code']
    bill_log_cache = get_tel_data(tel)
    cache_hit_month = []
    craw_data = None
    all_miss_list = list(set(missing_month_list))

    if bill_log_cache == None:
        bill_log_cache = {'tel': tel}
        bill_log_cache['uti'] = str(int(time.time()))

    try:
        from datetime import datetime
        ori_dict = {
            'sid': sid,
            'tel': tel,
            'pad_code': pad_code,
            'final_bill_logs': final_bill_logs,
            'missing_month_list': missing_month_list,
            'bill_log_cache': bill_log_cache,
            'expiretime': datetime.utcnow()
        }
        register_other_bill_log(ori_dict)
    except:
        error_msg = traceback.format_exc()
        print error_msg
        pass

    try:
        #爬虫数据标准化
        #        print final_bill_logs
        craw_data, craw_month_list = craw_data_std(tel, final_bill_logs,
                                                   missing_month_list)
        #        print craw_month_list,'169'
        craw_month_list = clean_month_list(craw_month_list, all_miss_list)
        all_month = list(set(craw_month_list + all_miss_list))
        if len(craw_month_list) > 0 or len(all_miss_list) > 0:
            craw_data, bill_log_cache, missing_month_list, cache_hit_month = data_fusion_kernel(
                bill_log_cache, craw_data, craw_month_list, all_miss_list)


#        print bill_log_cache
#    print craw_data
        craw_data = cross_key_name(craw_data)
        #    print craw_data
        #数据剪枝
        bill_log_cache = data_cut(bill_log_cache, all_month)
        craw_data = data_cut(craw_data, all_month)
    except:
        error_msg = traceback.format_exc()
        print error_msg
        return final_bill_logs, missing_month_list, [], 0.00

    try:
        #        print bill_log_cache
        ret = insert_data(bill_log_cache)
        pass
    except:
        message = traceback.format_exc()
        print message
        return final_bill_logs, missing_month_list, [], 0.00
    call_log_list = []
    for key, value in craw_data.items():
        if len(key) == 6:
            call_log_list.append(value)
    cache_hit_month = list(set(cache_hit_month))
    missing_month_list = list(set(missing_month_list))
    fusion_end_time = time.time()
    fusion_cost_time = fusion_end_time - fusion_start_time

    log_data = {
        'tel': tel,
        'bill_cache_hit_month': cache_hit_month,
        'bill_missing_month_list': missing_month_list,
        'bill_fusion_cost_time': fusion_cost_time,
        'bill_fusion_end_time': fusion_end_time,
    }
    logger('bill_data_fusion', 'INFO', '', **log_data)
    return call_log_list, missing_month_list, cache_hit_month, fusion_cost_time
コード例 #9
0
def multiprocess_dama(sid, img_data, code_type):
    log_name = 'multiprocess_dama'
    data = {'sid': sid}
    parent_conn, child_conn = Pipe()
    begin_time = time.time()
    proccess_flag = 0
    p1 = Process(target=dama_proccess,
                 args=(
                     sid,
                     img_data,
                     code_type,
                     proccess_flag,
                     child_conn,
                 ))  #申请子进程
    p1.deamon = True
    p1.start()  #运行进程
    proccess_flag = 1
    print('A进程启动完毕')
    data['p1'] = 'p1'
    data['A进程启动完毕'] = 'A进程启动完毕'
    while True:
        end_time = time.time()
        #        print (end_time-begin_time)
        if end_time - begin_time > 20 and proccess_flag == 1:
            p2 = Process(target=dama_proccess,
                         args=(
                             sid,
                             img_data,
                             code_type,
                             proccess_flag,
                             child_conn,
                         ))  #申请子进程
            p2.deamon = True
            p2.start()  #运行进程
            proccess_flag = 2
            print('B进程启动完毕')
            data['B进程启动完毕'] = 'B进程启动完毕'
            data['p2'] = 'p2'
        if end_time - begin_time > 40 and proccess_flag == 2:
            p3 = Process(target=dama_proccess,
                         args=(
                             sid,
                             img_data,
                             code_type,
                             proccess_flag,
                             child_conn,
                         ))  #申请子进程
            p3.start()  #运行进程
            p3.deamon = True
            proccess_flag = 3
            print('C进程启动完毕')
            data['C进程启动完毕'] = 'C进程启动完毕'
            data['p3'] = 'p3'

        data['proccess_flag'] = proccess_flag
        if parent_conn.poll():
            task = parent_conn.recv()
            if len(task) > 2:
                print('收到管道信息'.format(task))
                data['收到管道信息'] = task
                parent_conn.close()
                child_conn.close()
                logger(log_name, task[0], task[1], **data)
                return task
        if end_time - begin_time > 60:
            break
        time.sleep(1)
        print(end_time - begin_time)
    logger(log_name, 'crawl_error', '打码平台超时', **data)
    return 'crawl_error', u"打码平台超时", "", None
コード例 #10
0
def data_fusion(**kwargs):
    #pub_param,final_call_logs, missing_month_list, possibly_missing_list, part_missing_list

    fusion_start_time = time.time()
    final_call_logs = kwargs['final_call_logs']
    missing_month_list = kwargs['missing_month_list']
    possibly_missing_list = kwargs['possibly_missing_list']
    part_missing_list = kwargs['part_missing_list']
    tel = kwargs['tel']
    call_log_cache = get_tel_data(tel)
    cache_hit_month = []
    all_move_miss = []
    all_miss_list = list(
        set(missing_month_list + possibly_missing_list + part_missing_list))
    if call_log_cache == None and len(final_call_logs) < 1:
        return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0
    try:
        #爬虫数据标准化
        craw_data, craw_month_list = craw_data_std(tel, final_call_logs,
                                                   all_miss_list)

        all_miss_list = list(
            set(missing_month_list + possibly_missing_list +
                part_missing_list))
        craw_month_list = clean_month_list(craw_month_list, all_miss_list)
        all_month = list(set(craw_month_list + all_miss_list))
        if len(craw_month_list) > 0:
            craw_data, call_log_cache, craw_month_list, cache_hit_list, move_miss = data_fusion_kernel(
                call_log_cache, craw_data, craw_month_list, 1)
            cache_hit_month.extend(cache_hit_list)
            all_move_miss.append(move_miss)
        if len(missing_month_list) > 0:
            craw_data, call_log_cache, missing_month_list, cache_hit_list, move_miss = data_fusion_kernel(
                call_log_cache, craw_data, missing_month_list, 0)
            cache_hit_month.extend(cache_hit_list)
            all_move_miss.append(move_miss)
        if len(possibly_missing_list) > 0:
            craw_data, call_log_cache, possibly_missing_list, cache_hit_list, move_miss = data_fusion_kernel(
                call_log_cache, craw_data, possibly_missing_list, 3)
            cache_hit_month.extend(cache_hit_list)
            all_move_miss.append(move_miss)
        if len(part_missing_list) > 0:
            craw_data, call_log_cache, part_missing_list, cache_hit_list, move_miss = data_fusion_kernel(
                call_log_cache, craw_data, part_missing_list, 2)
            cache_hit_month.extend(cache_hit_list)
            all_move_miss.append(move_miss)


#        print all_move_miss
        for move_ in all_move_miss:
            for key, value in move_.items():
                if value == 0:
                    missing_month_list.append(key)
                elif value == 2:
                    part_missing_list.append(key)
                elif value == 3:
                    possibly_missing_list.append(key)
        #    print craw_data
        craw_data = cross_key_name(craw_data)
        #    print craw_data
        #数据剪枝
        call_log_cache = data_cut(call_log_cache, all_month)
        craw_data = data_cut(craw_data, all_month)
    except:
        error_msg = traceback.format_exc()
        print error_msg
        return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0

    try:
        #        print call_log_cache
        ret = insert_data(call_log_cache)
        pass
    except:
        message = traceback.format_exc()
        print message
        return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0
    call_log_list = []
    for key, value in craw_data.items():
        if type(value).__name__ == 'dict':
            if value.has_key('status'):
                for x in value['det']:
                    x['month'] = key
                    call_log_list.append(x)
    cache_hit_month = list(set(cache_hit_month))
    possibly_missing_list = list(set(possibly_missing_list))
    part_missing_list = list(set(part_missing_list))
    missing_month_list = list(set(missing_month_list))
    fusion_end_time = time.time()
    fusion_cost_time = fusion_end_time - fusion_start_time

    log_data = {
        'tel': tel,
        'cache_hit_month': cache_hit_month,
        'possibly_missing_list': possibly_missing_list,
        'part_missing_list': part_missing_list,
        'missing_month_list': missing_month_list,
        'fusion_cost_time': fusion_cost_time,
        'fusion_end_time': fusion_end_time,
        'all_move_miss': all_move_miss
    }
    logger('data_fusion', 'INFO', '', **log_data)
    return call_log_list, missing_month_list, possibly_missing_list, part_missing_list, cache_hit_month, fusion_cost_time