Example #1
0
def import_data(date, side, path):
    """
    [email protected]
    2016-07-04 大搜PV数据获取不到,改写逻辑
    :param date:
    :param side:
    :param path:
    :return:
    """
    pv_ftp = convert_path(source_config.SPO_SRC[side]["pv"], date)
    srcid_ftp = convert_path(source_config.SPO_SRC[side]["srcid"], date)
    pv_path = os.path.join(path, "pv.%s" % source_config.SIDE_NAME[side])
    srcid_path = os.path.join(path, "srcid.%s" % source_config.SIDE_NAME[side])
    pv_success = True
    try:
        # 获取大搜PV数据
        tools.wget(pv_ftp, pv_path)
    except:
        logging.info(u"下载PV数据失败!")
        pv_success = False
    try:
        tools.wget(srcid_ftp, srcid_path)
    except:
        logging.info(u"下载srcid数据失败!")
        return

    if pv_success:
        pv = open(pv_path).read().rstrip("\r\n")
        pv = float(pv)
    else:
        pv = None

    srcid_query_map, srcid_pv_map, srcid_effect_map = build_srcid_map(
        srcid_path, pv)

    stat_db = db.SaveDataBase(date, side)
    stat_db.clear_spo_srcid_stat()
    stat_db.save_spo_srcid_stat("srcid_pv", srcid_pv_map.items())
    logging.info("srcid_pv number:%s" % len(srcid_pv_map))
    if srcid_effect_map:
        stat_db.save_spo_srcid_stat("srcid_effect", srcid_effect_map.items())
        logging.info("srcid_effect number:%s" % len(srcid_effect_map))
    else:
        logging.info("there is no srcid_effect data")
    stat_db.clear_spo_query_stat()
    query_stat_list = []
    for srcid, query_pv_list in srcid_query_map.items():
        for query_pv in query_pv_list:
            query_stat_list.append([srcid, query_pv["query"], query_pv["pv"]])
    logging.info("query stat number:%s" % len(query_stat_list))
    stat_db.save_spo_query_stat(query_stat_list)
    stat_db.close()
Example #2
0
def import_data(side, date, path, source, product_id):
    try:
        tools.wget(source, path)
    except:
        logging.info(u"下载失败!")
        return
    stat_map = {}
    position_list = []
    url_list = []

    reg = re.compile(r"^\[(\w+)\]$")
    this_mod = None
    with open(path) as fp:
        for line in fp:
            line = line.rstrip("\r\n").decode("utf-8")
            match = reg.match(line)
            if match:
                this_mod = match.group(1)
                if this_mod not in ("index", "position", "detail"):
                    logging.info("unkown mod: %s" % this_mod)
                continue
            if this_mod == "index":
                index_parse(line, stat_map)
            elif this_mod == "position":
                position_parse(line, position_list)
            elif this_mod == "detail":
                detail_parse(line, url_list)
    #position_list截取前50
    position_list.sort(key=lambda x: x[0])
    position_list = position_list[:min(50, len(position_list))]

    #url_list去重
    # url_map = {url[0]: url for url in url_list}
    # url_list = url_map.values()
    #导入数据库
    stat_db = db.SaveDataBase(date, side)
    logging.info("stat number:%s" % len(stat_map))
    stat_db.clear_midpage_stat(product_id)
    stat_db.save_midpage_stat(product_id, stat_map.items())
    logging.info("position number:%s" % len(position_list))
    stat_db.clear_midpage_position_stat(product_id)
    stat_db.save_midpage_position_stat(product_id, position_list)
    logging.info("url number:%s" % len(url_list))
    stat_db.clear_midpage_url_stat(product_id)
    stat_db.save_midpage_url_stat(product_id, url_list)
    stat_db.close()
Example #3
0
def import_spo_data(date):
    u"""
    导入spo kpi:
        -- pv: 总日PV
        -- pv_influence: 搜索PV覆盖率
        -- accuracy: 结果准确率
        -- se_coverage: 需求覆盖率
    :param date:
    :return:
    """
    index_list = ['pv', 'pv_influence', 'accuracy', 'se_coverage', 'data_amount']
    spo_product = get_spo_product(date)
    manual_kpi = get_manual_file(date, spo_product)
    date = time.strftime("%Y-%m-%d",time.strptime(date, "%Y%m%d"))

    data = []
    if spo_product:
        for side in spo_product:
            index_dict = init_index_dict(index_list)
            for category in spo_product[side]:
                srcdict = spo_product[side][category]['srcids']
                pid = spo_product[side][category]['id']
                
                for val in index_list:
                    temp = {}
                    temp['side'] = side
                    temp['product'] = category
                    temp['date'] = date
                    temp['pid'] = pid
                    
                    stats = None
                    if val == 'pv':
                        stats = get_pv(srcdict, side, date)
                    elif val == 'pv_influence':
                        stats = get_pv_influence(srcdict, side, date)
                    elif val == 'accuracy': 
                        if manual_kpi[side].has_key(category) and manual_kpi[side][category]['accuracy']!='-':
                            stats = {}
                            stats['value'] = float(manual_kpi[side][category]['accuracy'])
                            stats['amount_ratio'] = manual_kpi[side][category]['amount_ratio']
                            stats['last_modify_date'] = manual_kpi[side][category]['last_modify_date']
                    elif val == 'se_coverage':
                        if manual_kpi[side].has_key(category) and manual_kpi[side][category]['se_coverage']!='-':
                            stats = {}
                            stats['value'] = float(manual_kpi[side][category]['se_coverage'])
                            stats['amount_ratio'] = manual_kpi[side][category]['amount_ratio']
                            stats['last_modify_date'] = manual_kpi[side][category]['last_modify_date']
                    elif val == 'data_amount':
                        if manual_kpi[side].has_key(category) and manual_kpi[side][category]['datamount']!='-':
                            stats = {}
                            stats['value'] = float(manual_kpi[side][category]['datamount'])
                            stats['last_modify_date'] = manual_kpi[side][category]['last_modify_date']
                    temp['index'] = val
                    if stats is not None and ('value' in stats) :                       
                        temp['value'] = stats['value']
                        temp['last_modify_date'] = stats['last_modify_date']
                        if val == 'pv' or val == 'pv_influence' or val == 'data_amount':
                            index_dict[val]['value'] += stats['value']
                        elif val == 'accuracy' or val == 'se_coverage':
                            index_dict[val]['value'] += stats['value']*stats['amount_ratio']
                        if index_dict[val]['last_modify_date'] < stats['last_modify_date']:
                            index_dict[val]['last_modify_date'] = stats['last_modify_date']
                        data.append(temp)
            #总量数据
            for idx in index_dict:
                temp = {}
                temp['side'] = side
                temp['product'] = u'总体'
                temp['index'] = idx
                temp['date'] = date
                temp['pid'] = 0
                temp['value'] = index_dict[idx]['value']
                temp['last_modify_date'] = index_dict[idx]['last_modify_date']
                data.append(temp)
    
    stat_db = db.SaveDataBase(date)
    print "remove data of %s..." % (date)
    stat_db.clear_spo_daily_summary(date, index_list)
    print "insert data into db"  
    ret = stat_db.save_spo_index_info(data)
Example #4
0
def import_spo_data(date, delta=1):
    u"""
    导入spo质量效果数据,目前主要统计『总体』:
        -- module_sla: 模块稳定性
        -- fe_sla: 前端交互稳定性
    :param date: 
    :param delta: 
    :return: 
    """
    start_time = time.mktime(time.strptime(date, "%Y%m%d"))
    index_dict = {}
    arr = []
    index_dict['module_sla'] = {'pc': '9', 'wise': '33'}
    index_dict['fe_sla'] = {'pc': '27', 'wise': '34'}
    kgqc = {
        'host': 'kgqc.baidu.com',
        'port': '80',
        'url': "/statistics/api/getStatisticsForKgdc/"
    }

    data = {}
    for idx in index_dict:
        '''module_sla/fe_sla'''
        for client in index_dict[idx]:
            arr.append('%s' % index_dict[idx][client])
    data['moduleId'] = ','.join(arr)

    client = Request()
    '''回溯最近15天的数据'''
    for i in range(15, -1, -1):
        search_date = time.strftime(
            '%Y%m%d', time.localtime(start_time - i * 24 * 60 * 60))
        data['start'] = search_date
        data['end'] = search_date
        kgqc_data = client.get(kgqc['host'], kgqc['port'], kgqc['url'], data)

        if 'data' in kgqc_data:
            for idx_key in index_dict:
                stat_db = db.SaveDataBase(search_date)
                print "remove %s data of %s..." % (idx_key, search_date)
                stat_db.clear_spo_daily_summary(search_date, [idx_key])

                for cli_key in index_dict[idx_key]:
                    value = 0
                    print cli_key
                    if index_dict[idx_key][cli_key] in kgqc_data['data'].keys(
                    ):
                        kgqc_item = kgqc_data['data'][index_dict[idx_key]
                                                      [cli_key]]['data']
                        if kgqc_item['check_num'] > 0:
                            value = round(
                                float(kgqc_item['check_num'] -
                                      kgqc_item['alarm_num']) /
                                kgqc_item['check_num'], 6)

                    indata = {}
                    indata['side'] = cli_key
                    indata['product'] = u'总体'
                    indata['pid'] = 0
                    indata['index'] = idx_key
                    indata['value'] = value
                    indata['date'] = search_date
                    indata['last_modify_date'] = search_date
                    print "insert data into db"
                    ret = stat_db.save_spo_index_info([indata])