Ejemplo n.º 1
0
def event_type_map_eval_ml_metrics(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type MAP evaluation:")

    print_and_log(
        logger, "event type MAP: {}".format(
            ml_metrics.mapk(result['gt_all_event_id'],
                            result['pred_all_event_id'])))

    k_list = []
    for i in range(len(result['chain_name'])):
        k_list.append(len(result['gt_all_event_type'][i]))
    k_list = sorted(list(set(k_list)))
    k_list.remove(0)
    print_and_log(logger, "all possible k: {}".format(k_list))

    for k in k_list:
        map_at_k = ml_metrics.mapk(result['gt_all_event_id'],
                                   result['pred_all_event_id'], k)
        print_and_log(logger, "event type MAP@{}: {}".format(int(k), map_at_k))

    return result
Ejemplo n.º 2
0
def event_type_map_eval_given_gt(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type MAP evaluation:")

    result['et_ap'] = list()

    for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']):
        AP = precision_score(a, p, average='macro')
        result['et_ap'].append(AP)

    map_re = np.mean(result['et_ap'])
    result['et_map'] = map_re

    print_and_log(logger, "event type MAP: {}".format(round(map_re, 4)))

    return result
Ejemplo n.º 3
0
def event_type_map_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type MAP evaluation:")

    result['et_ap'] = list()

    for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']):
        AP = compute_AP(a, p)
        result['et_ap'].append(AP)

    map_re = np.mean(result['et_ap'])
    result['et_map'] = map_re

    print_and_log(logger, "event type MAP: {}".format(round(map_re, 4)))

    return result
Ejemplo n.º 4
0
def user_cluster_map_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "user cluster MAP evaluation:")

    result['uc_ap'] = list()

    for a, p in zip(result['gt_all_user_cluster'],
                    result['pred_all_user_cluster']):
        AP = compute_AP(a, p)
        result['uc_ap'].append(AP)

    map_re = np.mean(result['uc_ap'])
    result['uc_map'] = map_re

    print_and_log(logger, "user cluster MAP: {}".format(round(map_re, 4)))

    return result
Ejemplo n.º 5
0
def event_type_categorical_accuracy_eval_given_gt(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type categorical accuracy evaluation:")

    y_true = []
    y_pred = []

    for a, p in zip(result['gt_all_event_id'], result['pred_all_event_id']):
        y_true += a
        y_pred += p

    result['et_cate'] = accuracy_score(y_true, y_pred)

    print_and_log(
        logger, "event type categorical accuracy: {}".format(
            round(result['et_cate'], 4)))

    return result
Ejemplo n.º 6
0
def user_cluster_categorical_accuracy_eval_given_gt(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "user cluster categorical accuracy evaluation:")

    y_true = []
    y_pred = []

    for a, p in zip(result['gt_all_user_cluster'],
                    result['pred_all_user_cluster']):
        y_true += a
        y_pred += p

    result['uc_cate'] = accuracy_score(y_true, y_pred)

    print_and_log(
        logger, "user cluster categorical accuracy: {}".format(
            round(result['uc_cate'], 4)))

    return result
Ejemplo n.º 7
0
def time_delay_overall_evaluation(config,
                                  logger,
                                  result,
                                  result_save_path,
                                  plot_ts=True,
                                  chain_length_eval=True):
    print_and_log(logger, "====================================")
    print_and_log(logger, "time delay evaluation:")

    # statistics
    pred_all = []
    gt_all = []
    avg_dtw = []
    avg_mse = []
    result["td_DTW"] = list()
    result["td_MSE"] = list()
    for i in range(len(result['chain_name'])):
        pred_time_delay = result['pred_all_time_delay'][i]
        gt_time_delay = result['gt_all_time_delay'][i]
        if len(pred_time_delay) == 0:
            pred_time_delay = [-1]
        if len(gt_time_delay) == 0:
            gt_time_delay = [-1]
        avg_dtw.append(fastdtw(gt_time_delay, pred_time_delay)[0])
        result["td_DTW"].append(avg_dtw[-1])
        if len(gt_time_delay) == len(pred_time_delay):
            avg_mse.append(mean_squared_error(gt_time_delay, pred_time_delay))
            result["td_MSE"].append(avg_mse[-1])
        else:
            result["td_MSE"].append('null')
        if len(result['pred_all_time_delay'][i]) != 0:
            pred_all += pred_time_delay
        if len(result['gt_all_time_delay'][i]) != 0:
            gt_all += gt_time_delay

    print_and_log(logger, "Average DTW: {}".format(round(np.mean(avg_dtw), 4)))
    if config['given_gt']:
        print_and_log(logger, "Average MSE: {}".format(np.mean(avg_mse)))
    print_and_log(
        logger,
        "MAX predicted: {}, ground truth: {}".format(round(max(pred_all), 4),
                                                     round(max(gt_all), 4)))
    print_and_log(
        logger,
        "MIN predicted: {}, ground truth: {}".format(round(min(pred_all), 4),
                                                     round(min(gt_all), 4)))
    print_and_log(
        logger, "MEAN predicted: {}, ground truth: {}".format(
            round(np.mean(pred_all), 4), round(np.mean(gt_all), 4)))
    print_and_log(
        logger, "STD predicted: {}, ground truth: {}".format(
            round(np.std(pred_all), 4), round(np.std(gt_all), 4)))

    # chain length evaluation
    if chain_length_eval:
        length_mae = []
        length_stat = dict()
        length_stat["gt_chain_0"] = 0
        length_stat["gt_chain_1"] = 0
        length_stat["Same_as_gt"] = 0
        length_stat["diff_1_to_10"] = 0
        length_stat["diff_10_to_100"] = 0
        length_stat["diff_100+"] = 0

    if 'chains_applied_keep_pred' in result:
        length_stat["applied_threshold"] = len(
            result["chains_applied_keep_pred"])

    sim_start = config['sim_period']['start'].split('T')[0]
    sim_end = config['sim_period']['end'].split('T')[0]

    if plot_ts:
        time_delay_plot_save_path = os.path.join(result_save_path,
                                                 "time_delay_plot")
        if not os.path.exists(time_delay_plot_save_path):
            os.makedirs(time_delay_plot_save_path)

    if chain_length_eval or plot_ts:
        for i in range(len(result['chain_name'])):
            chain = result['chain_name'][i]
            pred_time_delay = result['pred_all_time_delay'][i]
            gt_time_delay = result['gt_all_time_delay'][i]

            if plot_ts:
                plot_time_delay_ts_for_one_chain(chain,
                                                 time_delay_plot_save_path,
                                                 pred_time_delay,
                                                 gt_time_delay, sim_start,
                                                 sim_end)

            if chain_length_eval:
                length_mae.append(
                    abs(len(pred_time_delay) - len(gt_time_delay)))

                if len(gt_time_delay) == 0:
                    length_stat["gt_chain_0"] += 1
                if len(gt_time_delay) == 1:
                    length_stat["gt_chain_1"] += 1
                if len(pred_time_delay) == len(gt_time_delay):
                    length_stat["Same_as_gt"] += 1
                if abs(len(pred_time_delay) - len(gt_time_delay)) < 10 and (
                        abs(len(pred_time_delay) - len(gt_time_delay)) >= 1):
                    length_stat["diff_1_to_10"] += 1
                if abs(len(pred_time_delay) - len(gt_time_delay)) < 100 and (
                        abs(len(pred_time_delay) - len(gt_time_delay)) >= 10):
                    length_stat["diff_10_to_100"] += 1
                if abs(len(pred_time_delay) - len(gt_time_delay)) >= 100:
                    length_stat["diff_100+"] += 1

    if chain_length_eval:
        length_mae = np.mean(length_mae)

    if chain_length_eval:
        print_and_log(logger, "====================================")
        print_and_log(logger, "chain length evaluation:")

        print_and_log(logger, "MAE: {}".format(round(length_mae, 4)))

        print_and_log(
            logger, "Count of number of simulated "
            "chains: {}".format(len(result['chain_name'])))

        print_and_log(
            logger, "Count of number of chains whose "
            "ground truth length is 0: {}".format(length_stat["gt_chain_0"]))

        print_and_log(
            logger, "Count of number of chains whose "
            "ground truth length is 1: {}".format(length_stat["gt_chain_1"]))

        if 'chains_applied_keep_pred' in result:
            print_and_log(
                logger, "Count of number of predicted chains that "
                "length needed threshold to be applied: {}, "
                "percentage: {} ".format(
                    length_stat["applied_threshold"],
                    round(
                        length_stat["applied_threshold"] /
                        len(result['chain_name']), 4)))

        print_and_log(
            logger, "Count of number of predicted "
            "chains that has "
            "same length as ground truth"
            ": {}, percentage: {}".format(
                length_stat["Same_as_gt"],
                round(length_stat["Same_as_gt"] / len(result['chain_name']),
                      4)))
        print_and_log(
            logger, "Count of number of predicted chains that "
            "length difference is 1 to 10: {},"
            "percentage: {}".format(
                length_stat["diff_1_to_10"],
                round(length_stat["diff_1_to_10"] / len(result['chain_name']),
                      4)))
        print_and_log(
            logger, "Count of number of predicted chains that "
            "length difference is 10 to 100: {}, "
            "percentage: {}".format(
                length_stat["diff_10_to_100"],
                round(
                    length_stat["diff_10_to_100"] / len(result['chain_name']),
                    4)))
        print_and_log(
            logger, "Count of number of predicted chains that "
            "length difference is 100 and above: {}, "
            "percentage: {}".format(
                length_stat["diff_100+"],
                round(length_stat["diff_100+"] / len(result['chain_name']),
                      4)))

    return result
Ejemplo n.º 8
0
def user_cluster_nlg_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "user cluster average bleu scores:")
    # print_and_log(logger, "Please install nlg-eval package!\n"
    #               "Reference: https://github.com/Maluuba/nlg-eval")
    # print_and_log(logger, "After installing, please change the package "
    #               "__init__.py file (contact: [email protected]).")

    sys.path.append(config['nlgeval_repo_dir'])
    from nlgeval import compute_individual_metrics

    # avg  bleu
    avg_bleu = dict()
    avg_bleu = dict()
    avg_bleu['Bleu_1'] = list()
    avg_bleu['Bleu_2'] = list()
    avg_bleu['Bleu_3'] = list()
    avg_bleu['Bleu_4'] = list()

    result['uc_bleu1'] = list()
    result['uc_bleu2'] = list()
    result['uc_bleu3'] = list()
    result['uc_bleu4'] = list()
    for i in range(len(result['chain_name'])):
        if len(result['gt_all_user_cluster'][i]) == 0:
            gt_chain = " ".join(['no_event_in_simperiod'])
        else:
            gt_chain = " ".join(
                [str(ele) for ele in result['gt_all_user_cluster'][i]])
        if len(result['pred_all_user_cluster'][i]) == 0:
            hy_chain = " ".join(['no_event_in_simperiod'])
        else:
            hy_chain = " ".join(
                [str(ele) for ele in result['pred_all_user_cluster'][i]])
        metrics_dict = compute_individual_metrics(gt_chain,
                                                  hy_chain,
                                                  no_overlap=(False, True),
                                                  no_skipthoughts=True,
                                                  no_glove=True)
        result['uc_bleu1'].append(metrics_dict['Bleu_1'])
        avg_bleu['Bleu_1'].append(metrics_dict['Bleu_1'])

        if len(result['gt_all_user_cluster'][i]) >= 2:  # and (
            # len(result['pred_all_user_cluster'][i]) >= 2
            # ):
            result['uc_bleu2'].append(metrics_dict['Bleu_2'])
            avg_bleu['Bleu_2'].append(metrics_dict['Bleu_2'])
        else:
            result['uc_bleu2'].append('null')

        if len(result['gt_all_user_cluster'][i]) >= 3:  # and (
            # len(result['pred_all_user_cluster'][i])
            # ):
            result['uc_bleu3'].append(metrics_dict['Bleu_3'])
            avg_bleu['Bleu_3'].append(metrics_dict['Bleu_3'])
        else:
            result['uc_bleu3'].append('null')

        if len(result['gt_all_user_cluster'][i]) >= 4:  # and (
            # len(result['pred_all_user_cluster'][i]) >= 4
            # ):
            result['uc_bleu4'].append(metrics_dict['Bleu_4'])
            avg_bleu['Bleu_4'].append(metrics_dict['Bleu_4'])
        else:
            result['uc_bleu4'].append('null')

    for metric in avg_bleu:
        print_and_log(
            logger, "{}: {}".format(metric,
                                    round(np.average(avg_bleu[metric]), 4)))


#        print_and_log(logger, "{}: {}, calculated from {} values".format(
#                metric, round(np.average(avg_bleu[metric]), 4),
#                len(avg_bleu[metric])))
# pdb.set_trace()
    return result
Ejemplo n.º 9
0
def result_evaluation_given_gt(config, result_save_path, only_has_event=False):
    # load result pickle
    with open(os.path.join(result_save_path, 'result.pickle'), 'rb') as handle:
        result = pickle.load(handle)
    print('result.pickle loaded!')
    print("result.keys: {}\n\n".format(result.keys()))

    if only_has_event:
        result_new = dict()
        result_new['chain_name'] = list()
        result_new['pred_all_event_id'] = list()
        result_new['pred_all_event_type'] = list()
        result_new['pred_all_time_delay'] = list()
        result_new['pred_all_user_cluster'] = list()
        result_new['gt_all_event_id'] = list()
        result_new['gt_all_event_type'] = list()
        result_new['gt_all_time_delay'] = list()
        result_new['gt_all_user_cluster'] = list()

        for i in range(len(result['chain_name'])):
            if len(result['gt_all_event_id'][i]) != 0:
                result_new['chain_name'].append(result['chain_name'][i])
                result_new['pred_all_event_id'].append(
                    result['pred_all_event_id'][i])
                result_new['pred_all_event_type'].append(
                    result['pred_all_event_type'][i])
                result_new['pred_all_time_delay'].append(
                    result['pred_all_time_delay'][i])
                result_new['pred_all_user_cluster'].append(
                    result['pred_all_user_cluster'][i])
                result_new['gt_all_event_id'].append(
                    result['gt_all_event_id'][i])
                result_new['gt_all_event_type'].append(
                    result['gt_all_event_type'][i])
                result_new['gt_all_time_delay'].append(
                    result['gt_all_time_delay'][i])
                result_new['gt_all_user_cluster'].append(
                    result['gt_all_user_cluster'][i])

        result = result_new

    # logger
    if only_has_event:
        logger = set_logger(
            os.path.join(
                result_save_path, 'evaluate_only_has_event_given_gt_' +
                dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log'))
    else:
        logger = set_logger(
            os.path.join(
                result_save_path, 'evaluate_all_given_gt_' +
                dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log'))

    print_and_log(
        logger, "Evaluation over {} simulated chains...".format(
            len(result['chain_name'])))

    # evaluation proceses
    if config['event_type_nlg_eval']:
        result = event_type_nlg_eval(config, logger, result)

    if config['event_type_map_eval']:
        result = event_type_map_eval_given_gt(config, logger, result)

    result = event_type_categorical_accuracy_eval_given_gt(
        config, logger, result)

    if config['event_type_percentage_eval']:
        result = event_type_percentage_eval(config, logger, result)

    if config['user_cluster_nlg_eval']:
        result = user_cluster_nlg_eval(config, logger, result)

    if config['user_cluster_map_eval']:
        result = user_cluster_map_eval_given_gt(config, logger, result)

    result = user_cluster_categorical_accuracy_eval_given_gt(
        config, logger, result)

    if config['user_cluster_percentage_eval']:
        result = user_cluster_percentage_eval(config, logger, result)

    if config['time_delay_overall_evaluation']:
        if not only_has_event:
            result = time_delay_overall_evaluation(config,
                                                   logger,
                                                   result,
                                                   result_save_path,
                                                   plot_ts=config['plot_ts'],
                                                   chain_length_eval=False)
        else:
            result = time_delay_overall_evaluation(config,
                                                   logger,
                                                   result,
                                                   result_save_path,
                                                   plot_ts=False,
                                                   chain_length_eval=False)

    write_result_to_file(config, result, logger)

    del logger

    return
Ejemplo n.º 10
0
def user_cluster_percentage_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "user cluster distribution evaluation:")

    gt_class_list = []
    pred_class_list = []
    for i in range(len(result['chain_name'])):
        gt_class_list += result['gt_all_user_cluster'][i]
        pred_class_list += result['pred_all_user_cluster'][i]

    gt_class_list_counter = Counter(gt_class_list)
    pred_class_list_counter = Counter(pred_class_list)

    clusters = list(range(100 + 1))

    counts_per_class = []
    for i in range(len(clusters)):
        counts_per_class.append(gt_class_list_counter[i])
    gt_distribution = cal_distribution(counts_per_class)

    counts_per_class = []
    for i in range(len(clusters)):
        counts_per_class.append(pred_class_list_counter[i])
    pred_distribution = cal_distribution(counts_per_class)

    print_and_log(logger, "!!!!  ground truth distribution: ")
    for i in range(len(clusters)):
        print_and_log(logger, "{}: {}".format(i, round(gt_distribution[i], 4)))

    print_and_log(logger, "!!!!  prediction distribution: ")
    for i in range(len(clusters)):
        print_and_log(logger, "{}: {}".format(i, round(pred_distribution[i],
                                                       4)))
    return result
Ejemplo n.º 11
0
def event_type_percentage_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type distribution evaluation:")

    gt_class_list = []
    pred_class_list = []
    for i in range(len(result['chain_name'])):
        gt_class_list += result['gt_all_event_type'][i]
        pred_class_list += result['pred_all_event_type'][i]

    gt_class_list_counter = Counter(gt_class_list)
    pred_class_list_counter = Counter(pred_class_list)

    eventtype_2_id = dict()
    for key in config['eventtype_2_id']:
        eventtype_2_id[key] = config['eventtype_2_id'][key] - 1
    id_2_eventtype = dict(zip(eventtype_2_id.values(), eventtype_2_id.keys()))
    # pdb.set_trace()

    counts_per_class = []
    for i in range(len(id_2_eventtype)):
        et = id_2_eventtype[i]
        counts_per_class.append(gt_class_list_counter[et])
    gt_distribution = cal_distribution(counts_per_class)

    counts_per_class = []
    for i in range(len(id_2_eventtype)):
        et = id_2_eventtype[i]
        counts_per_class.append(pred_class_list_counter[et])
    pred_distribution = cal_distribution(counts_per_class)

    print_and_log(logger, "!!!!  ground truth distribution: ")
    for i in range(len(id_2_eventtype)):
        et = id_2_eventtype[i]
        print_and_log(logger, "{}: {}".format(et, round(gt_distribution[i],
                                                        4)))

    print_and_log(logger, "!!!!  prediction distribution: ")
    for i in range(len(id_2_eventtype)):
        et = id_2_eventtype[i]
        print_and_log(logger, "{}: {}".format(et,
                                              round(pred_distribution[i], 4)))
    return result
Ejemplo n.º 12
0
def testset_creation_given_gt(config, GT_avail=True):  # not implemented!
    create_dataset_start = time.time()

    logger = set_logger(
        os.path.join(
            config['exp_save_dir'], 'testset_creation_given_gt_' +
            dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log'))
    print_and_log(logger, '{}'.format('testset creation given gt...'))

    testset = dict()
    testset['X_test'] = list()
    testset['Y_test'] = list()
    testset['repo_list'] = list()
    testset['repo2sampleid'] = list()

    eventtype_2_id = config['eventtype_2_id']
    # id_2_eventtype = dict(zip(eventtype_2_id.values(),
    #                           eventtype_2_id.keys()))

    train_start = utc_timestamp(config['train_period']['start'])
    sim_start = utc_timestamp(config['sim_period']['start'])
    sim_end = utc_timestamp(config['sim_period']['end'])

    with open(
            os.path.join(config['exp_save_dir'], "dataset",
                         'unique_repo_train_vali.json'), 'r') as f:
        unique_repo_train_vali = json.load(f)
    repo_list = list(unique_repo_train_vali.keys())

    testset['repo_list'] = repo_list

    with open(config['github_event_repos_path'], 'rb') as f:
        github_event_repos = pickle.load(f)
    github_event_repos_set = set(
        [repo[:22] + '-' + repo[23:] for repo in github_event_repos])

    x_dim = config['x_dim']
    if config['use_repo_embedding']:
        with open(config['embed_vec_path'], 'rb') as f:
            embed_vec_all = pickle.load(f)
    if config['use_repo_profile_features']:
        with open(config['repo_profile_feat_path'], 'r') as f:
            repo_profile_raw = json.load(f)
        repo_profile_feat = dict()
        for repo in repo_profile_raw:
            this_repo_profile_feat = []
            this_repo_profile_feat.append(
                time_delay_normalization(repo_profile_raw[repo]['language'][0],
                                         '10log10_xplus1'))
            this_repo_profile_feat += repo_profile_raw[repo]['user_type']
            repo_profile_feat[repo] = this_repo_profile_feat
    if config['use_repo_idx_features']:
        with open(config['load_repo_path'], 'rb') as f:
            load_repo_train_vali = pickle.load(f)
    if config['use_repo_activity_features']:
        max_sum_act_feat = 0
        max_sum_act_feat_vec = None
        with open(config['repo_activity_feat_path'], 'rb') as f:
            repo_act_feat = pickle.load(f)
        for repo in repo_act_feat:
            for cluster in repo_act_feat[repo]:
                this_repo_cluster_feat = []
                for feat_name in repo_act_feat[repo][cluster]:
                    this_repo_cluster_feat += repo_act_feat[repo][cluster][
                        feat_name]
                repo_act_feat[repo][cluster] = this_repo_cluster_feat
                if sum(this_repo_cluster_feat) > max_sum_act_feat:
                    max_sum_act_feat = sum(this_repo_cluster_feat)
                    max_sum_act_feat_vec = this_repo_cluster_feat
        print_and_log(logger, "max_sum_act_feat: {}".format(max_sum_act_feat))
        print_and_log(logger,
                      "max_sum_act_feat_vec: {}".format(max_sum_act_feat_vec))
    if config['use_user_profile_features']:
        with open(config['user_profile_feat_path'], 'r') as f:
            user_profile_feat = json.load(f)
    if config['use_user_activity_features']:
        with open(config['user_activity_feat_path'], 'r') as f:
            user_act_feat = json.load(f)
    if config['use_user_cluster_one_hot']:
        with open(config['user_cluster_path'], 'r') as f:
            user_clusters = json.load(f)
        empty_no_event_user_cluster = max(user_clusters.values()) + 1
        user_clusters[config['empty_user']] = empty_no_event_user_cluster
        user_clusters['no_event_user'] = empty_no_event_user_cluster
    if config['use_user_cluster_minmax']:
        with open(config['user_cluster_path'], 'r') as f:
            user_clusters = json.load(f)
        max_cluster_id = max(user_clusters.values())
        min_cluster_id = min(user_clusters.values())
        empty_no_event_user_cluster = max_cluster_id + 1
        max_cluster_id = empty_no_event_user_cluster
        max_minus_min = max_cluster_id - min_cluster_id
        if min_cluster_id != 0:
            print("min cluster id is not 0! Need to examine code!")
            pdb.set_trace()
        user_clusters[config['empty_user']] = empty_no_event_user_cluster
        user_clusters['no_event_user'] = empty_no_event_user_cluster
    if config['use_cluster_profile_features']:
        max_sum_profile_feat = 0
        max_sum_profile_feat_vec = None
        with open(config['cluster_profile_feat_path'], 'rb') as f:
            cluster_profile_feat = pickle.load(f)
        for cluster in cluster_profile_feat:
            this_cluster_feat = []
            for feat_name in cluster_profile_feat[cluster]:
                # if feat_name == "geolocation" or feat_name == "user_type":
                #     continue
                this_cluster_feat += cluster_profile_feat[cluster][feat_name]
            cluster_profile_feat[cluster] = this_cluster_feat
            if sum(this_cluster_feat) > max_sum_profile_feat:
                max_sum_profile_feat = sum(this_cluster_feat)
                max_sum_profile_feat_vec = this_cluster_feat
        cluster_profile_feat[empty_no_event_user_cluster] = [
            0
        ] * config['dim_cluster_profile_features']
        print_and_log(logger,
                      "max_sum_profile_feat: {}".format(max_sum_profile_feat))
        print_and_log(
            logger,
            "max_sum_profile_feat_vec: {}".format(max_sum_profile_feat_vec))
    if config['use_cluster_activity_features']:
        max_sum_act_feat = 0
        max_sum_act_feat_vec = None
        with open(config['cluster_activity_feat_path'], 'rb') as f:
            cluster_act_feat = pickle.load(f)
        for cluster in cluster_act_feat:
            for repo in cluster_act_feat[cluster]:
                this_cluster_repo_feat = []
                for feat_name in cluster_act_feat[cluster][repo]:
                    this_cluster_repo_feat += cluster_act_feat[cluster][repo][
                        feat_name]
                cluster_act_feat[cluster][repo] = this_cluster_repo_feat
                if sum(this_cluster_repo_feat) > max_sum_act_feat:
                    max_sum_act_feat = sum(this_cluster_repo_feat)
                    max_sum_act_feat_vec = this_cluster_repo_feat
        print_and_log(logger, "max_sum_act_feat: {}".format(max_sum_act_feat))
        print_and_log(logger,
                      "max_sum_act_feat_vec: {}".format(max_sum_act_feat_vec))

    print_and_log(logger, "x_dim: {}".format(x_dim))

    user_has_no_cluster = set()

    sample_id = 0

    repo_y_et = []
    repo_y_td = []
    repo_y_uc = []

    print_and_log(logger, "gather testing data...")
    for repo in repo_list:  # for each cascade chain
        if (repo in config['repos_to_ignore']) or (repo
                                                   in github_event_repos_set):
            # it is a event repo or repo should be ignore
            continue

        repo_X = []
        repo2sampleid = []

        one_chain_pd = load_jsongz_2_dataframe(
            os.path.join(config['cascade_dir'], repo + '.json.gz'))

        one_chain_pd = one_chain_pd.loc[
            (one_chain_pd['nodeTime'] >= train_start)
            & (one_chain_pd['nodeTime'] <= sim_end)]
        one_chain_pd = one_chain_pd.sort_values(by=['nodeTime'])

        one_chain_event = []
        one_chain_time = []
        one_chain_user = []
        # padding event
        for i in range(config['window_size']):
            one_chain_event.append(config['empty_event_type'])
            one_chain_time.append(config['empty_time_delay'])
            one_chain_user.append(config['empty_user'])
        # <soc>
        one_chain_event.append(eventtype_2_id['<soc>'])
        one_chain_time.append(config['empty_time_delay'])
        one_chain_user.append(config['empty_user'])

        # event sequence
        one_chain_event += [
            eventtype_2_id[event] for event in one_chain_pd['actionType']
        ]
        one_chain_time += [time for time in one_chain_pd['nodeTime']]
        one_chain_user += [user for user in one_chain_pd['nodeUserID']]

        (one_chain_event_new, one_chain_time_new, one_chain_user_new) = \
            insert_no_event_for_a_chain_new(config,
                                            one_chain_event, one_chain_time,
                                            one_chain_user, sim_end+1)
        # if one_chain_event_new != one_chain_event:
        #     pdb.set_trace()

        one_chain_event = one_chain_event_new
        one_chain_time = one_chain_time_new
        one_chain_user = one_chain_user_new

        # calculate time delay sequence
        one_chain_time_delay = []
        for i in range(len(one_chain_time)):
            if (one_chain_event[i] == config['empty_event_type']
                    or one_chain_event[i] == eventtype_2_id['<soc>']):
                one_chain_time_delay.append(config['empty_time_delay'])
            elif one_chain_event[i - 1] == eventtype_2_id['<soc>']:
                one_chain_time_delay.append(config['empty_time_delay'])
            else:
                time_delay = get_time_delay(one_chain_time[i - 1],
                                            one_chain_time[i], 'float')[1]
                if config['time_delay_normalization_func'] is not None:
                    time_delay = time_delay_normalization(
                        time_delay, config['time_delay_normalization_func'])
                one_chain_time_delay.append(time_delay)

        # for each event sample in simulation period
        for i in range(config['window_size'], len(one_chain_event)):
            time_sample_outputevent = one_chain_time[i]
            event_sample_outputevent = one_chain_event[i]

            # if time_sample_outputevent in simulation period:
            # add this sample to testset
            if event_sample_outputevent == config['empty_event_type'] or (
                    event_sample_outputevent == eventtype_2_id['<soc>']):
                continue
            if one_chain_event[i - 1] == eventtype_2_id['<soc>']:
                continue
            if not ((time_sample_outputevent >= sim_start) and
                    (time_sample_outputevent <= sim_end)):
                continue

            input_event_type = \
                one_chain_event[i-config['window_size']:i]
            # input_time = one_chain_time[i-config['window_size']:i]
            input_time_delay = \
                one_chain_time_delay[i-config['window_size']:i]
            input_user = one_chain_user[i - config['window_size']:i]
            input_cluster = []
            for user in input_user:
                try:
                    input_cluster.append(user_clusters[user])
                except KeyError:
                    user_has_no_cluster.add(user)
                    input_cluster.append(user_clusters['no_event_user'])

            output_event_type = one_chain_event[i]
            output_time_delay = one_chain_time_delay[i]
            output_user = one_chain_user[i]
            try:
                output_cluster = user_clusters[output_user]
            except KeyError:
                user_has_no_cluster.add(output_user)
                output_cluster = user_clusters['no_event_user']

            # initialize input vector, and output vector for this sample
            x_vec = []

            # load repo embeding vector
            if config['use_repo_embedding']:
                try:
                    embed_vec = np.array(embed_vec_all[repo[:22] + '/' +
                                                       repo[23:]])
                except KeyError:
                    print_and_log(
                        logger, "Could not find "
                        "embedding vector for {}!".format(repo[:22] + '/' +
                                                          repo[23:]))
                    pdb.set_trace()

            # input feature vector
            for j in range(config['window_size']):  # for each event node
                x_j = []

                if config['use_repo_embedding']:
                    x_j += list(embed_vec)

                if config['use_repo_profile_features']:
                    try:
                        x_j += repo_profile_feat[repo]
                    except KeyError:
                        x_j += [0] * config['dim_repo_profile_features']

                if config['use_repo_idx_features']:
                    try:
                        x_j += load_repo_train_vali[repo]
                    except KeyError:
                        x_j += [0]

                if config['use_repo_activity_features']:
                    if input_cluster[j] == empty_no_event_user_cluster:
                        x_j += [0] * config['dim_repo_activity_features']
                    else:
                        try:
                            repo_thiscluster_act_feat = repo_act_feat[repo][
                                input_cluster[j]]
                            repo_allcluster_act_feat = repo_act_feat[repo][
                                'all_cluster']
                            x_j += repo_thiscluster_act_feat
                            x_j += repo_allcluster_act_feat
                        except KeyError:
                            x_j += [0] * config['dim_repo_activity_features']

                if config['use_user_profile_features']:
                    if input_user[j] == config['empty_user'] or (
                            input_user[j] == 'no_event_user'):
                        x_j += [0] * config['dim_user_profile_features']
                    else:
                        try:
                            x_j += user_profile_feat[input_user[j]]
                        except KeyError:
                            x_j += [0] * config['dim_user_profile_features']

                if config['use_user_activity_features']:
                    if input_user[j] == config['empty_user'] or (
                            input_user[j] == 'no_event_user'):
                        x_j += [0] * config['dim_user_activity_features']
                    else:
                        try:
                            thisrepo_feat = \
                                user_act_feat[input_user[j]][repo]
                        except KeyError:
                            # this user-repo no event in training period
                            thisrepo_feat = \
                                [0] * int(config[
                                        'dim_user_activity_features']/2)
                        allrepo_feat = \
                            user_act_feat[input_user[j]]['all']
                        x_j += thisrepo_feat + allrepo_feat

                if config['use_event_type_one_hot']:
                    event_type_one_hot = \
                        [0] * len(config['eventtype_2_id'])
                    if input_event_type[j] != config['empty_event_type']:
                        event_type_one_hot[input_event_type[j] - 1] = 1
                    x_j += event_type_one_hot

                if config['use_time_delay_features']:
                    x_j += [input_time_delay[j]]

                if config['use_user_cluster_one_hot']:
                    user_cluster_one_hot = \
                        [0] * config['dim_user_cluster_one_hot']
                    user_cluster_one_hot[input_cluster[j]] = 1

                    x_j += user_cluster_one_hot

                if config['use_user_cluster_minmax']:
                    use_user_cluster_minmax = (input_cluster[j] /
                                               max_minus_min)

                    x_j += [use_user_cluster_minmax]

                if config['use_cluster_profile_features']:
                    this_cluster_profile_feat = cluster_profile_feat[
                        input_cluster[j]]
                    x_j += this_cluster_profile_feat

                if config['use_cluster_activity_features']:
                    if input_cluster[j] == empty_no_event_user_cluster:
                        x_j += [0] * config['dim_cluster_activity_features']
                    else:
                        try:
                            cluster_thisrepo_act_feat = cluster_act_feat[
                                input_cluster[j]][repo]
                            cluster_allrepo_act_feat = cluster_act_feat[
                                input_cluster[j]]['all_repo']
                            x_j += cluster_thisrepo_act_feat
                            x_j += cluster_allrepo_act_feat
                        except KeyError:
                            x_j += [0
                                    ] * config['dim_cluster_activity_features']

                if len(x_j) != x_dim:
                    print("len(x_j) != x_dim")
                    pdb.set_trace()
                x_vec.append(x_j)
            if len(x_vec) != config['window_size']:
                print("len(x_vec) != config['window_size']")
                pdb.set_trace()

            repo_X.append(x_vec)

            # output vec
            repo_y_et.append([output_event_type])
            repo_y_td.append([output_time_delay])
            repo_y_uc.append([output_cluster])

            repo2sampleid.append(sample_id)
            sample_id += 1
            # pdb.set_trace()

        # finish gathering test data for this repo
        testset['X_test'] += repo_X
        testset['repo2sampleid'].append(repo2sampleid)
        # pdb.set_trace()

    testset['X_test'] = np.array(testset['X_test'])
    testset['Y_test_et'] = np.array(repo_y_et)
    testset['Y_test_td'] = np.array(repo_y_td)
    testset['Y_test_uc'] = np.array(repo_y_uc)

    print_and_log(logger, "X_test.shape: {}".format(testset['X_test'].shape))
    print_and_log(logger,
                  "Y_test_et.shape: {}".format(testset['Y_test_et'].shape))
    print_and_log(logger,
                  "Y_test_td.shape: {}".format(testset['Y_test_td'].shape))
    print_and_log(logger,
                  "Y_test_uc.shape: {}".format(testset['Y_test_uc'].shape))

    print_and_log(logger,
                  "number of chains for testing: {}".format(len(repo_list)))

    print_and_log(
        logger, "could not find cluster for {} users.".format(
            len(user_has_no_cluster)))

    if len(user_has_no_cluster) > 0:
        with open(
                os.path.join(config['exp_save_dir'], "dataset",
                             'user_has_no_cluster_given_gt.pickle'),
                'wb') as f:
            pickle.dump(user_has_no_cluster, f)

    print_and_log(
        logger,
        "the number of testing samples: {}".format(len(testset['X_test'])))

    with open(
            os.path.join(config['exp_save_dir'], "dataset",
                         'testset_given_gt.pickle'), 'wb') as f:
        pickle.dump(testset, f)

    print_and_log(
        logger,
        "{} took {} min".format("testset creation given gt",
                                (time.time() - create_dataset_start) / 60))
    # pdb.set_trace()
    return testset
Ejemplo n.º 13
0
def testset_creation(config, GT_avail=True):
    create_dataset_start = time.time()

    logger = set_logger(
        os.path.join(
            config['exp_save_dir'], 'testset_creation_' +
            dt.now().strftime("%Y-%m-%dT%H-%M-%SZ") + '.log'))
    print_and_log(logger, '{}'.format('testset creation...'))

    if not os.path.exists(os.path.join(config['exp_save_dir'], "dataset")):
        os.makedirs(os.path.join(config['exp_save_dir'], "dataset"))

    testset = dict()
    testset['X_test'] = []
    testset['repo'] = []
    testset['input_last_event_time'] = []
    testset['gt_event_type'] = None
    testset['gt_event_id'] = None
    testset['gt_time_delay'] = None
    testset['gt_user'] = None
    testset['gt_user_cluster'] = None
    X_test = []

    eventtype_2_id = config['eventtype_2_id']
    id_2_eventtype = dict(zip(eventtype_2_id.values(), eventtype_2_id.keys()))

    sim_start = utc_timestamp(config['sim_period']['start'])
    sim_end = utc_timestamp(config['sim_period']['end'])

    with open(config['unique_repo_train_vali_path'], 'r') as f:
        unique_repo_train_vali = json.load(f)
    repo_list = list(unique_repo_train_vali.keys())

    with open(config['github_event_repos_path'], 'rb') as f:
        github_event_repos = pickle.load(f)
    github_event_repos_set = set(
        [repo[:22] + '-' + repo[23:] for repo in github_event_repos])

    with open(config['user_cluster_path'], 'r') as f:
        user_clusters = json.load(f)
    empty_no_event_user_cluster = max(user_clusters.values()) + 1
    user_clusters[config['empty_user']] = empty_no_event_user_cluster
    user_clusters['no_event_user'] = empty_no_event_user_cluster

    unique_repo_test = dict()
    unique_user_test = dict()

    user_has_no_cluster = set()

    # gather simulation sample input
    print_and_log(logger, "gather simulation sample input...")
    for repo in repo_list:  # for each cascade chain
        if (repo in config['repos_to_ignore']) or (repo
                                                   in github_event_repos_set):
            # it is a event repo or repo should be ignore
            continue

        one_chain_pd = load_jsongz_2_dataframe(
            os.path.join(config['cascade_dir'], repo + '.json.gz'))
        # get all events before sim
        one_chain_pd = one_chain_pd.loc[one_chain_pd['nodeTime'] < sim_start]
        one_chain_pd = one_chain_pd.sort_values(by=['nodeTime'])

        one_chain_event = []
        one_chain_time = []
        one_chain_user = []
        # padding event
        for i in range(config['window_size']):
            one_chain_event.append(config['empty_event_type'])
            one_chain_time.append(config['empty_time_delay'])
            one_chain_user.append(config['empty_user'])
        # <soc>
        one_chain_event.append(eventtype_2_id['<soc>'])
        one_chain_time.append(config['empty_time_delay'])
        one_chain_user.append(config['empty_user'])

        # event sequence
        one_chain_event += [
            eventtype_2_id[event] for event in one_chain_pd['actionType']
        ]
        one_chain_time += [time for time in one_chain_pd['nodeTime']]
        one_chain_user += [user for user in one_chain_pd['nodeUserID']]

        (one_chain_event_new, one_chain_time_new,
         one_chain_user_new) = \
            insert_no_event_for_a_chain_new(config, one_chain_event,
                                            one_chain_time, one_chain_user,
                                            sim_start)
        # if one_chain_event_new != one_chain_event:
        #     pdb.set_trace()

        one_chain_event = one_chain_event_new
        one_chain_time = one_chain_time_new
        one_chain_user = one_chain_user_new

        # calculate time delay sequence
        one_chain_time_delay = []
        for i in range(len(one_chain_time)):
            if (one_chain_event[i] == config['empty_event_type']
                    or one_chain_event[i] == eventtype_2_id['<soc>']):
                one_chain_time_delay.append(config['empty_time_delay'])
            elif one_chain_event[i - 1] == eventtype_2_id['<soc>']:
                one_chain_time_delay.append(config['empty_time_delay'])
            else:
                time_delay = get_time_delay(one_chain_time[i - 1],
                                            one_chain_time[i], 'float')[1]
                if config['time_delay_normalization_func'] is not None:
                    time_delay = time_delay_normalization(
                        time_delay, config['time_delay_normalization_func'])
                one_chain_time_delay.append(time_delay)

        # input_last_event_time
        testset['input_last_event_time'].append(one_chain_time[-1])
        testset['repo'].append(repo)

        input_event_type = one_chain_event[-config['window_size']:]
        input_time_delay = one_chain_time_delay[-config['window_size']:]
        input_user = one_chain_user[i - config['window_size']:i]
        input_cluster = []
        for user in input_user:
            try:
                input_cluster.append(user_clusters[user])
            except KeyError:
                user_has_no_cluster.add(user)
                input_cluster.append(user_clusters['no_event_user'])

        # input feature vector
        if len(input_event_type) < config['window_size']:
            print("len(input_event_type) < config['window_size']")
            pdb.set_trace()

        # initialize input for this sample
        hightd_normalized = time_delay_normalization(
            720, config['time_delay_normalization_func'])
        lowtd_normalized = time_delay_normalization(
            0.001, config['time_delay_normalization_func'])
        td_normalized2 = time_delay_normalization(
            0.1, config['time_delay_normalization_func'])

        last_et = config['eventtype_2_id']['no_event_for_1month']
        last_td = hightd_normalized
        last_uc = user_clusters['no_event_user']

        x_vec = [last_et, last_td, last_uc]
        # if repo == "75Q5j4D5taKq5AlL--ZIFg-WKcY0zAnxfYZx6laSNH2UA":
        #     pdb.set_trace()
        # if repo == "2YzVcEU5XvJXobTU6swknA-uD2kFk9smPOUWFVcpo1bXg":
        #     pdb.set_trace()

        X_test.append(x_vec)

    testset['X_test'] = X_test

    print_and_log(logger, "X_test length: {}".format(len(testset['X_test'])))

    # gather simulation sample output
    if GT_avail:
        print_and_log(
            logger, "ground truth available. \n"
            "gather simulation sample output...")
        testset['gt_event_type'] = []
        testset['gt_time_delay'] = []
        testset['gt_event_id'] = []
        testset['gt_user'] = []
        testset['gt_user_cluster'] = []

        # for each gathered simulation sample input
        for i in range(len(testset['repo'])):
            repo_this_sample = testset['repo'][i]

            one_chain_pd = load_jsongz_2_dataframe(
                os.path.join(config['cascade_dir'],
                             repo_this_sample + '.json.gz'))
            # get events during sim
            one_chain_pd = one_chain_pd.loc[
                (one_chain_pd['nodeTime'] >= sim_start)
                & (one_chain_pd['nodeTime'] <= sim_end)]
            one_chain_pd = one_chain_pd.sort_values(by=['nodeTime'])

            one_chain_event_id = []
            one_chain_time = []
            one_chain_user = []

            unique_repo_test[repo_this_sample] = []

            if len(one_chain_pd) == 0:
                # this repo has no events in testing period
                # need to insert no event till end of testing
                input_last_event_time_this_chain = testset[
                    'input_last_event_time'][i]

                (one_chain_event_new, one_chain_time_new, one_chain_user_new
                 ) = insert_no_event_for_a_GTchain_who_has_no_event_at_all(
                     config, one_chain_event_id, one_chain_time,
                     one_chain_user, input_last_event_time_this_chain, sim_end)
            else:
                # this sample chain has events in testing period
                for user in one_chain_pd['nodeUserID']:
                    unique_user_test[user] = []

                one_chain_event_id += [
                    eventtype_2_id[event]
                    for event in one_chain_pd['actionType']
                ]
                one_chain_time += [time for time in one_chain_pd['nodeTime']]
                one_chain_user += [user for user in one_chain_pd['nodeUserID']]
                input_last_event_time_this_chain = testset[
                    'input_last_event_time'][i]

                if min(one_chain_time) < sim_start:
                    print("min(one_chain_time) < sim_start")
                    pdb.set_trace()

                (one_chain_event_new, one_chain_time_new,
                 one_chain_user_new) = \
                    insert_no_event_for_a_sim_GTchain(
                            config,
                            one_chain_event_id, one_chain_time,
                            one_chain_user,
                            input_last_event_time_this_chain,
                            sim_end
                            )

            # if one_chain_event_new != one_chain_event:
            #     pdb.set_trace()

            one_chain_event_id = one_chain_event_new
            one_chain_time = one_chain_time_new
            one_chain_user = one_chain_user_new
            one_chain_event_type = [
                id_2_eventtype[event] for event in one_chain_event_id
            ]
            one_chain_user_cluster = []

            # no mather one-hot or normalized, here need true cluster
            for user in one_chain_user:
                try:
                    one_chain_user_cluster.append(user_clusters[user])
                except KeyError:
                    user_has_no_cluster.add(user)
                    one_chain_user_cluster.append(
                        user_clusters['no_event_user'])

            # calculate time delay sequence
            one_chain_time_delay = []

            if len(one_chain_time) > 0:
                time_delay = get_time_delay(input_last_event_time_this_chain,
                                            one_chain_time[0], 'float')[1]
                # NO NEED TO DO normalization FOR GT !!!
                # if config['time_delay_normalization_func'] is not None:
                #     time_delay = time_delay_normalization(
                #             time_delay,
                #             config['time_delay_normalization_func'])
                one_chain_time_delay.append(time_delay)
                for j in range(1, len(one_chain_time)):
                    time_delay = get_time_delay(one_chain_time[j - 1],
                                                one_chain_time[j], 'float')[1]
                    # NO NEED TO DO normalization FOR GT !!!
                    # if config[
                    #       'time_delay_normalization_func'] is not None:
                    #     time_delay = time_delay_normalization(
                    #             time_delay,
                    #             config['time_delay_normalization_func'])
                    one_chain_time_delay.append(time_delay)

            testset['gt_event_id'].append(one_chain_event_id)
            testset['gt_event_type'].append(one_chain_event_type)
            testset['gt_time_delay'].append(one_chain_time_delay)
            testset['gt_user'].append(one_chain_user)
            testset['gt_user_cluster'].append(one_chain_user_cluster)

            # pdb.set_trace()

    print_and_log(
        logger, "could not find cluster for {} users.".format(
            len(user_has_no_cluster)))

    # check num of 0.0 hours in the ground truth output
    gt_zero_hours_count = 0
    gt_hours_count = 0
    for i in range(len(testset['gt_time_delay'])):
        for j in range(len(testset['gt_time_delay'][i])):
            time_delay_hour = testset['gt_time_delay'][i][j]
            gt_hours_count += 1
            if time_delay_hour == 0:
                gt_zero_hours_count += 1
    print_and_log(
        logger, "Out of {} ground truth time delay values that the model "
        "needs to predict, {} of them are 0.0 time delay hour.".format(
            gt_hours_count, round(gt_zero_hours_count / gt_hours_count, 2)))

    # save testset
    print_and_log(logger, "save testset ...")

    testset_save_path = os.path.join(config['exp_save_dir'], "dataset",
                                     'testset.pickle')
    with open(testset_save_path, 'wb') as handle:
        pickle.dump(testset, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print_and_log(
        logger, "testset.pickle save in {}".format(
            os.path.join(config['exp_save_dir'], "dataset")))

    with open(
            os.path.join(config['exp_save_dir'], "dataset",
                         'unique_repo_test.json'), 'w') as f:
        json.dump(unique_repo_test, f)
    with open(
            os.path.join(config['exp_save_dir'], "dataset",
                         'unique_user_test.json'), 'w') as f:
        json.dump(unique_user_test, f)

    print_and_log(
        logger, "the number of unique repos in "
        "testing samples: {}".format(len(unique_repo_test)))
    print_and_log(
        logger, "the number of unique users in "
        "testing samples: {}".format(len(unique_user_test)))

    print_and_log(
        logger, "{} took {}".format("testset creation",
                                    time.time() - create_dataset_start))

    # pdb.set_trace()

    return testset