Esempio n. 1
0
def get_all_combination_withCoverage_best_graph_Cand_boost(KB_terms, performance_runs, Ques_terms, Ans_terms, subgraph_size):  ## gold_labels_list is QA terms and pred_labels_over_runs is justification terms
    runs = list(performance_runs.keys())
    gold_labels = Ques_terms + Ans_terms
    # print("the gold_labels list looks like: ", runs)
    meta_subgraphs = []

    for i in range(subgraph_size-2):
        meta_subgraphs += list(combinations(runs, i+2))

    # for i in range(subgraph_size):  ## for taking best subgraph amongst subgraphs of size 3,4,5
    #     meta_subgraphs += list(combinations(runs, i+3))

    # meta_subgraphs += list(combinations(runs, subgraph_size))

    meta_graph_scores = []
    meta_graph_coverage_scores = []

    for meta_sub_graph1 in meta_subgraphs:
        current_subgraph_overlap = []
        current_subgraph_perf = []

        for ik1, rk1 in enumerate(meta_sub_graph1[:-1]):

            if ik1 == 0:  ## initializing the coverage list
                prediction_coverage = KB_terms[rk1]

            current_subgraph_perf.append(performance_runs[rk1])
            for rk2 in meta_sub_graph1[ik1 + 1:-1]:  ##### This is equivalent to M C 2

                current_subgraph_overlap.append(float(calculate_overlap(KB_terms[rk1], KB_terms[rk2])))
                prediction_coverage = get_union(prediction_coverage, KB_terms[rk2])

        avg_score = sum(current_subgraph_perf) / float(len(current_subgraph_perf))
        avg_overlap = sum(current_subgraph_overlap) / float(max(1,len(current_subgraph_overlap)))
        # print ("the ")
        final_query_coverage = len(get_intersection(prediction_coverage, Ques_terms)) / max(1,float(len(Ques_terms)))
        final_ans_coverage = len(get_intersection(prediction_coverage, Ans_terms)) / max(1,float(len(Ans_terms)))

        meta_graph_coverage_scores.append(final_query_coverage)
        # meta_graph_scores.append( avg_score  * final_ans_coverage * final_query_coverage)  ## taking average of subgraph scores
        # if subgraph_size>2:
        #    print ("the avg score, overlap and coverage looks like: ", avg_score, avg_overlap, final_query_coverage, final_ans_coverage)
        # meta_graph_scores.append( (avg_score/float(1+avg_overlap))  * (1+1*final_ans_coverage) * (1+final_query_coverage) )  ## taking average of subgraph scores
        meta_graph_scores.append( avg_score * (1+12*final_ans_coverage) * (1+final_query_coverage) )  ## taking average of subgraph scores

    # print ("the len of meta graph scores are : ", len(meta_graph_scores))
    try:
        best_sub_graph_index = meta_graph_scores.index(max(meta_graph_scores))

        return meta_subgraphs[best_sub_graph_index]
    except ValueError:
        return "Crashed"
Esempio n. 2
0
def get_all_combination_withCoverage_best_graph(
    KB_terms, performance_runs, Ques_terms, Ans_terms
):  ## gold_labels_list is QA terms and pred_labels_over_runs is justification terms
    runs = list(performance_runs.keys())
    gold_labels = Ques_terms + Ans_terms
    # print("the gold_labels list looks like: ", runs)
    meta_subgraphs = []

    # for i in range(len(runs)-1):
    #     meta_subgraphs += list(combinations(runs, i+2))

    meta_subgraphs += list(combinations(runs, 4))

    meta_graph_scores = []
    meta_graph_coverage_scores = []

    for meta_sub_graph1 in meta_subgraphs:
        current_subgraph_overlap = []
        current_subgraph_perf = []

        for ik1, rk1 in enumerate(meta_sub_graph1[:-1]):

            if ik1 == 0:  ## initializing the coverage list
                prediction_coverage = KB_terms[rk1]

            current_subgraph_perf.append(performance_runs[rk1])
            for rk2 in meta_sub_graph1[
                    ik1 + 1:-1]:  ##### This is equivalent to M C 2

                current_subgraph_overlap.append(
                    float(calculate_overlap(KB_terms[rk1], KB_terms[rk2])))
                prediction_coverage = get_union(prediction_coverage,
                                                KB_terms[rk2])

        avg_score = sum(current_subgraph_perf) / float(
            len(current_subgraph_perf))
        avg_overlap = sum(current_subgraph_overlap) / float(
            len(current_subgraph_overlap))
        # print ("the ")
        final_coverage = len(get_intersection(
            prediction_coverage, gold_labels)) / float(len(gold_labels))
        meta_graph_coverage_scores.append(final_coverage)
        # meta_graph_scores.append( (avg_score/float(avg_overlap+1)) * final_coverage )  ## taking average of subgraph scores
        meta_graph_scores.append(
            avg_score * final_coverage)  ## taking average of subgraph scores

    # print ("the len of meta graph scores are : ", len(meta_graph_scores))
    try:
        best_sub_graph_index = meta_graph_scores.index(max(meta_graph_scores))
        return meta_subgraphs[best_sub_graph_index]
    except ValueError:
        return "Crashed"
Esempio n. 3
0
def get_95percent_overlap_interval(pred_labels_over_runs, subgraph_size=2):
    runs = list(pred_labels_over_runs.keys())

    meta_subgraphs = list(combinations(runs, subgraph_size))

    meta_graph_scores = []
    Bi_node_overlap = {}

    for meta_sub_graph1 in meta_subgraphs:
        current_subgraph_score = []
        for ik1, rk1 in enumerate(meta_sub_graph1[:-1]):
            for rk2 in meta_sub_graph1[ik1+1:]:
                current_subgraph_score.append ( calculate_overlap(pred_labels_over_runs[rk1], pred_labels_over_runs[rk2]) )
                Bi_node_overlap.update({str(rk1) + str(rk2): current_subgraph_score[-1]})
        meta_graph_scores+=current_subgraph_score

    ### 95 % confidence interval of overlap or agreement scores
    mean_vals = mean_confidence_interval(meta_graph_scores)
    return mean_vals, Bi_node_overlap
Esempio n. 4
0
def get_best_linear_subgraph(pred_labels_over_runs, performance_runs, gold_labels_list, A1 = 1, A2 =1, A3 =1):  ## this is inclusion of coverage factor with Steve's graph suggestion
    runs = list(pred_labels_over_runs.keys())
    gold_labels = list(range(len(gold_labels_list)))
    print("the gold_labels list looks like: ", gold_labels)
    meta_subgraphs = []
    for i in range(len(runs)-1):
        meta_subgraphs += list(combinations(runs, i+2))

    meta_graph_scores = []
    meta_graph_coverage_scores = []

    for meta_sub_graph1 in meta_subgraphs:
        current_subgraph_overlap = []
        current_subgraph_perf = []

        for ik1, rk1 in enumerate(meta_sub_graph1[:-1]):

            if ik1 == 0:  ## initializing the coverage list
               prediction_coverage = pred_labels_over_runs[rk1]

            current_subgraph_perf.append(performance_runs[rk1])
            for rk2 in meta_sub_graph1[ik1+1:]:
                current_subgraph_overlap.append (float(calculate_overlap(pred_labels_over_runs[rk1], pred_labels_over_runs[rk2]) ) )
                prediction_coverage = get_union(prediction_coverage, pred_labels_over_runs[rk2])

        avg_score =  sum(current_subgraph_perf)/float(len(current_subgraph_perf))
        avg_overlap =  sum(current_subgraph_overlap)/float(len(current_subgraph_overlap))
        # print ("the ")
        final_coverage = len(get_intersection(prediction_coverage, gold_labels))/float(len(gold_labels))
        meta_graph_coverage_scores.append(final_coverage)
        meta_graph_scores.append( (A1*avg_score + A2*float(avg_overlap)) + A3*final_coverage )  ## taking average of subgraph scores

    print ("the len of meta graph scores are : ", len(meta_graph_scores))
    best_sub_graph_index = meta_graph_scores.index(max(meta_graph_scores))
    print ("best subgraph from linear regression is: ", meta_subgraphs[best_sub_graph_index], max(meta_graph_scores),meta_graph_coverage_scores[best_sub_graph_index])

    return meta_subgraphs[best_sub_graph_index]
Esempio n. 5
0
def get_all_combination_withCoverage_best_graph_Cand_boost_withIDF_forLR(KB_terms, performance_runs, Ques_terms, Ans_terms, subgraph_size, IDF_vals, All_x_features, All_y_features, gold_labels):  ## gold_labels_list is QA terms and pred_labels_over_runs is justification terms
    runs = list(performance_runs.keys())
    All_QA_terms = list(set(Ques_terms + Ans_terms ))
    # print("the gold_labels list looks like: ", runs)
    meta_subgraphs = []

    for i in range(subgraph_size-1):
        meta_subgraphs += list(combinations(runs, i+2))

    # for i in range(subgraph_size):  ## for taking best subgraph amongst subgraphs of size 3,4,5
    #     meta_subgraphs += list(combinations(runs, i+3))

    # meta_subgraphs += list(combinations(runs, subgraph_size))

    meta_graph_scores = []
    meta_graph_coverage_scores = []
    meta_graph_ans_coverage_scores = []
    meta_graph_overlap_scores = []
    for meta_sub_graph1 in meta_subgraphs:
        current_subgraph_overlap = []
        current_subgraph_perf = []

        for ik1, rk1 in enumerate(meta_sub_graph1[:-1]):

            if ik1 == 0:  ## initializing the coverage list
                prediction_coverage = KB_terms[rk1]

            current_subgraph_perf.append(performance_runs[rk1])
            for rk2 in meta_sub_graph1[ik1 + 1:-1]:  ##### This is equivalent to M C 2

                # current_subgraph_overlap.append(float(calculate_overlap_QA_terms(KB_terms[rk1], KB_terms[rk2], All_QA_terms)))
                current_subgraph_overlap.append(float(calculate_overlap(KB_terms[rk1], KB_terms[rk2])))
                prediction_coverage = get_union(prediction_coverage, KB_terms[rk2])

        avg_score = sum(current_subgraph_perf) / float(len(current_subgraph_perf))
        avg_overlap = sum(current_subgraph_overlap) / float(max(1,len(current_subgraph_overlap)))
        # print ("the ")
        final_query_coverage = get_intersection_withIDF(prediction_coverage, Ques_terms, IDF_vals) / max(1,float(len(Ques_terms)))
        final_ans_coverage = get_intersection_withIDF(prediction_coverage, Ans_terms, IDF_vals) / max(1,float(len(Ans_terms)))

        meta_graph_coverage_scores.append(final_query_coverage)
        meta_graph_ans_coverage_scores.append(final_ans_coverage)
        meta_graph_overlap_scores.append(avg_overlap)

        #### This part is for linear regression:
        pred_labels = meta_sub_graph1
        final_precision = len(set(gold_labels).intersection(set(pred_labels))) / float( max(1, len(pred_labels)))
        final_recall = len(set(gold_labels).intersection(set(pred_labels))) / float(len(gold_labels))
        fscore1 = (2*final_precision*final_recall)/float( max(1, final_precision+final_recall) )
        All_y_features.append(fscore1)
        All_x_features.append([avg_score, avg_overlap, final_ans_coverage, final_query_coverage])
        #######

        # meta_graph_scores.append( (avg_score/float(1+avg_overlap))  * (1+1*final_ans_coverage) * (1+final_query_coverage) )  ## taking average of subgraph scores
        # meta_graph_scores.append( (1+avg_score/float(1+avg_overlap)) *  (1+1*final_ans_coverage) * (1+final_query_coverage) )  ## taking average of subgraph scores ##  *  * # 1+avg_overlap
        ## score from LR
        meta_graph_scores.append( (0.1042*avg_score - 0.0352 * avg_overlap  + 0.0414 *final_ans_coverage + 0.0571 * final_query_coverage) )  ## taking average of subgraph scores ##  *  * # 1+avg_overlap

    # print ("the len of meta graph scores are : ", len(meta_graph_scores))
    try:
        best_sub_graph_index = meta_graph_scores.index(max(meta_graph_scores))
        # print ("checking weather this returns any overlap val or not ", meta_graph_overlap_scores)
        return meta_subgraphs[best_sub_graph_index], meta_graph_overlap_scores[best_sub_graph_index], meta_graph_coverage_scores[best_sub_graph_index], meta_graph_ans_coverage_scores[best_sub_graph_index], All_x_features, All_y_features
    except ValueError:
        return "Crashed"
Esempio n. 6
0
    json_file = open(json_file_name, "r")
    predictions_each_epoch = []
    epoch_num = []

    for line in json_file:
        json_data = json.loads(line)
        for key2 in json_data.keys(
        ):  ## there will be only 1 key, i.e. current epoch number
            predictions_each_epoch.append(json_data[key2])
            epoch_num.append(key2)

## now calculating overlap between consecutive epochs
if overlap_analysis == 1:
    for index1, epoch in enumerate(epoch_num[:-6]):

        overlap_vals = calculate_overlap(predictions_each_epoch[index1],
                                         predictions_each_epoch[index1 + 1])
        print(epoch_num[index1], epoch_num[index1 + 4], overlap_vals)

## calculating ensemble:
start_epoch = 160
end_epoch = 200

All_scores_counted_over_epochs = {}

for ind1, epoch_nums1 in enumerate(epoch_num):
    if int(epoch_nums1) >= start_epoch and int(epoch_nums1) < end_epoch:
        for label1 in predictions_each_epoch[ind1]:
            if label1 in All_scores_counted_over_epochs.keys():
                All_scores_counted_over_epochs[label1] += (
                    predictions_each_epoch[ind1][label1]
                )  ## we want to maintain a single list for the collection counter to work in a single line