Esempio n. 1
0
def main():
    # loading training feature data
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json"))
    output_file = open("window_size_output_file.tsv", "w")
    query_folder = "/home/smsarwar/PycharmProjects/civilian_killing/data/query_configurations/iterative_query_lr_dir"
    # allfeats will hold all the features in the training data and their weights
    onlyfiles = [
        os.path.join(query_folder, file) for file in os.listdir(query_folder)
        if os.path.isfile(os.path.join(query_folder, file))
    ]
    # print onlyfiles
    #results = {}
    number_of_files = 50
    #query_terms = np.arange(10, 41, 10)
    window_sizes = np.asarray([2, 3, 5, 10, 15, 20, 25, 30])
    res = np.zeros((4, len(window_sizes)))

    for idx, file in enumerate(onlyfiles[:number_of_files]):
        #results[file] = {}
        print file
        lines = open(file).readlines()
        lines = [line.strip() for line in lines]
        line = lines[20]
        sentence_ids = line.strip().split()
        allfeats = []
        #loading all the features
        for item in data.keys():
            if item in sentence_ids:
                feats = {}
                for feature in data[item]:
                    feats[feature] = 1
                allfeats.append(feats)

        #prior feats will contain all the features in the training dataset
        prior_feats = {}
        for item in data.keys():
            prior_feats = merge_two_dicts(prior_feats, data[item])

        allfeats.append(prior_feats)
        features, weights = transform_data_and_train(allfeats)
        name_set = load_gold()
        id_dict, train_id_dict = load_dictionary()
        #query_terms = np.arange(1, 1002, 100)
        #number_of_query_terms = min(len(features), 200)
        #query_terms = np.arange(1, 1002, 100)
        terms = []
        p10 = []
        p20 = []
        p30 = []
        num_civilian = []
        query_number = 0
        number_of_query_terms = min(len(features), 200)
        for window_size in window_sizes:
            file_temp = open("indri_lr_query_file.xml", "w")
            file_temp.write(get_query_prefix())
            query = prepare_indri_query_top_k_features(
                features,
                weights,
                query_number,
                window_size=window_size,
                topk=number_of_query_terms,
                prf=False)
            file_temp.write(query + '\n')
            query_number += 1
            file_temp.write(get_query_suffix())
            file_temp.close()
            print "now querying"
            cmd = "IndriRunQuery indri_lr_query_file.xml"
            output = sp.check_output(cmd.split())
            # #print output
            precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values(
                output.split("\n"), id_dict, name_set, train_id_dict)
            # #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found))
            terms.append(window_size)
            p10.append(precs10)
            p20.append(precs20)
            p30.append(precs30)
            num_civilian.append(number_of_civilians_found)

        res[0] += np.asarray(p10)
        res[1] += np.asarray(p20)
        res[2] += np.asarray(p30)
        res[3] += np.asarray(num_civilian)

        # print res[0]/(idx+1)
        # print res[1]/(idx+1)
        # print res[2]/(idx+1)
        # print res[3]/(idx+1)

        res_P10 = ''
        res_P20 = ''
        res_P30 = ''
        res_num_civilian = ''

        for index in np.arange(len(window_sizes)):
            res_P10 += str(res[0][index] / (idx + 1)) + "\t"
            res_P20 += str(res[1][index] / (idx + 1)) + "\t"
            res_P30 += str(res[2][index] / (idx + 1)) + "\t"
            res_num_civilian += str(res[3][index] / (idx + 1)) + "\t"

        print window_sizes
        print res_P10.strip()
        print res_P20.strip()
        print res_P30.strip()
        print res_num_civilian.strip()

        output_file.write(str(window_sizes) + "\n")
        output_file.write(res_P10.strip() + "\n")
        output_file.write(res_P20.strip() + "\n")
        output_file.write(res_P30.strip() + "\n")
        output_file.write(res_num_civilian.strip() + "\n")
        output_file.write(
            "-----------------------------------------------------\n")

    output_file.close()
Esempio n. 2
0
def main():
    plotting = False
    # loading training feature data
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json"))

    query_folder = "/home/smsarwar/PycharmProjects/civilian_killing/data/query_configurations/iterative_query_lr_dir"
    # allfeats will hold all the features in the training data and their weights
    onlyfiles = [
        os.path.join(query_folder, file) for file in os.listdir(query_folder)
        if os.path.isfile(os.path.join(query_folder, file))
    ]
    # print onlyfiles
    #results = {}
    number_of_files = 50
    results = np.zeros((30, 4))

    for idx, file in enumerate(onlyfiles):
        #results[file] = {}
        query_number = 1
        file_temp = open("indri_lr_query_file.xml", "w")
        file_temp.write(get_query_prefix())
        for line in open(file):
            sentence_ids = line.strip().split()
            allfeats = []
            #loading all the features
            for item in data.keys():
                if item in sentence_ids:
                    feats = {}
                    for feature in data[item]:
                        feats[feature] = 1
                    allfeats.append(feats)

            #prior feats will contain all the features in the training dataset
            prior_feats = {}
            for item in data.keys():
                prior_feats = merge_two_dicts(prior_feats, data[item])

            #print 'length of the feature set ' + str(len(prior_feats))

            allfeats.append(prior_feats)
            features, weights = transform_data_and_train(allfeats)
            name_set = load_gold()
            id_dict, train_id_dict = load_dictionary()
            #query_terms = np.arange(1, 1002, 100)
            #query_terms = np.arange(1, 10, 10)
            number_of_query_terms = min(len(features), 200)
            #query_terms = [5]
            terms = []
            p10 = []
            p20 = []
            p30 = []
            num_civilian = []

            query = prepare_indri_query_top_k_features(
                features,
                weights,
                query_number,
                topk=number_of_query_terms,
                prf=False)
            file_temp.write(query + '\n')
            #print(query + "\n")
            query_number += 1
        file_temp.write(get_query_suffix())

        file_temp.close()

        # print "now querying"
        # cmd =  "IndriRunQuery indri_lr_query_file.xml"
        # output = sp.check_output(cmd.split())
        # run_file = open("../../../data/runs/iterative_lr_ir_run_dir/" + os.path.basename(file) + ".run", "w")
        # run_file.write(output)
        # run_file.close()

        # #print output
        # precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values(output.split("\n"), id_dict, name_set, train_id_dict)
        # #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found))
        # terms.append(number_of_query_terms)
        # p10.append(precs10)
        # p20.append(precs20)
        # p30.append(precs30)
        # num_civilian.append(number_of_civilians_found)
        # results[index][0]+=precs10
        # results[index][1]+= precs20
        # results[index][2]+= precs30
        # results[index][3]+= number_of_civilians_found
        # print results[index]/(idx+1)
        # index+=1

        # file = open("2.xml.run", 'w+')
        # for i in np.arange(30):
        #     file.write(output)
        # file.close()
        #
        # cmd = "mv 2.xml.run /home/smsarwar/PycharmProjects/civilian_killing/data/runs/iterative_lr_ir_run_dir/"
        # output = sp.check_output(cmd.split())
        # #print "querying done"
        # #print output
        #
        # cmd = "python /home/smsarwar/PycharmProjects/civilian_killing/src/eval/entity_level_evaluation.py"
        # output = sp.check_output(cmd.split())
        # print "evaluation done"
        # print output

        #print p10
        #print p20
        #print p30
        #print num_civilian

    print results
Esempio n. 3
0
def main():
    # loading training feature dataa
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json"))
    # allfeats will hold all the features in the training data and their weights

    allfeats = []
    print len(data.keys())
    #loading all the features
    for item in data.keys():
        feats = {}
        for feature in data[item]:
            feats[feature] = 1
        allfeats.append(feats)

    #prior feats will contain all the features in the training dataset
    prior_feats = {}
    for item in data.keys():
        prior_feats = merge_two_dicts(prior_feats, data[item])

    print 'length of the feature set ' + str(len(prior_feats))

    allfeats.append(prior_feats)
    features, weights = transform_data_and_train(allfeats)
    name_set = load_gold()
    id_dict, train_id_dict = load_dictionary()
    query_terms = np.arange(1, 1002, 100)

    terms = []
    p10 = []
    p20 = []
    p30 = []
    num_civilian = []

    for number_of_query_terms in query_terms:
        file = open("indri_lr_query_file.xml", "w+")
        file.write(prepare_indri_query_top_k_features(features, weights, topk=number_of_query_terms))
        file.close()

        print "now querying"
        cmd =  "IndriRunQuery indri_lr_query_file.xml"
        output = sp.check_output(cmd.split())
        #print output
        precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values(output.split("\n"), id_dict, name_set, train_id_dict)
        #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found))
        terms.append(number_of_query_terms)
        p10.append(precs10)
        p20.append(precs20)
        p30.append(precs30)
        num_civilian.append(number_of_civilians_found)
        # file = open("2.xml.run", 'w+')
        # for i in np.arange(30):
        #     file.write(output)
        # file.close()
        #
        # cmd = "mv 2.xml.run /home/smsarwar/PycharmProjects/civilian_killing/data/runs/iterative_lr_ir_run_dir/"
        # output = sp.check_output(cmd.split())
        # #print "querying done"
        # #print output
        #
        # cmd = "python /home/smsarwar/PycharmProjects/civilian_killing/src/eval/entity_level_evaluation.py"
        # output = sp.check_output(cmd.split())
        # print "evaluation done"
        # print output
    print p10
    print p20
    print p30
    print num_civilian

    f, axarr = plt.subplots(2, 2)
    # axarr[0, 0].plot(terms, p10)
    # axarr[0, 0].set_title('Precision@10')
    # axarr[0, 1].scatter(terms, p20)
    # axarr[0, 1].set_title('Precision@20')
    # axarr[1, 0].plot(terms, p30)
    # axarr[1, 0].set_title('Precision@30')
    # axarr[1, 1].scatter(terms, number_of_civilians_found)
    # axarr[1, 1].set_title('\#civilians (top-1000)')
    # # Fine-tune figure; hide x ticks for top plots and y ticks for right plots
    # plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False)
    # plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False)
    #print terms
    axarr[0, 0].plot(terms, p10, marker='o', markerfacecolor='black', markersize=6, color='darkgray', linewidth=2,
             label='P@10', linestyle='-.')
    axarr[0, 0].set_title('Precision@10')

    axarr[0, 1].plot(terms, p20, marker='v', markerfacecolor='blue', markersize=6, color='gray', linewidth=2,
             label='P@20', linestyle='--')
    axarr[0, 1].set_title('Precision@20')

    axarr[1, 0].plot(terms, p30, marker='x', markerfacecolor='red', markersize=6, color='black', linewidth=2,
             label='P@30', linestyle=':')
    axarr[1, 0].set_title('Precision@30')

    axarr[1, 1].plot(terms, num_civilian, marker='s', markerfacecolor='red', markersize=6, color='black', linewidth=2,
                     label='P@30', linestyle=':')
    axarr[1, 1].set_title('Relevant entities')

    f.text(0.5, 0.005 , 'Number of Features in Query', ha='center', fontsize=10)

    #plt.xlabel('Number of Examples in Query')
    # plt.ylabel('Precision@100')
    # plt.legend()
    # plt.savefig("p@100")
    # plt.show()
    # f.subplots_adjust(hspace=0.3)
    plt.show()