Example #1
0
    def training_set_for_learn_to_rank_from_feature_file(config_learning, config):

        data_set_name = config.get('WMT', 'dataset')

        feature_values = read_features_file(config_learning.get("x_train", None), '\t')
        human_rankings = read_reference_file(config_learning.get("y_train", None), '\t')

        new_features = []
        new_labels = []

        path_features = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + ".learn_rank.tsv"
        path_objective = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + ".learn_rank.tsv"

        for i, label in enumerate(human_rankings):
            if label == 2:
                features = np.subtract(RankingTask.split_list(feature_values[i])[0], RankingTask.split_list(feature_values[i])[1])
                new_label = 1
            else:
                features = np.subtract(RankingTask.split_list(feature_values[i])[1], RankingTask.split_list(feature_values[i])[0])
                new_label = 0

            new_features.append(features)
            new_labels.append(new_label)

        write_feature_file(path_objective, new_features)
        write_reference_file(path_features, new_labels)
Example #2
0
    def round_robin(self, config_path_learning, config_path, feature_set, lps):

        config = ConfigParser()
        config.readfp(open(config_path))

        with open(config_path_learning, "r") as cfg_file:
            config_learning = yaml.load(cfg_file.read())

        f_results = open("results.txt", "w")

        for test_lp in sorted(lps):

            x_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            y_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            x_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"
            y_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"

            train_lps = ScoringTask.get_train_lps(lps, test_lp)

            train_feature_values = []
            train_reference_values = []

            test_feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")
            test_reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")

            for train_lp in sorted(train_lps):
                feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")
                reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")

                train_feature_values += list(feature_values)
                train_reference_values += list(reference_values)

            write_feature_file(x_train, train_feature_values)
            write_reference_file(y_train, train_reference_values)
            write_feature_file(x_test, test_feature_values)
            write_reference_file(y_test, test_reference_values)

            gold_standard = test_reference_values
            predictions = ScoringTask.train_predict(config_path_learning)
            # predictions = ScoringTask.recursive_feature_elimination(config_learning, config, 50)

            correlation = ScoringTask.evaluate_predicted(predictions, gold_standard)

            f_results.write(test_lp + " " + str(correlation) + " with " + feature_set + "\n")
            os.remove(x_train)
            os.remove(x_test)
            os.remove(y_train)
            os.remove(y_test)
Example #3
0
def learn_to_rank(feature_values, human_comparisons, path_x, path_y):

    xs = []
    ys = []

    for dataset, lp in sorted(human_comparisons.keys()):
        for comparison in human_comparisons[dataset, lp]:

            if comparison.sign == '=':
                continue

            idx_winner, idx_loser = find_winner_loser_index(comparison)
            xs.append(make_instance(feature_values[idx_winner], feature_values[idx_loser]))
            xs.append(make_instance(feature_values[idx_loser], feature_values[idx_winner]))
            ys.append(1)
            ys.append(0)

    write_feature_file(path_x, xs)
    write_reference_file(path_y, ys)
Example #4
0
    def clean_dataset(config_learning, human_comparisons):

        feature_values = read_features_file(config_learning.get('x_train'), '\t')
        labels = read_reference_file(config_learning.get('y_train'), '\t')
        new_feature_values = []
        new_labels = []
        human_comparisons = RankingTask.eliminate_ties(human_comparisons)
        comparisons_untied_phrases = defaultdict(list)
        comparisons_untied_signs = defaultdict(list)

        deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for comparison in human_comparisons[dataset, lang_pair]:

                if comparison.sign == "=":
                    continue
                else:
                    comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2])
                    comparisons_untied_signs[dataset, lang_pair].append(comparison.sign)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]):

                features = feature_values[i]
                label = labels[i]

                if comparison in deduplicated_phrases[dataset, lang_pair]:

                    if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None:
                        continue

                    label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)])

                new_feature_values.append(features)
                new_labels.append(label)

        write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values)
        write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
def feature_extraction(config_features_path):

    config = ConfigParser()
    config.readfp(open(config_features_path))
    wd = config.get('WMT', 'working_directory')
    if not os.path.exists(wd):
        os.mkdir(wd)

    data = RankingData(config)
    data.read_dataset()

    process = Process(config)
    sentences_tgt, sentences_ref = process.run_processors()

    feature_names = FeatureExtractor.read_feature_names(config)
    feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref)
    write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values)

    my_dataset = data.plain[0].dataset
    my_lp = data.plain[0].lp
    f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
    f_file = open(f_path, 'w')

    for i, instance in enumerate(data.plain):
        if instance.dataset == my_dataset and instance.lp == my_lp:
            f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
        else:
            f_file.close()
            my_dataset = instance.dataset
            my_lp = instance.lp
            f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
            f_file = open(f_path, 'w')

    f_judgements = config.get('WMT', 'human_ranking')
    human_rankings = HumanRanking()
    human_rankings.add_human_data(f_judgements, config)
    human_rankings.get_sentence_ids(data)

    learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')