def test_set_for_rank_to_scores(self, data_structure, feature_values, config_path_learning):

        sentences_systems = defaultdict(list)

        combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config)
        data_set_name = self.config.get('WMT', 'dataset')
        f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name, 'w')
        meta_data = defaultdict(list)

        for data_set, lang_pair, system_name, phrase_number in data_structure:
            sentences_systems[data_set, lang_pair, phrase_number].append(system_name)

        for data_set, lang_pair, phrase_number in sorted(sentences_systems.keys()):

            system_pairs = list(combinations(sentences_systems[data_set, lang_pair, phrase_number], 2))

            for sys1, sys2 in sorted(system_pairs):

                idx_sys1, idx_sys2 = self.get_sentence_idx(data_set, lang_pair, data_structure, phrase_number, sys1, sys2)

                combined_features = []
                for i in range(len(feature_values[0])):
                    combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i],
                                                                   feature_values[idx_sys2][i])
                    combined_features.append(combined_feature)

                f_features.write('\t'.join([val for val in combined_features]) + '\n')
                meta_data[data_set, lang_pair, phrase_number].append([sys1, sys2])

        f_features.close()

        results = defaultdict(list)
        confidence_scores = self.get_confidence_scores(config_path_learning)
        count = 0
        for data_set, lang_pair, phrase_number in sorted(meta_data.keys()):
            for sys1, sys2 in sorted(meta_data[data_set, lang_pair, phrase_number]):
                results[data_set, lang_pair, phrase_number].append([sys1, sys2, confidence_scores[count]])
                count += 1

        return results
    def training_set_for_rank_direct(self, data_structure, human_rankings, feature_values, ignore_ties=True):

        combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config)
        data_set_name = self.config.get('WMT', 'dataset')
        feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "")

        for dataset, lang_pair in sorted(human_rankings.keys()):

            f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')
            f_objective = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')
            f_meta_data = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')

            for human_comparison in human_rankings[dataset, lang_pair]:

                label = self.signs_to_labels(human_comparison.sign, ignore_ties=ignore_ties)
                if label is None:
                    continue

                f_objective.write(label + '\n')

                seg_id = human_comparison.phrase
                sys1 = human_comparison.sys1
                sys2 = human_comparison.sys2
                idx_sys1, idx_sys2 = self.get_sentence_idx(dataset, lang_pair, data_structure, seg_id, sys1, sys2)
                f_meta_data.write(str(idx_sys1) + '\t' + str(idx_sys2) + '\n')

                combined_features = []
                for i in range(len(feature_values[0])):
                    combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i],
                                                                   feature_values[idx_sys2][i])
                    combined_features.append(combined_feature)

                f_features.write('\t'.join([val for val in combined_features]) + '\n')

            f_features.close()
            f_objective.close()
            f_meta_data.close()
Exemple #3
0
def average_feature_values():

    config_path = os.getcwd() + "/" + "config" + "/" + "wmt.cfg"
    config = ConfigParser()
    config.readfp(open(config_path))

    my_dir = os.path.expanduser("~/Dropbox/experiments_fluency/test_learn_to_rank")
    feature_file = my_dir + "/" + "x_newstest2015.cobalt_comb_min_fluency_features_all.cs-en.tsv"
    feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config)
    strategies = FeatureExtractor.get_combinations_from_config_file(config)

    feature_values = read_features_file(feature_file, "\t")
    averages = np.mean(feature_values, axis=0)

    feature_list = []
    for i, feature_name in enumerate(feature_names):
        # if strategies[i] == 'both':
        #     feature_list.append(feature_name)
        #     feature_list.append(feature_name)
        # else:
        feature_list.append(feature_name)

    for i, name in enumerate(feature_list):
        print(name + "\t" + str(averages[i]))