Example #1
0
    def get_data(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        f_judgements = self.config.get('WMT', 'human_ranking')
        maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons'))
        human_rankings = HumanRanking()
        human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)

        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return data_structure2, human_rankings, extractor.vals
def feature_extraction(config_features_path):

    config = ConfigParser()
    config.readfp(open(config_features_path))
    wd = config.get('WMT', 'working_directory')
    if not os.path.exists(wd):
        os.mkdir(wd)

    data = RankingData(config)
    data.read_dataset()

    process = Process(config)
    sentences_tgt, sentences_ref = process.run_processors()

    feature_names = FeatureExtractor.read_feature_names(config)
    feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref)
    write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values)

    my_dataset = data.plain[0].dataset
    my_lp = data.plain[0].lp
    f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
    f_file = open(f_path, 'w')

    for i, instance in enumerate(data.plain):
        if instance.dataset == my_dataset and instance.lp == my_lp:
            f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
        else:
            f_file.close()
            my_dataset = instance.dataset
            my_lp = instance.lp
            f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
            f_file = open(f_path, 'w')

    f_judgements = config.get('WMT', 'human_ranking')
    human_rankings = HumanRanking()
    human_rankings.add_human_data(f_judgements, config)
    human_rankings.get_sentence_ids(data)

    learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')
Example #3
0
    def clean_dataset(config_learning, human_comparisons):

        feature_values = read_features_file(config_learning.get('x_train'), '\t')
        labels = read_reference_file(config_learning.get('y_train'), '\t')
        new_feature_values = []
        new_labels = []
        human_comparisons = RankingTask.eliminate_ties(human_comparisons)
        comparisons_untied_phrases = defaultdict(list)
        comparisons_untied_signs = defaultdict(list)

        deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for comparison in human_comparisons[dataset, lang_pair]:

                if comparison.sign == "=":
                    continue
                else:
                    comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2])
                    comparisons_untied_signs[dataset, lang_pair].append(comparison.sign)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]):

                features = feature_values[i]
                label = labels[i]

                if comparison in deduplicated_phrases[dataset, lang_pair]:

                    if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None:
                        continue

                    label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)])

                new_feature_values.append(features)
                new_labels.append(label)

        write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values)
        write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
    config_learning = yaml.load(cfg_file.read())

# Prepare feature files
# This needs to be done for both training and testing data, changing the names of the datasets in the configuratio file

prepare_wmt = PrepareWmt()
ranking_task = RankingTask(config_path)
ranking_task.prepare_feature_files()

# Create training set for learn to rank
# Comment the above prepare feature files method

dataset_for_all = config.get('WMT', 'dataset')
feature_set_name = os.path.basename(config.get('Features', 'feature_set')).replace(".txt", "")
data_structure2 = prepare_wmt.get_data_structure2(config)

f_judgements = config.get('WMT', 'human_ranking')
human_rankings = HumanRanking()
human_rankings.add_human_data(f_judgements, config)

feature_values = read_features_file(os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', "\t")

ranking_task.training_set_for_learn_to_rank(data_structure2, human_rankings, feature_values)
ranking_task.train_save(config_learning, config)

# Run the trained model on a the test feature file and produce the output in WMT format

predictions = ranking_task.test_learn_to_rank_coefficients(config_learning, config)
data_structure = prepare_wmt.get_data_structure(config)
prepare_wmt.wmt_format(config, feature_set_name, dataset_for_all, predictions, data_structure)