def get_data(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        f_judgements = self.config.get('WMT', 'human_ranking')
        maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons'))
        human_rankings = HumanRanking()
        human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)

        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return data_structure2, human_rankings, extractor.vals
    def get_data(self):

        human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t')
        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return extractor.vals, human_scores
    def prepare_feature_files(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)
        feature_values = extractor.vals

        datasets_language_pairs = set((x[0], x[1]) for x in data_structure2)

        dataset_for_all = self.config.get('WMT', 'dataset')
        feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "")
        f_features_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')
        f_meta_data_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')

        for dataset, lp in sorted(datasets_language_pairs):

            f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset + '.' + feature_set_name + '.' + lp + '.tsv', 'w')

            for i, sentence_data in enumerate(data_structure2):

                if dataset in sentence_data and lp in sentence_data:
                    f_features_all.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
                    f_meta_data_all.write('\t'.join([str(x) for x in sentence_data]) + "\n")
                    f_features.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")

            f_features.close()

        f_features_all.close()
def feature_extraction(config_features_path):

    config = ConfigParser()
    config.readfp(open(config_features_path))
    wd = config.get('WMT', 'working_directory')
    if not os.path.exists(wd):
        os.mkdir(wd)

    data = RankingData(config)
    data.read_dataset()

    process = Process(config)
    sentences_tgt, sentences_ref = process.run_processors()

    feature_names = FeatureExtractor.read_feature_names(config)
    feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref)
    write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values)

    my_dataset = data.plain[0].dataset
    my_lp = data.plain[0].lp
    f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
    f_file = open(f_path, 'w')

    for i, instance in enumerate(data.plain):
        if instance.dataset == my_dataset and instance.lp == my_lp:
            f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
        else:
            f_file.close()
            my_dataset = instance.dataset
            my_lp = instance.lp
            f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
            f_file = open(f_path, 'w')

    f_judgements = config.get('WMT', 'human_ranking')
    human_rankings = HumanRanking()
    human_rankings.add_human_data(f_judgements, config)
    human_rankings.get_sentence_ids(data)

    learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')
def test_feature_sets():

    cfg = ConfigParser()
    cfg.readfp(open(os.getcwd() + '/config/system.cfg'))

    group_name = FE.get_features_group_name(cfg)
    features_to_test = FE.read_feature_names(cfg)

    if os.path.exists(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary'):
        "Path exists!"
        return

    output_file = open(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary', 'w')

    name0 = group_name + '_' + 'all'
    corr0 = corr_feature_set(features_to_test, name0)
    output_file.write(name0 + '\t' + str(corr0) + '\n')

    for feat in features_to_test:

        name1 = group_name + '_' + feat + '_' + 'only'
        corr1 = corr_feature_set(feat, name1)
        output_file.write(name1 + '\t' + str(corr1) + '\n')

        name2 = group_name + '_' + feat + '_' + 'excluded'
        excluding = []

        for ffeat in features_to_test:
            if ffeat == feat:
                continue
            excluding.append(ffeat)

        corr2 = corr_feature_set(excluding, name2)
        output_file.write(name2 + '\t' + str(corr2) + '\n')

    output_file.close()