def get_data(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        f_judgements = self.config.get('WMT', 'human_ranking')
        maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons'))
        human_rankings = HumanRanking()
        human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)

        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return data_structure2, human_rankings, extractor.vals
    def get_data(self):

        human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t')
        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return extractor.vals, human_scores
    def prepare_feature_files(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)
        feature_values = extractor.vals

        datasets_language_pairs = set((x[0], x[1]) for x in data_structure2)

        dataset_for_all = self.config.get('WMT', 'dataset')
        feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "")
        f_features_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')
        f_meta_data_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')

        for dataset, lp in sorted(datasets_language_pairs):

            f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset + '.' + feature_set_name + '.' + lp + '.tsv', 'w')

            for i, sentence_data in enumerate(data_structure2):

                if dataset in sentence_data and lp in sentence_data:
                    f_features_all.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
                    f_meta_data_all.write('\t'.join([str(x) for x in sentence_data]) + "\n")
                    f_features.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")

            f_features.close()

        f_features_all.close()
Exemple #4
0
def extract_features(txt_io, feat_io, cword_io, train=False, factor_files={}):
    csets = CSetPair(config['source-cset'], config['target-cset'])
    extractor = FeatureExtractor(csets, config['features'], config['costs'])

    check_factor_requirements(extractor.required_factors(), factor_files)

    finder = CWordFinder(csets, train)
    if config['nulls-ngrams']:
        null_finder = NullFinder(csets.src, config['nulls-ngrams'])
        finder.add_extra_finder(null_finder)
    reader = CWordReader(cword_io)

    log.info("Extract features from {}".format(txt_io.name))

    count = 0
    for sid, line, fact_sent in each_factorized_input(txt_io, factor_files):
        for cword in finder.find_confusion_words(line, fact_sent):
            feat_str = extractor.extract_features(cword, fact_sent)
            feat_io.write(feat_str)

            reader.format(sid, cword)
            count += 1

    log.info("Found {} confusion words".format(count))