def feature_extraction(config_features_path): config = ConfigParser() config.readfp(open(config_features_path)) wd = config.get('WMT', 'working_directory') if not os.path.exists(wd): os.mkdir(wd) data = RankingData(config) data.read_dataset() process = Process(config) sentences_tgt, sentences_ref = process.run_processors() feature_names = FeatureExtractor.read_feature_names(config) feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref) write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values) my_dataset = data.plain[0].dataset my_lp = data.plain[0].lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') for i, instance in enumerate(data.plain): if instance.dataset == my_dataset and instance.lp == my_lp: f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") else: f_file.close() my_dataset = instance.dataset my_lp = instance.lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') f_judgements = config.get('WMT', 'human_ranking') human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, config) human_rankings.get_sentence_ids(data) learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')
import sys import os from configparser import ConfigParser from utils.ranking_data import RankingData from utils.write_parsed import write_parsed from utils.human_ranking import HumanRanking from processors.process import Process from features.feature_extractor import FeatureExtractor from utils.wmt import write_wmt_format from utils.process_semeval import process_semeval from nltk.corpus import stopwords # Read configuration file config = ConfigParser() config.readfp(open('test.cfg')) # Prepare dataset ranking_data = RankingData(config) ranking_data.read_dataset() ranking_data.write_dataset() write_parsed(config.get('Data', 'input_dir').replace('plain', 'parse'), config.get('Data', 'working_dir'), ['cs-en']) # Process dataset process = Process(config) sentences_target, sentences_reference = process.run_processors() cobalt_scores = FeatureExtractor.extract_features_static(['cobalt'], sentences_target, sentences_reference) # print(str(cobalt_scores[-1][0])) ranking_data.write_scores_wmt_format(cobalt_scores, metric='cobalt', output_path='output/cobalt.scores')