def round_robin(self, config_path_learning, config_path, feature_set, lps): config = ConfigParser() config.readfp(open(config_path)) with open(config_path_learning, "r") as cfg_file: config_learning = yaml.load(cfg_file.read()) f_results = open("results.txt", "w") for test_lp in sorted(lps): x_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv" y_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv" x_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv" y_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv" train_lps = ScoringTask.get_train_lps(lps, test_lp) train_feature_values = [] train_reference_values = [] test_feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t") test_reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t") for train_lp in sorted(train_lps): feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t") reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t") train_feature_values += list(feature_values) train_reference_values += list(reference_values) write_feature_file(x_train, train_feature_values) write_reference_file(y_train, train_reference_values) write_feature_file(x_test, test_feature_values) write_reference_file(y_test, test_reference_values) gold_standard = test_reference_values predictions = ScoringTask.train_predict(config_path_learning) # predictions = ScoringTask.recursive_feature_elimination(config_learning, config, 50) correlation = ScoringTask.evaluate_predicted(predictions, gold_standard) f_results.write(test_lp + " " + str(correlation) + " with " + feature_set + "\n") os.remove(x_train) os.remove(x_test) os.remove(y_train) os.remove(y_test)
def recursive_feature_elimination(config_learning, config_data, number_features): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfe = RFE(estimator, number_features, step=1) rfe.fit(x_train, y_train) for i, name in enumerate(feature_names): output.write(name + "\t" + str(rfe.ranking_[i]) + "\n") print(name + "\t" + str(rfe.ranking_[i])) predictions = rfe.predict(x_test) output.close() return predictions
def training_set_for_learn_to_rank_from_feature_file(config_learning, config): data_set_name = config.get('WMT', 'dataset') feature_values = read_features_file(config_learning.get("x_train", None), '\t') human_rankings = read_reference_file(config_learning.get("y_train", None), '\t') new_features = [] new_labels = [] path_features = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + ".learn_rank.tsv" path_objective = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + ".learn_rank.tsv" for i, label in enumerate(human_rankings): if label == 2: features = np.subtract(RankingTask.split_list(feature_values[i])[0], RankingTask.split_list(feature_values[i])[1]) new_label = 1 else: features = np.subtract(RankingTask.split_list(feature_values[i])[1], RankingTask.split_list(feature_values[i])[0]) new_label = 0 new_features.append(features) new_labels.append(new_label) write_feature_file(path_objective, new_features) write_reference_file(path_features, new_labels)
def open_datasets(train_path, train_ref_path, test_path, test_ref_path, delim, labels_path=None, tostring=False): if not os.path.isfile(os.path.abspath(train_path)): raise IOError("training dataset path is not valid: %s" % train_path) if not os.path.isfile(os.path.abspath(train_ref_path)): raise IOError("training references path is not valid: %s" % train_ref_path) if not os.path.isfile(os.path.abspath(test_path)): raise IOError("test dataset path is not valid: %s" % test_path) if not os.path.isfile(os.path.abspath(test_ref_path)): raise IOError("test references path is not valid: %s" % test_ref_path) labels = [] if labels_path is not None: if not os.path.isfile(os.path.abspath(labels_path)): raise IOError("labels file is not valid: %s" % labels_path) labels = read_labels_file(labels_path, delim) X_train = read_features_file(train_path, delim, tostring=tostring) y_train = read_reference_file(train_ref_path, delim, tostring=tostring) X_test = read_features_file(test_path, delim, tostring=tostring) y_test = read_reference_file(test_ref_path, delim, tostring=tostring) if len(X_train.shape) != 2: raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") if len(X_test.shape) != 2: raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.") if X_train.shape[0] != y_train.shape[0]: print(X_train.shape[0], y_train.shape[0]) raise IOError("the number of instances in the train features file does not match the number of references given.") if X_test.shape[0] != y_test.shape[0]: raise IOError("the number of instances in the test features file does not match the number of references given.") if X_train.shape[1] != X_test.shape[1]: raise IOError("the number of features in train and test datasets is different.") return X_train, y_train, X_test, y_test, labels
def load_predict(config_learning, config_data): learning_config = config_learning.get("learning", None) method_name = learning_config.get("method", None) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') y_test = read_reference_file(config_learning.get('y_test'), '\t') scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) estimator = joblib.load(os.path.expanduser(config_data.get("Learner", "models")) + "/" + method_name + ".pkl") predictions = estimator.predict(x_test) return predictions
def get_data(self): human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t') process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) return extractor.vals, human_scores
def train_model(cfg, model_path): x_train = read_features_file(cfg.get('x_train'), '\t') y_train = read_reference_file(cfg.get('y_train'), '\t') x_test = read_features_file(cfg.get('x_test'), '\t') scale = cfg.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) estimator, scorers = learn_model.set_learning_method(cfg, x_train, y_train) estimator.fit(x_train, y_train) joblib.dump(estimator, model_path)
def train_save(config_learning, config_data): learning_config = config_learning.get("learning", None) method_name = learning_config.get("method", None) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) estimator.fit(x_train, y_train) joblib.dump(estimator, os.path.expanduser(config_data.get('Learner', 'models')) + '/' + method_name + '.pkl')
def clean_dataset(config_learning, human_comparisons): feature_values = read_features_file(config_learning.get('x_train'), '\t') labels = read_reference_file(config_learning.get('y_train'), '\t') new_feature_values = [] new_labels = [] human_comparisons = RankingTask.eliminate_ties(human_comparisons) comparisons_untied_phrases = defaultdict(list) comparisons_untied_signs = defaultdict(list) deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons) for dataset, lang_pair in sorted(human_comparisons.keys()): for comparison in human_comparisons[dataset, lang_pair]: if comparison.sign == "=": continue else: comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2]) comparisons_untied_signs[dataset, lang_pair].append(comparison.sign) for dataset, lang_pair in sorted(human_comparisons.keys()): for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]): features = feature_values[i] label = labels[i] if comparison in deduplicated_phrases[dataset, lang_pair]: if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None: continue label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)]) new_feature_values.append(features) new_labels.append(label) write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values) write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
def recursive_feature_elimination_cv(config_learning, config_data): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data) combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(x_train, y_train) feature_list = [] for i, feature_name in enumerate(feature_names): if combination_methods[i] == 'both': feature_list.append(feature_name) feature_list.append(feature_name) else: feature_list.append(feature_name) for i, name in enumerate(feature_list): output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n") output.close() predictions = rfecv.predict(x_test) return predictions