Ejemplo n.º 1
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Ejemplo n.º 2
0
    def test_learn_to_rank_coefficients(config_learning, config):

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        learner = config_learning.get("learning", None)
        method_name = learner.get("method", None)

        estimator = joblib.load(os.path.expanduser(config.get('Learner', 'models')) + '/' + method_name + '.pkl')

        coefficients = estimator.coef_[0]

        predictions = []

        for instance in x_test:
            result = 0.0
            for k, val in enumerate(instance):
                result += val * coefficients[k]

            predictions.append(result)

        return predictions
Ejemplo n.º 3
0
    def round_robin(self, config_path_learning, config_path, feature_set, lps):

        config = ConfigParser()
        config.readfp(open(config_path))

        with open(config_path_learning, "r") as cfg_file:
            config_learning = yaml.load(cfg_file.read())

        f_results = open("results.txt", "w")

        for test_lp in sorted(lps):

            x_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            y_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            x_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"
            y_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"

            train_lps = ScoringTask.get_train_lps(lps, test_lp)

            train_feature_values = []
            train_reference_values = []

            test_feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")
            test_reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")

            for train_lp in sorted(train_lps):
                feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")
                reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")

                train_feature_values += list(feature_values)
                train_reference_values += list(reference_values)

            write_feature_file(x_train, train_feature_values)
            write_reference_file(y_train, train_reference_values)
            write_feature_file(x_test, test_feature_values)
            write_reference_file(y_test, test_reference_values)

            gold_standard = test_reference_values
            predictions = ScoringTask.train_predict(config_path_learning)
            # predictions = ScoringTask.recursive_feature_elimination(config_learning, config, 50)

            correlation = ScoringTask.evaluate_predicted(predictions, gold_standard)

            f_results.write(test_lp + " " + str(correlation) + " with " + feature_set + "\n")
            os.remove(x_train)
            os.remove(x_test)
            os.remove(y_train)
            os.remove(y_test)
Ejemplo n.º 4
0
def train_model(cfg, model_path):

    x_train = read_features_file(cfg.get('x_train'), '\t')
    y_train = read_reference_file(cfg.get('y_train'), '\t')
    x_test = read_features_file(cfg.get('x_test'), '\t')
    scale = cfg.get("scale", True)

    if scale:
        x_train, x_test = scale_datasets(x_train, x_test)

    estimator, scorers = learn_model.set_learning_method(cfg, x_train, y_train)
    estimator.fit(x_train, y_train)
    joblib.dump(estimator, model_path)
Ejemplo n.º 5
0
def predict(cfg, model_path, probabilities=True):
    x_train = read_features_file(cfg.get('x_train'), '\t')
    x_test = read_features_file(cfg.get('x_test'), '\t')
    scale = cfg.get("scale", True)

    if scale:
        x_train, x_test = scale_datasets(x_train, x_test)

    estimator = joblib.load(model_path)
    if probabilities:
        return [x[0] for x in estimator.predict_proba(x_test)]
    else:
        return estimator.predict(x_test)
Ejemplo n.º 6
0
    def test_learn_to_rank(config_learning, config):

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        learner = config_learning.get("learning", None)
        method_name = learner.get("method", None)

        estimator = joblib.load(os.path.expanduser(config.get('Learner', 'models')) + '/' + method_name + '.pkl')

        return [x[0] for x in estimator.predict_proba(x_test)]
Ejemplo n.º 7
0
    def training_set_for_learn_to_rank_from_feature_file(config_learning, config):

        data_set_name = config.get('WMT', 'dataset')

        feature_values = read_features_file(config_learning.get("x_train", None), '\t')
        human_rankings = read_reference_file(config_learning.get("y_train", None), '\t')

        new_features = []
        new_labels = []

        path_features = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + ".learn_rank.tsv"
        path_objective = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + ".learn_rank.tsv"

        for i, label in enumerate(human_rankings):
            if label == 2:
                features = np.subtract(RankingTask.split_list(feature_values[i])[0], RankingTask.split_list(feature_values[i])[1])
                new_label = 1
            else:
                features = np.subtract(RankingTask.split_list(feature_values[i])[1], RankingTask.split_list(feature_values[i])[0])
                new_label = 0

            new_features.append(features)
            new_labels.append(new_label)

        write_feature_file(path_objective, new_features)
        write_reference_file(path_features, new_labels)
Ejemplo n.º 8
0
def open_datasets(train_path, train_ref_path, test_path,
                  test_ref_path, delim, labels_path=None, tostring=False):
    
    if not os.path.isfile(os.path.abspath(train_path)):
        raise IOError("training dataset path is not valid: %s" % train_path)
    
    if not os.path.isfile(os.path.abspath(train_ref_path)):
        raise IOError("training references path is not valid: %s" % train_ref_path)
    
    if not os.path.isfile(os.path.abspath(test_path)):
        raise IOError("test dataset path is not valid: %s" % test_path)
    
    if not os.path.isfile(os.path.abspath(test_ref_path)):
        raise IOError("test references path is not valid: %s" % test_ref_path)

    labels = []
    if labels_path is not None:
        if not os.path.isfile(os.path.abspath(labels_path)):
            raise IOError("labels file is not valid: %s" % labels_path)

        labels = read_labels_file(labels_path, delim)

    X_train = read_features_file(train_path, delim, tostring=tostring)
    y_train = read_reference_file(train_ref_path, delim, tostring=tostring)
    
    X_test = read_features_file(test_path, delim, tostring=tostring)
    y_test = read_reference_file(test_ref_path, delim, tostring=tostring)
    
    if len(X_train.shape) != 2:
        raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.")

    if len(X_test.shape) != 2:
        raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.")
        
    if X_train.shape[0] != y_train.shape[0]:
        print(X_train.shape[0],  y_train.shape[0])
        raise IOError("the number of instances in the train features file does not match the number of references given.")
        
    if X_test.shape[0] != y_test.shape[0]:
        raise IOError("the number of instances in the test features file does not match the number of references given.")

    if X_train.shape[1] != X_test.shape[1]:
        raise IOError("the number of features in train and test datasets is different.")

    return X_train, y_train, X_test, y_test, labels
Ejemplo n.º 9
0
    def train_save(config_learning, config_data):

        learning_config = config_learning.get("learning", None)
        method_name = learning_config.get("method", None)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        estimator.fit(x_train, y_train)
        joblib.dump(estimator, os.path.expanduser(config_data.get('Learner', 'models')) + '/' + method_name + '.pkl')
Ejemplo n.º 10
0
    def load_predict(config_learning, config_data):

        learning_config = config_learning.get("learning", None)
        method_name = learning_config.get("method", None)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        y_test = read_reference_file(config_learning.get('y_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        estimator = joblib.load(os.path.expanduser(config_data.get("Learner", "models")) + "/" + method_name + ".pkl")
        predictions = estimator.predict(x_test)

        return predictions
Ejemplo n.º 11
0
    def get_confidence_scores(config_path):

        with open(config_path, 'r') as cfg_file:
            config = yaml.load(cfg_file.read())

        learning_config = config.get("learning", None)
        method_name = learning_config.get("method", None)
        x_test = read_features_file(config.get("x_test", None), '\t')
        estimator = joblib.load(config.get("save", None) + '/' + method_name + '.pkl')
        confidence_scores = estimator.decision_function(x_test)
        return confidence_scores
Ejemplo n.º 12
0
    def recursive_feature_elimination_cv(config_learning, config_data):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data)
        combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy')
        rfecv.fit(x_train, y_train)

        feature_list = []

        for i, feature_name in enumerate(feature_names):
             if combination_methods[i] == 'both':
                 feature_list.append(feature_name)
                 feature_list.append(feature_name)
             else:
                 feature_list.append(feature_name)

        for i, name in enumerate(feature_list):
            output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n")

        output.close()

        predictions = rfecv.predict(x_test)

        return predictions
Ejemplo n.º 13
0
    def clean_dataset(config_learning, human_comparisons):

        feature_values = read_features_file(config_learning.get('x_train'), '\t')
        labels = read_reference_file(config_learning.get('y_train'), '\t')
        new_feature_values = []
        new_labels = []
        human_comparisons = RankingTask.eliminate_ties(human_comparisons)
        comparisons_untied_phrases = defaultdict(list)
        comparisons_untied_signs = defaultdict(list)

        deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for comparison in human_comparisons[dataset, lang_pair]:

                if comparison.sign == "=":
                    continue
                else:
                    comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2])
                    comparisons_untied_signs[dataset, lang_pair].append(comparison.sign)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]):

                features = feature_values[i]
                label = labels[i]

                if comparison in deduplicated_phrases[dataset, lang_pair]:

                    if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None:
                        continue

                    label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)])

                new_feature_values.append(features)
                new_labels.append(label)

        write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values)
        write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
Ejemplo n.º 14
0
    config_learning = yaml.load(cfg_file.read())

# Prepare feature files
# This needs to be done for both training and testing data, changing the names of the datasets in the configuratio file

prepare_wmt = PrepareWmt()
ranking_task = RankingTask(config_path)
ranking_task.prepare_feature_files()

# Create training set for learn to rank
# Comment the above prepare feature files method

dataset_for_all = config.get('WMT', 'dataset')
feature_set_name = os.path.basename(config.get('Features', 'feature_set')).replace(".txt", "")
data_structure2 = prepare_wmt.get_data_structure2(config)

f_judgements = config.get('WMT', 'human_ranking')
human_rankings = HumanRanking()
human_rankings.add_human_data(f_judgements, config)

feature_values = read_features_file(os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', "\t")

ranking_task.training_set_for_learn_to_rank(data_structure2, human_rankings, feature_values)
ranking_task.train_save(config_learning, config)

# Run the trained model on a the test feature file and produce the output in WMT format

predictions = ranking_task.test_learn_to_rank_coefficients(config_learning, config)
data_structure = prepare_wmt.get_data_structure(config)
prepare_wmt.wmt_format(config, feature_set_name, dataset_for_all, predictions, data_structure)
Ejemplo n.º 15
0
def get_confidence_scores(model_path, features_path):
    x_test = read_features_file(features_path, '\t')
    estimator = joblib.load(model_path)
    return estimator.decision_function(x_test)