Beispiel #1
0
    def round_robin(self, config_path_learning, config_path, feature_set, lps):

        config = ConfigParser()
        config.readfp(open(config_path))

        with open(config_path_learning, "r") as cfg_file:
            config_learning = yaml.load(cfg_file.read())

        f_results = open("results.txt", "w")

        for test_lp in sorted(lps):

            x_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            y_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv"
            x_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"
            y_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv"

            train_lps = ScoringTask.get_train_lps(lps, test_lp)

            train_feature_values = []
            train_reference_values = []

            test_feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")
            test_reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t")

            for train_lp in sorted(train_lps):
                feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")
                reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \
                                                    "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t")

                train_feature_values += list(feature_values)
                train_reference_values += list(reference_values)

            write_feature_file(x_train, train_feature_values)
            write_reference_file(y_train, train_reference_values)
            write_feature_file(x_test, test_feature_values)
            write_reference_file(y_test, test_reference_values)

            gold_standard = test_reference_values
            predictions = ScoringTask.train_predict(config_path_learning)
            # predictions = ScoringTask.recursive_feature_elimination(config_learning, config, 50)

            correlation = ScoringTask.evaluate_predicted(predictions, gold_standard)

            f_results.write(test_lp + " " + str(correlation) + " with " + feature_set + "\n")
            os.remove(x_train)
            os.remove(x_test)
            os.remove(y_train)
            os.remove(y_test)
Beispiel #2
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Beispiel #3
0
    def training_set_for_learn_to_rank_from_feature_file(config_learning, config):

        data_set_name = config.get('WMT', 'dataset')

        feature_values = read_features_file(config_learning.get("x_train", None), '\t')
        human_rankings = read_reference_file(config_learning.get("y_train", None), '\t')

        new_features = []
        new_labels = []

        path_features = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + ".learn_rank.tsv"
        path_objective = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + ".learn_rank.tsv"

        for i, label in enumerate(human_rankings):
            if label == 2:
                features = np.subtract(RankingTask.split_list(feature_values[i])[0], RankingTask.split_list(feature_values[i])[1])
                new_label = 1
            else:
                features = np.subtract(RankingTask.split_list(feature_values[i])[1], RankingTask.split_list(feature_values[i])[0])
                new_label = 0

            new_features.append(features)
            new_labels.append(new_label)

        write_feature_file(path_objective, new_features)
        write_reference_file(path_features, new_labels)
Beispiel #4
0
def open_datasets(train_path, train_ref_path, test_path,
                  test_ref_path, delim, labels_path=None, tostring=False):
    
    if not os.path.isfile(os.path.abspath(train_path)):
        raise IOError("training dataset path is not valid: %s" % train_path)
    
    if not os.path.isfile(os.path.abspath(train_ref_path)):
        raise IOError("training references path is not valid: %s" % train_ref_path)
    
    if not os.path.isfile(os.path.abspath(test_path)):
        raise IOError("test dataset path is not valid: %s" % test_path)
    
    if not os.path.isfile(os.path.abspath(test_ref_path)):
        raise IOError("test references path is not valid: %s" % test_ref_path)

    labels = []
    if labels_path is not None:
        if not os.path.isfile(os.path.abspath(labels_path)):
            raise IOError("labels file is not valid: %s" % labels_path)

        labels = read_labels_file(labels_path, delim)

    X_train = read_features_file(train_path, delim, tostring=tostring)
    y_train = read_reference_file(train_ref_path, delim, tostring=tostring)
    
    X_test = read_features_file(test_path, delim, tostring=tostring)
    y_test = read_reference_file(test_ref_path, delim, tostring=tostring)
    
    if len(X_train.shape) != 2:
        raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.")

    if len(X_test.shape) != 2:
        raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.")
        
    if X_train.shape[0] != y_train.shape[0]:
        print(X_train.shape[0],  y_train.shape[0])
        raise IOError("the number of instances in the train features file does not match the number of references given.")
        
    if X_test.shape[0] != y_test.shape[0]:
        raise IOError("the number of instances in the test features file does not match the number of references given.")

    if X_train.shape[1] != X_test.shape[1]:
        raise IOError("the number of features in train and test datasets is different.")

    return X_train, y_train, X_test, y_test, labels
Beispiel #5
0
    def load_predict(config_learning, config_data):

        learning_config = config_learning.get("learning", None)
        method_name = learning_config.get("method", None)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        y_test = read_reference_file(config_learning.get('y_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        estimator = joblib.load(os.path.expanduser(config_data.get("Learner", "models")) + "/" + method_name + ".pkl")
        predictions = estimator.predict(x_test)

        return predictions
Beispiel #6
0
    def get_data(self):

        human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t')
        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return extractor.vals, human_scores
Beispiel #7
0
def train_model(cfg, model_path):

    x_train = read_features_file(cfg.get('x_train'), '\t')
    y_train = read_reference_file(cfg.get('y_train'), '\t')
    x_test = read_features_file(cfg.get('x_test'), '\t')
    scale = cfg.get("scale", True)

    if scale:
        x_train, x_test = scale_datasets(x_train, x_test)

    estimator, scorers = learn_model.set_learning_method(cfg, x_train, y_train)
    estimator.fit(x_train, y_train)
    joblib.dump(estimator, model_path)
Beispiel #8
0
    def train_save(config_learning, config_data):

        learning_config = config_learning.get("learning", None)
        method_name = learning_config.get("method", None)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        estimator.fit(x_train, y_train)
        joblib.dump(estimator, os.path.expanduser(config_data.get('Learner', 'models')) + '/' + method_name + '.pkl')
Beispiel #9
0
    def clean_dataset(config_learning, human_comparisons):

        feature_values = read_features_file(config_learning.get('x_train'), '\t')
        labels = read_reference_file(config_learning.get('y_train'), '\t')
        new_feature_values = []
        new_labels = []
        human_comparisons = RankingTask.eliminate_ties(human_comparisons)
        comparisons_untied_phrases = defaultdict(list)
        comparisons_untied_signs = defaultdict(list)

        deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for comparison in human_comparisons[dataset, lang_pair]:

                if comparison.sign == "=":
                    continue
                else:
                    comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2])
                    comparisons_untied_signs[dataset, lang_pair].append(comparison.sign)

        for dataset, lang_pair in sorted(human_comparisons.keys()):

            for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]):

                features = feature_values[i]
                label = labels[i]

                if comparison in deduplicated_phrases[dataset, lang_pair]:

                    if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None:
                        continue

                    label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)])

                new_feature_values.append(features)
                new_labels.append(label)

        write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values)
        write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
Beispiel #10
0
    def recursive_feature_elimination_cv(config_learning, config_data):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data)
        combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy')
        rfecv.fit(x_train, y_train)

        feature_list = []

        for i, feature_name in enumerate(feature_names):
             if combination_methods[i] == 'both':
                 feature_list.append(feature_name)
                 feature_list.append(feature_name)
             else:
                 feature_list.append(feature_name)

        for i, name in enumerate(feature_list):
            output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n")

        output.close()

        predictions = rfecv.predict(x_test)

        return predictions