コード例 #1
0
ファイル: model_runner.py プロジェクト: dwanev/SEL
    def get_salience_by_entity_by_doc_id(self, feature_filename, model, docid_set, feature_names, dexterDataset, wikipediaDataset, show_tree = False, filter_for_interesting = True):
        salience_by_entity_by_doc_id = {}

        X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix(
            feature_filename=feature_filename,
            feature_names=feature_names,
            entity_id_index=1,
            y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4,
            tmp_filename='/tmp/temp_conversion_file.txt'
        )

        self.logger.info('Filtering only golden rows')
        # remove any rows that are not in the golden set

        fg = FilterGolden()
        X_sel, y_sel, docid_array_sel, entity_id_array_sel = fg.get_only_golden_rows(X_sel, y_sel, docid_array_sel,
                                                                                     entity_id_array_sel, dexterDataset,
                                                                                     wikipediaDataset)
        self.logger.info('After filtering only golden rows:')
        self.logger.info('X Shape = %s', X_sel.shape)
        self.logger.info('y Shape = %s', y_sel.shape)

        if filter_for_interesting:
            X_sel, y_sel, docid_array_sel, entity_id_array_sel = fg.get_only_rows_with_entity_salience_variation(
                X_sel, y_sel, docid_array_sel, entity_id_array_sel)

            self.logger.info('After filtering only interesting rows:')
            self.logger.info('X Shape = %s', X_sel.shape)
            self.logger.info('y Shape = %s', y_sel.shape)



        if model is not None:
            t2 = model.predict(X_sel)
        else:
            t2 = np.zeros(shape=y_sel.shape)
            self.logger.warning('No model, returning all 0 predictions')


        if show_tree:
            self.show_tree_info(model.estimators_[0], X_sel)
            # for e in model.estimators_:
            #     self.show_tree_info(e, X_sel)

        for i in range(len(docid_array_sel)):
            docid = int(docid_array_sel[i])
            if docid_set is None or docid in docid_set:
                entity_id = int(entity_id_array_sel[i])
                p2 = t2[i]
                if docid not in salience_by_entity_by_doc_id:
                    salience_by_entity_by_doc_id[docid] = {}
                salience_by_entity_by_doc_id[docid][entity_id] = p2

        return salience_by_entity_by_doc_id
コード例 #2
0
    def train_model(self, feature_filename, feature_names, dexter_dataset,
                    wikipedia_dataset, model_filename):
        X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix(
            feature_filename=feature_filename,
            feature_names=feature_names,
            entity_id_index=1,
            y_feature_index=2,
            first_feature_index=4,
            number_features_per_line=len(feature_names) + 4,
            tmp_filename='/tmp/temp_conversion_file_tf.txt')

        assert (X_sel.shape[1] == len(feature_names))

        # train only on records we have a golden salience for
        fg = FilterGolden()
        X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_golden_rows(
            X_sel, y_sel, docid_array_sel, entity_id_array_sel, dexter_dataset,
            wikipedia_dataset)

        # split into test and train
        splitter = DataSplitter()
        # X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets(X2_sel, y2_sel,
        #                                                                                         docid2_sel, 7,
        #                                                                                         train_split=0.90)

        X_train, X_test, y_train, y_test = splitter.get_test_train_datasets_deterministic(
            X2_sel, y2_sel, docid2_sel, Const.TRAINSET_DOCID_LIST)
        half_features = int((len(feature_names)) / 2.0)
        forest = RandomForestRegressor(bootstrap=True,
                                       criterion='mse',
                                       max_depth=16,
                                       max_features=half_features,
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=50,
                                       n_jobs=4,
                                       oob_score=True,
                                       random_state=None,
                                       verbose=0,
                                       warm_start=False)

        forest = forest.fit(X_train, y_train)

        print('oob score ' + str(forest.oob_score_))
        with open(model_filename, 'wb') as handle:
            pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #3
0
ファイル: hack14_filter_golden.py プロジェクト: dwanev/SEL
def train_model():
    X, y, docid_array, entity_id_array = load_feature_matrix(
        feature_filename=INTERMEDIATE_PATH +
        'dexter_all_heavy_catted_8_7_2018.txt',
        feature_names=feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=40,
        tmp_filename='/tmp/temp_conversion_file.txt')

    # train only on records we have a golden salience for
    fg = FilterGolden()
    logger.info('X Shape = %s', X.shape)
    logger.info('y Shape = %s', y.shape)

    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()

    X2, y2, docid2, entityid2 = fg.get_only_golden_rows(
        X, y, docid_array, entity_id_array, dexter_dataset, wikipedia_dataset)

    logger.info('X2 Shape = %s', X2.shape)
    logger.info('y2 Shape = %s', y2.shape)

    wrapper = GBRTWrapper()
    gbrt = wrapper.train_model_no_split(X2, y2, n_estimators=40)
    logger.info('trained')
    # gbrt.save_model()

    # from https://shankarmsy.github.io/stories/gbrt-sklearn.html
    # One of the benefits of growing trees is that we can understand how important each of the features are
    print("Feature Importances")
    print(gbrt.feature_importances_)
    print()
    # Let's print the R-squared value for train/test. This explains how much of the variance in the data our model is
    # able to decipher.
    print("R-squared for Train: %.2f" % gbrt.score(X2, y2))
    # print ("R-squared for Test: %.2f" %gbrt.score(X_test, y_test) )
    # - See more at: https://shankarmsy.github.io/stories/gbrt-sklearn.html#sthash.JNZQbnph.dpuf
    return gbrt, X2, y2, docid2, entityid2
コード例 #4
0
    def go(self, filename, feature_names, filter_only_golden):
        X, y, docid_array, entity_id_array = load_feature_matrix(feature_filename=filename,
                                                                 feature_names=feature_names,
                                                                 entity_id_index=1,
                                                                 y_feature_index=2,
                                                                 first_feature_index=4,
                                                                 number_features_per_line=len(feature_names) + 4,
                                                                 tmp_filename='/tmp/temp_conversion_file.txt'
                                                                 )

        # train only on records we have a golden salience for
        self.logger.info('__________________________',)
        self.logger.info('File %s', filename)
        self.logger.info('X Shape = %s', X.shape)
        self.logger.info('y Shape = %s', y.shape)

        if filter_only_golden:
            dexterDataset = DatasetDexter()
            wikipediaDataset = WikipediaDataset()
            fg = sellibrary.filter_only_golden.FilterGolden()
            X, y, docid_array, entity_id_array = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset)
            self.logger.info('After filtering only golden rows:')
            self.logger.info('X Shape = %s', X.shape)
            self.logger.info('y Shape = %s', y.shape)

        self.logger.info('y [1] %s', y[1:10])
        self.logger.info('y [1] %s', y[y > 0.0])

        y[y < 2.0] = 0
        y[y >= 2.0] = 1

        ig = self.information_gain_v2(X, y)
        self.logger.info('ig %s', ig)
        self.logger.info('ig shape %s', ig.shape)

        d = {}
        for i in range(len(feature_names)):
            d[feature_names[i]] = ig[i]

        self.sort_and_print(d)
        return d
コード例 #5
0
    const = Const()
    dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path()

    file_A_feature_names = const.get_sel_feature_names()
    filename_A = dropbox_intermediate_path + 'wp/wp.txt'

    file_B_feature_names = const.sent_feature_names
    filename_B = dropbox_intermediate_path + 'wp_sentiment_simple.txt'  #'base_tf_simple_v2.txt'

    output_filename = dropbox_intermediate_path + 'wp_joined.txt'  #'joined_sel_sent_and_tf.txt'

    # Load File A
    X1, y1, docid_array1, entity_id_array1 = load_feature_matrix(
        feature_filename=filename_A,
        feature_names=file_A_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_A_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')

    print(y1.shape)
    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()
    # fg = FilterGolden()
    # X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(X1, y1, docid_array1, entity_id_array1, dexter_dataset,
    #                                                     wikipedia_dataset)

    document_list = dexter_dataset.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid(
        document_list, wikipedia_dataset)
コード例 #6
0
ファイル: hack_18_docids.py プロジェクト: dwanev/SEL
from sellibrary.sel.dexter_dataset import DatasetDexter

from sellibrary.util.test_train_splitter import DataSplitter

INTERMEDIATE_PATH = FileLocations.get_dropbox_intermediate_path()

sent_feature_names = [
    'title_sentiment_ngram_20', 'title_neg_sent_ngram_20',
    'title_pos_sent_ngram_20', 'body_sentiment_ngram_20',
    'body_neg_sent_ngram_20', 'body_pos_sent_ngram_20'
]

X_sent, y_sent, docid_array_sent, entity_id_array_sent = load_feature_matrix(
    feature_filename=INTERMEDIATE_PATH + 'sentiment_simple.txt',
    feature_names=sent_feature_names,
    entity_id_index=1,
    y_feature_index=2,
    first_feature_index=4,
    number_features_per_line=10,
    tmp_filename='/tmp/temp_conversion_file.txt')

splitter = DataSplitter()

X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets(
    X_sent, y_sent, docid_array_sent, 7, train_split=0.50)

in_train_set_by_id

ids_in_trainset = []
ids_in_testset = []

for i in in_train_set_by_id.keys():
コード例 #7
0
ファイル: hack_22.1_split_files.py プロジェクト: dwanev/SEL
if __name__ == "__main__":

    const = Const()
    dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path()

    input_filename = dropbox_intermediate_path + 'joined_sel_sent_and_tf.txt'
    input_feature_names = const.get_joined_sel_sent_and_tf_feature_names()

    output_filename = dropbox_intermediate_path + 'efficient_2_features.txt'
    output_features_names = const.efficient_2_feature_names

    X1, y1, docid_array1, entity_id_array1 = load_feature_matrix(
        feature_filename=input_filename,
        feature_names=input_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(input_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')

    fg = FilterGolden()
    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()
    X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(
        X1, y1, docid_array1, entity_id_array1, dexter_dataset,
        wikipedia_dataset)
    document_list = dexter_dataset.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid(
        document_list, wikipedia_dataset)
コード例 #8
0
ファイル: hack_23_ablation.py プロジェクト: dwanev/SEL
def mask_feature_get_ndcg(feature_number_set, feature_filename, feature_names, dexter_dataset, wikipedia_dataset):
    X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix(
        feature_filename=feature_filename,
        feature_names=feature_names,
        entity_id_index=1,
        y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file_ablation.txt'
    )

    assert (X_sel.shape[1] == len(feature_names))
    # write over the test and training data feature, so it will not be used
    for feature_number in feature_number_set:
        if 0 <= feature_number < len(feature_names):
            X_sel[:, feature_number] = 0

    # train only on records we have a golden salience for
    fg = FilterGolden()
    X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_golden_rows(
        X_sel, y_sel, docid_array_sel, entity_id_array_sel, dexter_dataset, wikipedia_dataset)
    logger.info('Shape only golden %s', str(X2_sel.shape))
    # train only on records we have salience across multiple documents
    X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_rows_with_entity_salience_variation(
        X2_sel, y2_sel, docid2_sel, entityid2_sel)

    logger.info('Shape only entities with multiple saliences %s', str(X2_sel.shape))
    # split into test and train
    splitter = DataSplitter()
    # X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets(X2_sel, y2_sel,
    #                                                                                         docid2_sel, 7,
    #                                                                                         train_split=0.90)

    X_train, X_test, y_train, y_test = splitter.get_test_train_datasets_deterministic(X2_sel, y2_sel,
                                                                                      docid2_sel,
                                                                                      Const.TRAINSET_DOCID_LIST)
    half_features = int((len(feature_names) - len(feature_number_set)) / 2.0)
    # forest = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=16,
    #                              max_features=half_features, max_leaf_nodes=None,
    #                              min_impurity_decrease=0.0, min_impurity_split=None,
    #                              min_samples_leaf=1, min_samples_split=2,
    #                              min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4,
    #                              oob_score=False, random_state=None, verbose=0, warm_start=False)
    #
    forest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
                                   max_features=half_features, max_leaf_nodes=None,
                                   min_impurity_decrease=0.0, min_impurity_split=None,
                                   min_samples_leaf=1, min_samples_split=2,
                                   min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=4,
                                   oob_score=True, random_state=None, verbose=0, warm_start=False)

    forest = forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    print("oob score :" + str(forest.oob_score_))
    test_score = forest.score(X_test, y_test)
    print("oob score (on test data):" + str(test_score))
    # test_score = 0.0

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X2_sel.shape[1]):
        print(
            "%d, feature, %d, %s, %f " % (
                f + 1, indices[f], const.get_joined_feature_names()[indices[f]], importances[indices[f]]))

    model_filename = Const.TEMP_PATH + 'abalation.pickle'
    with open(model_filename, 'wb') as handle:
        pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL)

    docid_set = set(Const.TESTSET_DOCID_LIST)

    model_runner = ModelRunner()
    overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid = model_runner.get_ndcg_and_trec_eval(
        feature_filename, model_filename, feature_names, docid_set,
        wikipedia_dataset, dexter_dataset, per_document_ndcg=False)
    return overall_ndcg, test_score, overall_trec_val_by_name