def get_salience_by_entity_by_doc_id(self, feature_filename, model, docid_set, feature_names, dexterDataset, wikipediaDataset, show_tree = False, filter_for_interesting = True): salience_by_entity_by_doc_id = {} X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix( feature_filename=feature_filename, feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4, tmp_filename='/tmp/temp_conversion_file.txt' ) self.logger.info('Filtering only golden rows') # remove any rows that are not in the golden set fg = FilterGolden() X_sel, y_sel, docid_array_sel, entity_id_array_sel = fg.get_only_golden_rows(X_sel, y_sel, docid_array_sel, entity_id_array_sel, dexterDataset, wikipediaDataset) self.logger.info('After filtering only golden rows:') self.logger.info('X Shape = %s', X_sel.shape) self.logger.info('y Shape = %s', y_sel.shape) if filter_for_interesting: X_sel, y_sel, docid_array_sel, entity_id_array_sel = fg.get_only_rows_with_entity_salience_variation( X_sel, y_sel, docid_array_sel, entity_id_array_sel) self.logger.info('After filtering only interesting rows:') self.logger.info('X Shape = %s', X_sel.shape) self.logger.info('y Shape = %s', y_sel.shape) if model is not None: t2 = model.predict(X_sel) else: t2 = np.zeros(shape=y_sel.shape) self.logger.warning('No model, returning all 0 predictions') if show_tree: self.show_tree_info(model.estimators_[0], X_sel) # for e in model.estimators_: # self.show_tree_info(e, X_sel) for i in range(len(docid_array_sel)): docid = int(docid_array_sel[i]) if docid_set is None or docid in docid_set: entity_id = int(entity_id_array_sel[i]) p2 = t2[i] if docid not in salience_by_entity_by_doc_id: salience_by_entity_by_doc_id[docid] = {} salience_by_entity_by_doc_id[docid][entity_id] = p2 return salience_by_entity_by_doc_id
def train_model(self, feature_filename, feature_names, dexter_dataset, wikipedia_dataset, model_filename): X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix( feature_filename=feature_filename, feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4, tmp_filename='/tmp/temp_conversion_file_tf.txt') assert (X_sel.shape[1] == len(feature_names)) # train only on records we have a golden salience for fg = FilterGolden() X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_golden_rows( X_sel, y_sel, docid_array_sel, entity_id_array_sel, dexter_dataset, wikipedia_dataset) # split into test and train splitter = DataSplitter() # X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets(X2_sel, y2_sel, # docid2_sel, 7, # train_split=0.90) X_train, X_test, y_train, y_test = splitter.get_test_train_datasets_deterministic( X2_sel, y2_sel, docid2_sel, Const.TRAINSET_DOCID_LIST) half_features = int((len(feature_names)) / 2.0) forest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16, max_features=half_features, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4, oob_score=True, random_state=None, verbose=0, warm_start=False) forest = forest.fit(X_train, y_train) print('oob score ' + str(forest.oob_score_)) with open(model_filename, 'wb') as handle: pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL)
def train_model(): X, y, docid_array, entity_id_array = load_feature_matrix( feature_filename=INTERMEDIATE_PATH + 'dexter_all_heavy_catted_8_7_2018.txt', feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=40, tmp_filename='/tmp/temp_conversion_file.txt') # train only on records we have a golden salience for fg = FilterGolden() logger.info('X Shape = %s', X.shape) logger.info('y Shape = %s', y.shape) dexter_dataset = DatasetDexter() wikipedia_dataset = WikipediaDataset() X2, y2, docid2, entityid2 = fg.get_only_golden_rows( X, y, docid_array, entity_id_array, dexter_dataset, wikipedia_dataset) logger.info('X2 Shape = %s', X2.shape) logger.info('y2 Shape = %s', y2.shape) wrapper = GBRTWrapper() gbrt = wrapper.train_model_no_split(X2, y2, n_estimators=40) logger.info('trained') # gbrt.save_model() # from https://shankarmsy.github.io/stories/gbrt-sklearn.html # One of the benefits of growing trees is that we can understand how important each of the features are print("Feature Importances") print(gbrt.feature_importances_) print() # Let's print the R-squared value for train/test. This explains how much of the variance in the data our model is # able to decipher. print("R-squared for Train: %.2f" % gbrt.score(X2, y2)) # print ("R-squared for Test: %.2f" %gbrt.score(X_test, y_test) ) # - See more at: https://shankarmsy.github.io/stories/gbrt-sklearn.html#sthash.JNZQbnph.dpuf return gbrt, X2, y2, docid2, entityid2
def go(self, filename, feature_names, filter_only_golden): X, y, docid_array, entity_id_array = load_feature_matrix(feature_filename=filename, feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4, tmp_filename='/tmp/temp_conversion_file.txt' ) # train only on records we have a golden salience for self.logger.info('__________________________',) self.logger.info('File %s', filename) self.logger.info('X Shape = %s', X.shape) self.logger.info('y Shape = %s', y.shape) if filter_only_golden: dexterDataset = DatasetDexter() wikipediaDataset = WikipediaDataset() fg = sellibrary.filter_only_golden.FilterGolden() X, y, docid_array, entity_id_array = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset) self.logger.info('After filtering only golden rows:') self.logger.info('X Shape = %s', X.shape) self.logger.info('y Shape = %s', y.shape) self.logger.info('y [1] %s', y[1:10]) self.logger.info('y [1] %s', y[y > 0.0]) y[y < 2.0] = 0 y[y >= 2.0] = 1 ig = self.information_gain_v2(X, y) self.logger.info('ig %s', ig) self.logger.info('ig shape %s', ig.shape) d = {} for i in range(len(feature_names)): d[feature_names[i]] = ig[i] self.sort_and_print(d) return d
const = Const() dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path() file_A_feature_names = const.get_sel_feature_names() filename_A = dropbox_intermediate_path + 'wp/wp.txt' file_B_feature_names = const.sent_feature_names filename_B = dropbox_intermediate_path + 'wp_sentiment_simple.txt' #'base_tf_simple_v2.txt' output_filename = dropbox_intermediate_path + 'wp_joined.txt' #'joined_sel_sent_and_tf.txt' # Load File A X1, y1, docid_array1, entity_id_array1 = load_feature_matrix( feature_filename=filename_A, feature_names=file_A_feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(file_A_feature_names) + 4, tmp_filename='/tmp/temp_conversion_file.txt') print(y1.shape) dexter_dataset = DatasetDexter() wikipedia_dataset = WikipediaDataset() # fg = FilterGolden() # X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(X1, y1, docid_array1, entity_id_array1, dexter_dataset, # wikipedia_dataset) document_list = dexter_dataset.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid( document_list, wikipedia_dataset)
from sellibrary.sel.dexter_dataset import DatasetDexter from sellibrary.util.test_train_splitter import DataSplitter INTERMEDIATE_PATH = FileLocations.get_dropbox_intermediate_path() sent_feature_names = [ 'title_sentiment_ngram_20', 'title_neg_sent_ngram_20', 'title_pos_sent_ngram_20', 'body_sentiment_ngram_20', 'body_neg_sent_ngram_20', 'body_pos_sent_ngram_20' ] X_sent, y_sent, docid_array_sent, entity_id_array_sent = load_feature_matrix( feature_filename=INTERMEDIATE_PATH + 'sentiment_simple.txt', feature_names=sent_feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=10, tmp_filename='/tmp/temp_conversion_file.txt') splitter = DataSplitter() X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets( X_sent, y_sent, docid_array_sent, 7, train_split=0.50) in_train_set_by_id ids_in_trainset = [] ids_in_testset = [] for i in in_train_set_by_id.keys():
if __name__ == "__main__": const = Const() dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path() input_filename = dropbox_intermediate_path + 'joined_sel_sent_and_tf.txt' input_feature_names = const.get_joined_sel_sent_and_tf_feature_names() output_filename = dropbox_intermediate_path + 'efficient_2_features.txt' output_features_names = const.efficient_2_feature_names X1, y1, docid_array1, entity_id_array1 = load_feature_matrix( feature_filename=input_filename, feature_names=input_feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(input_feature_names) + 4, tmp_filename='/tmp/temp_conversion_file.txt') fg = FilterGolden() dexter_dataset = DatasetDexter() wikipedia_dataset = WikipediaDataset() X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows( X1, y1, docid_array1, entity_id_array1, dexter_dataset, wikipedia_dataset) document_list = dexter_dataset.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid( document_list, wikipedia_dataset)
def mask_feature_get_ndcg(feature_number_set, feature_filename, feature_names, dexter_dataset, wikipedia_dataset): X_sel, y_sel, docid_array_sel, entity_id_array_sel = load_feature_matrix( feature_filename=feature_filename, feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(feature_names) + 4, tmp_filename='/tmp/temp_conversion_file_ablation.txt' ) assert (X_sel.shape[1] == len(feature_names)) # write over the test and training data feature, so it will not be used for feature_number in feature_number_set: if 0 <= feature_number < len(feature_names): X_sel[:, feature_number] = 0 # train only on records we have a golden salience for fg = FilterGolden() X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_golden_rows( X_sel, y_sel, docid_array_sel, entity_id_array_sel, dexter_dataset, wikipedia_dataset) logger.info('Shape only golden %s', str(X2_sel.shape)) # train only on records we have salience across multiple documents X2_sel, y2_sel, docid2_sel, entityid2_sel = fg.get_only_rows_with_entity_salience_variation( X2_sel, y2_sel, docid2_sel, entityid2_sel) logger.info('Shape only entities with multiple saliences %s', str(X2_sel.shape)) # split into test and train splitter = DataSplitter() # X_train, X_test, y_train, y_test, in_train_set_by_id = splitter.get_test_train_datasets(X2_sel, y2_sel, # docid2_sel, 7, # train_split=0.90) X_train, X_test, y_train, y_test = splitter.get_test_train_datasets_deterministic(X2_sel, y2_sel, docid2_sel, Const.TRAINSET_DOCID_LIST) half_features = int((len(feature_names) - len(feature_number_set)) / 2.0) # forest = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=16, # max_features=half_features, max_leaf_nodes=None, # min_impurity_decrease=0.0, min_impurity_split=None, # min_samples_leaf=1, min_samples_split=2, # min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4, # oob_score=False, random_state=None, verbose=0, warm_start=False) # forest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16, max_features=half_features, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=4, oob_score=True, random_state=None, verbose=0, warm_start=False) forest = forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] print("oob score :" + str(forest.oob_score_)) test_score = forest.score(X_test, y_test) print("oob score (on test data):" + str(test_score)) # test_score = 0.0 # Print the feature ranking print("Feature ranking:") for f in range(X2_sel.shape[1]): print( "%d, feature, %d, %s, %f " % ( f + 1, indices[f], const.get_joined_feature_names()[indices[f]], importances[indices[f]])) model_filename = Const.TEMP_PATH + 'abalation.pickle' with open(model_filename, 'wb') as handle: pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL) docid_set = set(Const.TESTSET_DOCID_LIST) model_runner = ModelRunner() overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid = model_runner.get_ndcg_and_trec_eval( feature_filename, model_filename, feature_names, docid_set, wikipedia_dataset, dexter_dataset, per_document_ndcg=False) return overall_ndcg, test_score, overall_trec_val_by_name