def train(session, features, labels, training_steps, batch_size, dropout_hidden, dropout_embedding, batch_gen=util.Batch_Gen, report_at=0): ''' Standard training function using dropout (on embedding and hidden layers). ''' sess=session loss=tf.get_collection(tf.GraphKeys.LOSSES)[-1] training_objective=tf.get_default_graph().get_operation_by_name('training_objective') batch_gen=batch_gen(features, labels, batch_size) for i_step in range(training_steps): curr_features, curr_labels=batch_gen.next() # monitor training if report_at>0 and i_step%report_at==0: curr_loss,preds=sess.run([loss,'output_layer:0'], feed_dict={'input_layer:0':curr_features, 'dropout_embedding:0':.0, 'dropout_hidden:0':.0, 'actual_values:0':curr_labels}) preds=pd.DataFrame(data=preds, columns=curr_labels.columns, index=curr_labels.index) perf=util.eval(true=curr_labels, prediction=preds) print(i_step, curr_loss, np.mean(perf)) #acutal training step sess.run(training_objective, feed_dict={'input_layer:0':curr_features, 'dropout_embedding:0':dropout_embedding, 'dropout_hidden:0':dropout_hidden , 'actual_values:0':curr_labels})
def test(session, features, labels): sess=session preds=sess.run('output_layer:0', feed_dict={'input_layer:0':features, 'dropout_embedding:0':.0, 'dropout_hidden:0':.0}) preds=pd.DataFrame(data=preds, columns=labels.columns, index=labels.index) return util.eval(labels, preds)
def eval(self, gold_lex): return(util.eval(gold_lex, self.predict(gold_lex.index)))
def main(results_path='results', metric='r'): RESULTS = results_path + '/' if not os.path.exists(RESULTS): os.makedirs(RESULTS) ### settings for setting in SETTINGS: print('Now processing {}'.format(setting.name)) ### check if this setting has already been processed if os.path.isdir(RESULTS + setting.name): print('\t{} has already been processed!'.format(setting.name)) else: labels = setting.load_data() embs = setting.load_embeddings() models = { 'turney': turney.Bootstrapper(embs), 'densifier': densifier.Densifier(embs), 'my_model_relu': my_model_relu, 'my_model_sigmoid': my_model_sigmoid, 'aicyber': aicyber.mlp_ensemble(), 'li_regressor': li_regressor(), 'linear_model': li_regressor(init_fun=sklearn.linear_model.LinearRegression) } results_setting={key:pd.DataFrame(columns=labels.columns)\ for key in list(models)} ### Crossvalidation k = 0 for train_index, test_index in KFold(n_splits=10, shuffle=True).\ split(labels): k += 1 train = labels.iloc[train_index] test = labels.iloc[test_index] print(k) train_features = util.feature_extraction(train.index, embs) test_features = util.feature_extraction(test.index, embs) ### methods for model_name in list(models): model = models[model_name] print(model_name) ### case distinction because models do not share the same ### interface tf.reset_default_graph() preds = None if model_name in [ 'aicyber', 'li_regressor', 'linear_model' ]: model.fit(train_features.copy(), train.copy()) preds = model.predict(test_features.copy()) elif model_name in ['my_model_relu', 'my_model_sigmoid']: # print(train) # sess=tf.Session() session = model.fit(train_features.copy(), train.copy()) preds = model.predict(test_features.copy(), session, var_names=train.columns) del session else: model.fit(train.copy()) preds = model.predict(test.index.copy()) ### print(test) print(preds) ### perf = util.eval(test, preds, metric) print(perf) results_setting[model_name].loc[k] = perf print(results_setting[model_name]) os.makedirs(RESULTS + setting.name) ### after cv, for each individual results data frame, average results and save data for model_name in list(models): curr_results = results_setting[model_name] curr_results = util.average_results_df(curr_results) fname = '{}{}/{}.tsv'.format(RESULTS, setting.name, model_name) util.save_tsv(curr_results, fname) print('\tFinished processing {}'.format(setting.name)) ### delete respective setting to free up memory del setting
split(labels): k += 1 train = labels.iloc[train_index] test = labels.iloc[test_index] print(k) for config in configs: print(config) threshold = config[0] alpha = config[1] ds.fit(seed_lexicon=train, binarization_threshold=threshold, alpha=alpha) prediction = ds.predict(words=test.index) performance = util.eval(test, prediction) print(performance) results_config[str(config)].loc[k] = performance meta_df = pd.DataFrame(columns=['threshold', 'alpha']) for config in configs: results_df = results_config[str(config)] results_df = util.average_results_df(results_df) fname = 'results/{}.tsv'.format(str(config)) util.save_tsv(results_df, fname) meta_df.loc[fname] = config util.save_tsv(meta_df, 'results/meta.tsv')
def eval(self, gold_lex): if self.induced_lexicon is None: raise ValueError( 'Embeddings need to be transformed first! Run "fit"!') else: return (util.eval(gold_lex, self.predict(gold_lex.index)))