def objective_function_veracity_branchLSTM_fullPHEME(params): path = 'saved_data_fullPHEME' train = [ 'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting', 'prince-toronto', 'putinmissing', 'sydneysiege' ] test = 'charliehebdo' max_branch_len = 25 x_train = [] y_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_y_train = np.load(os.path.join(path, t, 'labels.npy')) temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) y_train.extend(temp_y_train) x_train = np.asarray(x_train) y_train = np.asarray(y_train) x_test = np.load(os.path.join(path, test, 'train_array.npy')) y_test = np.load(os.path.join(path, test, 'labels.npy')) ids_test = np.load(os.path.join(path, test, 'ids.npy')) #% y_train = to_categorical(y_train, num_classes=None) model = heteroscedastic_model(x_train, y_train, params, output_classes=3) mb_size = params['mb_size'] num_epochs = params['num_epochs'] model.fit(x_train, [y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) verbose = False predictions_test = model.predict(x_test, batch_size=mb_size, verbose=verbose) softmax_test = predictions_test[1] y_pred = np.argmax(softmax_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_test, y_test, y_pred) mactest_F = f1_score(tree_label, tree_prediction, average='macro') output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK} #%% return output
def objective_function_branchLSTM_Twitter15(params): #%% path = 'preprocessing/saved_data_15' # fold 0 is development set train = '0/train' test = '0/test' max_branch_len = 25 x_train = [] y_train = [] temp_x_train = np.load(os.path.join(path, train, 'train_array.npy')) y_train = np.load(os.path.join(path, train, 'labels.npy')) # pad sequences to the size of the largest x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_test = np.load(os.path.join(path, test, 'train_array.npy')) y_test = np.load(os.path.join(path, test, 'labels.npy')) ids_test = np.load(os.path.join(path, test, 'ids.npy')) #% y_train = to_categorical(y_train, num_classes=None) # y_pred, confidence = LSTM_model(x_train, y_train, x_test, params) model = heteroscedastic_model(x_train, y_train, params, output_classes=4) mb_size = params['mb_size'] num_epochs = params['num_epochs'] model.fit(x_train, [y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) verbose = False predictions_test = model.predict(x_test, batch_size=mb_size, verbose=verbose) softmax_test = predictions_test[1] y_pred = np.argmax(softmax_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_test, y_test, y_pred) mactest_F = f1_score(tree_label, tree_prediction, average='macro') output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK} #%% return output
def eval_veracity_LSTM_CV(params,dataset='15'): path = 'preprocessing/saved_data_'+dataset folds = ['0','1', '2','3', '4'] num_epochs = params['num_epochs'] mb_size = params['mb_size'] for f in folds: print(f) test = f+'/test' train = f+'/train' x_test = np.load(os.path.join(path,test, 'train_array.npy')) y_test = np.load(os.path.join(path,test, 'labels.npy')) ids_test = np.load(os.path.join(path,test, 'ids.npy')) predictions_train = [] x_train = np.load(os.path.join(path,train, 'train_array.npy')) y_train = np.load(os.path.join(path,train, 'labels.npy')) y_train = to_categorical(y_train, num_classes=4) ids_train = np.load(os.path.join(path,train, 'ids.npy')) model = heteroscedastic_model(x_train, y_train, params, output_classes=4) model.fit(x_train,[y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) predictions = models.predict_on_data(model,params, x_train, y_train, x_test, y_test, num_classes=3, verbose=True) tree_results_train = branch2tree(ids_train, predictions['train']) predictions['train']['tree_results'] = tree_results_train predictions_train.append(predictions['train']) filename = 'output/model'+f+'.h5' model.save(filename) json_string = model.to_json() with open('output/my_model_architecture'+f+'.h5','w') as fout: json.dump(json_string,fout) model.save_weights('output/my_model_weights'+f+'.h5') # I need to improve this tree_results_test = branch2tree(ids_test, predictions['test']) predictions['test']['tree_results'] = tree_results_test predictions['train']['tree_results'] = predictions_train filename = 'output/predictions'+f+'.pkl' fout = open(filename, "wb") pickle.dump(predictions, fout) fout.close() eval_info_test = eval_branches(tree_results_test) # eval_info_train = [eval_branches(i) for i in tree_results_train] eval_info = {} eval_info['test'] = eval_info_test # eval_info['train'] = eval_info_train filename = 'output/eval_info'+f+'.pkl' fout = open(filename, "wb") pickle.dump(eval_info, fout) fout.close()
def eval_veracity_LSTM_CV(params, branch=True): #%% if branch: path = 'saved_data_fullPHEME' else: path = 'saved_data_timelinefullPHEME' folds = [ 'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting', 'prince-toronto', 'putinmissing', 'sydneysiege', 'charliehebdo', 'germanwings-crash' ] num_epochs = params['num_epochs'] mb_size = params['mb_size'] for number in range(len(folds)): x_temp = np.load(os.path.join(path, 'ebola-essien', 'train_array.npy')) y_temp = np.load(os.path.join(path, 'ebola-essien', 'labels.npy')) model = heteroscedastic_model(x_temp, y_temp, params, output_classes=3) # del x_temp print(number) test = folds[number] train = deepcopy(folds) del train[number] x_test = np.load(os.path.join(path, test, 'train_array.npy')) y_test = np.load(os.path.join(path, test, 'labels.npy')) ids_test = np.load(os.path.join(path, test, 'ids.npy')) predictions_train = [] for t in train: x_train = np.load(os.path.join(path, t, 'train_array.npy')) y_train = np.load(os.path.join(path, t, 'labels.npy')) y_train = to_categorical(y_train, num_classes=3) ids_train = np.load(os.path.join(path, t, 'ids.npy')) model.fit(x_train, [y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) predictions = models.predict_on_data(model, params, x_train, y_train, x_test, y_test, num_classes=3, verbose=True) tree_results_train = branch2tree(ids_train, predictions['train']) predictions['train']['tree_results'] = tree_results_train predictions_train.append(predictions['train']) filename = 'output/model' + str(test) + '.h5' model.save(filename) json_string = model.to_json() with open('output/my_model_architecture' + str(test) + '.h5', 'w') as f: json.dump(json_string, f) model.save_weights('output/my_model_weights' + str(test) + '.h5') # I need to improve this tree_results_test = branch2tree(ids_test, predictions['test']) predictions['test']['tree_results'] = tree_results_test predictions['train']['tree_results'] = predictions_train filename = 'output/predictions' + str(test) + '.pkl' f = open(filename, "wb") pickle.dump(predictions, f) f.close() eval_info_test = eval_branches(tree_results_test) eval_info = {} eval_info['test'] = eval_info_test filename = 'output/eval_info' + str(test) + '.pkl' f = open(filename, "wb") pickle.dump(eval_info, f) f.close()