def train_LSTM(X, Y, model, train_split=0.8, epochs=10, batch_size=32): # Clinical SAMPLE_TYPE_cli, X_cli, Y_cli = get_input(sample_type=4, shuffle_documents=False, pad=False) which_model = 2 if which_model == 2: custom_fit(X, Y, train_split=train_split, model=model, epochs=epochs) print "Clinical Data" custom_fit(X_cli, Y_cli, train_split=1, model=model) # Test clinical elif which_modle == 1: # Works for TYPE2 but check for others # Both these lines work for which_model == 1 X_train, Y_train, X_test, Y_test = split_data(X, Y, train_split=train_split) model.fit(X_train, Y_train, shuffle=False, nb_epoch=epochs, batch_size=batch_size, validation_data=(X_test, Y_test)) # WIkipedia #model.evaluate(X_test, Y_test, batch_size=batch_size) #pred = model.predict(X_test) #rounded = np.round(pred) #result = helper.windiff_metric_NUMPY(Y_test, rounded) #print result # Clinical # Temporary TRUNCATION TRUNCATE_LEN = X_train.shape[1] print "NOTE: Truncating the Test dataset(clinical) from %d sentences to %d sentences." % ( X_cli.shape[1], TRUNCATE_LEN) X_cli, Y_cli = X_cli[:, :TRUNCATE_LEN, :], Y_cli[:, :TRUNCATE_LEN, :] model.evaluate(X_cli, Y_cli, batch_size=batch_size) pred = model.predict(X_cli) rounded = np.round(pred) result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True) print result pdb.set_trace()
# _, result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True) # print result #pdb.set_trace() #rounded = [round(x) for x in pred] if __name__ == "__main__": # Print parameters print "=== SCALE_LOSS_FUN: %d, ONE_SIDE_CONTEXT_SIZE: %d ===" % ( int(SCALE_LOSS_FUN), ONE_SIDE_CONTEXT_SIZE) print "NOTE: Make sure you have MIN_SENTENCES_IN_DOCUMENT >= 2*context_size + 1" # For which_model == 2 SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input( sample_type=2, shuffle_documents=True, pad=False) NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[ 0], -1, X_wiki[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # For which_model == 2 # Biography data for training #SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input(sample_type=5, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler) #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # Clinical, Fiction, Wikipedia - Only for testing SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input( sample_type=4, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler) SAMPLE_TYPE_fic, X_fic, Y_fic, trained_sample_handler = get_input(
# _, result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True) # print result pdb.set_trace() #rounded = [round(x) for x in pred] if __name__ == "__main__": # Print parameters print "=== SCALE_LOSS_FUN: %d, ONE_SIDE_CONTEXT_SIZE: %d ===" % ( int(SCALE_LOSS_FUN), ONE_SIDE_CONTEXT_SIZE) print "NOTE: Make sure you have MIN_SENTENCES_IN_DOCUMENT >= 2*context_size + 1" # For which_model == 2 SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input( sample_type=2, shuffle_documents=True, pad=False) NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[ 0], -1, X_wiki[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded print "X_wiki[0].shape: ", X_wiki[0].shape # For which_model == 2 # Biography data for training #SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input(sample_type=5, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler) #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # Clinical - Only for testing #SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input(sample_type=4, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler) # Fiction - Only for testing #SAMPLE_TYPE_fic, X_fic, Y_fic, trained_sample_handler = get_input(sample_type=6, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler) dictionary_object = trained_sample_handler.dictionary
pred = model.predict(X_cli) rounded = np.round(pred) result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True) print result pdb.set_trace() #rounded = [round(x) for x in pred] if __name__ == "__main__": # For which_model == 2 SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input( sample_type=2, shuffle_documents=True, pad=True) NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[ 0], -1, X_wiki[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # For which_model == 2 # Biography data for training SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input( sample_type=5, shuffle_documents=False, pad=True, trained_sent2vec_model=trained_sample_handler) #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # Clinical - Only for testing SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input( sample_type=4,
# # # print "macro results are" # print "average precision is %f" %(p/10) # print "average recall is %f" %(r/10) # print "average f1 is %f" %(f1/10) # # print "micro results are" # print "average precision is %f" %(p1/10) # print "average recall is %f" %(r1/10) # print "average f1 is %f" %(f11/10) if __name__ == "__main__": # For which_model == 2 SAMPLE_TYPE, X, Y = get_input(sample_type=2, shuffle_documents=True, pad=False) NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X.shape[0], -1, X[ 0].shape[1] #MAX_SEQUENCE_LENGTH is is already padded # For which_model == 1 #SAMPLE_TYPE, X, Y = get_input(sample_type=2, shuffle_documents=True, pad=True) #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X.shape[0], X.shape[1], X.shape[2] #MAX_SEQUENCE_LENGTH is is already padded #if SAMPLE_TYPE == 1: # Y = Y[:,-1].reshape((NO_OF_SAMPLES, 1)) # For LSTM #elif SAMPLE_TYPE == 2: # # because of TimeDistributed layer :/ # Y = Y.reshape((NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, 1)) #else: # print "INVALID SAMPLE TYPE!"
print helper.windiff_metric_NUMPY(Y_test, rounded) pdb.set_trace() #rounded = [round(x) for x in predictions] # round predictions #print(predictions) #pdb.set_trace() def sample_data(): # load pima indians dataset dataset = np.loadtxt( "/home/pinkesh/DATASETS/PIMA_DATASET/pima-indians-diabetes.data", delimiter=",") X = dataset[:, 0:8] Y = dataset[:, 8] return X, Y if __name__ == "__main__": #X, Y = sample_data() SAMPLE_TYPE, X, Y = get_input(shuffle=False) # Split test-train data train_ratio = 0.8 print 'X(train)=', X.shape[0] * train_ratio print 'X(test)=', X.shape[0] * (1 - train_ratio) train_samples = int(train_ratio * X.shape[0]) #pdb.set_trace() run_neural_net(X[:train_samples + 1, :], Y[:train_samples + 1], X[train_samples + 1:, :], Y[train_samples + 1:])