def multitask(): x_train1, x_test1, y_train1, y_test1 = load_data.get_depression_data() x_train2, x_test2, y_train2, y_test2 = load_data.get_bipolar_disorder_data( ) x_train1 = x_train1[:len(x_train2)] x_test1 = x_test1[:len(x_test2)] y_train1 = y_train1[:len(y_train2)] y_test1 = y_test1[:len(y_test2)] vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) model = multitask1.get_multitask_model( (x_train1.shape[1], x_train1.shape[2])) multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2, x_test2, y_train2, y_test2, model)
def rf_with_glove(): x_train, x_test, y_train, y_test = load_data.get_depression_data() vectorize_function = preprocessing.vectorize_data_1d_glove embedding_index = preprocessing.get_embeddings_index() x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, embedding_index) x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) pred = baseline(x_train, x_test, y_train, y_test) print(pred)
def lstm_with_embedding_layer(): x_train, x_test, y_train, y_test = load_data.get_depression_data() y_train_one_hot = preprocessing.class_one_hot(y_train) embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix( x_train) x_train = preprocessing.vectorize_with_tokenizer(x_train, tokenizer) x_test = preprocessing.vectorize_with_tokenizer(x_test, tokenizer) model = get_lstm_model(True, word_index, embedding_matrix) run(x_train, x_test, y_train_one_hot, y_test, model)
def multitask2(): x_train1, x_test1, y_train1, y_test1 = load_data.get_depression_data() x_train2, x_test2, y_train2, y_test2 = load_data.get_bipolar_disorder_data( ) x_train1 = x_train1[:len(x_train2)] x_test1 = x_test1[:len(x_test2)] y_train1 = y_train1[:len(y_train2)] y_test1 = y_test1[:len(y_test2)] vectorize_function = preprocessing.vectorize_with_tokenizer embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix( x_train1) x_train1 = vectorize_function(x_train1, tokenizer) x_train2 = vectorize_function(x_train2, tokenizer) x_test1 = vectorize_function(x_test1, tokenizer) x_test2 = vectorize_function(x_test2, tokenizer) model = multitask1.get_multitask_model_2_embeddings( (x_train1.shape[1], ), word_index, embedding_matrix) multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2, x_test2, y_train2, y_test2, model)
def multitask_memory_efficient(): vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() data_per_iteration = BATCH_SIZE num_of_batches = TRAIN_SET_SIZE // data_per_iteration num_of_train_batches = 0 x_train1_filenames = [] y_train1_filenames = [] x_train2_filenames = [] y_train2_filenames = [] for i in range(num_of_batches): x_train1, y_train1 = load_data.get_depression_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration, test_size=0) x_train2, y_train2 = load_data.get_bipolar_disorder_data( start_index=i * data_per_iteration // 2, skiprows_start=(i + 1) * data_per_iteration // 2, skiprows_end=(i + 1) * data_per_iteration // 2 + 10**7, nrows=data_per_iteration, test_size=0) x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_train1 = x_train1[:len(x_train2)] y_train1 = y_train1[:len(y_train2)] np.save("x_train1_" + str(i) + ".npy", x_train1) y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2) np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1) np.save("x_train2_" + str(i) + ".npy", x_train2) y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2) np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2) x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") num_of_train_batches += len(x_train1) // BATCH_SIZE x_test1, y_test1 = load_data.get_depression_data(start_index=0, end_index=0, test_size=500) x_test2, y_test2 = load_data.get_bipolar_disorder_data( start_index=num_of_batches * data_per_iteration // 2, skiprows_start=(num_of_batches + 1) * data_per_iteration // 2 + 250, skiprows_end=(num_of_batches + 1) * data_per_iteration // 2 + 10**7 + 250, nrows=data_per_iteration, test_size=1) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) x_test1 = x_test1[:len(x_test2)] y_test1 = y_test1[:len(y_test2)] model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) multitask1.run_multitask(x_train1_filenames, x_test1, y_train1_filenames, y_test1, x_train2_filenames, x_test2, y_train2_filenames, y_test2, model, fit_generator=True, steps_per_epoch=num_of_train_batches)
def ff(): x_train, x_test, y_train, y_test = load_data.get_depression_data() x_train, x_test = preprocessing.vectorize_data_tfidf(x_train, x_test) model = get_ff_model((x_train.shape[1], )) run(x_train, x_test, y_train, y_test, model)