def lstm_with_embedding_layer(): x_train, x_test, y_train, y_test = load_data.get_depression_data() y_train_one_hot = preprocessing.class_one_hot(y_train) embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix( x_train) x_train = preprocessing.vectorize_with_tokenizer(x_train, tokenizer) x_test = preprocessing.vectorize_with_tokenizer(x_test, tokenizer) model = get_lstm_model(True, word_index, embedding_matrix) run(x_train, x_test, y_train_one_hot, y_test, model)
def lstm(): # x_train, x_test, y_train, y_test = load_data.get_depression_data() # x_train, x_test, y_train, y_test = load_data.get_bipolar_disorder_data() # x_train, y_train = load_data.get_rsdd_data(set_="train") # x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation") x_train, y_train = load_data.get_smhd_data(set_="train") x_test, y_test = load_data.get_smhd_data(end_index=5, set_="validation") y_train_one_hot = preprocessing.class_one_hot(y_train) vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() print(x_train[0]) x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, embedding_index) x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) model = get_lstm_model(use_embedding_layer=False, input_shape=(x_train.shape[1], x_train.shape[2])) run(x_train, x_test, y_train_one_hot, y_test, model)
def save_data(vectorize_function, embedding_index, data_per_iteration=2, num_of_load_iterations=2): x_train_filenames = [] y_train_filenames = [] num_of_train_batches = 0 for i in range(257, num_of_load_iterations): start = time() # x_train, y_train = load_data.get_rsdd_data(start_index=i * data_per_iteration, # end_index=(i + 1) * data_per_iteration, set_="train") x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration) t1 = time() print(t1 - start) x_train, y_train = load_smhd_datasets.prepare_binary_data(x0, x1) t2 = time() print(t2 - t1) x_train = preprocessing.add_features_and_vectorize( x_train, vectorize_function, embedding_index) t3 = time() print(t3 - t2) y_train_one_hot = preprocessing.class_one_hot(y_train, 2) # print(x_train.shape) # print(y_train_one_hot.shape) np.save("x_train" + str(i) + ".npy", x_train) np.save("y_train" + str(i) + ".npy", y_train_one_hot) x_train_filenames.append("x_train" + str(i) + ".npy") y_train_filenames.append("y_train" + str(i) + ".npy") num_of_train_batches += len(x_train) // BATCH_SIZE end = time() print(end - t3) f = open("num_of_train_batches.txt", "w") f.write(str(num_of_train_batches)) f.close() return x_train_filenames, y_train_filenames, num_of_train_batches
def multitask_memory_efficient(): vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() data_per_iteration = BATCH_SIZE num_of_batches = TRAIN_SET_SIZE // data_per_iteration num_of_train_batches = 0 x_train1_filenames = [] y_train1_filenames = [] x_train2_filenames = [] y_train2_filenames = [] for i in range(num_of_batches): x_train1, y_train1 = load_data.get_depression_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration, test_size=0) x_train2, y_train2 = load_data.get_bipolar_disorder_data( start_index=i * data_per_iteration // 2, skiprows_start=(i + 1) * data_per_iteration // 2, skiprows_end=(i + 1) * data_per_iteration // 2 + 10**7, nrows=data_per_iteration, test_size=0) x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_train1 = x_train1[:len(x_train2)] y_train1 = y_train1[:len(y_train2)] np.save("x_train1_" + str(i) + ".npy", x_train1) y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2) np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1) np.save("x_train2_" + str(i) + ".npy", x_train2) y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2) np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2) x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") num_of_train_batches += len(x_train1) // BATCH_SIZE x_test1, y_test1 = load_data.get_depression_data(start_index=0, end_index=0, test_size=500) x_test2, y_test2 = load_data.get_bipolar_disorder_data( start_index=num_of_batches * data_per_iteration // 2, skiprows_start=(num_of_batches + 1) * data_per_iteration // 2 + 250, skiprows_end=(num_of_batches + 1) * data_per_iteration // 2 + 10**7 + 250, nrows=data_per_iteration, test_size=1) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) x_test1 = x_test1[:len(x_test2)] y_test1 = y_test1[:len(y_test2)] model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) multitask1.run_multitask(x_train1_filenames, x_test1, y_train1_filenames, y_test1, x_train2_filenames, x_test2, y_train2_filenames, y_test2, model, fit_generator=True, steps_per_epoch=num_of_train_batches)
def multitask_smhd_memory_efficient(reload_data=True, data_per_iteration=2, num_of_load_iterations=2, num_of_train_batches=None, user_level=True): x_train1_filenames = [] y_train1_filenames = [] x_train2_filenames = [] y_train2_filenames = [] if reload_data: vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() num_of_train_batches = 0 for i in range(num_of_load_iterations): if user_level: x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data_user_level( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration) else: x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration) x_train1, y_train1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 3], x1) x_train2, y_train2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 3:2 * len(x0) // 3], x2) x_train3, y_train3 = load_smhd_datasets.prepare_binary_data( x0[2 * len(x0) // 3:], x3) x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_train3 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) np.save("x_train1_" + str(i) + ".npy", x_train1) y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2) np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1) np.save("x_train2_" + str(i) + ".npy", x_train2) y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2) np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2) np.save("x_train3_" + str(i) + ".npy", x_train3) y_train_one_hot3 = preprocessing.class_one_hot(y_train3, 2) np.save("y_train3_" + str(i) + ".npy", y_train_one_hot3) x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") num_of_train_batches += len(x_train1) // BATCH_SIZE f = open("num_of_train_batches.txt", "w") f.write(num_of_train_batches) f.close() if user_level: x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data_user_level( set_='validation') else: x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data( set_='validation') x_test1, y_test1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 2], x1) x_test2, y_test2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 2:], x2) x_test1 = preprocessing.add_features_and_vectorize( x_test1, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize( x_test2, vectorize_function, embedding_index) x_test1 = x_test1[:len(x_test2)] y_test1 = y_test1[:len(y_test2)] x_test2 = x_test2[:len(x_test1)] y_test2 = y_test2[:len(y_test1)] print(len(y_test1)) print(len(y_test2)) np.save("x_test1.npy", x_test1) np.save("y_test1.npy", y_test1) np.save("x_test2.npy", x_test2) np.save("y_test2.npy", y_test2) else: for i in range(num_of_load_iterations): x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") if isinstance(num_of_train_batches, str): num_of_train_batches = eval( open(num_of_train_batches, "r").readlines()[0]) x_test1 = np.load("x_test1.npy") y_test1 = np.load("y_test1.npy") x_test2 = np.load("x_test2.npy") y_test2 = np.load("y_test2.npy") model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) acc1, f11, acc2, f12 = multitask1.run_multitask( x_train1_filenames, x_test1, y_train1_filenames, y_test1, x_train2_filenames, x_test2, y_train2_filenames, y_test2, model, fit_generator=True, steps_per_epoch=num_of_train_batches) return acc1, f11, acc2, f12
def lstm_memory_efficient(): vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() data_per_iteration = 5 # BATCH_SIZE * 10 count = 0 i = 0 x_train_filenames = [] y_train_filenames = [] while count < TRAIN_SET_SIZE // BATCH_SIZE: i += 1 # x_train, y_train = load_data.get_depression_data(start_index=i*data_per_iteration, # end_index=(i+1)*data_per_iteration, test_size=0) # x_train, y_train = load_data.get_bipolar_disorder_data(start_index=i * data_per_iteration // 2, # skiprows_start=(i+1) * data_per_iteration // 2, # skiprows_end=(i+1) * data_per_iteration // 2 + 10**7, # nrows=data_per_iteration, test_size=0) x_train, y_train = load_data.get_rsdd_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration, set_="train") x_train = preprocessing.add_features_and_vectorize( x_train, vectorize_function, embedding_index) y_train_one_hot = preprocessing.class_one_hot(y_train, 2) print(x_train.shape) print(y_train_one_hot.shape) for j in range(len(x_train) // BATCH_SIZE): np.save("x_train" + str(count) + ".npy", x_train[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]) np.save("y_train" + str(count) + ".npy", y_train_one_hot[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]) x_train_filenames.append("x_train" + str(count) + ".npy") y_train_filenames.append("y_train" + str(count) + ".npy") count += 1 # x_test, y_test = load_data.get_bipolar_disorder_data(start_index=num_of_batches * data_per_iteration // 2, # skiprows_start=(num_of_batches+1) * data_per_iteration // 2, # skiprows_end=(num_of_batches+1) * data_per_iteration // 2 + 10**7, # nrows=data_per_iteration, test_size=1) # x_test, y_test = load_data.get_depression_data(start_index=0, end_index=0, test_size=500) x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation") x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) np.save("x_test.npy", x_test) np.save("y_test.npy", y_test) x_test = np.load("x_test.npy") y_test = np.load("y_test.npy") model = get_lstm_model(use_embedding_layer=False, input_shape=(x_test.shape[1], x_test.shape[2])) return run(x_train_filenames, x_test, y_train_filenames, y_test, model, True, steps_per_epoch=20)
def lstm_model_hyperparameters_random_search(x, y, n_iterations=100): x_train, x_validation, y_train, y_validation = train_test_split( x, y, test_size=0.2) y_train = preprocessing.class_one_hot(y_train) best_acc = 0 best_f1 = 0 best_avg = 0 best_model = None for _ in range(n_iterations): if random() < 0.5: nb_epoch = randint(5, 50) else: nb_epoch = randint( 5, 30) # model will not lose to much time for large epoch number if random() < 0.5: lr = random() * 10**(randint(0, 3) - 4) else: lr = random() * 10**-3 r = random() if r < 0.2: broj_neurona_u_sloju = randint(100, 500) elif r < 0.4: broj_neurona_u_sloju = 2**randint(2, 10) else: broj_neurona_u_sloju = EMBEDDING_DIM r = random() if r < 0.3: dropout_rate = r elif r < 0.5: dropout_rate = random() / 10 elif r < 0.7: dropout_rate = 0.2 else: dropout_rate = 0 r = random() if r < 0.2: recurrent_dropout = r elif r < 0.4: recurrent_dropout = random() / 10 elif r < 0.6: recurrent_dropout = 0.2 elif r < 0.8: recurrent_dropout = 0 else: recurrent_dropout = dropout_rate if random() < 0.5: bidirectional = True else: bidirectional = False r = random() if r < 0.3: activation = "sigmoid" elif r < 0.6: activation = "relu" else: activation = "None" model = Sequential() if bidirectional: model.add( Bidirectional(LSTM(units=broj_neurona_u_sloju, dropout=dropout_rate, recurrent_dropout=recurrent_dropout), input_shape=(x_train.shape[1], x_train.shape[2]))) else: model.add( LSTM(units=broj_neurona_u_sloju, dropout=dropout_rate, recurrent_dropout=recurrent_dropout, input_shape=(x_train.shape[1], x_train.shape[2]))) if activation == "sigmoid": model.add(Activation('sigmoid')) elif activation == "relu": model.add(Activation('relu')) model.add(Dense(output_dim=2)) model.add(Activation('softmax')) adam = Adam(lr=lr) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) model.fit(x_train, y_train, nb_epoch=nb_epoch, batch_size=100, verbose=0) pred = model.predict_classes(x_validation, 100) acc_score = metrics.accuracy_score(pred, y_validation) f1_score = metrics.f1_score(pred, y_validation, average='macro') avg_score = (acc_score + f1_score) / 2 if avg_score > best_avg: print('acc_score: ', acc_score) print('f1_score: ', f1_score) print('avg_score: ', avg_score) print('broj_neurona: ', broj_neurona_u_sloju) print('lr: ', lr) print('nb_epoch: ', nb_epoch) print('recurrent_dropout: ', recurrent_dropout) print('bidirectional: ', bidirectional) print('dropout_rate: ', dropout_rate) print('activation: ', activation) print() best_avg = avg_score best_model = model if acc_score > best_acc: best_acc = acc_score if f1_score > best_f1: best_f1 = f1_score elif acc_score > best_acc: print('acc_score: ', acc_score) print('f1_score: ', f1_score) print('avg_score: ', avg_score) print('broj_neurona: ', broj_neurona_u_sloju) print('lr: ', lr) print('nb_epoch: ', nb_epoch) print('recurrent_dropout: ', recurrent_dropout) print('bidirectional: ', bidirectional) print('dropout_rate: ', dropout_rate) print('activation: ', activation) print() best_acc = acc_score if f1_score > best_f1: best_f1 = f1_score if f1_score > best_f1: print('acc_score: ', acc_score) print('f1_score: ', f1_score) print('avg_score: ', avg_score) print('broj_neurona: ', broj_neurona_u_sloju) print('lr: ', lr) print('nb_epoch: ', nb_epoch) print('recurrent_dropout: ', recurrent_dropout) print('bidirectional: ', bidirectional) print('dropout_rate: ', dropout_rate) print('activation: ', activation) print() best_f1 = f1_score return best_model