def multitask():
    x_train1, x_test1, y_train1, y_test1 = load_data.get_depression_data()
    x_train2, x_test2, y_train2, y_test2 = load_data.get_bipolar_disorder_data(
    )
    x_train1 = x_train1[:len(x_train2)]
    x_test1 = x_test1[:len(x_test2)]
    y_train1 = y_train1[:len(y_train2)]
    y_test1 = y_test1[:len(y_test2)]
    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()

    x_train1 = preprocessing.add_features_and_vectorize(
        x_train1, vectorize_function, embedding_index)
    x_test1 = preprocessing.add_features_and_vectorize(x_test1,
                                                       vectorize_function,
                                                       embedding_index)
    x_train2 = preprocessing.add_features_and_vectorize(
        x_train2, vectorize_function, embedding_index)
    x_test2 = preprocessing.add_features_and_vectorize(x_test2,
                                                       vectorize_function,
                                                       embedding_index)

    model = multitask1.get_multitask_model(
        (x_train1.shape[1], x_train1.shape[2]))

    multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2,
                             x_test2, y_train2, y_test2, model)
Example #2
0
def rf_with_glove():
    x_train, x_test, y_train, y_test = load_data.get_depression_data()
    vectorize_function = preprocessing.vectorize_data_1d_glove
    embedding_index = preprocessing.get_embeddings_index()
    x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, embedding_index)
    x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index)
    pred = baseline(x_train, x_test, y_train, y_test)
    print(pred)
Example #3
0
def lstm_with_embedding_layer():
    x_train, x_test, y_train, y_test = load_data.get_depression_data()
    y_train_one_hot = preprocessing.class_one_hot(y_train)
    embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix(
        x_train)
    x_train = preprocessing.vectorize_with_tokenizer(x_train, tokenizer)
    x_test = preprocessing.vectorize_with_tokenizer(x_test, tokenizer)
    model = get_lstm_model(True, word_index, embedding_matrix)
    run(x_train, x_test, y_train_one_hot, y_test, model)
def multitask2():
    x_train1, x_test1, y_train1, y_test1 = load_data.get_depression_data()
    x_train2, x_test2, y_train2, y_test2 = load_data.get_bipolar_disorder_data(
    )
    x_train1 = x_train1[:len(x_train2)]
    x_test1 = x_test1[:len(x_test2)]
    y_train1 = y_train1[:len(y_train2)]
    y_test1 = y_test1[:len(y_test2)]
    vectorize_function = preprocessing.vectorize_with_tokenizer
    embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix(
        x_train1)

    x_train1 = vectorize_function(x_train1, tokenizer)
    x_train2 = vectorize_function(x_train2, tokenizer)
    x_test1 = vectorize_function(x_test1, tokenizer)
    x_test2 = vectorize_function(x_test2, tokenizer)

    model = multitask1.get_multitask_model_2_embeddings(
        (x_train1.shape[1], ), word_index, embedding_matrix)

    multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2,
                             x_test2, y_train2, y_test2, model)
def multitask_memory_efficient():
    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()

    data_per_iteration = BATCH_SIZE
    num_of_batches = TRAIN_SET_SIZE // data_per_iteration
    num_of_train_batches = 0
    x_train1_filenames = []
    y_train1_filenames = []
    x_train2_filenames = []
    y_train2_filenames = []
    for i in range(num_of_batches):
        x_train1, y_train1 = load_data.get_depression_data(
            start_index=i * data_per_iteration,
            end_index=(i + 1) * data_per_iteration,
            test_size=0)
        x_train2, y_train2 = load_data.get_bipolar_disorder_data(
            start_index=i * data_per_iteration // 2,
            skiprows_start=(i + 1) * data_per_iteration // 2,
            skiprows_end=(i + 1) * data_per_iteration // 2 + 10**7,
            nrows=data_per_iteration,
            test_size=0)

        x_train1 = preprocessing.add_features_and_vectorize(
            x_train1, vectorize_function, embedding_index)
        x_train2 = preprocessing.add_features_and_vectorize(
            x_train2, vectorize_function, embedding_index)

        x_train1 = x_train1[:len(x_train2)]
        y_train1 = y_train1[:len(y_train2)]

        np.save("x_train1_" + str(i) + ".npy", x_train1)
        y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2)
        np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1)

        np.save("x_train2_" + str(i) + ".npy", x_train2)
        y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2)
        np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2)

        x_train1_filenames.append("x_train1_" + str(i) + ".npy")
        y_train1_filenames.append("y_train1_" + str(i) + ".npy")
        x_train2_filenames.append("x_train2_" + str(i) + ".npy")
        y_train2_filenames.append("y_train2_" + str(i) + ".npy")
        num_of_train_batches += len(x_train1) // BATCH_SIZE

    x_test1, y_test1 = load_data.get_depression_data(start_index=0,
                                                     end_index=0,
                                                     test_size=500)

    x_test2, y_test2 = load_data.get_bipolar_disorder_data(
        start_index=num_of_batches * data_per_iteration // 2,
        skiprows_start=(num_of_batches + 1) * data_per_iteration // 2 + 250,
        skiprows_end=(num_of_batches + 1) * data_per_iteration // 2 + 10**7 +
        250,
        nrows=data_per_iteration,
        test_size=1)

    x_test1 = preprocessing.add_features_and_vectorize(x_test1,
                                                       vectorize_function,
                                                       embedding_index)
    x_test2 = preprocessing.add_features_and_vectorize(x_test2,
                                                       vectorize_function,
                                                       embedding_index)
    x_test1 = x_test1[:len(x_test2)]
    y_test1 = y_test1[:len(y_test2)]

    model = multitask1.get_multitask_model(
        (x_test1.shape[1], x_test1.shape[2]))
    multitask1.run_multitask(x_train1_filenames,
                             x_test1,
                             y_train1_filenames,
                             y_test1,
                             x_train2_filenames,
                             x_test2,
                             y_train2_filenames,
                             y_test2,
                             model,
                             fit_generator=True,
                             steps_per_epoch=num_of_train_batches)
Example #6
0
def ff():
    x_train, x_test, y_train, y_test = load_data.get_depression_data()
    x_train, x_test = preprocessing.vectorize_data_tfidf(x_train, x_test)
    model = get_ff_model((x_train.shape[1], ))
    run(x_train, x_test, y_train, y_test, model)