Exemple #1
0
def main(train_set_size=1000):
    # x_train, x_test, y_train, y_test = load_data.get_depression_data(start_index=0, end_index=train_set_size//2,
    #                                                                   test_size=500)
    x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level(start_index=0, end_index=train_set_size)
    x_train, y_train = load_smhd_datasets.prepare_binary_data(x0, x1)

    x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level(end_index=100, set_="validation")
    x_test, y_test = load_smhd_datasets.prepare_binary_data(x0, x1)

    print(x_train.shape)
    vectorize_function = preprocessing.vectorize_data_tfidf
    vectorizer = preprocessing.get_tfidf_vectorizer(x_train)
    # x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, vectorizer)
    # x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, vectorizer)
    x_train = vectorize_function(x_train, vectorizer)
    x_test = vectorize_function(x_test, vectorizer)
    test_acc, f1_score = baseline(x_train, x_test, y_train, y_test)
    return test_acc, f1_score
def multitask_smhd():
    x0, _, x2, x1, _ = load_smhd_datasets.get_smhd_data()
    x_train1, y_train1 = load_smhd_datasets.prepare_binary_data(
        x0[:len(x0) // 2], x1)
    x_train2, y_train2 = load_smhd_datasets.prepare_binary_data(
        x0[len(x0) // 2:], x2)
    print(y_train1)
    x0, _, x2, x1, _ = load_smhd_datasets.get_smhd_data(set_='validation')
    x_test1, y_test1 = load_smhd_datasets.prepare_binary_data(
        x0[:len(x0) // 2], x1)
    x_test2, y_test2 = load_smhd_datasets.prepare_binary_data(
        x0[len(x0) // 2:], x2)
    print(y_test2)
    x_train1 = x_train1[:len(x_train2)]
    x_test1 = x_test1[:len(x_test2)]
    y_train1 = y_train1[:len(y_train2)]
    y_test1 = y_test1[:len(y_test2)]

    x_train2 = x_train2[:len(x_train1)]
    x_test2 = x_test2[:len(x_test1)]
    y_train2 = y_train2[:len(y_train1)]
    y_test2 = y_test2[:len(y_test1)]

    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()

    x_train1 = preprocessing.add_features_and_vectorize(
        x_train1, vectorize_function, embedding_index)
    x_test1 = preprocessing.add_features_and_vectorize(x_test1,
                                                       vectorize_function,
                                                       embedding_index)
    x_train2 = preprocessing.add_features_and_vectorize(
        x_train2, vectorize_function, embedding_index)
    x_test2 = preprocessing.add_features_and_vectorize(x_test2,
                                                       vectorize_function,
                                                       embedding_index)

    model = multitask1.get_multitask_model(
        (x_test1.shape[1], x_test1.shape[2]))
    print(y_train1)
    return multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1,
                                    x_train2, x_test2, y_train2, y_test2,
                                    model)
Exemple #3
0
def lstm_memory_efficient_simple(reload_data=True,
                                 data_per_iteration=2,
                                 num_of_load_iterations=2,
                                 num_of_train_batches=None):
    if reload_data:
        vectorize_function = preprocessing.vectorize_data_glove
        embedding_index = preprocessing.get_embeddings_index()
        x_train_filenames, y_train_filenames, num_of_train_batches = save_data(
            vectorize_function, embedding_index, data_per_iteration,
            num_of_load_iterations)
        x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level(
            end_index=100, set_="validation")
        x_test, y_test = load_smhd_datasets.prepare_binary_data(x0, x1)

        x_test = preprocessing.add_features_and_vectorize(
            x_test, vectorize_function, embedding_index)
        np.save("x_test.npy", x_test)
        np.save("y_test.npy", y_test)
    else:
        x_train_filenames = []
        y_train_filenames = []
        for i in range(num_of_load_iterations):
            x_train_filenames.append("x_train" + str(i) + ".npy")
            y_train_filenames.append("y_train" + str(i) + ".npy")
        if isinstance(num_of_train_batches, str):
            num_of_train_batches = eval(
                open(num_of_train_batches, "r").readlines()[0])

    x_test = np.load("x_test.npy")
    y_test = np.load("y_test.npy")

    model = get_lstm_model(use_embedding_layer=False,
                           input_shape=(x_test.shape[1], x_test.shape[2]))
    return run(x_train_filenames,
               x_test,
               y_train_filenames,
               y_test,
               model,
               fit_generator=True,
               epochs=5,
               steps_per_epoch=num_of_train_batches)
Exemple #4
0
def save_data(vectorize_function,
              embedding_index,
              data_per_iteration=2,
              num_of_load_iterations=2):
    x_train_filenames = []
    y_train_filenames = []

    num_of_train_batches = 0
    for i in range(257, num_of_load_iterations):
        start = time()
        # x_train, y_train = load_data.get_rsdd_data(start_index=i * data_per_iteration,
        #                                            end_index=(i + 1) * data_per_iteration, set_="train")
        x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level(
            start_index=i * data_per_iteration,
            end_index=(i + 1) * data_per_iteration)
        t1 = time()
        print(t1 - start)
        x_train, y_train = load_smhd_datasets.prepare_binary_data(x0, x1)
        t2 = time()
        print(t2 - t1)
        x_train = preprocessing.add_features_and_vectorize(
            x_train, vectorize_function, embedding_index)
        t3 = time()
        print(t3 - t2)
        y_train_one_hot = preprocessing.class_one_hot(y_train, 2)
        # print(x_train.shape)
        # print(y_train_one_hot.shape)
        np.save("x_train" + str(i) + ".npy", x_train)
        np.save("y_train" + str(i) + ".npy", y_train_one_hot)
        x_train_filenames.append("x_train" + str(i) + ".npy")
        y_train_filenames.append("y_train" + str(i) + ".npy")
        num_of_train_batches += len(x_train) // BATCH_SIZE
        end = time()
        print(end - t3)
    f = open("num_of_train_batches.txt", "w")
    f.write(str(num_of_train_batches))
    f.close()

    return x_train_filenames, y_train_filenames, num_of_train_batches
def multitask_smhd_memory_efficient(reload_data=True,
                                    data_per_iteration=2,
                                    num_of_load_iterations=2,
                                    num_of_train_batches=None,
                                    user_level=True):
    x_train1_filenames = []
    y_train1_filenames = []
    x_train2_filenames = []
    y_train2_filenames = []
    if reload_data:
        vectorize_function = preprocessing.vectorize_data_glove
        embedding_index = preprocessing.get_embeddings_index()

        num_of_train_batches = 0
        for i in range(num_of_load_iterations):
            if user_level:
                x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data_user_level(
                    start_index=i * data_per_iteration,
                    end_index=(i + 1) * data_per_iteration)
            else:
                x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data(
                    start_index=i * data_per_iteration,
                    end_index=(i + 1) * data_per_iteration)

            x_train1, y_train1 = load_smhd_datasets.prepare_binary_data(
                x0[:len(x0) // 3], x1)
            x_train2, y_train2 = load_smhd_datasets.prepare_binary_data(
                x0[len(x0) // 3:2 * len(x0) // 3], x2)
            x_train3, y_train3 = load_smhd_datasets.prepare_binary_data(
                x0[2 * len(x0) // 3:], x3)

            x_train1 = preprocessing.add_features_and_vectorize(
                x_train1, vectorize_function, embedding_index)
            x_train2 = preprocessing.add_features_and_vectorize(
                x_train2, vectorize_function, embedding_index)
            x_train3 = preprocessing.add_features_and_vectorize(
                x_train2, vectorize_function, embedding_index)

            np.save("x_train1_" + str(i) + ".npy", x_train1)
            y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2)
            np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1)

            np.save("x_train2_" + str(i) + ".npy", x_train2)
            y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2)
            np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2)

            np.save("x_train3_" + str(i) + ".npy", x_train3)
            y_train_one_hot3 = preprocessing.class_one_hot(y_train3, 2)
            np.save("y_train3_" + str(i) + ".npy", y_train_one_hot3)

            x_train1_filenames.append("x_train1_" + str(i) + ".npy")
            y_train1_filenames.append("y_train1_" + str(i) + ".npy")
            x_train2_filenames.append("x_train2_" + str(i) + ".npy")
            y_train2_filenames.append("y_train2_" + str(i) + ".npy")
            num_of_train_batches += len(x_train1) // BATCH_SIZE

        f = open("num_of_train_batches.txt", "w")
        f.write(num_of_train_batches)
        f.close()
        if user_level:
            x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data_user_level(
                set_='validation')
        else:
            x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data(
                set_='validation')

        x_test1, y_test1 = load_smhd_datasets.prepare_binary_data(
            x0[:len(x0) // 2], x1)
        x_test2, y_test2 = load_smhd_datasets.prepare_binary_data(
            x0[len(x0) // 2:], x2)

        x_test1 = preprocessing.add_features_and_vectorize(
            x_test1, vectorize_function, embedding_index)
        x_test2 = preprocessing.add_features_and_vectorize(
            x_test2, vectorize_function, embedding_index)
        x_test1 = x_test1[:len(x_test2)]
        y_test1 = y_test1[:len(y_test2)]
        x_test2 = x_test2[:len(x_test1)]
        y_test2 = y_test2[:len(y_test1)]
        print(len(y_test1))
        print(len(y_test2))
        np.save("x_test1.npy", x_test1)
        np.save("y_test1.npy", y_test1)
        np.save("x_test2.npy", x_test2)
        np.save("y_test2.npy", y_test2)

    else:
        for i in range(num_of_load_iterations):
            x_train1_filenames.append("x_train1_" + str(i) + ".npy")
            y_train1_filenames.append("y_train1_" + str(i) + ".npy")
            x_train2_filenames.append("x_train2_" + str(i) + ".npy")
            y_train2_filenames.append("y_train2_" + str(i) + ".npy")
        if isinstance(num_of_train_batches, str):
            num_of_train_batches = eval(
                open(num_of_train_batches, "r").readlines()[0])

    x_test1 = np.load("x_test1.npy")
    y_test1 = np.load("y_test1.npy")
    x_test2 = np.load("x_test2.npy")
    y_test2 = np.load("y_test2.npy")

    model = multitask1.get_multitask_model(
        (x_test1.shape[1], x_test1.shape[2]))
    acc1, f11, acc2, f12 = multitask1.run_multitask(
        x_train1_filenames,
        x_test1,
        y_train1_filenames,
        y_test1,
        x_train2_filenames,
        x_test2,
        y_train2_filenames,
        y_test2,
        model,
        fit_generator=True,
        steps_per_epoch=num_of_train_batches)

    return acc1, f11, acc2, f12