Ejemplo n.º 1
0
def prepare_data(shuffle=False, labels_to_categorical=True):
    path = os.getcwd()[:os.getcwd().rfind("/")]
    to_write_filename = path + "/stats/data_prep_for_lstm_visualization.txt"
    utils.initialize_writer(to_write_filename)

    train_filename = "train.txt"
    test_filename = "test.txt"
    tokens_filename = "clean_original_"  # other types of tokens to experiment with in /res/tokens/
    data_path = path + "/res/tokens/tokens_"

    # Load the data
    train_data = utils.load_file(data_path + tokens_filename + train_filename)
    test_data = utils.load_file(data_path + tokens_filename + test_filename)

    if shuffle:
        train_data = utils.shuffle_words(train_data)
        test_data = utils.shuffle_words(test_data)
        print("DATA IS SHUFFLED")

    # Load the labels
    train_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        train_filename)
    ]
    test_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        test_filename)
    ]

    # Get the max length of the train tweets
    max_tweet_length = utils.get_max_len_info(train_data)

    # Convert all tweets into sequences of word indices
    tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes(
        train_data, test_data, lower=True)
    vocab_size = len(tokenizer.word_counts) + 1
    word_to_index = tokenizer.word_index
    print("There are %s unique tokens." % len(word_to_index))

    # Pad sequences with 0s (can do it post or pre - post works better here)
    x_train = pad_sequences(train_indices,
                            maxlen=max_tweet_length,
                            padding="post",
                            truncating="post",
                            value=0.)
    x_test = pad_sequences(test_indices,
                           maxlen=max_tweet_length,
                           padding="post",
                           truncating="post",
                           value=0.)

    # Transform the output into categorical data or just keep it as it is (in a numpy array)
    if labels_to_categorical:
        train_labels = to_categorical(np.asarray(train_labels))
        test_labels = to_categorical(np.asarray(test_labels))
    else:
        train_labels = np.array(train_labels)
        test_labels = np.array(test_labels)
    return x_train, train_labels, x_test, test_labels, vocab_size, tokenizer, max_tweet_length
Ejemplo n.º 2
0
        predict(model, x_test, y_test)
        end = time.time()
        print(
            "==================================================================\n"
        )
        print("%s model analysis completion time: %.3f s = %.3f min" %
              (dnn_model, (end - start), (end - start) / 60.0))
        print(
            "==================================================================\n"
        )


if __name__ == "__main__":
    path = os.getcwd()[:os.getcwd().rfind('/')]
    to_write_filename = path + '/stats/dnn_models_analysis.txt'
    utils.initialize_writer(to_write_filename)

    # Load the train and test sets for the selected dataset
    dataset = "ghosh"
    train_data, _, train_labels, test_data, _, test_labels = data_proc.get_dataset(
        dataset)

    # Alternatively, if other experiments with the data are to be made (on Ghosh's dataset)
    # load different tokens (grammatical, strict, filtered, etc) and train on those
    """
    train_filename = "train_sample.txt"
    test_filename = "test_sample.txt"
    
    train_data = utils.load_file(path + "/res/tokens/tokens_clean_original_" + train_filename)
    test_data = utils.load_file(path + "/res/tokens/tokens_clean_original_" + test_filename)