Ejemplo n.º 1
0
def load_sentences(load_sentences_from, input_dir):
    # call file 'sentences'
    if load_sentences_from == 'manually':
        # Just write them below
        sentences = ['I am a boy.', 'I am a boy', 'I am a girl']
    elif load_sentences_from == 'txt':
        # Load from txt
        with open(input_dir + 'sentences.txt', 'r') as f:
            sentences = f.read().splitlines()
    elif load_sentences_from == 'csv':
        # Load from csv.
        sentences = np.array(
            pd.read_csv(input_dir + 'sentences.csv', header=None)).flatten()
    # Preprocess/clean text
    sentences = [
        data_helpers.clean_str_new(sentence) for sentence in sentences
    ]
    # Turn words into IDs based on training data
    Xtrain = Xtrain, Ytrain = data_helpers.load_all_data(
        config.train_path, config.validation_path, categories,
        shuffle=False)  # I changed this so it combines train and test
    Xtest, Ytest = data_helpers.load_data(config.test_path, categories)
    tokenizer = Tokenizer(
        filters='')  # depending on word embedding, set lower=False.
    tokenizer.fit_on_texts(np.append(np.array(Xtrain), np.array(Xtest)))
    sequences = tokenizer.texts_to_sequences(sentences)
    sentences_encoded = pad_sequences(sequences,
                                      maxlen=sequence_length,
                                      padding='post')
    return sentences, sentences_encoded
Ejemplo n.º 2
0
                'word21': pd.DataFrame(layer_output_21),
                'word27': pd.DataFrame(layer_output_27),
                'word33': pd.DataFrame(layer_output_33)
            }
            return layer_output_last, timestep_outputs
        else:
            return layer_output_last
    else:
        layer_output_all = pd.DataFrame(layer_output_all)
        return layer_output_all


# Load data
##============================================================================================
Xtrain, Ytrain = data_helpers.load_all_data(
    config.train_path, config.validation_path, categories,
    shuffle=False)  # I changed this so it combines train and test
Xtest, Ytest = data_helpers.load_data(config.test_path, categories)
Xtest_raw, Ytest_raw = data_helpers.load_data_raw(config.test_path, categories)
X, y = data_helpers.load_whole_dataset(config.train_path,
                                       config.validation_path,
                                       config.test_path,
                                       categories,
                                       load_all=True,
                                       shuffle=False,
                                       one_hot=False)

import importlib
importlib.reload(data_helpers)

## Encode Ytrain
Ejemplo n.º 3
0
import tensorflow as tf
from autoencoder import StackedAutoencoderClassifier
from cnn import CNNLayer
import pandas as pd
from gensim.models import Word2Vec
import os
import data_helpers
import datetime
import time
from numeric_cnn import NumericCNN
from cnn_rnn import CNN_RNN
import sys


text_features_train, numeric_features_train, labels_train = data_helpers.load_all_data('../data/trainingset.txt')

text_features_test, numeric_features_test, labels_test = data_helpers.load_all_data('../data/testset.txt')

max_sentence_length = max([len(x.split(' ')) for x in (text_features_train + text_features_test)])
numeric_feature_num = len(numeric_features_train[0])

init_words_embedded_model = Word2Vec.load('../data/word2vec.model')
num_classes = len(labels_train[0])
l2_reg_lambda = 0.6

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

def train(x_train_cnn, x_train_cnn_rnn, x_train_ncnn, y_train, x_test_cnn, x_test_cnn_rnn, x_test_ncnn, y_test):

    cnn_max_sentence_length = max([len(x.split(' ')) for x in (x_train_cnn + x_test_cnn)])
    ncnn_feature_num = len(x_train_ncnn[0])
Ejemplo n.º 4
0
# layer_name = 'kmaxpool_1'
# intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output_shape)
# Load data
##============================================================================================
#

# importlib.reload(data_helpers)

# Xtrain, Ytrain = data_helpers.load_all_data(config.train_path,config.validation_path, categories, shuffle=False) # I changed this so it combines train and validation
# Xvalidation, Yvalidation = data_helpers.load_data(config.validation_path, categories)
# Xvalidation_raw, Yvalidation_raw = data_helpers.load_data_raw(config.validation_path, categories)

# Xtrain, Ytrain = data_helpers.load_data(config.train_path, categories)
Xtrain, Ytrain = data_helpers.load_all_data(config.train_path,
                                            config.validation_path,
                                            categories,
                                            shuffle=False)
Xvalidation, Yvalidation = data_helpers.load_data(config.validation_path,
                                                  categories)
Xvalidation_raw, Yvalidation_raw = data_helpers.load_data_raw(
    config.validation_path, categories)

## Encode Ytrain
# =====================================================================================
#one hot encode and integer encode
Ytrain_encoded = np_utils.to_categorical(Ytrain)
Ytrain_integer = np.array(Ytrain)
Yvalidation_encoded = np_utils.to_categorical(Yvalidation)
Yvalidation_integer = np.array(Yvalidation)

# Zero pad (encode) Xtrain and Xvalidation