Beispiel #1
0
def load_data():
    Data = loadReports()
    txt = Data["txt"]
    labels = np.array(Data["label"])
    vocab_size = 10000
    encoded_docs = [one_hot(t, vocab_size) for t in txt]
    max_length = 600
    padded_docs = pad_sequences(encoded_docs,
                                maxlen=max_length,
                                padding='post')
    print("X:", padded_docs.shape)
    print("Labels:", labels.shape)

    #RANDOMIZE:
    randArray = list(zip(padded_docs, labels))
    #random.shuffle(randArray)

    X, labels = zip(*randArray)
    X = np.array(X)
    labels = np.array(labels)
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        labels,
                                                        test_size=0.33,
                                                        shuffle=False)

    return x_train, x_test, y_train, y_test
Beispiel #2
0
def load_data(vocab_size, max_length):
    #so it doesn't use ellipse when printing
    np.set_printoptions(threshold=np.inf)

    #Loading data and padding it
    Data = loadReports()
    txt = Data["txt"]
    labels = np.array(Data["label"])

    encoded_docs = [one_hot(t, vocab_size) for t in txt]

    padded_docs = pad_sequences(encoded_docs,
                                maxlen=max_length,
                                padding='post')
    print("X:", padded_docs.shape)
    print("Labels:", labels.shape)

    #RANDOMIZE:
    randArray = list(zip(padded_docs, labels))
    #random.shuffle(randArray)

    X, labels = zip(*randArray)
    X = np.array(X)
    labels = np.array(labels)
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        labels,
                                                        test_size=0.2,
                                                        shuffle=False)

    return x_train, x_test, y_train, y_test
Beispiel #3
0
def load_Glove(vocab_size):
    Data = loadReports()
    txt = Data["txt"]
    embeddings_index = dict()
    f = open('./glove.42B.300d.txt', encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    # create a weight matrix for words in training docs
    t = Tokenizer()
    t.fit_on_texts(txt)
    vocab_size = len(t.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
Beispiel #4
0
import tensorflow as tf
from keras.layers import Dense, Flatten, Reshape, BatchNormalization, CuDNNLSTM, CuDNNGRU
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.models import load_model
import numpy as np
import random
import matplotlib.pyplot as plt

#so it doesn't use ellipse when printing
np.set_printoptions(threshold=np.inf)

#Loading data and padding it
Data = loadReports()
txt = Data["txt"]
labels = np.array(Data["label"])

vocab_size = 10000
encoded_docs = [one_hot(t, vocab_size) for t in txt]
max_length = 300
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

print("X:", padded_docs.shape)
print("Labels:", labels.shape)

#RANDOMIZE:
randArray = list(zip(padded_docs, labels))
random.shuffle(randArray)