def load_data(): Data = loadReports() txt = Data["txt"] labels = np.array(Data["label"]) vocab_size = 10000 encoded_docs = [one_hot(t, vocab_size) for t in txt] max_length = 600 padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print("X:", padded_docs.shape) print("Labels:", labels.shape) #RANDOMIZE: randArray = list(zip(padded_docs, labels)) #random.shuffle(randArray) X, labels = zip(*randArray) X = np.array(X) labels = np.array(labels) x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, shuffle=False) return x_train, x_test, y_train, y_test
def load_data(vocab_size, max_length): #so it doesn't use ellipse when printing np.set_printoptions(threshold=np.inf) #Loading data and padding it Data = loadReports() txt = Data["txt"] labels = np.array(Data["label"]) encoded_docs = [one_hot(t, vocab_size) for t in txt] padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print("X:", padded_docs.shape) print("Labels:", labels.shape) #RANDOMIZE: randArray = list(zip(padded_docs, labels)) #random.shuffle(randArray) X, labels = zip(*randArray) X = np.array(X) labels = np.array(labels) x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, shuffle=False) return x_train, x_test, y_train, y_test
def load_Glove(vocab_size): Data = loadReports() txt = Data["txt"] embeddings_index = dict() f = open('./glove.42B.300d.txt', encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs t = Tokenizer() t.fit_on_texts(txt) vocab_size = len(t.word_index) + 1 embedding_matrix = np.zeros((vocab_size, 300)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix
import tensorflow as tf from keras.layers import Dense, Flatten, Reshape, BatchNormalization, CuDNNLSTM, CuDNNGRU from keras.models import Sequential from keras.preprocessing.text import one_hot from keras.preprocessing.sequence import pad_sequences from keras.layers.embeddings import Embedding from keras.models import load_model import numpy as np import random import matplotlib.pyplot as plt #so it doesn't use ellipse when printing np.set_printoptions(threshold=np.inf) #Loading data and padding it Data = loadReports() txt = Data["txt"] labels = np.array(Data["label"]) vocab_size = 10000 encoded_docs = [one_hot(t, vocab_size) for t in txt] max_length = 300 padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print("X:", padded_docs.shape) print("Labels:", labels.shape) #RANDOMIZE: randArray = list(zip(padded_docs, labels)) random.shuffle(randArray)