import matplotlib.pyplot as plt import numpy as np from tensorflow.keras import Sequential, regularizers from tensorflow.keras.datasets import imdb from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam (X_train, y_train), (X_test, y_test) = imdb.load_data() (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=500) data = np.concatenate((training_data, testing_data), axis=0) targets = np.concatenate((training_targets, testing_targets), axis=0) index = imdb.get_word_index() reverse_index = dict([(value, key) for (key, value) in index.items()]) decoded = " ".join([reverse_index.get(i - 3, "#") for i in data[0]]) print(decoded) def plot_loss(loss, v_loss): plt.figure(1, figsize=(8, 5)) plt.plot(loss, 'b', label='train') plt.plot(v_loss, 'r', label='validation') plt.title('Loss') plt.ylabel('loss') plt.xlabel('epochs') plt.legend() plt.show() plt.clf()
embeddings_index = dict() f = open('glove.6B.100d.txt', encoding='utf8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # To create the embedding matrix: vocabulary_size = 10000 word2id = imdb.get_word_index() # dictionary from words to integers (the id of the word in the vocab) id2word = {i: word for word, i in word2id.items()} # Embedding matrix holds the vector representation for the words. embedding_matrix = np.zeros((vocabulary_size, 100)) # 50000 for word, index in word2id.items(): if index > vocabulary_size - 1: continue else: embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector # for i in range(0, 3): # print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[i], embedding_matrix[i])) (X_train_full, y_train_full), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
# Wait, so Keras' official data-loading helper function gives us a *numpy array of python lists?* I realize that performance is not super important, but why not just a 2D array? # Sanity check our training data train_data[0][0:10] train_labels[0] # Since we restricted ourselves to the top 10,000 most frequent words, no word index will exceed 10,000: max([max(sequence) for sequence in train_data]) # For kicks, here's how you can quickly decode one of these reviews back to English words: # word_index is a dictionary mapping words to an integer index word_index = imdb.get_word_index() # We reverse it, mapping integer indices to words reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_review = ' '.join([reverse_word_index.get(i - 3, '�') for i in train_data[0]]) decoded_review def decode_review(arg): # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". if isinstance(arg, int): return ' '.join([reverse_word_index.get(i - 3, '�') for i in train_data[arg]])
def run_imdb(): # Extract useful data from dataset print('Extracting the IMDB dataset') (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Illustration of the input data print( f'In this dataset a label of 1 indicates a positive review, 0 a negative review.\nHaving taken the top 10,000' f' most-used words no word index will exceed 10,000.\nMax Index = ' f'{max([max(sequence) for sequence in train_data])}') print( f"For the sake of illustration, let's decode a review back to English (not being printed for easier reading)") word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # print(decoded_review) # Encoding the inputs print("In order to pass these lists of integers into a neural network we must first encode them as tensors of " "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.") def vectorise_sequences(sequences, dimension=10000): ret = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): ret[i, sequence] = 1 if i < 1: print(f"\n{sequence} => {ret[i]}\n") return ret x_train = vectorise_sequences(train_data) y_train = np.asarray(train_labels).astype('float32') x_test = vectorise_sequences(test_data) y_test = np.asarray(test_labels).astype('float32') # Design and compile the model print("Now to build the network, this time using parameters with greater configurability") model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000,))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=[metrics.binary_accuracy]) # Divide the training data print("Creating a validation set for greater insight during training") x_val = x_train[:10000] # Taking the 1st 10000 samples for validation partial_x_train = x_train[10000:] # Leaving everything from 10000 onwards for training y_val = y_train[:10000] # Taking the 1st 10000 labels for validation partial_y_train = y_train[10000:] # Leaving everything from 10000 onwards for training # Train the model print("Begin training the model:") history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) history_dict = history.history print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. " f"The keys are: {history_dict.keys()}") # ['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'] # Prepare to plot the training and validation information loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] acc_values = history_dict['binary_accuracy'] val_acc_values = history_dict['val_binary_accuracy'] epochs = range(1, len(history_dict['binary_accuracy']) + 1) plt.plot(epochs, loss_values, 'bo', label='Training Loss') plt.plot(epochs, val_loss_values, 'b', label='Validation Loss') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() plt.plot(epochs, acc_values, 'bo', label='Training Accuracy') plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # Evaluate the model print("\nAfter reviewing each plot, evaluate the performance of the model on new data") results = model.evaluate(x_test, y_test) print(f"Evaluation Results: Loss = {results[0]} Accuracy = {results[1] * 100}%")
# Prepossessing parameters sequence_length = 400 max_words = 5000 # Word2Vec parameters min_word_count = 1 context = 10 imdb = tf.keras.datasets.imdb (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000,start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) #teacher-predictions #teacher = 'lstm' #def get_teacher_predictions(teacher): # if teacher == 'lstm': # return y_train
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional import pandas from sklearn.preprocessing import LabelEncoder, StandardScaler import numpy (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=None, skip_top=0, maxlen=666, seed=113, start_char=1, oov_char=2, index_from=3) x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=666) word_to_id = imdb.get_word_index() word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} print(' '.join(id_to_word[id] for id in x_train[0])) model = keras.Sequential() model.add(Dense(500, input_dim=666, activation='tanh')) model.add(Dense(200, activation='tanh')) model.add(Dense(1, name="output_layer", activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='AdaDelta',
return pre_proc_text # IMDB 데이터에 사용된 총 단어의 종류는 88,584개 (vocabulary 크기)이다. # 가장 많이 사용되는 6,000개 단어만 사용하고, 나머지는 OOV로 표시한다. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=6000, start_char=0, oov_char=0, index_from=0) # train, test 데이터를 합친다. 필요한 경우 나중에 나눠쓴다. text = np.hstack([x_train, x_test]) label = np.hstack([y_train, y_test]) # vocabulary를 가져온다. word2idx = imdb.get_word_index() idx2word = dict((v, k) for k, v in word2idx.items()) # start_char와 oov_char는 '.'으로 표시해 둔다. 나중에 전처리 과정에서 제거된다. idx2word[0] = '.' # 숫자로 표시된 x_train을 실제 단어로 변환한다. def decode(review): x = [idx2word[s] for s in review] return ' '.join(x) # 리뷰 문서를 전처리한다. reviews = [] for i, review in enumerate(text):
Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1zs--3ULwCynfqLilHDzTUEoPb51c_qTT """ import numpy as np import matplotlib.pyplot as plt from tensorflow.keras.datasets import imdb from tensorflow.keras import models,layers from tensorflow import keras import tensorflow as tf (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Keep only top 10,000 most frequently words word_index = imdb.get_word_index() # word_index is a dictionary mapping words to an integer index. reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Reverses it, mapping integer indices to words # Decodes the review to read the content. Note that the indcies are offset by 3, # becuase 0, 1, 2 are reserved indices for "padding", "start of sequence," and "Unknown" decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[100]]) print(decoded_review) # one-hot encoding def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results x_train = vectorize_sequences(train_data)
# Trainings und Testdaten werden über Keras geladen # Alternativ können Sie direkt die Datei als Pikle Datei herunterladen (X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.npz", num_words=VOCABULARY_SIZE, skip_top=0, maxlen=None, seed=113, start_char=START_CHAR, oov_char=2, index_from=INDEX_FROM) # Die Datei wird imdb_word_index.json heruntergeladen word_to_id = imdb.get_word_index(path="./imdb_word_index.json") # Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in # https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) # Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = START_CHAR # 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} # Zeigt den Inhalt einer Rezension (bestimmt durch REVIEW_INDEX) REVIEW_INDEX = 2 print(X_train[REVIEW_INDEX]) print("---- Rezensionstext --------- ") print(' '.join(id_to_word[id] for id in X_train[REVIEW_INDEX]))
from tensorflow import keras from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D from tensorflow.keras.datasets import imdb import os word_indexes = imdb.get_word_index() word_indexes = {k: (v + 2) for k, v in word_indexes.items()} word_indexes["<PAD>"] = 0 word_indexes["<START>"] = 1 word_indexes["<UKN>"] = 2 reverse_word_indexes = {v: k for k, v in word_indexes.items()} def load_data(): (X1, y1), (X2, y2) = imdb.load_data() return X1, y1, X2, y2 def preprocess_data(train_data, test_data): train_data = keras.preprocessing.sequence.pad_sequences(train_data, maxlen=512, value=0, padding='post') test_data = keras.preprocessing.sequence.pad_sequences(test_data, maxlen=512, value=0, padding='post') return train_data, test_data
import tensorflow as tf physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing import sequence from flask import Flask, jsonify, make_response, request import google.cloud.logging import logging client = google.cloud.logging.Client() client.setup_logging() app = Flask(__name__) encoder_dict = imdb.get_word_index(path="imdb_word_index.json") def encode(sent): lst = [] for i in sent.lower().split(): if i in encoder_dict.keys(): if encoder_dict[i]<50000: lst.append(encoder_dict[i]) return lst dec_d = {v:k for k,v in encoder_dict.items()} def decode(sent): out = '' for i in sent: if i in dec_d.keys(): out = out + " " + dec_d[i] return out
import subprocess import requests import numpy as np import tensorflow as tf from tensorflow.keras.layers import Dense,Embedding,LSTM,Bidirectional from tensorflow.keras.optimizers import Adam,SGD,RMSprop from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.datasets import imdb ### load data and preprocess max_len = 32 trunc_type='post' padding_type='post' (train_x,train_y) , (test_x , test_y) = imdb.load_data() word2idx_dict = imdb.get_word_index() # idx2word_dict = {idx:word for word,idx in word2idx_dict.items()} # print(train_x.shape,test_x.shape) # def idxs2sentence(idxs): # return [idx2word_dict[idx] for idx in idxs] # print(train_x[1]) # print(idxs2sentence(train_x[1])) vocab_size = len(word2idx_dict) train_x = pad_sequences( train_x ,maxlen = max_len ,truncating=trunc_type ,padding =padding_type ) test_x = pad_sequences(