def data_process(): #data_utils.create_vocabulary(vocabulary_path=vocabulary_Path, data_path=comment_train_set_Path, # max_vocabulary_size=40000, tokenizer=False) #data_utils.data_to_token_ids(data_path=comment_set_Path, target_path=token_comment_set, # vocabulary_path=vocabulary_Path, tokenizer=False) # data_utils.data_to_token_ids(data_path='comment_test_set.txt',target_path='testX',vocabulary_path='vocab40000',tokenizer=False) #print('end') # Amazon Dataset loading train, test, _ = imdb.load_data(path=PKL_PATH, n_words=40000, valid_portion=0.1) trainX, trainY = train testX, testY = test # # with open(file_train_X,'w') as f: # for strs in trainX: # f.write(' '.join(str(e) for e in strs)+"\n") # with open(file_train_Y,'w') as f: # f.write(' '.join(str(e)+"\n" for e in trainY)) # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=300, value=0.) testX = pad_sequences(testX, maxlen=300, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=6) testY = to_categorical(testY, nb_classes=6) return trainX,trainY,testX,testY
def get_login_pages_imdb(keywords): import gensim import re from tflearn.datasets import imdb train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train sentences = trainX print(len(sentences)) print(sentences) model = gensim.models.Word2Vec(sentences, size=200, window=3, min_count=1, workers=4, iter=50) for key in keywords: print("[%s] most_similar:" % key) results = model.most_similar(positive=[key], topn=10) for i in results: print(i)
def get_LSTM(activation='softmax', valid_portion=0.1, learning_rate=0.001): ''' :param activation: 'softmax', 'linear', 'tanh', 'sigmoid', 'softplus', 'relu', 'prelu' :param valid_portion: :param learning_rate:1, 0.1, 0.001, 0.0001 :return: ''' # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY) testY = to_categorical(testY) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
def __init__(self, datapath='data/imdb/imdb.pkl'): (trainX, trainY), (testX, testY), _ = imdb.load_data(path=datapath, n_words=30000) self.trainX = pad_sequences(trainX, maxlen=250, value=0.) self.trainY = to_categorical(trainY, nb_classes=2) self.testX = pad_sequences(testX, maxlen=250, value=0.) self.testY = to_categorical(testY, nb_classes=2) self.num_examples = len(trainX) self.ptr = 0
def tflearn_imdb(): """ 文本情感分析 :return: """ (X_train, Y_train), (X_test, Y_test) = imdb.load_data() X_train, Y_train = pad_sequences(Y_train, maxlen=100), to_categorical(Y_train, nb_classes=2) X_test, Y_test = pad_sequences(Y_test, maxlen=100), to_categorical(Y_test, nb_classes=2) network = input_data([None, 100], name="input") tflearn.embedding(network, input_dim=10000, output_dim=128) branch1 = tflearn.conv_1d(network, 128, 3, padding="valid", activation="relu", regularizer="L2") branch2 = tflearn.conv_1d(network, 128, 4, padding="valid", activation="relu", regularizer="L2") branch3 = tflearn.conv_1d(network, 128, 5, padding="valid", activation="relu", regularizer="L2") network = tflearn.merge([branch1, branch2, branch3], mode="concat", axis=1) network = tf.expand_dims(network, 2) network = tflearn.global_avg_pool(network) network = tflearn.dropout(network, 0.5) network = tflearn.fully_connected(network, 2, activation="softmax") network = tflearn.regression(network, optimizer="adam", learning_rate=0.001, loss="categorical_crossentropy", name="target") model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(X_train, Y_train, n_epoch=5, shuffle=True, validation_set=(X_test, Y_test), show_metric=True, batch_size=32)
def FitModel(model): train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32) model.save("MyModel.pkl")
def main(): (X_train, y_train), (X_test, y_test), _ = imdb.load_data() X_train = np.array(pad_sequences(X_train, maxlen=100)) X_test = np.array(pad_sequences(X_test, maxlen=100)) vocab_size = X_train.max() + 1 print 'vocab size: {}'.format(vocab_size) y_train = to_categorical(np.array(y_train), 2) y_test = np.array(y_test) cnn = Discriminator(vocab_size, 100, 100, [2, 3], 50, 2) cnn.train(X_train, y_train, 5)
def main(): (X_train, y_train), (X_test, y_test), _ = imdb.load_data() X_train = np.array(pad_sequences(X_train, maxlen=100)) print(X_train.shape) X_test = np.array(pad_sequences(X_test, maxlen=100)) vocab_size = X_train.max() + 1 print('vocab size: {}'.format(vocab_size)) y_train = X_train.copy() print(y_train) y_test = np.array(y_test) cnn = Generator(100, vocab_size, 50, 100) cnn.train(X_train, y_train, 20)
def __init__(self): train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding self.trainX = pad_sequences(trainX, maxlen=200, value=0.) self.testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors self.trainY = to_categorical(trainY, nb_classes=2) self.testY = to_categorical(testY, nb_classes=2)
def getData_imdb(): from tflearn.datasets import imdb train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) print(trainX.shape) print(trainY) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) return trainX, testX, trainY, testY
def run_on_imdb(): # IMDB Dataset loading train, test, _ = imdb.load_data(path=imdb_dataset_path, n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) if check_file_exist(imdb_model_path): model.load(imdb_model_path) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32) if save_model: print("Saving model as 'imdb_model.tfl'") model.save(imdb_model_path) return 0
def MNISTRNN(): train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1) X_train, Y_train = train X_test, Y_test = test X_train = pad_sequences(X_train, maxlen=100,value=0.) X_test = pad_sequences(X_test, maxlen=100,value=0.) Y_train = to_categorical(Y_train, nb_classes=2) Y_test = to_categorical(Y_test, nb_classes=2) # LSTM RNN = tflearn.input_data([None, 100]) RNN = tflearn.embedding(RNN, input_dim=10000, output_dim=128) RNN = tflearn.lstm(RNN, 128, dropout=0.8) RNN = tflearn.fully_connected(RNN, 2, activation='softmax') RNN = tflearn.regression(RNN, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # train model = tflearn.DNN(RNN, tensorboard_verbose=0, tensorboard_dir='MINST_tflearn_board_RNN/') model.fit(X_train, Y_train, validation_set=(X_test,Y_test),show_metric=True,batch_size=32)
def preprocess_database(database_path: str, maxlen: int, valid_portion: float, n_words): train_dataset, valid_dataset, test_dataset = imdb.load_data( path=os.path.join(database_path, "imdb.pkl"), n_words=n_words, valid_portion=valid_portion, maxlen=maxlen) train_dataset = preprocess_dataset(train_dataset, maxlen, PAD_VALUE, NB_CLASSES) valid_dataset = preprocess_dataset(valid_dataset, maxlen, PAD_VALUE, NB_CLASSES) test_dataset = preprocess_dataset(test_dataset, maxlen, PAD_VALUE, NB_CLASSES) convert_dataset_to_tfrecords(train_dataset, os.path.join(database_path, "train")) convert_dataset_to_tfrecords(valid_dataset, os.path.join(database_path, "valid")) convert_dataset_to_tfrecords(test_dataset, os.path.join(database_path, "test"))
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): # IMDB Dataset loading train, test, _ = imdb.load_data(path=file_path, n_words=vocab_size, valid_portion=val_fraction, sort_by_len=False) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) train_dataset = DataSet(trainX, trainY) return train_dataset
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): # IMDB Dataset loading train, test, _ = imdb.load_data( path=file_path, n_words=vocab_size, valid_portion=val_fraction, sort_by_len=False) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) train_dataset = DataSet(trainX, trainY) return train_dataset
def get_login_pages_imdb(keywords): import gensim import re from tflearn.datasets import imdb train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train sentences=trainX print len(sentences) print sentences model = gensim.models.Word2Vec(sentences, size=200, window=3, min_count=1, workers=4,iter=50) for key in keywords: print "[%s] most_similar:" % key results=model.most_similar(positive=[key], topn=10) for i in results: print i
def main(): # load IMDB Dataset train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000) trainX, trainY = train testX, testY = test print (len(trainX)) print (len(trainX[0])) # Data preprocesssing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Coverting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) for i in range(0, 20): print (trainX[i]) for i in range(0, 20): print (trainY[i]) # network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
""" Author: gonsoomoon DAte: Sep 10, 2017 Ref: Nikhil Buduma (2017). Fundamentals of deep learning. Sebastopol, CA: O’Reilly Media """ from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb import numpy as np # IMDB Dataset loading train, test, _ = imdb.load_data(path='data/imdb.pkl', n_words=30000,valid_portion=0.1) # TrainX is a list of 22,500 reviews which consists of indices of words # The length of each review of the list varies like 25, 52, and 2500 # TrainY is a list of 22,500 sentimantal that is in the form of being negative as 0 or positive as 1 # TestX and TestY are lists of 2,500 that are the same properties as the TrainX and TrainY trainX, trainY = train testX, testY = test #print ("type of trainX: ", type(trainX)) #print ("type of trainY: ", type(trainY)) # type of trainX: <class 'list'> print ("shape of trainX: ", np.shape(trainX)) print ("length of trainX: ", len(trainX)) #print ("trainX[0]: ", len(trainX[0])) #print ("trainX[0]: ", len(trainX[1])) #print ("trainX[0]: ", len(trainX[100])) #print ("trainX[0]: ", len(trainX[1000])) print ("shape of trainY: ", np.shape(trainY))
print("{}={}".format(attr.upper(), value)) print("") maxlen = FLAGS.maxlen vocab_size = FLAGS.vocab_size embedding_dim = FLAGS.embedding_dim rnn_hidden_size = FLAGS.rnn_hidden_size num_filters = FLAGS.num_filters dropout_prob = FLAGS.dropout_prob learning_rate = FLAGS.learning_rate batch_size = FLAGS.batch_size num_epochs = FLAGS.num_epochs # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=vocab_size, valid_portion=0.1) trainX, trainY = train testX, testY = test # Sequence padding trainX = pad_sequences(trainX, maxlen=maxlen, value=0.) testX = pad_sequences(testX, maxlen=maxlen, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building network network = input_data(shape=[None, maxlen], name='input')
import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # Load IMDB train, test, _ = imdb.load_data(path="imdb.pkl", n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data processing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Build Network net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy') # Train
from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # Load pre-processed IMDB movie reviews # from tflearn datasets # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, #converting to .pkl extension valid_portion=0.1) #as it's easy to operate upon #in python. #n_words = 10000, we're #extracting 10k words from # the reviews #portion of data for validation #set is 1% trainX, trainY = train # saving the training data into input(X) and target(Y) vectors testX, testY = test # saving the test data into input(X) and target(Y) vectors # Data preprocessing # Sequence padding- We need to pad sequences to perform seq-to-seq prediction. # Using the pad_sequences function of tflearn
""" Simple example using a Dynamic RNN (LSTM) to classify IMDB sentiment dataset. Dynamic computation are performed over sequences with variable length. """ import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb #IMDB Dataset loading train, valid, test = imdb.load_data(path='data/imdb.pk1', n_words=10000, valid_portion=0.1) trainX, trainY = train validX, validY = valid #Data preprocessing #Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) # Returns:x: `numpy array` with dimensions (number_of_sequences, maxlen) validX = pad_sequences(validX, maxlen=100, value=0.) #Converting labels to binary vectors trainY = to_categorical(trainY) """ def to_categorical(y, nb_classes=None) Convert class vector (integers from 0 to nb_classes) to binary class matrix, for use with categorical_crossentropy.
import tflearn import os from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb train, test, _ = imdb.load_data(path=os.path.join(os.path.dirname(__file__), 'data', 'imdb.pkl'), n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # What is pad_sequence an trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
# 电影评论及评论对应的情感分类 import tflearn from tflearn.datasets import imdb # IMDB 是一个电影评论的数据库 from tflearn.data_utils import to_categorical, pad_sequences train, valid, _ = imdb.load_data() X, Y = train X_test, Y_test = valid # 序列处理 # 1.填充序列 # pad_sequences(sequences,maxlen,dtype,padding,truncating,value) # 将长度为nb_samples的序列(标量序列)转换为形式如同(nb_samples,nb_timesteps) 2d 矩阵。 # 如果提供了参数maxlen,那么nb_timesteps=maxlen, # 否则则nb_timesteps为最长序列的长度 # 其他短于nb_timesteps的序列,后面部分都会用value填充 # 其他长于nb_timesteps的序列,后面部分都会被截取 # 截取的位置取决于padding和truncating train_X = pad_sequences(X, maxlen=100, value=0.) test_X = pad_sequences(X_test, maxlen=100, value=0.) # 2.标签数据进行二进制矩阵化 train_Y = to_categorical(Y, 2) test_Y = to_categorical(Y_test, 2) # 构建网络 net = tflearn.input_data([None, 100]) # embedding处理,因为外部数据没有word2vec # 输入维度是词语的空间,嵌入到一个128的向量空间 net = tflearn.embedding(net, input_dim=100000, output_dim=128)
import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb #load datasets train, test = imdb.load_data(path='imdb.pkl', n_words=1000, valid_portion=0.01) trainX, trainY = train testX, testY = test #preproc trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY,
import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb #Restarting karnel import tensorflow as tf print('Kernel Restarting..') tf.reset_default_graph() print('Kernel Restarted..') input_max_row = 10000 input_max_col = 100 nb_classes = 2 #Dataset Load: train,valid,test = imdb.load_data(path='imdb.pkl', n_words=input_max_row,valid_portion=0.1) print('Data Loaded..') #imdb_review = imdb.load_data(path='imdb.pkl') #train,test = imdb_review trainX,trainY = train testX,testY = test valX,valY = valid #data_preprocessing trainX = pad_sequences(trainX, maxlen=input_max_col) testX = pad_sequences(testX, maxlen=input_max_col) valX = pad_sequences(valX, maxlen=input_max_col) #Binary trainY = to_categorical(trainY,nb_classes=nb_classes) testY = to_categorical(testY,nb_classes=nb_classes) valY = to_categorical(valY,nb_classes=nb_classes)
import numpy as np from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading train, test, _ = imdb.load_data(path='data/imdb.pkl', n_words=30000) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=500, value=0.) testX = pad_sequences(testX, maxlen=500, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) class IMDBDataset(): def __init__(self, X, Y): self.num_examples = len(X) self.inputs = X self.tags = Y self.ptr = 0 def minibatch(self, size): ret = None if self.ptr + size < len(self.inputs): x = self.inputs[self.ptr:self.ptr + size] y = self.tags[self.ptr:self.ptr + size] else:
import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb import pandas as pd train, test, _ = imdb.load_data( path='imdb.pkl', n_words=10000, valid_portion=0.1) # 10% of data as "validation set" trainX, trainY = train testX, testY = test pd.Series(trainX).tail() print(list(pd.Series(trainX).iloc[5555])) pd.Series(trainX).shape pd.Series(trainY).tail() pd.Series(trainY).shape pd.Series(trainY).value_counts() pd.Series(trainY).value_counts().index.tolist() len(pd.Series(trainY).value_counts().index.tolist()) # # Data Preprocessing # ### Sequence Padding # # Pad each sequence to the same length: the length of the longest sequence. # If maxlen is provided, any sequence longer than maxlen is truncated to # maxlen. Truncation happens off either the beginning (default) or the # end of the sequence. Supports post-padding and pre-padding (default).
if self.ptr + size < self.num_examples: xbatch = self.xdata[self.ptr:self.ptr + size] ybatch = self.ydata[self.ptr:self.ptr + size] else: xbatch = np.concatenate( (self.xdata[:self.ptr], self.xdata[:size - len(self.xdata[self.ptr:])])) ybatch = np.concatenate( (self.ydata[:self.ptr], self.ydata[:size - len(self.ydata[self.ptr:])])) self.ptr = (self.ptr + size) % self.num_examples return xbatch, ybatch @staticmethod def pad_sequences(X, maxlen): new_seqs = np.zeros((len(X), maxlen)) for i, seq in enumerate(X): if len(seq) <= maxlen: new_seqs[i, :len(seq)] = seq else: new_seqs[i, :] = seq[:maxlen] return new_seqs (x_train, y_train), (x_val, y_val), _ = imdb.load_data('imdb.pkl', n_words=10000, valid_portion=0.1) train_data = IMDBDataset(x_train, y_train, 100) val_data = IMDBDataset(x_val, y_val, 100)
- http://ai.stanford.edu/~amaas/data/sentiment/ """ from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.embedding_ops import embedding from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell from tflearn.layers.estimator import regression # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = input_data(shape=[None, 200]) net = embedding(net, input_dim=20000, output_dim=128) net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011). Links: - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf - http://ai.stanford.edu/~amaas/data/sentiment/ """ from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading train, val, test = imdb.load_data(path='imdb.pkl', maxlen=200, n_words=20000) trainX, trainY = train valX, valY = val testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) valX = pad_sequences(valX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) valY = to_categorical(valY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building
from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading train, val, test = imdb.load_data(path='imdb.pkl', maxlen=200, n_words=20000) trainX, trainY = train valX, valY = val testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) valY = to_categorical(valY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 200]) net = tflearn.embedding(net, input_dim=20000, output_dim=128) net = tflearn.lstm(net, 128) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
Implemented with the popular tflean library to implement lstm The dataset is a preprocessed dataset which process every sentense into number according to the frequency of every words. ''' import tflearn from tflearn.data_utils import pad_sequences from tflearn.datasets import imdb import numpy as np import keras batch_size = 32 # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=500, valid_portion=0.2) trainX, trainY = train testX, testY = test #print(trainX) # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = keras.utils.to_categorical(trainY, 2) testY = keras.utils.to_categorical(testY, 2) # Network building
from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading train1, test1, actTest = imdb.load_data(path='own.pkl', valid_portion=0.05) trainX, trainY = train testX, testY = test actTestX, actTestY = actTest # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) actTestX = pad_sequences(actTestX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) actTestY = to_categorical(actTestY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')