Exemple #1
0
def data_process():
    #data_utils.create_vocabulary(vocabulary_path=vocabulary_Path, data_path=comment_train_set_Path,
    #                             max_vocabulary_size=40000, tokenizer=False)
    #data_utils.data_to_token_ids(data_path=comment_set_Path, target_path=token_comment_set,
    #                             vocabulary_path=vocabulary_Path, tokenizer=False)
    # data_utils.data_to_token_ids(data_path='comment_test_set.txt',target_path='testX',vocabulary_path='vocab40000',tokenizer=False)
    #print('end')
    # Amazon Dataset loading
    train, test, _ = imdb.load_data(path=PKL_PATH, n_words=40000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test
    #
    # with open(file_train_X,'w') as f:
    #     for strs in trainX:
    #         f.write(' '.join(str(e) for e in strs)+"\n")
    # with open(file_train_Y,'w') as f:
    #     f.write(' '.join(str(e)+"\n" for e in trainY))


    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=300, value=0.)
    testX = pad_sequences(testX, maxlen=300, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=6)
    testY = to_categorical(testY, nb_classes=6)
    return trainX,trainY,testX,testY
Exemple #2
0
def get_login_pages_imdb(keywords):

    import gensim
    import re
    from tflearn.datasets import imdb

    train, test, _ = imdb.load_data(path='imdb.pkl',
                                    n_words=10000,
                                    valid_portion=0.1)

    trainX, trainY = train
    sentences = trainX
    print(len(sentences))
    print(sentences)

    model = gensim.models.Word2Vec(sentences,
                                   size=200,
                                   window=3,
                                   min_count=1,
                                   workers=4,
                                   iter=50)

    for key in keywords:
        print("[%s] most_similar:" % key)
        results = model.most_similar(positive=[key], topn=10)
        for i in results:
            print(i)
Exemple #3
0
def get_LSTM(activation='softmax', valid_portion=0.1, learning_rate=0.001):
    '''

    :param activation: 'softmax', 'linear', 'tanh', 'sigmoid', 'softplus', 'relu', 'prelu'
    :param valid_portion:
    :param learning_rate:1, 0.1, 0.001, 0.0001
    :return:
    '''
    # IMDB Dataset loading
    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY)
    testY = to_categorical(testY)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=32)
Exemple #4
0
 def __init__(self, datapath='data/imdb/imdb.pkl'):
     (trainX, trainY), (testX, testY), _ = imdb.load_data(path=datapath,
                                                          n_words=30000)
     self.trainX = pad_sequences(trainX, maxlen=250, value=0.)
     self.trainY = to_categorical(trainY, nb_classes=2)
     self.testX = pad_sequences(testX, maxlen=250, value=0.)
     self.testY = to_categorical(testY, nb_classes=2)
     self.num_examples = len(trainX)
     self.ptr = 0
Exemple #5
0
def tflearn_imdb():
    """
    文本情感分析
    :return:
    """
    (X_train, Y_train), (X_test, Y_test) = imdb.load_data()

    X_train, Y_train = pad_sequences(Y_train,
                                     maxlen=100), to_categorical(Y_train,
                                                                 nb_classes=2)
    X_test, Y_test = pad_sequences(Y_test,
                                   maxlen=100), to_categorical(Y_test,
                                                               nb_classes=2)

    network = input_data([None, 100], name="input")
    tflearn.embedding(network, input_dim=10000, output_dim=128)

    branch1 = tflearn.conv_1d(network,
                              128,
                              3,
                              padding="valid",
                              activation="relu",
                              regularizer="L2")
    branch2 = tflearn.conv_1d(network,
                              128,
                              4,
                              padding="valid",
                              activation="relu",
                              regularizer="L2")
    branch3 = tflearn.conv_1d(network,
                              128,
                              5,
                              padding="valid",
                              activation="relu",
                              regularizer="L2")

    network = tflearn.merge([branch1, branch2, branch3], mode="concat", axis=1)
    network = tf.expand_dims(network, 2)
    network = tflearn.global_avg_pool(network)
    network = tflearn.dropout(network, 0.5)
    network = tflearn.fully_connected(network, 2, activation="softmax")

    network = tflearn.regression(network,
                                 optimizer="adam",
                                 learning_rate=0.001,
                                 loss="categorical_crossentropy",
                                 name="target")

    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(X_train,
              Y_train,
              n_epoch=5,
              shuffle=True,
              validation_set=(X_test, Y_test),
              show_metric=True,
              batch_size=32)
def FitModel(model):
    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
    model.save("MyModel.pkl")
Exemple #7
0
def main():
    (X_train, y_train), (X_test, y_test), _ = imdb.load_data()
    X_train = np.array(pad_sequences(X_train, maxlen=100))

    X_test = np.array(pad_sequences(X_test, maxlen=100))

    vocab_size = X_train.max() + 1
    print 'vocab size: {}'.format(vocab_size)
    y_train = to_categorical(np.array(y_train), 2)
    y_test = np.array(y_test)
    cnn = Discriminator(vocab_size, 100, 100, [2, 3], 50, 2)
    cnn.train(X_train, y_train, 5)
Exemple #8
0
def main():
    (X_train, y_train), (X_test, y_test), _ = imdb.load_data()
    X_train = np.array(pad_sequences(X_train, maxlen=100))
    print(X_train.shape)
    X_test = np.array(pad_sequences(X_test, maxlen=100))
    vocab_size = X_train.max() + 1
    print('vocab size: {}'.format(vocab_size))
    y_train = X_train.copy()
    print(y_train)

    y_test = np.array(y_test)
    cnn = Generator(100, vocab_size, 50, 100)
    cnn.train(X_train, y_train, 20)
Exemple #9
0
    def __init__(self):
        train, test, _ = imdb.load_data(path='imdb.pkl',
                                        n_words=10000,
                                        valid_portion=0.1)
        trainX, trainY = train
        testX, testY = test

        # Data preprocessing
        # Sequence padding
        self.trainX = pad_sequences(trainX, maxlen=200, value=0.)
        self.testX = pad_sequences(testX, maxlen=200, value=0.)
        # Converting labels to binary vectors
        self.trainY = to_categorical(trainY, nb_classes=2)
        self.testY = to_categorical(testY, nb_classes=2)
Exemple #10
0
def getData_imdb():
    from tflearn.datasets import imdb

    train, test, _ = imdb.load_data(path='imdb.pkl',
                                    n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    print(trainX.shape)
    print(trainY)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)
    return trainX, testX, trainY, testY
def run_on_imdb():
    # IMDB Dataset loading
    train, test, _ = imdb.load_data(path=imdb_dataset_path,
                                    n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)

    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)

    if check_file_exist(imdb_model_path):
        model.load(imdb_model_path)

    model.fit(trainX,
              trainY,
              validation_set=(testX, testY),
              show_metric=True,
              batch_size=32)

    if save_model:
        print("Saving model as 'imdb_model.tfl'")
        model.save(imdb_model_path)

    return 0
Exemple #12
0
def MNISTRNN():
    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1)
    X_train, Y_train = train
    X_test, Y_test = test

    X_train = pad_sequences(X_train, maxlen=100,value=0.)
    X_test = pad_sequences(X_test, maxlen=100,value=0.)
    Y_train = to_categorical(Y_train, nb_classes=2)
    Y_test = to_categorical(Y_test, nb_classes=2)

    # LSTM
    RNN = tflearn.input_data([None, 100])
    RNN = tflearn.embedding(RNN, input_dim=10000, output_dim=128)
    RNN = tflearn.lstm(RNN, 128, dropout=0.8)
    RNN = tflearn.fully_connected(RNN, 2, activation='softmax')
    RNN = tflearn.regression(RNN, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')

    # train
    model = tflearn.DNN(RNN, tensorboard_verbose=0, tensorboard_dir='MINST_tflearn_board_RNN/')
    model.fit(X_train, Y_train, validation_set=(X_test,Y_test),show_metric=True,batch_size=32)
Exemple #13
0
def preprocess_database(database_path: str, maxlen: int, valid_portion: float,
                        n_words):
    train_dataset, valid_dataset, test_dataset = imdb.load_data(
        path=os.path.join(database_path, "imdb.pkl"),
        n_words=n_words,
        valid_portion=valid_portion,
        maxlen=maxlen)

    train_dataset = preprocess_dataset(train_dataset, maxlen, PAD_VALUE,
                                       NB_CLASSES)
    valid_dataset = preprocess_dataset(valid_dataset, maxlen, PAD_VALUE,
                                       NB_CLASSES)
    test_dataset = preprocess_dataset(test_dataset, maxlen, PAD_VALUE,
                                      NB_CLASSES)

    convert_dataset_to_tfrecords(train_dataset,
                                 os.path.join(database_path, "train"))
    convert_dataset_to_tfrecords(valid_dataset,
                                 os.path.join(database_path, "valid"))
    convert_dataset_to_tfrecords(test_dataset,
                                 os.path.join(database_path, "test"))
Exemple #14
0
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):

    # IMDB Dataset loading
    train, test, _ = imdb.load_data(path=file_path,
                                    n_words=vocab_size,
                                    valid_portion=val_fraction,
                                    sort_by_len=False)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    train_dataset = DataSet(trainX, trainY)

    return train_dataset
Exemple #15
0
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):

    # IMDB Dataset loading
    train, test, _ = imdb.load_data(
        path=file_path,
        n_words=vocab_size,
        valid_portion=val_fraction,
        sort_by_len=False)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    train_dataset = DataSet(trainX, trainY)

    return train_dataset
Exemple #16
0
def get_login_pages_imdb(keywords):

    import gensim
    import re
    from tflearn.datasets import imdb

    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                    valid_portion=0.1)

    trainX, trainY = train
    sentences=trainX
    print len(sentences)
    print sentences

    model = gensim.models.Word2Vec(sentences, size=200, window=3, min_count=1, workers=4,iter=50)



    for key in keywords:
        print "[%s] most_similar:" % key
        results=model.most_similar(positive=[key], topn=10)
        for i in results:
            print i
Exemple #17
0
def main():
    # load IMDB Dataset

    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000)

    trainX, trainY = train
    testX, testY = test

    print (len(trainX))
    print (len(trainX[0]))


    # Data preprocesssing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)

    # Coverting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    for i in range(0, 20):
        print (trainX[i])

    for i in range(0, 20):
        print (trainY[i])

    # network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
Exemple #18
0
""" 
Author: gonsoomoon
DAte: Sep 10, 2017
Ref: Nikhil Buduma (2017). Fundamentals of deep learning. Sebastopol, CA: O’Reilly Media
"""
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import numpy as np

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='data/imdb.pkl', n_words=30000,valid_portion=0.1)

# TrainX is a list of 22,500 reviews which consists of indices of words
# The length of each review of the list varies like 25, 52, and 2500
# TrainY is a list of 22,500 sentimantal that is in the form of being negative as 0 or positive as 1
# TestX and TestY are lists of 2,500 that are the same properties as the TrainX and TrainY

trainX, trainY = train
testX, testY = test

#print ("type of trainX: ", type(trainX))
#print ("type of trainY: ", type(trainY))
# type of trainX:  <class 'list'>
print ("shape of trainX: ", np.shape(trainX))
print ("length of trainX: ", len(trainX))
#print ("trainX[0]: ", len(trainX[0]))
#print ("trainX[0]: ", len(trainX[1]))
#print ("trainX[0]: ", len(trainX[100]))
#print ("trainX[0]: ", len(trainX[1000]))

print ("shape of trainY: ", np.shape(trainY))
    print("{}={}".format(attr.upper(), value))
print("")

maxlen = FLAGS.maxlen
vocab_size = FLAGS.vocab_size
embedding_dim = FLAGS.embedding_dim
rnn_hidden_size = FLAGS.rnn_hidden_size
num_filters = FLAGS.num_filters
dropout_prob = FLAGS.dropout_prob
learning_rate = FLAGS.learning_rate
batch_size = FLAGS.batch_size
num_epochs = FLAGS.num_epochs

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl',
                                n_words=vocab_size,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Sequence padding
trainX = pad_sequences(trainX, maxlen=maxlen, value=0.)
testX = pad_sequences(testX, maxlen=maxlen, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Building network
network = input_data(shape=[None, maxlen], name='input')
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# Load IMDB
train, test, _ = imdb.load_data(path="imdb.pkl",
                                n_words=10000,
                                valid_portion=0.1)

trainX, trainY = train
testX, testY = test

# Data processing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)

trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Build Network
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.0001,
                         loss='categorical_crossentropy')

# Train
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb # Load pre-processed IMDB movie reviews 
								  # from tflearn datasets


# IMDB Dataset loading 
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, #converting to .pkl extension
                                valid_portion=0.1) 				#as it's easy to operate upon 
																#in python.

																#n_words = 10000, we're 
																#extracting 10k words from 
																# the reviews

																#portion of data for validation
																#set is 1%


trainX, trainY = train 				# saving the training data into input(X) and target(Y) vectors 											
testX, testY = test 				# saving the test data into input(X) and target(Y) vectors	



# Data preprocessing


# Sequence padding- We need to pad sequences to perform seq-to-seq prediction.
# Using the pad_sequences function of tflearn
"""
Simple example using a Dynamic RNN (LSTM) to classify IMDB sentiment dataset.
Dynamic computation are performed over sequences with variable length.
"""
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

#IMDB Dataset loading
train, valid, test = imdb.load_data(path='data/imdb.pk1',
                                    n_words=10000,
                                    valid_portion=0.1)

trainX, trainY = train
validX, validY = valid

#Data preprocessing
#Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
# Returns:x: `numpy array` with dimensions (number_of_sequences, maxlen)

validX = pad_sequences(validX, maxlen=100, value=0.)

#Converting labels to binary vectors
trainY = to_categorical(trainY)
"""
def to_categorical(y, nb_classes=None)

Convert class vector (integers from 0 to nb_classes)
to binary class matrix, for use with categorical_crossentropy.
import tflearn
import os
from tflearn.data_utils import  to_categorical, pad_sequences
from tflearn.datasets import imdb

train, test, _ = imdb.load_data(path=os.path.join(os.path.dirname(__file__), 'data', 'imdb.pkl'), n_words=10000,
                                valid_portion=0.1)

trainX, trainY = train
testX, testY = test

# What is pad_sequence an
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)

trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001,
                         loss='categorical_crossentropy')

model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
Exemple #24
0
# 电影评论及评论对应的情感分类
import tflearn
from tflearn.datasets import imdb  # IMDB 是一个电影评论的数据库
from tflearn.data_utils import to_categorical, pad_sequences

train, valid, _ = imdb.load_data()

X, Y = train
X_test, Y_test = valid

# 序列处理
# 1.填充序列
# pad_sequences(sequences,maxlen,dtype,padding,truncating,value)
# 将长度为nb_samples的序列(标量序列)转换为形式如同(nb_samples,nb_timesteps) 2d 矩阵。
# 如果提供了参数maxlen,那么nb_timesteps=maxlen,
# 否则则nb_timesteps为最长序列的长度
# 其他短于nb_timesteps的序列,后面部分都会用value填充
# 其他长于nb_timesteps的序列,后面部分都会被截取
# 截取的位置取决于padding和truncating
train_X = pad_sequences(X, maxlen=100, value=0.)
test_X = pad_sequences(X_test, maxlen=100, value=0.)

# 2.标签数据进行二进制矩阵化
train_Y = to_categorical(Y, 2)
test_Y = to_categorical(Y_test, 2)

# 构建网络
net = tflearn.input_data([None, 100])
# embedding处理,因为外部数据没有word2vec
# 输入维度是词语的空间,嵌入到一个128的向量空间
net = tflearn.embedding(net, input_dim=100000, output_dim=128)
Exemple #25
0
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

#load datasets
train, test = imdb.load_data(path='imdb.pkl', n_words=1000, valid_portion=0.01)

trainX, trainY = train
testX, testY = test

#preproc

trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)

trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)

net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.0001,
                         loss='categorical_crossentropy')

model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX,
          trainY,
Exemple #26
0
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
#Restarting karnel
import tensorflow as tf
print('Kernel Restarting..')
tf.reset_default_graph()
print('Kernel Restarted..')

input_max_row = 10000
input_max_col = 100
nb_classes = 2

#Dataset Load:
train,valid,test = imdb.load_data(path='imdb.pkl', n_words=input_max_row,valid_portion=0.1)
print('Data Loaded..')
#imdb_review = imdb.load_data(path='imdb.pkl')
#train,test = imdb_review
trainX,trainY = train
testX,testY = test
valX,valY = valid

#data_preprocessing
trainX = pad_sequences(trainX, maxlen=input_max_col)
testX = pad_sequences(testX, maxlen=input_max_col)
valX = pad_sequences(valX, maxlen=input_max_col)

#Binary
trainY = to_categorical(trainY,nb_classes=nb_classes)
testY = to_categorical(testY,nb_classes=nb_classes)
valY = to_categorical(valY,nb_classes=nb_classes)
import numpy as np
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='data/imdb.pkl', n_words=30000)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=500, value=0.)
testX = pad_sequences(testX, maxlen=500, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)


class IMDBDataset():
    def __init__(self, X, Y):
        self.num_examples = len(X)
        self.inputs = X
        self.tags = Y
        self.ptr = 0

    def minibatch(self, size):
        ret = None
        if self.ptr + size < len(self.inputs):
            x = self.inputs[self.ptr:self.ptr + size]
            y = self.tags[self.ptr:self.ptr + size]
        else:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import pandas as pd

train, test, _ = imdb.load_data(
    path='imdb.pkl', n_words=10000,
    valid_portion=0.1)  # 10% of data as "validation set"
trainX, trainY = train
testX, testY = test

pd.Series(trainX).tail()
print(list(pd.Series(trainX).iloc[5555]))
pd.Series(trainX).shape

pd.Series(trainY).tail()
pd.Series(trainY).shape
pd.Series(trainY).value_counts()
pd.Series(trainY).value_counts().index.tolist()

len(pd.Series(trainY).value_counts().index.tolist())

# # Data Preprocessing

# ### Sequence Padding
#
# Pad each sequence to the same length: the length of the longest sequence.
# If maxlen is provided, any sequence longer than maxlen is truncated to
# maxlen. Truncation happens off either the beginning (default) or the
# end of the sequence. Supports post-padding and pre-padding (default).
Exemple #29
0
        if self.ptr + size < self.num_examples:
            xbatch = self.xdata[self.ptr:self.ptr + size]
            ybatch = self.ydata[self.ptr:self.ptr + size]
        else:
            xbatch = np.concatenate(
                (self.xdata[:self.ptr],
                 self.xdata[:size - len(self.xdata[self.ptr:])]))
            ybatch = np.concatenate(
                (self.ydata[:self.ptr],
                 self.ydata[:size - len(self.ydata[self.ptr:])]))
        self.ptr = (self.ptr + size) % self.num_examples

        return xbatch, ybatch

    @staticmethod
    def pad_sequences(X, maxlen):
        new_seqs = np.zeros((len(X), maxlen))
        for i, seq in enumerate(X):
            if len(seq) <= maxlen:
                new_seqs[i, :len(seq)] = seq
            else:
                new_seqs[i, :] = seq[:maxlen]
        return new_seqs


(x_train, y_train), (x_val, y_val), _ = imdb.load_data('imdb.pkl',
                                                       n_words=10000,
                                                       valid_portion=0.1)
train_data = IMDBDataset(x_train, y_train, 100)
val_data = IMDBDataset(x_val, y_val, 100)
    - http://ai.stanford.edu/~amaas/data/sentiment/

"""

from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=200, value=0.)
testX = pad_sequences(testX, maxlen=200, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = input_data(shape=[None, 200])
net = embedding(net, input_dim=20000, output_dim=128)
net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
Exemple #31
0
    Analysis. The 49th Annual Meeting of the Association for Computational
    Linguistics (ACL 2011).

Links:
    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
    - http://ai.stanford.edu/~amaas/data/sentiment/

"""
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, val, test = imdb.load_data(path='imdb.pkl', maxlen=200,
                                  n_words=20000)
trainX, trainY = train
valX, valY = val
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=200, value=0.)
valX = pad_sequences(valX, maxlen=200, value=0.)
testX = pad_sequences(testX, maxlen=200, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
valY = to_categorical(valY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, val, test = imdb.load_data(path='imdb.pkl', maxlen=200, n_words=20000)
trainX, trainY = train
valX, valY = val
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=200, value=0.)
testX = pad_sequences(testX, maxlen=200, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
valY = to_categorical(valY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 200])
net = tflearn.embedding(net, input_dim=20000, output_dim=128)
net = tflearn.lstm(net, 128)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')
Exemple #33
0
Implemented with the popular tflean library to implement lstm
The dataset is a preprocessed dataset which process every sentense into number according to the
frequency of every words.
'''

import tflearn
from tflearn.data_utils import pad_sequences
from tflearn.datasets import imdb
import numpy as np
import keras

batch_size = 32

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl',
                                n_words=500,
                                valid_portion=0.2)
trainX, trainY = train
testX, testY = test
#print(trainX)

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors

trainY = keras.utils.to_categorical(trainY, 2)
testY = keras.utils.to_categorical(testY, 2)

# Network building
Exemple #34
0
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train1, test1, actTest = imdb.load_data(path='own.pkl', valid_portion=0.05)
trainX, trainY = train
testX, testY = test
actTestX, actTestY = actTest
# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
actTestX = pad_sequences(actTestX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
actTestY = to_categorical(actTestY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.001,
                         loss='categorical_crossentropy')