Esempio n. 1
0
def build_model(vocabFile, model_type='bilstm'):

    processor = VocabularyProcessor.restore(vocabFile)
    n_words = len(processor.vocabulary_)

    net = tflearn.input_data([None, 300])
    net = tflearn.embedding(net, input_dim=n_words, output_dim=200)

    if model_type == 'bilstm':
        net = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(200),
                                        tflearn.BasicLSTMCell(200))
        net = dropout(net, 0.5)
    elif model_type == 'lstm':
        net = tflearn.lstm(net, 200, dropout=0.5)
        net = dropout(net, 0.5)
    elif model_type == 'cnn':
        net = conv_model(net)

    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.05,
                             loss='categorical_crossentropy')

    return net
class SentimentLookup:
    net = tflearn.input_data     ([None, 40])
    net = tflearn.embedding      (net, input_dim=12495, output_dim=128)
    net = tflearn.lstm           (net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression     (net, optimizer='adam', learning_rate=0.001,
                                       loss='categorical_crossentropy')
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.load(MODEL)

    vp = VocabularyProcessor.restore(VOCAB)
    
    def _process_tweet(self, tweet = ""):
        cleaned = str(tweet).upper()
        cleaned = re.sub('&\w+;',   '',          cleaned)
        cleaned = re.sub('\'',      '',          cleaned)
        cleaned = re.sub('@\w+ ',   'USERNAME ', cleaned)
        cleaned = re.sub('[^A-Z ]', '',          cleaned)
        cleaned = re.sub('[ ]+',    ' ',         cleaned)
        return cleaned.strip()

    def sentiment(self, data):
        if isinstance(data, str):
            query = [x for x in SentimentLookup.vp.transform([self._process_tweet(data)])]
            bad, good = SentimentLookup.model.predict(query).tolist()[0]
            return good

        data  = map(self._process_tweet, data)
        query = [x for x in SentimentLookup.vp.transform(data)]
        return SentimentLookup.model.predict(query)[:,1]
def load_vocab_processor(name,max_length,min_frequency):
    '''
    load model
    '''
    print('Loading vocabulary model from {}'.format(name))
    vp = VocabularyProcessor(max_length, min_frequency=min_frequency)
    vp = vp.restore(name)
    return vp
Esempio n. 4
0
    def __init__(self):
        if not exists(VOCABULARY_PATH):
            self._vocab = self._create_vocab()
            self._vocab.save(VOCABULARY_PATH)
        else:
            self._vocab = VocabularyProcessor.restore(VOCABULARY_PATH)

        self._model = self._create_model()
        if exists(MODEL_PATH + '.meta'):
            self._model.load(MODEL_PATH, True)
Esempio n. 5
0
def text2npy(inFile, outFile, vocabFile, dtype):

    processor = VocabularyProcessor.restore(vocabFile)
    doc = MySentences(inFile, dtype, 'get_content')
    train_doc = list(processor.transform(doc))

    # 可以使用 to_categorical 来实现onehot编码
    # to_categorical(np.array(lable), NB_CLASSES))

    if dtype == 'train':
        # 把标签做变换
        lable = []
        for y in MySentences(inFile, dtype, 'get_info'):
            lable.append(int(y))
        y = np.array(lable)
        # 保存到文件
        np.save(outFile, np.column_stack([train_doc, y]))
    elif dtype == 'test':
        np.save(outFile, train_doc)

        fw = open(outFile + "_doc.txt", 'w')
        for y in MySentences(inFile, dtype, 'get_info'):
            fw.write(y.encode('utf8') + "\n")
        fw.close()
Esempio n. 6
0
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))


# 序列长度填充或截取到100,删除词频<=2的词
vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)

# 创建词汇表,创建后不能更改
vocab.fit(DOCUMENTS)

# 保存和加载词汇表
vocab.save('vocab.pickle')
vocab = VocabularyProcessor.restore('vocab.pickle')

# 文本转为词ID序列,未知或填充用的词ID为0
id_documents = list(vocab.transform(DOCUMENTS))
for id_document in id_documents:
    print(id_document)
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 def restore(self):
     self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
class CNN(object):
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.number_of_classes = 2
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.max_words = None
        self.vocabProcessor = None
        self.cnn_model = None
        self.model = None
        self.test_x = []
        self.test_y = []

    def load_dataset_training(self,
                              vocab_name,
                              filename='datasetWithoutNeutral'):
        """ Load the dataset """
        X, Y = load_csv('datasets/' + filename,
                        target_column=2,
                        columns_to_ignore=[0])
        """ Count max words from the longest sentence """
        self.max_words = max([len(x[0].split(" ")) for x in X])
        """ Get vocabulare size from longest sentence """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        """ Encode pos, neu and neg to numbers """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(Y)
        Y = labelEncoder.transform(Y)
        """ Change the list of sentences to a list of sequence of words """
        X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                             for x in X])))
        """ Split the datasets to training set and test test """
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            X, Y, test_size=0.10, random_state=7)
        """ Pad the sequences to fit the longest sentence """
        self.X_train = pad_sequences(self.X_train,
                                     maxlen=self.max_words,
                                     value=0.)
        self.X_test = pad_sequences(self.X_test,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.Y_train = to_categorical(self.Y_train,
                                      nb_classes=self.number_of_classes)
        self.Y_test = to_categorical(self.Y_test,
                                     nb_classes=self.number_of_classes)
        self.vocabProcessor.save(vocab_name)

    def create_cnn_architecture_two_layers(
            self,
            model_name,
            outputDim=300,
            number_of_filters=60,
            filterSize=[3, 4],
            padding='same',
            activation_function_convLayer='relu',
            regularizer='L2',
            dropouts=0.5,
            activation_function_fc='softmax',
            optimizer='adam',
            learning_rate=0.001,
            loss_function='categorical_crossentropy'):
        if len(filterSize) == 0:
            filterSize = [3, 4]
        """ Define input shape and create word embedding """
        self.cnn_model = input_data(shape=[None, self.max_words], name='input')
        self.cnn_model = tflearn.embedding(
            self.cnn_model,
            input_dim=len(self.vocabProcessor.vocabulary_),
            output_dim=outputDim)
        """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
        conv1 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[0],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        conv2 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[1],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        #conv3 = conv_1d(cnn_model, nb_filter = 128,  filter_size = 5, padding = 'same',
        #                 activation = 'relu', regularizer = 'L2')
        self.cnn_model = merge([conv1, conv2], mode='concat', axis=1)
        """ Expand one dimension to fit the max_pooling layer """
        self.cnn_model = tf.expand_dims(self.cnn_model, 1)
        self.cnn_model = global_max_pool(self.cnn_model)
        """ Instantiate dropout layer and specify dropout parameter """
        self.cnn_model = dropout(self.cnn_model, dropouts)
        """ Instantiate fully connected layer and regression layer. """
        self.cnn_model = fully_connected(self.cnn_model,
                                         self.number_of_classes,
                                         activation=activation_function_fc)
        self.cnn_model = regression(self.cnn_model,
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    loss=loss_function,
                                    name='models/' + model_name)

    def train_and_save(self,
                       model_name,
                       tensorboard_verbose=0,
                       tensorboard_dir='/logs/',
                       nb_epochs=5,
                       shuffle=True,
                       show_metric=True):
        """ Instantiate Deep neural network model and start the training """
        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.fit(self.X_train,
                       self.Y_train,
                       n_epoch=nb_epochs,
                       validation_set=(self.X_test, self.Y_test),
                       shuffle=shuffle,
                       show_metric=show_metric,
                       batch_size=self.batch_size,
                       run_id=model_name)
        """ Save the model """
        self.model.save('models/' + model_name)

    def load_model(self,
                   model_name,
                   outputDim=300,
                   number_of_filters=60,
                   filterSize=[3, 4],
                   padding='same',
                   activation_function_convLayer='relu',
                   regularizer='L2',
                   dropouts=0.5,
                   activation_function_fc='softmax',
                   optimizer='adam',
                   learning_rate=0.001,
                   loss_function='categorical_crossentropy',
                   tensorboard_verbose=0,
                   tensorboard_dir='/logs/'):
        """
            Has to pass the same values that the models were trained with. If the
            model was trained on default values, the parameters will pass it automatically.
        """

        self.create_cnn_architecture_two_layers(
            model_name, outputDim, number_of_filters, filterSize, padding,
            activation_function_convLayer, regularizer, dropouts,
            activation_function_fc, optimizer, learning_rate, loss_function)

        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.load('models/' + model_name)

    def load_test_dataset(self,
                          filename='testDatasetWithOutNeuTwo',
                          vocab_name='vocabProc'):
        """
            Something is wrong with this function. Does not get the same result
            as before when loading in the new data...
        """
        """ Load test dataset """
        self.test_x, self.test_y = load_csv('datasets/' + filename,
                                            target_column=1)
        """ Get restored vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Encode pos, neu and neg to numbers  """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(self.test_y)
        self.test_y = labelEncoder.transform(self.test_y)
        """ Change the list of sentences to a list of sequence of words """
        self.test_x = np.array(
            list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
        """ Pad the sequences to fit the longest sentence """
        self.test_x = pad_sequences(self.test_x,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.test_y = to_categorical(self.test_y,
                                     nb_classes=self.number_of_classes)

    def evaluate_model_performance(self):
        metrix_score = self.model.evaluate(self.test_x,
                                           self.test_y,
                                           batch_size=self.batch_size)
        return metrix_score

    def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentence])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(sentence)
        return pred_score

    def predict_list(self,
                     list_of_sentences=[[''], ['']],
                     vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentece])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(list_of_sentences)
        return pred_score
Esempio n. 9
0
"""
    Load dataset and load the model for evaluation
"""

tf.reset_default_graph()
""" Load the dataset """
#test_x, test_y = load_csv('testDatasetWithNeuOne', target_column = 1)
#test_x, test_y = load_csv('testDatasetWithNeuTwo', target_column = 1)
#test_x, test_y = load_csv('testDatasetWithOutNeuOne', target_column = 1)
test_x, test_y = load_csv('testDatasetWithOutNeuTwo', target_column=1)
""" Count max words from the longest sentence """
#max_words = max([len(x[0].split(" ")) for x in test_x])
max_words = 2132
""" Get vocabulare size from longest sentence """
vocab = VocabularyProcessor(max_words)
vocab = vocab.restore('vocabProc')
""" Encode pos, neu and neg to numbers """
labelEncoder = LabelEncoder()
labelEncoder.fit(test_y)
test_y = labelEncoder.transform(test_y)
""" Change the list of sentences to a list of sequence of words """
test_x = np.array(list(vocab.transform([x[0] for x in test_x])))
""" Pad the sequences to fit the longest sentence """
test_x = pad_sequences(test_x, maxlen=max_words, value=0.)
""" Convert labels to binary vector """
test_y = to_categorical(test_y, nb_classes=2)
#test_y= to_categorical(test_y, nb_classes = 3)
""" Create the same neural network as the one that is going to be loaded. """
cnn_model = input_data(shape=[None, max_words], name='input')
cnn_model = tflearn.embedding(cnn_model,
                              input_dim=len(vocab.vocabulary_),
Esempio n. 10
0
import tflearn
from tflearn.data_utils import VocabularyProcessor
import sys

# get all arguments
games_to_predict = sys.argv[1:]

if len(games_to_predict) is 0:
    print("Type games to predict when you run the script af arguments")
    exit()

# create and load vocal vector model
word_processor = VocabularyProcessor(15)
word_processor.restore("wordprocessor")

# create and load ML model
net = tflearn.input_data([None, 15])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=.8)
net = tflearn.fully_connected(net, 11, activation='softmax')
net = tflearn.regression(net) # adam, 0.001

model = tflearn.DNN(net, tensorboard_verbose=0)
model.load("model.tfl")

# use labels for output
ratings = [
    'Unbearable',  # 72
    'Disaster', # 4
    'Awful', # 664
    'Painful', # 340