Exemple #1
0
def build_nce_model(num_words, num_docs, doc_embedding_size=doc_embedding_size, word_embedding_size=word_embedding_size):
    X1 = tflearn.input_data(shape=[None, 1])
    X2 = tflearn.input_data(shape=[None, 3])
    
    Y = tf.placeholder(tf.float32, [None, 1])

    d1, = tflearn.embedding(X1, input_dim=num_docs, output_dim=doc_embedding_size)
    w1, w2, w3 = tflearn.embedding(X2, input_dim=num_words, output_dim=word_embedding_size)

    embedding_layer = tflearn.merge([d1, w1, w2, w3], mode='concat')

    num_classes = num_words
    dim = doc_embedding_size + 3*word_embedding_size
        
    with tf.variable_scope("NCELoss"):
        weights = tflearn.variables.variable('W', [num_classes, dim])
        biases  = tflearn.variables.variable('b', [num_classes])

        batch_loss = tf.nn.nce_loss(weights, biases, embedding_layer, Y, num_sampled=100, num_classes=num_classes)
        loss = tf.reduce_mean(batch_loss)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    
    trainop = tflearn.TrainOp(loss=loss, optimizer=optimizer,
                          metric=None, batch_size=32)

    trainer = tflearn.Trainer(train_ops=trainop, tensorboard_verbose=0, checkpoint_path='embedding_model_nce')
    return trainer, X1, X2, Y
Exemple #2
0
def do_cnn_doc2vec(trainX, testX, trainY, testY):
    global max_features
    print "CNN and doc2vec"

    #trainX = pad_sequences(trainX, maxlen=max_features, value=0.)
    #testX = pad_sequences(testX, maxlen=max_features, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_features], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128,validate_indices=False)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="review")
Exemple #3
0
def do_rnn(trainX, testX, trainY, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    print "GET n_words embedding %d" % n_words


    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, MAX_DOCUMENT_LENGTH])
    net = tflearn.embedding(net, input_dim=n_words, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")
Exemple #4
0
def do_rnn(trainX, testX, trainY, testY):
    max_document_length=64
    y_test=testY
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=64)
    net = tflearn.lstm(net, 64, dropout=0.1)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir="dga_log")
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="dga",n_epoch=1)

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
Exemple #5
0
def do_rnn(x,y):
    global max_document_length
    print "RNN"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True,
              batch_size=10,run_id="webshell",n_epoch=5)

    y_predict_list=model.predict(testX)
    y_predict=[]
    for i in y_predict_list:
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    do_metrics(y_test, y_predict)
Exemple #6
0
def  do_cnn(trainX, trainY,testX, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
    network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.5)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
def generate_net(embedding):
    net = tflearn.input_data([None, 200])
    net = tflearn.embedding(net, input_dim=300000, output_dim=128)
    net = tflearn.lstm(net, 128)
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam',
                             loss='categorical_crossentropy')
    return net
Exemple #8
0
def build_model(num_words, num_docs,
		doc_embedding_size=doc_embedding_size, word_embedding_size=word_embedding_size):
    input_layer1 = tflearn.input_data(shape=[None, 1])
    input_layer2 = tflearn.input_data(shape=[None, 3])

    d1, = tflearn.embedding(input_layer1, input_dim=num_docs, output_dim=doc_embedding_size)
    w1, w2, w3 = tflearn.embedding(input_layer2, input_dim=num_words, output_dim=word_embedding_size)

    embedding_layer = tflearn.merge([d1, w1, w2, w3], mode='concat')
    softmax = tflearn.fully_connected(embedding_layer, num_words, activation='softmax')

    optimizer = tflearn.optimizers.Adam(learning_rate=0.001)
    # optimizer = tflearn.optimizers.SGD(learning_rate=0.1)

    metric = tflearn.metrics.Accuracy()
    net = tflearn.regression(softmax, optimizer=optimizer, metric=metric, batch_size=16,
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net, tensorboard_verbose=0, checkpoint_path='embedding_model')
    return model
def build(embedding_size=(400000, 50), train_embedding=False, hidden_dims=128,
          learning_rate=0.001):
    net = tflearn.input_data([None, 200])
    net = tflearn.embedding(net, input_dim=embedding_size[0],
                            output_dim=embedding_size[1],
                            trainable=train_embedding, name='EmbeddingLayer')
    net = tflearn.lstm(net, hidden_dims)
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate,
                             loss='categorical_crossentropy')
    return net
Exemple #10
0
def do_cnn(x,y):
    global max_document_length
    print "CNN and tf"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')

    model = tflearn.DNN(network, tensorboard_verbose=0)
    #if not os.path.exists(pkl_file):
        # Training
    model.fit(trainX, trainY,
                  n_epoch=5, shuffle=True, validation_set=0.1,
                  show_metric=True, batch_size=100,run_id="webshell")
    #    model.save(pkl_file)
    #else:
    #    model.load(pkl_file)

    y_predict_list=model.predict(testX)
    #y_predict = list(model.predict(testX,as_iterable=True))

    y_predict=[]
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)
    print 'y_predict_list:'
    print y_predict_list
    print 'y_predict:'
    print  y_predict
    #print  y_test

    do_metrics(y_test, y_predict)
def run():
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=20000, output_dim=128)
    net = tflearn.bidirectional_rnn(
        net, tflearn.BasicLSTMCell(128), tflearn.BasicLSTMCell(128))
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(
        net, optimizer='adam', loss='categorical_crossentropy')

    m = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
    m.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64)
    m.save('models/bidirectional_rnn.tfl')
Exemple #12
0
def build():
    network = input_data([None, Meta.max_string_len])
    network = embedding(network, input_dim=Meta.max_one_hot, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = dropout(network, 0.5)
    network = lstm(network, 128)
    # network = fully_connected(network, 20)
    network = fully_connected(network, 2, activation='softmax')
    network = tflearn.regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')
    model = tflearn.DNN(network, tensorboard_verbose=0)
    return model
Exemple #13
0
    def test_recurrent_layers(self):

        X = [[1, 3, 5, 7], [2, 4, 8, 10], [1, 5, 9, 11], [2, 6, 8, 0]]
        Y = [[0., 1.], [1., 0.], [0., 1.], [1., 0.]]

        with tf.Graph().as_default():
            g = tflearn.input_data(shape=[None, 4])
            g = tflearn.embedding(g, input_dim=12, output_dim=4)
            g = tflearn.lstm(g, 6)
            g = tflearn.fully_connected(g, 2, activation='softmax')
            g = tflearn.regression(g, optimizer='sgd', learning_rate=1.)

            m = tflearn.DNN(g)
            m.fit(X, Y, n_epoch=300, snapshot_epoch=False)
            self.assertGreater(m.predict([[5, 9, 11, 1]])[0][1], 0.9)
Exemple #14
0
def do_rnn(trainX, testX, trainY, testY):
    global max_sequences_len
    global max_sys_call
    # Data preprocessing
    # Sequence padding

    trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.)
    testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY_old=testY
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    print "GET max_sequences_len embedding %d" % max_sequences_len
    print "GET max_sys_call embedding %d" % max_sys_call

    net = tflearn.input_data([None, max_sequences_len])
    net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.3)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    #y_predict=to_categorical(y_predict, nb_classes=2)

    print(classification_report(testY_old, y_predict))
    print metrics.confusion_matrix(testY_old, y_predict)
Exemple #15
0
def bi_lstm(trainX, trainY,testX, testY):
    trainX = pad_sequences(trainX, maxlen=200, value=0.)
    testX = pad_sequences(testX, maxlen=200, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data(shape=[None, 200])
    net = tflearn.embedding(net, input_dim=20000, output_dim=128)
    net = tflearn.bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64,run_id="rnn-bilstm")
Exemple #16
0
def run():
    net = tflearn.input_data([None, 100])
    # embed int vector to compact real vector
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    # f*****g magic of rnn
    # if dynamic lstm, backprop thru time till the seq ends,
    # but padding is needed to feed input dim; tail not used
    net = tflearn.lstm(net, 128, dropout=0.8, dynamic=True)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    m = tflearn.DNN(net)
    m.fit(trainX, trainY, validation_set=(testX, testY),
          show_metric=True, batch_size=32)
    m.save('models/lstm.tfl')

    run()
Exemple #17
0
def do_cnn_word2vec_2d_345(trainX, testX, trainY, testY):
    global max_features
    global max_document_length
    print "CNN and word2vec_2d_345"
    y_test = testY

    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length,max_features,1], name='input')
    network = tflearn.embedding(network, input_dim=1, output_dim=128,validate_indices=False)
    branch1 = conv_2d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_2d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_2d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool_2d(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="sms")

    y_predict_list = model.predict(testX)
    print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
Exemple #18
0
def lstm(trainX, trainY,testX, testY):
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=32,run_id="rnn-lstm")
Exemple #19
0
def do_rnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="review",n_epoch=5)
Exemple #20
0
    yTr = numpy.reshape(yTr, (yTr.shape[0], 1))
    yTe = numpy.reshape(yTe, (yTe.shape[0], 1))

    print(xTr.shape, xTe.shape, yTr.shape, yTe.shape)

    x = tf.placeholder(shape=(None, 9), dtype=tf.float32)
    y_ = tf.placeholder(shape=(None, 1), dtype=tf.float32)
    keep_prob = tf.placeholder(tf.float32)

    batch_size = 75
    epochs = 800
    lr = 0.00001

    net = tflearn.input_data(placeholder=x)
    net = tflearn.embedding(net,
                            input_dim=21,
                            output_dim=32,
                            weights_init='xavier')
    net = tflearn.fully_connected(net, 100, activation='prelu')
    net = tflearn.layers.normalization.batch_normalization(net)
    net = tflearn.dropout(net, 0.1)
    net = tflearn.fully_connected(net, 1, activation='sigmoid')

    loss = tf.reduce_mean(tf.square(net - y_))
    train_op = tf.train.RMSPropOptimizer(lr).minimize(loss)
    accuracy = tf.contrib.metrics.streaming_root_mean_squared_error(net, y_)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        sess.run(tf.initialize_local_variables())
        tflearn.is_training(True, session=sess)
# Create train, validation, and test sets
train_x, train_y = features[train_index], labels[train_index]
val_x, val_y = features[test_index], labels[test_index]
test_x, test_y = features[validation_index], labels[validation_index]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
'''

# Create Recurrant Neural Network Model
# input dim = vocab size
# fully connected size = no of unique y labels
net = tflearn.input_data([None, max_length])
net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, no_of_unique_y_labels, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=0)

# Train the RNN with train and test data
model.fit(train_x, train_y, validation_set=(test_x, test_y), show_metric=True)

'''
# Manually save the model
# need to make folder SavedModels
model.save('SavedModels/model.tfl')
print(colored('Model Saved!', 'red'))
Exemple #22
0
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
#basically OHE.

# Network building
"""Another confusing part. A normal FFNN will not suffice for this problem, so a
recurrent neural network is employed in order to capture the sequential order of text (Like in a movie review!)
and is programmed below. An important piece of a RNN is the LSTM which helps "remember" the sequence. 

Will be further covered....
"""

net = tflearn.input_data([None,
                          100])  #input layer, [batch_size, size of input]
net = tflearn.embedding(
    net, input_dim=10000, output_dim=128
)  #we use the previous layers output as the next layers input.
# in an embedding, words are represented by dense vectors where
#a vector represents the projection of the word into a continuous
#vector space. There are 10k words in these reviews, so that is how many dim are needed.

#The position of a word in the learned vector space is referred to as its embedding.

# a word embedding can be learned as part of a deep learning model.
#This can be a slower approach, but tailors the model to a specific
#training dataset.

net = tflearn.lstm(net, 128, dropout=0.8)
"""Quick summary: 

Every review is padded so that it has 100 dimensions, and the words in the each review (there are 10k total words)
doc=doc.split('\n')
doc=map(lambda s: s.split('.'),doc)
doc=[item for sublist in doc for item in sublist]
splitDoc = map(makeWindows,doc)
docX=[item for sublist in splitDoc for item in sublist]

X=map(lambda t: t[0],docX)
Y=map(lambda t: t[1],docX)


# TEST. DELETE AFTER THIS LINE

net = tflearn.input_data([None, WINDOW_SIZE -1])
# Masking is not required for embedding, sequence length is computed prior to
# the embedding op and assigned as 'seq_length' attribute to the returned Tensor.
net = tflearn.embedding(net, input_dim=3, output_dim=3)
net = tflearn.lstm(net, 12, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy')


X=np.array([
	    [0,0,1],
	    [0,0,1],
	    [0,1,1],
	    [0,1,0],
	    [1,0,0]])

Y=np.array([1,1,0,0,0])

trainY


# In[19]:

pd.DataFrame(trainY).tail()


# # Network Building

# In[20]:

# The first element is the "batch size" which we set to "None"
# The second element is set to "100" coz we set the max sequence length to "100"
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128) # input_dim: Vocabulary size (number of ids)
net = tflearn.lstm(net, 128, dropout=0.8) # Long Short Term Memory Recurrent Layer
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, 
                         optimizer='adam', 
                         learning_rate=1e-4,
                         loss='categorical_crossentropy')


# # Training

# In[21]:

model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
vocab_size = len(word_dict)

Xtrain = []  # input word
Ytrain = []  # output word
for data_word in data:
    tempx = to_one_hot(word_dict[data_word[0]], vocab_size)
    tempy = to_one_hot(word_dict[data_word[1]], vocab_size)
    Xtrain.append(tempx)
    Ytrain.append(tempy)
# convert them to numpy arrays
Xtrain = np.asarray(Xtrain)
Ytrain = np.asarray(Ytrain)

g = tflearn.input_data(shape=[None, max_len_seq])
g = tflearn.embedding(g, input_dim=len(word_dict), output_dim=128)
g = tflearn.lstm(g, 512, return_seq=True)
g = tflearn.dropout(g, 0.5)
g = tflearn.lstm(g, 512)
g = tflearn.dropout(g, 0.5)
g = tflearn.fully_connected(g, len(word_dict), activation='softmax')
g = tflearn.regression(g,
                       optimizer='adam',
                       loss='categorical_crossentropy',
                       learning_rate=0.001)

m = tflearn.SequenceGenerator(g,
                              dictionary=word_dict,
                              seq_maxlen=max_len_seq,
                              clip_gradients=5.0,
                              checkpoint_path='./checkpoints/sayton_model')
Exemple #26
0
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

# pickle.dump (X_train, open ("xtrain.p", b))
# pickle.dump (X_test, open ("xtest.p", b))

# X_train = pickle.load (open ("xtrain.p", rb))
# X_test = pickle.load (open ("xtest.p", rb))

### Models

print('Build model')

net = tflearn.input_data([None, model_size])
net = tflearn.embedding(net, input_dim=n_words, output_dim=lstm_size[0])
for i in range(len(lstm_size)):
    if i < len(lstm_size) - 1:
        net = tflearn.gru(net,
                          lstm_size[i],
                          activation=activation_function,
                          return_seq=True)
        net = tflearn.dropout(net, dropout_ratio)
    else:
        net = tflearn.gru(net, lstm_size[i], activation=activation_function)
        net = tflearn.dropout(net, dropout_ratio)
net = tflearn.fully_connected(net, len(qualities), activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.001,
                         loss='categorical_crossentropy')
# print(train_x[0:2], train_y[0:2])

# 转化为固定长度的向量,这里固定长度为100
train_x = pad_sequences(train_x, maxlen=100, value=0)
# print(train_x[0:2])
test_x = pad_sequences(test_x, maxlen=100, value=0)

# 二值化向量
train_y = to_categorical(train_y, nb_classes=2)  # [0]=>[1. 0.]
test_y = to_categorical(test_y, nb_classes=2)   # [1]=>[0. 1.]
# print(train_y[0:2])

# 构建卷积神经网络,使用1d卷积

input = input_data(shape=[None, 100], name='input')
embedding = tflearn.embedding(input, input_dim=10000, output_dim=128)
branch1 = conv_1d(embedding, 128, 3, padding='valid', activation='relu', regularizer='L2')
branch2 = conv_1d(embedding, 128, 4, padding='valid', activation='relu', regularizer='L2')
branch3 = conv_1d(embedding, 128, 5, padding='valid', activation='relu', regularizer='L2')
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
"""
训练开始
"""
model = tflearn.DNN(network, tensorboard_verbose=0, tensorboard_dir='logs')
model.fit(train_x, train_y, n_epoch=1, shuffle=True, validation_set=(test_x, test_y), show_metric=True, batch_size=32)
Exemple #28
0
# import our chat-bot
with open('dataset_chatbot_helptree.json') as json_data:
    intents = json.load(json_data)

# clean up all of our data structures
data = pickle.load(open("training_data", "rb"))
words_data = data['words']
classes = data['classes']
x_train = data['train_x']
y_train = data['train_y']

# reset underlying graph data
ops.reset_default_graph()
# Build neural network
layer = tflearn.input_data(shape=[None, len(x_train[0])])
layer = tflearn.embedding(layer, input_dim=10000, output_dim=128)
layer = tflearn.lstm(layer, 256, dropout=0.8)
layer = tflearn.fully_connected(layer, len(y_train[0]), activation='softmax')
layer = tflearn.regression(layer,
                           optimizer='adam',
                           learning_rate=0.001,
                           loss='categorical_crossentropy')

# Define model and setup tensorboard
model = tflearn.DNN(layer, tensorboard_dir='tflearn_logs')
# load our saved model
model.load('./model.tflearn')


def clean_up_sentence(sentence):
    # tokenize the pattern
from __future__ import division, print_function, absolute_import


import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb


train, test, _= imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1)

trainX, trainY = train
testX, testY = test

trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)

trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

net = tflearn.input_data([None,100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentrophy')

model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)

    train_split_proportion = 0.8
    validation_split_proporiton = 0.5
    split = int(len(features) * train_split_proportion)
    trainX, testX = features[:split], features[split:]
    trainY, testY = labels[:split], labels[split:]
    #print(trainY)
    trainY = to_categorical(trainY, 2)
    testY = to_categorical(testY, 2)

tf.reset_default_graph()
tflearn.config.init_training_mode()

net = tflearn.input_data([None, MAX_SENTENCE_LENGHT])
net = tflearn.embedding(net,
                        input_dim=NUM_DICTIONARY_WORDS + 1,
                        output_dim=100)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.001,
                         loss='categorical_crossentropy')

SubjectivityLSTM_model = tflearn.DNN(net, tensorboard_verbose=0)

if TRAIN:
    SubjectivityLSTM_model.fit(trainX,
                               trainY,
                               validation_set=(testX, testY),
                               n_epoch=8,
Exemple #31
0
# valid_y = [y[s] for s in sidx[n_train:]]

train_x = x[:(n_samples - 8000)]  # last 8k is all travel
train_y = y[:(n_samples - 8000)]
valid_x = x[(n_samples - 8000):]
valid_y = y[(n_samples - 8000):]

trainX = pad_sequences(train_x, maxlen=120, value=0.)
validX = pad_sequences(valid_x, maxlen=120, value=0.)

trainY = pad_sequences(train_y, maxlen=120, value=0.)
validY = pad_sequences(valid_y, maxlen=120, value=0.)

print("generating model...")
g = tflearn.input_data([None, 120])
g = tflearn.embedding(g, input_dim=10000, output_dim=256)

g = tflearn.lstm(g, 256, activation='tanh')
g = tflearn.dropout(g, 0.3)

# g = tflearn.lstm(g, 128, dynamic=True)
# g = tflearn.dropout(g, 0.3)

g = tflearn.fully_connected(g, 120, activation='softmax')
g = tflearn.regression(g,
                       optimizer='adam',
                       loss='categorical_crossentropy',
                       learning_rate=0.001)

m = tflearn.DNN(g, clip_gradients=5.0)
val = len(vp.vocabulary_)
print(val)
tweets_parsed = vp.transform(tweets)
vp.save('my_dictionary')
print(vp)

trainX = tweets_parsed
trainY = tflearn.data_utils.to_categorical(content1, nb_classes=0)

filtered_gen = (item for item in trainX)
gen_to_list = list(filtered_gen)

trainX1 = pad_sequences(gen_to_list, maxlen=120, value=0.)
#print(trainX1)



# Network building
net = tflearn.input_data([None, 120])
net = tflearn.embedding(net, input_dim=val, output_dim=64)
net = tflearn.lstm(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam',loss='binary_crossentropy')


# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX1, trainY, show_metric=True, batch_size=64)

Exemple #33
0
X_train = pad_sequences(X_train, maxlen=model_size, value=0.)
X_test = pad_sequences(X_test, maxlen=model_size, value=0.)

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

# pickle.dump (X_train, open ("xtrain.p", b))
# pickle.dump (X_test, open ("xtest.p", b))

# X_train = pickle.load (open ("xtrain.p", rb))
# X_test = pickle.load (open ("xtest.p", rb))

### Models

print('Build model')

net = tflearn.input_data([None, model_size])
net = tflearn.embedding(net, input_dim=n_words, output_dim=128)
# net = tflearn.lstm(net, 512, dropout=0.5, return_seq=True)
net = tflearn.lstm(net, 512, dropout=0.5)
net = tflearn.fully_connected(net, len (qualities), activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

print ('Train model')

model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir = "logdir/lstm")

print ('Predict')
model.fit(X_train, Y_train, validation_set=(X_test, Y_test), show_metric=True,
          batch_size=32, n_epoch = nb_epochs)
test3_protein = prepare_data(data_dir, "./data/test3_sps", vocab_size_protein,
                             vocab_protein, protein_MAX_size, 1)
test3_compound = prepare_data(data_dir, "./data/test3_smile",
                              vocab_size_compound, vocab_compound,
                              comp_MAX_size, 0)
test3_IC50 = read_labels("./data/test3_ic50")

## separating train,dev, test data
compound_train, compound_dev, IC50_train, IC50_dev, protein_train, protein_dev = train_dev_split(
    train_protein, train_compound, train_IC50, dev_perc, comp_MAX_size,
    protein_MAX_size, batch_size)

## RNN for protein
prot_data = input_data(shape=[None, protein_MAX_size])
prot_embd = tflearn.embedding(prot_data,
                              input_dim=vocab_size_protein,
                              output_dim=GRU_size_prot)
prot_gru_1 = tflearn.gru(prot_embd,
                         GRU_size_prot,
                         initial_state=prot_init_state_1,
                         trainable=True,
                         return_seq=True,
                         restore=False)
prot_gru_1 = tf.stack(prot_gru_1, axis=1)
prot_gru_2 = tflearn.gru(prot_gru_1,
                         GRU_size_prot,
                         initial_state=prot_init_state_2,
                         trainable=True,
                         return_seq=True,
                         restore=False)
prot_gru_2 = tf.stack(prot_gru_2, axis=1)
# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY)
testY = to_categorical(testY)

# Building convolutional network
network = input_data(shape=[None, 100], name='input')
network = tflearn.embedding(network, input_dim=10000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
Exemple #36
0
    def create_model(self,
                     l,
                     tN,
                     N=100000,
                     d=10,
                     K=5,
                     H=1000,
                     m=0.05,
                     reuse=False):
        '''
    N = 1000000 (Paper)
    d = Unknown
    '''
        with tf.variable_scope('TagSpace', reuse=reuse):
            lr = tf.placeholder('float32', shape=[1], name='lr')
            doc = tf.placeholder('float32', shape=[None, l], name='doc')
            tag_flag = tf.placeholder('float32',
                                      shape=[None, tN],
                                      name='tag_flag')

            doc_embed = tflearn.embedding(doc, input_dim=N, output_dim=d)
            self.lt_embed = lt_embed = tf.Variable(
                tf.random_normal([tN, d], stddev=0.1))

            net = tflearn.conv_1d(doc_embed, H, K, activation='tanh')
            net = tflearn.max_pool_1d(net, K)
            net = tflearn.tanh(net)
            self.logit = logit = tflearn.fully_connected(net,
                                                         d,
                                                         activation=None)

            zero_vector = tf.zeros(shape=(1, 1), dtype=tf.float32)

            logit = tf.expand_dims(logit, 1)
            logit_set = tf.concat([logit for i in range(tN)], axis=1)

            tag_flag_ex = tf.expand_dims(tag_flag, 2)
            tg = tf.concat([tag_flag_ex for i in range(d)], axis=2)

            self.tag_logit = tf.reduce_sum(tf.multiply(
                logit_set, tf.multiply(tf.ones_like(tg), lt_embed)),
                                           axis=2)

            self.positive_logit = positive_logit = tf.reduce_sum(tf.multiply(
                logit_set, tf.multiply(tg, lt_embed)),
                                                                 axis=2)
            self.f_positive = f_positive = tf.map_fn(
                lambda x: (tf.boolean_mask(x[0], x[1]), True),
                (positive_logit, tf.not_equal(positive_logit, zero_vector)))
            positive = tf.reduce_min(f_positive[0], axis=1)
            self.positive = positive

            tag_flag_ex = tf.expand_dims(1 - tag_flag, 2)
            tg = tf.concat([tag_flag_ex for i in range(d)], axis=2)
            negative_logit = tf.reduce_sum(tf.multiply(
                logit_set, tf.multiply(tg, lt_embed)),
                                           axis=2)

            self.f_negative = f_negative = tf.map_fn(
                lambda x: (tf.boolean_mask(x[0], x[1]), True),
                (negative_logit, tf.not_equal(negative_logit, zero_vector)))
            self.negative = negative = tf.reduce_max(f_negative[0], axis=1)

            self.f_loss = f_loss = tf.reduce_mean(
                tf.reduce_max([
                    tf.reduce_min([
                        tf.expand_dims(m - positive + negative, 1),
                        tf.expand_dims(tf.fill([tf.shape(doc)[0]], 10e7), 1)
                    ],
                                  axis=0),
                    tf.zeros([tf.shape(doc)[0], 1])
                ],
                              axis=0))
            params = tf.trainable_variables()

            opt = tf.train.AdamOptimizer(learning_rate=lr[0])
            gradients = tf.gradients(f_loss, params)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
            self.op = opt.apply_gradients(zip(clipped_gradients, params))
Codice algoritmo in Python 2.7
from __future__ import division, print_function, absolute_import import tflearn
from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb
# IMDB Dataset loading
train, test, _ = imdb.load_data(path=’imdb.pkl’, n_words=10000, valid_portion=0.1)
trainX, trainY = train testX, testY = test
# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation=’softmax’)
net = tflearn.regression(net, optimizer=’adam’, learning_rate=0.001,
loss=’categorical_crossentropy’)
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=32)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot
testY = to_categorical(testY, nb_classes=number_classes)   #y as one hot
print("end padding & transform to one hot...")
#--------------------------------------------------------------------------------------------------
    # cache trainX,trainY,testX,testY for next time use.
#    with open(f_cache, 'w') as f:
#        pickle.dump((trainX,trainY,testX,testY,vocab_size),f)
#else:
#    print("traning data exists in cache. going to use it.")

# 3.Building convolutional network
######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC##############################################################################################
#(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
network = tflearn.embedding(network, input_dim=vocab_size, output_dim=128) #TODO [None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
         #conv_1d(incoming,nb_filter,filter_size)
branch1 = conv_1d(network, 128, 1, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 2, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns
branch4 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters]
branch5 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters]
network = merge([branch1, branch2, branch3,branch4,branch5], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters]
network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape
network = global_max_pool(network) #[batch_size, pooled dim]
network = dropout(network, 0.5) #[batch_size, pooled dim]
network = fully_connected(network, number_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,number_classes]
top5 = tflearn.metrics.Top_k(k=5)
network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') #metric=top5
######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################
# 4.Training
Exemple #39
0
# IMDB Dataset loading
train, test, _ = imdb.load_data(path="imdb.pkl", n_words=10000, valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# NOTE: Padding is required for dimension consistency. This will pad sequences
# with 0 at the end, until it reaches the max sequence length. 0 is used as a
# masking value by dynamic RNNs in TFLearn; a sequence length will be
# retrieved by counting non zero elements in a sequence. Then dynamic RNN step
# computation is performed according to that length.
trainX = pad_sequences(trainX, maxlen=100, value=0.0)
testX = pad_sequences(testX, maxlen=100, value=0.0)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
# Masking is not required for embedding, sequence length is computed prior to
# the embedding op and assigned as 'seq_length' attribute to the returned Tensor.
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8, dynamic=True)
net = tflearn.fully_connected(net, 2, activation="softmax")
net = tflearn.regression(net, optimizer="adam", learning_rate=0.001, loss="categorical_crossentropy")

# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY)
testY = to_categorical(testY)

# Building convolutional network
network = input_data(shape=[None, 100], name='input')
network = tflearn.embedding(network, input_dim=10000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
Exemple #41
0
    train_X.append(current_mixture)
    train_Y.append(int(row['is_linked']))

train_X = pad_sequences(train_X)
train_Y = to_categorical(train_Y, 2)

test_X = train_X[:testing_set]
train_X = train_X[testing_set:]

test_Y = train_Y[:testing_set]
train_Y = train_Y[testing_set:]

# Building convolutional network
network = input_data(shape=[None, max_mixture_size], name='input')
network = tflearn.embedding(network, input_dim=len(corpora), output_dim=128)
branch1 = conv_1d(network,
                  128,
                  3,
                  padding='valid',
                  activation='relu',
                  regularizer="L2")
branch2 = conv_1d(network,
                  128,
                  4,
                  padding='valid',
                  activation='relu',
                  regularizer="L2")
branch3 = conv_1d(network,
                  128,
                  5,
Exemple #42
0
def createCombined1dConvNetNeuralNetworkModelForFutureResourceUtilisation(
        input_size_states, output_size_actions, learningRate,
        rowsFutureResourceUtilisationMatrix,
        columnsFutureResourceUtilisationMatrix):
    # Specify the log directory
    logdir = 'log/1d_combined/' + datetime.now().strftime("%Y%m%d-%H%M%S")

    # tflearn.init_graph(num_cores=1, gpu_memory_fraction=0.8)

    #### 1d-convolutional layers for currentState ####
    convnetCurrentState = input_data(shape=[None, input_size_states],
                                     name='input_currentState')
    convnetCurrentState = tflearn.embedding(convnetCurrentState,
                                            input_dim=input_size_states,
                                            output_dim=2)

    convnetCurrentState = conv_1d(convnetCurrentState,
                                  nb_filter=16,
                                  filter_size=5,
                                  strides=1,
                                  padding='valid',
                                  activation='relu')
    convnetCurrentState = max_pool_1d(convnetCurrentState,
                                      kernel_size=2,
                                      strides=2,
                                      padding='valid')

    convnetCurrentState = conv_1d(convnetCurrentState,
                                  nb_filter=16,
                                  filter_size=3,
                                  strides=1,
                                  padding='valid',
                                  activation='relu')
    convnetCurrentState = max_pool_1d(convnetCurrentState,
                                      kernel_size=2,
                                      strides=2,
                                      padding='valid')

    convnetCurrentState = conv_1d(convnetCurrentState,
                                  nb_filter=32,
                                  filter_size=3,
                                  strides=1,
                                  padding='valid',
                                  activation='relu')
    convnetCurrentState = max_pool_1d(convnetCurrentState,
                                      kernel_size=2,
                                      strides=2,
                                      padding='valid')

    convnetCurrentState = flatten(convnetCurrentState)

    #### 2d-convolutional layers for FutureResourceUtilisationMatrix ####
    # How to configure input_data: https://stackoverflow.com/questions/48482746/tflearn-what-is-input-data
    convnetResourceUtilisation = input_data(
        shape=[
            None, rowsFutureResourceUtilisationMatrix,
            columnsFutureResourceUtilisationMatrix, 1
        ],
        name='input_futureResourceUtilisationMatrix')

    convnetResourceUtilisation = conv_2d(convnetResourceUtilisation,
                                         nb_filter=16,
                                         filter_size=[4, 1],
                                         strides=1,
                                         padding='same',
                                         activation='relu')
    convnetResourceUtilisation = max_pool_2d(convnetResourceUtilisation,
                                             kernel_size=[1, 10],
                                             strides=1,
                                             padding='valid')

    convnetResourceUtilisation = flatten(convnetResourceUtilisation)

    # merging the outputs of both convolutional nets
    finalNet = merge_outputs([convnetResourceUtilisation, convnetCurrentState],
                             'concat')  # axis=0 is concatenation

    #### final fully connected layers ####
    finalNet = fully_connected(finalNet,
                               n_units=256,
                               weights_init='truncated_normal',
                               activation='relu')
    finalNet = dropout(finalNet, 0.5)

    finalNet = fully_connected(finalNet,
                               n_units=128,
                               weights_init='truncated_normal',
                               activation='relu')
    finalNet = dropout(finalNet, 0.5)

    finalNet = fully_connected(finalNet,
                               n_units=output_size_actions,
                               activation='softmax')
    finalNet = regression(finalNet,
                          optimizer='adam',
                          learning_rate=learningRate,
                          loss='categorical_crossentropy',
                          name='targets')

    model = tflearn.DNN(finalNet,
                        tensorboard_dir=logdir,
                        tensorboard_verbose=0)

    return model
Exemple #43
0
# loading data
train, test, _ = imdb.load_data(path="datasets/imdb/imdb.pkl",
                                n_words=10000,
                               valid_portion=0.1)
x_train, y_train = train
x_test, y_test = test

# sequence padding and converting labels to binary vectors
x_train = pad_sequences(x_train, maxlen=100, value=0.)
y_train = to_categorical(y_train, nb_classes=2)
x_test = pad_sequences(x_test, maxlen=100, value=0.)
y_test = to_categorical(y_test, nb_classes=2)

# building LSTM network
RNN = tflearn.input_data([None, 100])
RNN = tflearn.embedding(RNN, input_dim=10000, output_dim=128)

RNN = tflearn.lstm(RNN, 128, dropout=0.8)
RNN = tflearn.fully_connected(RNN, 2, activation='softmax')
RNN = tflearn.regression(RNN, optimizer='adam', learning_rate=0.001,
                        loss='categorical_crossentropy')

# traning the network
model = tflearn.DNN(RNN, tensorboard_verbose=0)

model.fit(x_train, y_train, validation_set=(x_test,y_test),
         show_metric=True, batch_size=32)


# %% [markdown]
# ## Keras
Exemple #44
0
def comment_predict(data):
    """
	根据已有模型对评论倾向进行预测
	:return:
	"""
    # 建立模型时用到的评论数据
    predict_data = pd.read_csv("courses.csv")

    def chinese_word_cut(text):
        """
		使用结巴分词对中文进行切分转化为独立的词语
		:param text: 完整的评论
		:return: 切分后的评论
		"""
        return " ".join(jieba.cut(text))

    # 进行分词并新建一列保存结果
    predict_data["cut_name"] = predict_data.name.apply(chinese_word_cut)

    # 确定评论部分(X)和标签部分(y)
    X = predict_data["cut_name"]
    y = predict_data["type"]

    # 对数据集进行切分,分为训练集(train)和测试集(test)
    # 这里随机数种子要和建立模型时的随机数种子一样
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    def get_custom_stopwords(stop_words_file):
        """
		得到停用词表
		:param stop_words_file:
		:return: 停用词表list
		"""
        with open(stop_words_file, encoding="utf-8") as f:
            stopwords = f.read()

        stopwords_list = stopwords.split("\n")
        custom_stopwords_list = [i for i in stopwords_list]
        return custom_stopwords_list

    # 得到停用词表
    stop_words_file = "./stop_words/哈工大停用词表.txt"
    stopwords = get_custom_stopwords(stop_words_file)

    # 计算特征数值
    vect = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                           stop_words=frozenset(stopwords))
    vect.fit(X_train)
    vocab = vect.vocabulary_

    def convert_X_to_X_word_ids(X):
        """
		将评论(文字部分)转化为id集(数值序列)
		:param X:评论集合
		:return:数值序列
		"""
        return X.apply(lambda x: [
            vocab[w] for w in [w.lower().strip() for w in x.split()]
            if w in vocab
        ])

    # 序列扩充,统一延长到长度为20的序列,使得评论序列格式相同,不足的用0代替
    X_train_word_ids = convert_X_to_X_word_ids(X_train)
    X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0)

    # 标签集处理
    unique_y_labels = list(y_train.value_counts().index)
    le = preprocessing.LabelEncoder()
    le.fit(unique_y_labels)

    # 构造网络
    size_of_each_vector = X_train_padded_seqs.shape[1]
    vocab_size = len(vocab)
    no_of_unique_y_labels = len(unique_y_labels)

    net = tflearn.input_data([None, size_of_each_vector])
    net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.6)
    net = tflearn.fully_connected(net,
                                  no_of_unique_y_labels,
                                  activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=1e-4,
                             loss='categorical_crossentropy')

    # 初始化
    model = tflearn.DNN(net,
                        tensorboard_verbose=0,
                        tensorboard_dir="./tflearn_data/tflearn_logs/")

    # 加载模型
    model.load(
        "./tflearn_data/tflearn_models/2019-07-08 11.51.40.170202(200, 42)/model"
    )

    # ———————————————————————————————————————预测部分———————————————————————————————————————
    # 待预测的评论数据
    predict_data = data

    # 对评论数据进行分词
    predict_data["cut_name"] = predict_data.name.apply(chinese_word_cut)

    # 设置预测集
    predict_X = predict_data["cut_name"]
    vect.fit(predict_X)

    # 转化为数值序列
    predict_X_word_ids = convert_X_to_X_word_ids(predict_X)
    predict_X_padded_seqs = pad_sequences(predict_X_word_ids,
                                          maxlen=20,
                                          value=0)

    # 进行预测并得到结果
    predict_Y = model.predict(predict_X_padded_seqs)

    # 输出结果
    # print(predict_Y)
    get_type(predict_Y)