Exemple #1
0
def train_without_pretrained_embedding():
    x, y, vocab, vocab_inv = data_helpers.load_data()
    vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('dev shape:', x_dev.shape)
    print('vocab_size', vocab_size)
   
    batch_size = 50
    num_embed = 300
    sentence_size = x_train.shape[1]

    print('batch size', batch_size)
    print('sentence max words', sentence_size)
    print('embedding size', num_embed)

    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, vocab_size, dropout=0.5, with_embedding=False)
    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv
Exemple #3
0
def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    """Construct data iter

    Parameters
    ----------
    batch_size: int
    num_embed: int
    pre_trained_word2vec: boolean
                        identify the pre-trained layers or not
    Returns
    ----------
    train_set: DataIter
                Train DataIter
    valid: DataIter
                Valid DataIter
    sentences_size: int
                array dimensions
    embedded_size: int
                array dimensions
    vocab_size: int
                array dimensions
    """
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshape for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embedded_size = x.shape[-1]
        sentences_size = x.shape[2]
        vocabulary_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embedded_size = num_embed
        sentences_size = x.shape[1]
        vocabulary_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentences_size)
    print('embedding size', embedded_size)
    print('vocab size', vocabulary_size)

    train_set = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)

    return train_set, valid, sentences_size, embedded_size, vocabulary_size
    def run(self):
        input_data, label_data = data_helpers.load_data()
        n_steps = 3
        n_input = 1
        n_classes = 1
        n_hidden = self.n_hidden
        batch_size = self.batch_size
        training_iters = self.training_iters
        display_step = self.display_step
        checkpoint_step = self.checkpoint_step
        # batches = data_helpers.batch_gen(zip(input_data, label_data), 2)
        # for batch in batches:
        #     x_batch, y_batch = zip(*batch)
        #     print('-' * 50)
        #     print(x_batch)
        #     print(y_batch)

        new_rnn = RNN(n_steps=n_steps, n_input=n_input,
                      n_hidden=self.n_hidden, n_classes=n_classes)
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(new_rnn.cost, global_step=global_step) # Adam Optimizer
        # optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # grads_and_vars = optimizer.compute_gradients(new_rnn.cost)
        # train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.join(self.out_dir, "checkpoints")
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=1)

        init = tf.initialize_all_variables()
        with tf.Session() as sess:
            sess.run(init)

            def train_step(_x_batch, _y_batch):
                feed_dict = {new_rnn.x: _x_batch, new_rnn.y: _y_batch,
                             new_rnn.istate: np.zeros((batch_size, 2*n_hidden))}
                _, step = sess.run([optimizer, global_step], feed_dict=feed_dict)
                step = tf.train.global_step(sess, global_step)

                if step % display_step == 0:
                    # Calculate batch accuracy
                    acc = sess.run(new_rnn.accuracy, feed_dict=feed_dict)
                    # Calculate batch loss
                    loss = sess.run(new_rnn.cost, feed_dict=feed_dict)
                    # print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + \
                    #       ", Training Accuracy= " + "{:.5f}".format(acc)
                    print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + str(loss) + \
                          ", Training Accuracy= " + str(acc)
                if step % checkpoint_step == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
            batches = data_helpers.batch_iter(zip(input_data, label_data), batch_size, training_iters)
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                x_batch = np.array(x_batch).reshape((batch_size, n_steps, n_input))
                train_step(x_batch, y_batch)
Exemple #5
0
def read_from_dataset(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff):
    print("Loading data...")
    x, y, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec = data_helpers.load_data(
            dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff)

    myvocab_size = len(vocabulary)

    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.3, random_state=42)


    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print("max_seq_len is: ", seq_max_len)

    return x_train, x_dev, y_train, y_dev, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec
Exemple #6
0
def read_from_dataset(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff):
    print("Loading data...")
    x, y, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec = data_helpers.load_data(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff)


    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.3, random_state=42)


    if(FLAGS.is_evaluation == True):
        x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2, random_state=27)
        FLAGS.evaluate_every=1
        pass

    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print("max_seq_len is: ", seq_max_len)

    return x_train, x_dev, y_train, y_dev, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec
Exemple #7
0
def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshpae for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embed_size = x.shape[-1]
        sentence_size = x.shape[2]
        vocab_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embed_size = num_embed
        sentence_size = x.shape[1]
        vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentence_size)
    print('embedding size', embed_size)
    print('vocab size', vocab_size)

    train = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)
    
    return (train, valid, sentence_size, embed_size, vocab_size)
def do_inference(hostport, work_dir, concurrency, num_tests):
    """Tests PredictionService with concurrent requests.

  Args:
    hostport: Host:port address of the PredictionService.
    work_dir: The full path of working directory for test data set.
    concurrency: Maximum number of concurrent requests.
    num_tests: Number of test images to use.

  Returns:
    The classification error rate.

  Raises:
    IOError: An error occurred processing test data set.
  """
    # print(load_data(1))
    # return
    x, y, vocabulary, vocabulary_inv = load_data(1)
    # print(x)  # ['2' '144' '1073' ..., '0' '0' '0']
    # print(y)  # [0 1]
    # print(vocabulary)  # 'breakpoints': '169715', 'shrill': '61929', '1day': '22983'
    # print(vocabulary_inv)  # ['krystyn'], ['litracey'], ['failbringer']
    # return
    # Randomly shuffle datas
    np.random.seed(123)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    text_percent = FLAGS.test_data_ratio / 100.0
    test_index = int(len(x) * text_percent)
    x_train, x_test = x_shuffled[:-test_index], x_shuffled[-test_index:]
    y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:]
    # batches = batch_iter(zip(x_train, y_train), FLAGS.batch_size, FLAGS.epochs)
    test_batches = list(batch_iter(zip(x_test, y_test), FLAGS.batch_size, 1))
    host, port = hostport.split(':')
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
    result_counter = _ResultCounter(num_tests, concurrency)

    print("Testing start................: batch size: " +
          str(len(test_batches)))
    for batch in test_batches:
        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'twitter-sentiment'
        request.model_spec.signature_name = 'predict_text'
        test_data, label = zip(*batch)
        desired_array = [
            int(numeric_string) for numeric_string in test_data[0]
        ]
        desired_array = np.array(desired_array, dtype=np.int64)
        print('test_data[0]:')
        print(test_data[0])
        print('converted test data (desired_array):')
        print(desired_array)
        request.inputs['text'].CopyFrom(
            tf.contrib.util.make_tensor_proto(desired_array,
                                              shape=[1, test_data[0].size]))
        # print('request: inputs(text):')
        # print(request.inputs['text'])
        request.inputs['dropout'].CopyFrom(
            tf.contrib.util.make_tensor_proto(1.0, shape=[1, 1]))
        result_counter.throttle()
        result_future = stub.Predict.future(request, 5.0)  # 5 seconds
        result_future.add_done_callback(
            _create_rpc_callback(label[0], result_counter))
    return result_counter.get_error_rate()
import time
import data_helpers

beginTime = time.time()

# Parameter definitions
batch_size = 100
learning_rate = 0.005
max_steps = 1000

# Uncommenting this line removes randomness
# You'll get exactly the same result on each run
# np.random.seed(1)

# Prepare data
data_sets = data_helpers.load_data()

# -----------------------------------------------------------------------------
# Prepare the TensorFlow graph
# (We're only defining the graph here, no actual calculations taking place)
# -----------------------------------------------------------------------------

# Define input placeholders
images_placeholder = tf.placeholder(tf.float32, shape=[None, 3072])
labels_placeholder = tf.placeholder(tf.int64, shape=[None])

# Define variables (these are the values we want to optimize)
weights = tf.Variable(tf.zeros([3072, 10]))
biases = tf.Variable(tf.zeros([10]))

# Define the classifier's result
Exemple #10
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load data. Load your own data here
print("Loading data...")
x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data()
y_test = np.argmax(y_test, axis=1)
print("Vocabulary size: {:d}".format(len(vocabulary)))
print("Test set size {:d}".format(len(y_test)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
OUT_DIR = os.path.abspath(os.path.join(os.path.curdir, 'output'))
RUN_ID = time.strftime('run%Y%m%d-%H%M%S')
RUN_DIR = os.path.abspath(os.path.join(OUT_DIR, RUN_ID))
LOG_FILE_PATH = os.path.abspath(os.path.join(RUN_DIR, 'log.log'))
if FLAGS.load is not None:
    CHECKPOINT_FILE_PATH = os.path.abspath(
        os.path.join(FLAGS.load, 'ckpt.ckpt'))
else:
    CHECKPOINT_FILE_PATH = os.path.abspath(os.path.join(RUN_DIR, 'ckpt.ckpt'))
os.mkdir(RUN_DIR)
SUMMARY_DIR = os.path.join(RUN_DIR, 'summaries')
LOG_FILE = open(LOG_FILE_PATH, 'a', buffering=1)

log('======================= START! ========================')
# Load data
x, y, vocabulary, vocabulary_inv = load_data(FLAGS.dataset_fraction)

# Randomly shuffle data
np.random.seed(123)
shuffle_indices = np.random.permutation(np.arange(len(y)))
# x_shuffled = x[shuffle_indices.tolist()]
x_shuffled = [x[idx] for idx in shuffle_indices.tolist()]
y_shuffled = y[shuffle_indices]

# Split train/test set
text_percent = FLAGS.test_data_ratio
test_index = int(len(x) * text_percent)
x_train, x_test = np.array(x_shuffled[:-test_index]), np.array(
    x_shuffled[-test_index:])
y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:]
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split
from data_helpers import load_data

print('Loading data')
x, y, vocabulary, vocabulary_inv = load_data()

# x.shape -> (10662, 56)
# y.shape -> (10662, 2)
# len(vocabulary) -> 18765
# len(vocabulary_inv) -> 18765

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

# X_train.shape -> (8529, 56)
# y_train.shape -> (8529, 2)
# X_test.shape -> (2133, 56)
# y_test.shape -> (2133, 2)


sequence_length = x.shape[1] # 56
vocabulary_size = len(vocabulary_inv) # 18765
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5
Exemple #13
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import os
import time
import datetime
import numpy as np
import data_helpers
import tensorflow as tf
from vgg_model import vgg16

batch_size, num_epochs = 12, 50
num_classes = 2
pos_path, neg_path = "data/positive/", "data/negative/"
x_train, y_train, x_dev, y_dev = data_helpers.load_data(pos_path, neg_path)
print("load data is ok...")


with tf.Graph().as_default():
    gpu_options = tf.GPUOptions(allow_growth=True)
    session_conf = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        gpu_options=gpu_options)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        vggmodel = vgg16(
            height=data_helpers.h,
            width=data_helpers.w,
            channel=data_helpers.c,
            number_classes=num_classes
        )
Exemple #14
0
    def train(self,
              data_dir,
              fquery,
              freply,
              batch_size=128,
              steps_per_checkpoint=100):
        queries = data_helpers.load_data(data_dir, fquery, self.qmax_length)
        replies = data_helpers.load_data(data_dir, freply, self.rmax_length)

        validation_queries = data_helpers.load_data("data/validation_ADEM",
                                                    "queries.txt",
                                                    self.qmax_length)
        validation_replies = data_helpers.load_data("data/validation_ADEM",
                                                    "hred_replies.txt",
                                                    self.rmax_length)
        scores = data_helpers.load_file("data/validation_ADEM",
                                        "hred_scores.txt")
        scores = [float(score) for score in scores]
        #TODO - calculate MSE against these scores?

        data_size = len(queries)
        print_score = tf.print(self.score)
        with self.session.as_default():
            self.init_model()

            checkpoint_path = os.path.join(self.train_dir, "unref.model")
            loss = 0.0
            validation_loss = 0.0
            best_validation_loss = 1000
            prev_losses = [1.0]
            impatience = 0.0
            while True:
                step, l = self.train_step(queries, replies, data_size,
                                          batch_size)
                # KEVIN DOES THIS TRAIN THE MODEL ON THE VALIDATION SET :(
                _, validation_l = self.get_validation_loss(
                    validation_queries, validation_replies,
                    len(validation_queries), batch_size)

                loss += l
                validation_loss += validation_l
                print(validation_loss)
                # save checkpoint
                if step % steps_per_checkpoint == 0:
                    loss /= steps_per_checkpoint
                    validation_loss /= steps_per_checkpoint
                    print ("global_step %d, loss %f, learning rate %f"  \
                            %(step, loss, self.learning_rate.eval()))

                    if validation_loss < best_validation_loss:
                        best_validation_loss = validation_loss
                        impatience = 0.0
                        self.saver.save(self.session,
                                        checkpoint_path,
                                        global_step=self.global_step)
                    else:
                        impatience += 1

                    print("Validation loss is %f. The best loss thus far has been %f. Impatience: %f" \
                        %(validation_loss, best_validation_loss, impatience))

                    if loss > max(prev_losses):
                        self.session.run(self.learning_rate_decay_op)
                    prev_losses = (prev_losses + [loss])[-5:]
                    loss = 0.0

                    self.log_writer.add_summary(self.summary, step)

                    #                    """ Debug
                    query_batch, query_sizes, idx = self.get_batch(
                        queries, data_size, 10)
                    reply_batch, reply_sizes, idx = self.get_batch(
                        replies, data_size, 10, idx)
                    input_feed = self.make_input_feed(query_batch,
                                                      query_sizes,
                                                      reply_batch,
                                                      reply_sizes,
                                                      training=False)
                    score, tests = self.session.run(
                        [self.pos_score, self.test], input_feed)
                    print('-------------')
                    for s, t in zip(score[:10], tests[:10]):
                        print(s, t)
Exemple #15
0
def train(neg_file, pos_file, checkpoint, epoch):
    """
    Examples:
        python3 model.py --neg-file=/var/www/src/github.com/nlp/ChineseNlpCorpus/format_datasets/total_train_neg.txt --pos=/var/www/src/github.com/nlp/ChineseNlpCorpus/format_datasets/total_train_pos.txt
    """
    click.echo(click.style('Loading data...', fg='green'))
    x, y, vocabulary, vocabulary_inv = load_data(pos_file, neg_file)
    click.echo(click.style('Loading data over.', fg='green'))

    # x.shape -> (10662, 56)
    # y.shape -> (10662, 2)
    # len(vocabulary) -> 18765
    # len(vocabulary_inv) -> 18765

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # X_train.shape -> (8529, 56)
    # y_train.shape -> (8529, 2)
    # X_test.shape -> (2133, 56)
    # y_test.shape -> (2133, 2)


    sequence_length = x.shape[1] # 56
    vocabulary_size = len(vocabulary_inv) # 18765
    embedding_dim = 256
    filter_sizes = [3,4,5]
    num_filters = 512
    drop = 0.5

    epochs = 100
    batch_size = 500 # initial 30

    if checkpoint:
        click.echo(click.style('Loading model %s...' % checkpoint, fg='green'))
        model = load_model(checkpoint)
    else:
        # this returns a tensor
        click.echo(click.style('Creating new model...', fg='green'))
        inputs = Input(shape=(sequence_length,), dtype='int32')
        embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
        reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

        conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
        conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
        conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

        maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
        maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
        maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

        concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
        flatten = Flatten()(concatenated_tensor)
        dropout = Dropout(drop)(flatten)
        output = Dense(units=2, activation='softmax')(dropout)

        # this creates a model that includes
        model = Model(inputs=inputs, outputs=output)
        adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
        model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    # tensorboard callback
    cb_tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)
    cb_checkpoint = ModelCheckpoint('./checkpoints/model.epoch.{epoch:03d}.vacc{val_acc:.4f}.hdf5',
                                    monitor='val_acc', verbose=1, save_weights_only=False, save_best_only=True, mode='auto')

    click.echo(click.style('Traning model...', fg='green'))

    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, initial_epoch=epoch,
        verbose=1, callbacks=[cb_checkpoint, cb_tensorboard], validation_data=(X_test, y_test)
    )
Exemple #16
0
        group['lr'] = lr


# save Loss and Epochs in csv
def save_loss(Loss, path):
    num_epochs = len(Loss)
    Epochs = list(range(1, num_epochs + 1))
    Loss_data = {'Epochs': Epochs, 'Loss': Loss}

    import pandas as pd
    df = pd.DataFrame(Loss_data, columns=['Epochs', 'Loss'])
    df.to_csv(path)


for FOLD in range(1, 6):
    X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = data_helpers.load_data(
        fold=FOLD)

    X_trn = X_trn.astype(np.int32)
    X_tst = X_tst.astype(np.int32)
    Y_trn = Y_trn.astype(np.int32)
    Y_tst = Y_tst.astype(np.int32)

    args.num_classes = Y_trn.shape[1]
    embedding_weights = load_word2vec('glove',
                                      vocabulary_inv,
                                      num_features=300)

    capsule_net = CapsNet_Text(args, embedding_weights)
    current_lr = args.learning_rate
    optimizer = Adam(capsule_net.parameters(), lr=current_lr)
    # capsule_net = nn.DataParallel(capsule_net).cuda()
    optimizers = ['adam', 'nadam']
    epochs = [10, 20]
    batch_sizes = [200, 500]
    schedules = [lambda1, lambda2, step_decay]

    save_dir = './multi'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Train
    for d in databases:
        print(d)

        (x_train,
         y_train), (x_dev, y_dev), (x_test,
                                    y_test), vocab_size, max_len = load_data(d)

        for o in optimizers:
            for e in epochs:
                for bz in batch_sizes:
                    for s in schedules:

                        model = CapsNet(input_shape=x_train.shape[1:],
                                        n_class=len(
                                            np.unique(np.argmax(y_train, 1))),
                                        num_routing=3,
                                        vocab_size=vocab_size,
                                        embed_dim=50,
                                        max_len=max_len)

                        model.summary()
logging.root.setLevel(level=logging.INFO)
logger = logging.getLogger()

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
logger.info("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    logger.info("{}={}".format(attr.upper(), value))
logger.info("")

# Data Preparation
# ==================================================

# Load data
logger.info("Loading data...")
train_size, train_data, train_label = data_helpers.load_data(
    FLAGS.train_data_file, FLAGS.raw_data_file)
x_text = train_data
y = np.array(train_label)

# Build vocabulary
max_document_length = max([len(x.split()) for x in train_data])
word2vec_helpers = Word2VecHelper()
x = word2vec_helpers.SentencesIndex(x_text, max_document_length)
#vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
#x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
Exemple #19
0
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool1D, Conv1D, Convolution2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from data_helpers import load_data
from livelossplot import PlotLossesKeras
from matplotlib import pyplot

print('Loading data')
x, y, max_sentace_length = load_data()
# x, y, vocabulary, vocabulary_inv = load_data()
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# X_train.shape -> (44958, 220)
# y_train.shape -> (44958, 1)
# X_test.shape -> (9992, 220)
# y_test.shape -> (9992, 1)

# X_train.shape -> (8529, 56)
# y_train.shape -> (8529, 2)
# X_test.shape -> (2133, 56)
# y_test.shape -> (2133, 2)

sequence_length = (max_sentace_length, 100)  # number of signs in sentace
#vocabulary_size = X_test.max()
embedding_dim = 256
    print("--pos_file and --neg_file must be specified")
    sys.exit(1)

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x, y, word2id, id2word = data_helpers.load_data(FLAGS.vocab_file, 
    FLAGS.pos_file, FLAGS.neg_file, FLAGS.sequence_length, 
    FLAGS.train_size + FLAGS.dev_size)
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
x_train, x_dev = x_shuffled[:-FLAGS.dev_size], x_shuffled[-FLAGS.dev_size:]
y_train, y_dev = y_shuffled[:-FLAGS.dev_size], y_shuffled[-FLAGS.dev_size:]
print("Vocabulary Size: {:d}".format(len(word2id)))
print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))


# Training
# ==================================================
except:
	print_red("Failed to open file.")
	quit()

def log(*string):
	output = ' '.join(string)
	print output
	LOG_FILE.write(''.join(['\n', output]))

log("=======================================================")
log("======================= START! ========================")
log("=======================================================")
log("Preprocessing...")

### Load data ###
x, y, vocabulary, vocabulary_inv = data_helpers.load_data(FLAGS.reduced_dataset)
maxLengthInX = max(len(i) for i in x)

# Randomly shuffle data
np.random.seed(123)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
text_percent = FLAGS.test_data_ratio / 100.0
test_index = int(len(x)*text_percent)
x_train, x_test = x_shuffled[:-test_index], x_shuffled[-test_index:]
y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:]

### Derived parameters ###
sequence_length = x_train.shape[1]
Exemple #22
0
'''
CNN中LSTM和卷积滤波器的隐含单元数为100,
卷积滤波器长度为3,丢包率为0.5,最小批量为16个。
这些超参数通过SST-2上的网格搜索来选择开发集。
'''

use_cuda = torch.cuda.is_available()
EMBEDDING_DIM = 300
HIDDEN_DIM = 100
EPOCH = 10
BATCH_SIZE = 16
LR = 1e-3

# 得到词索引
X, Y, word_to_ix, ix_to_word = data_helpers.load_data()

vocab_size = len(word_to_ix)
max_sent_len = X.shape[1]
num_classes = Y.shape[1]

print('vocab size       = {}'.format(vocab_size))
print('max sentence len = {}'.format(max_sent_len))
print('num of classes   = {}'.format(num_classes))


class MyModel(nn.Module):
    """
        1,将句子矩阵,先做卷积,通过avarage pool生成一个特征列向量,长度为句子长度
        2,将句子矩阵做LSTM,得到每一个时刻输出的矩阵
        3,将1的结果的每个元素点乘到2结果的向量
Exemple #23
0
        }
        x_raw = [
            "The number of reported cases of gonorrhea in Colorado increased",
            "I am in the market for a 24-bit graphics card for a PC"
        ]
        y_test = None
    elif dataset_name == "financenews":
        datasets = {
            "target_names": [
                'strong_neg_examples', 'weak_neg_examples', 'neutral_examples',
                'weak_pos_examples', 'strong_pos_examples'
            ]
        }
        datasets = data_helpers.get_datasets_financenews_test(
            cfg["datasets"][dataset_name]["test_path"])
        x_raw = data_helpers.load_data(datasets)
        y_test = None
        # datasets = {"target_names": ['strong_neg_examples', 'weak_neg_examples', 'neutral_examples', 'weak_pos_examples', 'strong_pos_examples']}
        # x_raw = ["这是什么垃圾股票", "我赚翻了"]
        # y_test = None

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nPredicting...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Exemple #24
0
def train_rnn():
    # Data Preparation
    # ==================================================

    if FLAGS.init_embedding_path is not None:
        embedding = np.load(FLAGS.init_embedding_path)
        print("Using pre-trained word embedding which shape is {}\n".format(embedding.shape))
        FLAGS.vocab_size = embedding.shape[0]
        FLAGS.embedding_size = embedding.shape[1]

    if FLAGS.init_model_path is not None:
        assert os.path.isdir(FLAGS.init_model_path), "init_model_path must be a directory\n"
        ckpt = tf.train.get_checkpoint_state(FLAGS.init_model_path)
        assert ckpt, "No checkpoint found in {}\n".format(FLAGS.init_model_path)
        assert ckpt.model_checkpoint_path, "No model_checkpoint_path found in checkpoint\n"

    # Create root directory
    timestamp = str(int(time.time()))
    root_dir = os.path.join(os.path.curdir, 'runs', 'textrnn', 'trained_result_' + timestamp)
    os.makedirs(root_dir)

    # Load data
    print("Loading data...\n")
    x, y = data_helpers.load_data(FLAGS.data_file, FLAGS.sequence_length, FLAGS.vocab_size, root_dir=root_dir)
    FLAGS.num_classes = len(y[0])

    # Split dataset
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=FLAGS.test_size, stratify=y, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

    # Training
    # ==================================================
    with tf.Graph().as_default():
        tf_config = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        tf_config.gpu_options.allow_growth = FLAGS.gpu_allow_growth

        with tf.Session(config=tf_config).as_default() as sess:
            rnn = TextRNN(
		vocab_size=FLAGS.vocab_size,
		embedding_size=FLAGS.embedding_size,
		sequence_length=FLAGS.sequence_length,
		rnn_size=FLAGS.rnn_size,
                num_layers=FLAGS.num_layers,
                attention_size=FLAGS.attention_size,
            	num_classes=FLAGS.num_classes,
		learning_rate=FLAGS.learning_rate,
		grad_clip=FLAGS.grad_clip)

            # Output directory for models and summaries
            out_dir = os.path.abspath(root_dir)
            print("Writing to {}...\n".format(out_dir))

            # Summaries for loss and accuracy
            tf.summary.scalar("loss", rnn.loss)
            tf.summary.scalar("accuracy", rnn.accuracy)
            merged_summary = tf.summary.merge_all()

            # Summaries dictionary
            train_summary_dir = os.path.join(out_dir, 'summaries', 'train')
            val_summary_dir = os.path.join(out_dir, 'summaries', 'val')
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
            val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph)

            # Checkpoint directory, will not create itself
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, 'checkpoints'))
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt')
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Using pre-trained word embedding
            if FLAGS.init_embedding_path is not None:
                sess.run(rnn.embedding.assign(embedding))
                del embedding

            # Continue training from saved model
            if FLAGS.init_model_path is not None:
                saver.restore(sess, ckpt.model_checkpoint_path)

            # Training start
            print("Start training...\n")
            best_at_step = 0
            best_val_accuracy = 0
            for epoch in range(FLAGS.num_epochs):
                # Generate train batches
                train_batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size)
                start = time.time()
                for batch in train_batches:
                    # Training model on x_batch and y_batch
                    x_batch, y_batch = zip(*batch)
                    seq_len_train = data_helpers.real_len(x_batch)
                    feed_dict = {rnn.input_x: x_batch, rnn.input_y: y_batch, rnn.seq_len: seq_len_train, rnn.keep_prob: FLAGS.dropout_keep_prob}
                    _, global_step, train_summaries, train_loss, train_accuracy = sess.run([rnn.train_op, rnn.global_step,
                        merged_summary, rnn.loss, rnn.accuracy], feed_dict=feed_dict)

                    # Evaluates model on val set
                    if global_step % FLAGS.evaluate_every == 0:
                        end = time.time()
                        train_summary_writer.add_summary(train_summaries, global_step)
                        seq_len_val = data_helpers.real_len(x_val)
                        feed_dict = {rnn.input_x: x_val, rnn.input_y: y_val, rnn.seq_len: seq_len_val, rnn.keep_prob: 1.0}
                        val_summaries, val_loss, val_accuracy = sess.run([merged_summary, rnn.loss, rnn.accuracy], feed_dict=feed_dict)
                        val_summary_writer.add_summary(val_summaries, global_step)
                        print("Epoch: {}, global step: {}, training speed: {:.3f}sec/batch".format(epoch,
                            global_step, (end - start) / FLAGS.evaluate_every))
                        print("train loss: {:.3f}, train accuracy: {:.3f}, val loss: {:.3f}, val accuracy: {:.3f}\n".format(train_loss,
                            train_accuracy, val_loss, val_accuracy))
                        # If improved, save the model
                        if val_accuracy > best_val_accuracy:
                            print("Get a best val accuracy at step {}, model saving...\n".format(global_step))
                            saver.save(sess, checkpoint_prefix, global_step=global_step)
                            best_val_accuracy = val_accuracy
                            best_at_step = global_step
                        start = time.time()

            # Rename the checkpoint
            best_model_prefix = checkpoint_prefix + '-' + str(best_at_step)
            os.rename(best_model_prefix + '.index', os.path.join(checkpoint_dir, 'best_model.index'))
            os.rename(best_model_prefix + '.meta', os.path.join(checkpoint_dir, 'best_model.meta'))
            os.rename(best_model_prefix + '.data-00000-of-00001', os.path.join(checkpoint_dir, 'best_model.data-00000-of-00001'))

            # Testing on test set
            print("\nTraining complete, testing the best model on test set...\n")
            saver.restore(sess, os.path.join(checkpoint_dir, 'best_model'))
            seq_len_test = data_helpers.real_len(x_test)
            feed_dict = {rnn.input_x: x_test, rnn.input_y: y_test, rnn.seq_len: seq_len_test, rnn.keep_prob: 1.0}
            y_logits, test_accuracy = sess.run([rnn.logits, rnn.accuracy], feed_dict=feed_dict)
            print("Testing Accuracy: {:.3f}\n".format(test_accuracy))
            label_transformer = joblib.load(os.path.join(out_dir, 'label_transformer.pkl'))
            y_test_original = label_transformer.inverse_transform(y_test)
            y_logits_original = label_transformer.inverse_transform(y_logits)
            print("Precision, Recall and F1-Score:\n\n", classification_report(y_test_original, y_logits_original))

            # Save parameters
            print("Parameters saving...\n")
            params = {}
            for param, value in FLAGS.__flags.items():
                params[param] = value
            with open(os.path.join(out_dir, 'parameters.json'), 'w') as outfile:
                json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)

            # Save word embedding
            print("Word embedding saving...\n")
            np.save(os.path.join(out_dir, 'embedding.npy'), sess.run(rnn.embedding))
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if (not FLAGS.pos_file or not FLAGS.neg_file or not FLAGS.vocab_file 
    or not FLAGS.checkpoint_dir):
    print("--pos_file, --neg_file, --vocab_file and "
        "--checkpoint_dir must be specified")
    sys.exit(1)

# Load data. Load your own data here
print("Loading data...")
x_test, y_test, word2id, id2word = data_helpers.load_data(FLAGS.vocab_file, 
    FLAGS.pos_file, FLAGS.neg_file, FLAGS.sequence_length, 1000)
y_test = np.argmax(y_test, axis=1)
print("Vocabulary size: {:d}".format(len(word2id)))
print("Test set size {:d}".format(len(y_test)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
def main(_):

    # cluster specification
    #  parameter_servers = ["sgs-gpu-02:2222", "sgs-gpu-02:2223", "sgs-gpu-03:2222", "sgs-gpu-03:2223"]
    #  workers = ["sgs-gpu-02:2224", "sgs-gpu-02:2225", "sgs-gpu-03:2224", "sgs-gpu-03:2225"]
    parameter_servers = [
        "spaceml1:2222", "spaceml1:2223", "spaceml1:2224", "spaceml1:2225"
    ]
    workers = [
        "spaceml1:2226", "spaceml1:2227", "spaceml1:2228", "spaceml1:2229"
    ]

    num_ps = len(parameter_servers)
    num_worker = num_ps

    cluster = tf.train.ClusterSpec({
        "ps": parameter_servers,
        "worker": workers
    })

    #local server, either ps or worker
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)

    data_sets = data_helpers.load_data()

    W1 = [0, 0, 0, 0]
    b1 = [0, 0, 0, 0]
    W2 = [0, 0, 0, 0]
    b2 = [0, 0, 0, 0]

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        with tf.device("/job:ps/task:0"):
            W1[0] = tf.get_variable(
                name='w10',
                shape=[3072, 240],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(3072))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            #        W1[0] = tf.Variable(tf.random_normal([3072,240]))
            b1[0] = tf.Variable(tf.zeros([240]))
            W2[0] = tf.get_variable(
                name='w20',
                shape=[240, 10],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(120))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            #W2[0] = tf.Variable(tf.random_normal([240,10]))
            b2[0] = tf.Variable(tf.zeros([10]))
        with tf.device("/job:ps/task:1"):
            W1[1] = tf.get_variable(
                name='w11',
                shape=[3072, 240],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(3072))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            #W1[1] = tf.Variable(tf.random_normal([3072,240]))
            b1[1] = tf.Variable(tf.zeros([240]))
            W2[1] = tf.get_variable(
                name='w21',
                shape=[240, 10],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(120))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            # W2[1] = tf.Variable(tf.random_normal([240,10]))
            b2[1] = tf.Variable(tf.zeros([10]))
        with tf.device("/job:ps/task:2"):
            W1[2] = tf.get_variable(
                name='w12',
                shape=[3072, 240],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(3072))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))

            #W1[2] = tf.Variable(tf.random_normal([3072,240]))
            b1[2] = tf.Variable(tf.zeros([240]))
            W2[2] = tf.get_variable(
                name='w22',
                shape=[240, 10],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(120))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))

            #W2[2] = tf.Variable(tf.random_normal([240,10]))
            b2[2] = tf.Variable(tf.zeros([10]))
        with tf.device("/job:ps/task:3"):
            W1[3] = tf.get_variable(
                name='w13',
                shape=[3072, 240],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(3072))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            # W1[3] = tf.Variable(tf.random_normal([3072,240]))
            b1[3] = tf.Variable(tf.zeros([240]))
            W2[3] = tf.get_variable(
                name='w23',
                shape=[240, 10],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(float(120))),
                regularizer=tf.contrib.layers.l2_regularizer(0.1))
            #W2[3] = tf.Variable(tf.random_normal([240,10]))
            b2[3] = tf.Variable(tf.zeros([10]))

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):

            # Create the model
            x = tf.placeholder(tf.float32, shape=[None, 3072])
            y_ = tf.placeholder(tf.int64, shape=[None])

            h1 = tf.nn.relu(
                tf.matmul(x, W1[FLAGS.task_index]) + b1[FLAGS.task_index])

            y = tf.matmul(h1, W2[FLAGS.task_index]) + b2[FLAGS.task_index]

            cross_entropy = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_,
                                                               logits=y))

            opt = tf.train.GradientDescentOptimizer(FLAGS.lr)

            grads_and_vars = opt.compute_gradients(cross_entropy, [
                W1[FLAGS.task_index], b1[FLAGS.task_index],
                W2[FLAGS.task_index], b2[FLAGS.task_index]
            ])

            #	w = W2[FLAGS.task_index]
            #	b = b2[FLAGS.task_index]
            new_gv0 = (grads_and_vars[0][0] -
                       (W1[(FLAGS.task_index - 1) % num_ps] + W1[
                           (FLAGS.task_index + 1) % num_ps] -
                        2 * W1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0),
                       grads_and_vars[0][1])
            new_gv1 = (grads_and_vars[1][0] -
                       (b1[(FLAGS.task_index - 1) % num_ps] + b1[
                           (FLAGS.task_index + 1) % num_ps] -
                        2 * b1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0),
                       grads_and_vars[1][1])
            new_gv2 = (grads_and_vars[2][0] -
                       (W2[(FLAGS.task_index - 1) % num_ps] + W2[
                           (FLAGS.task_index + 1) % num_ps] -
                        2 * W2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0),
                       grads_and_vars[2][1])
            new_gv3 = (grads_and_vars[3][0] -
                       (b2[(FLAGS.task_index - 1) % num_ps] + b2[
                           (FLAGS.task_index + 1) % num_ps] -
                        2 * b2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0),
                       grads_and_vars[3][1])

            #print b1[FLAGS.task_index]
            g = grads_and_vars[1][0]
            new_gv = list()
            new_gv.append(new_gv0)
            new_gv.append(new_gv1)
            new_gv.append(new_gv2)
            new_gv.append(new_gv3)

            train_step = opt.apply_gradients(new_gv)

            correct_prediction = tf.equal(tf.argmax(y, 1), y_)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            saver = tf.train.Saver()
            init_op = tf.global_variables_initializer()

            sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                     logdir="/mnt/ds3lab/litian/logs",
                                     init_op=init_op,
                                     saver=saver)

            zipped_data = zip(data_sets['images_train'],
                              data_sets['labels_train'])
            batches = data_helpers.gen_batch(list(zipped_data), 128, 50000)

            with sv.managed_session(server.target) as sess:
                begin = time.time()
                for i in range(50000):
                    batch = next(batches)
                    image_batch, label_batch = zip(*batch)
                    sess.run(train_step,
                             feed_dict={
                                 x: image_batch,
                                 y_: label_batch
                             })

                    if i % 50 == 0:
                        train_accuracy = sess.run(accuracy,
                                                  feed_dict={
                                                      x: image_batch,
                                                      y_: label_batch
                                                  })
                        train_loss = sess.run(cross_entropy,
                                              feed_dict={
                                                  x: image_batch,
                                                  y_: label_batch
                                              })
                        localtime = time.asctime(time.localtime(time.time()))
                        print(localtime)
                        tmp = time.time()
                        print((tmp - begin) / 60.0)

                        print(
                            "step %d, training accuracy %g, training loss %g" %
                            (i, train_accuracy, train_loss))
            sv.stop()
Exemple #27
0
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")

# Costum load data:
id_file = open('./data/id.csv').read()        ## Loads IDs
ids = id_file.splitlines()

print('yay')

x, y, vocabulary, vocabulary_inv, real_codes, dictionary_of_codes = data_helpers.load_data()


# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev, x_full = x_shuffled[:-10000], x_shuffled[-10000:], x      ##NOTE: Added full set
y_train, y_dev, y_full = y_shuffled[:-10000], y_shuffled[-10000:], y
print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
print("Full set size: {:d}".format(len(y_full)))
tf.flags.DEFINE_string("test_data_path", "./data/train.txt", "Data path to evaluation")
tf.flags.DEFINE_string("checkpoint_dir", "/media/jb/DATA/OMGEmotionChallenge/text_cnn/runs/1525115737/checkpoints", "Checkpoint directory from training run")
tf.flags.DEFINE_string("calculateEvaluationCCC", "./data/calculateEvaluationCCC.py", "path to ccc script")
tf.flags.DEFINE_string("validationCSV", "./data/omg_TrainVideos.csv", "path to ccc script")
tf.flags.DEFINE_boolean("compute_ccc", True, "compute_ccc ?")


# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

x_test, y_test, arousals, valences, video_ids, utterances, vocabulary, vocabulary_inv, onehot_label, max_sequence_length = data_helpers.load_data(FLAGS.test_data_path, FLAGS.checkpoint_dir)


# Evaluation
# ==================================================

checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        print ("FLAGS.checkpoint_dir %s" % FLAGS.checkpoint_dir)
Exemple #29
0
# Load data
print("Loading data...")
# train data: the note, first word is the class, the others are note
# output data: the engineer name list
# train again with other data
output = "support_engineers.csv"
# vector: glove & word2vec
train = "dataset1_clean_1"
vector = "w2v.txt"
# seq_size: how many data should people keep
seq_size = 350
# number of iteration through the data
num_of_round = 110

x, y, W, vocabulary = data_helpers.load_data(train,output,vector,seq_size)

# Training
# ==================================================

with tf.Graph().as_default():
    # A Graph contains operations and tensors.
    # Session is the environment you are executing graph operations in, and it contains state about Variables and queues.
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x.shape[1],
            # the length of the sentence
            num_classes=FLAGS.num_classes,
            # how many classes as output
Exemple #30
0
              (params.num_features))

        # dictionary, where key is word, value is word vectors
        embedding_model = {}
        for line in open(model_name, 'r'):
            tmp = line.strip().split()
            word, vec = tmp[0], map(float, tmp[1:])
            assert (len(vec) == params.num_features)
            if word not in embedding_model:
                embedding_model[word] = vec
        assert (len(embedding_model) == 400000)

    else:
        raise ValueError('Unknown pretrain model type: %s!' %
                         (params.model_type))

    embedding_weights = [
        embedding_model[w] if w in embedding_model else np.random.uniform(
            -0.25, 0.25, params.num_features) for w in params.vocabulary_inv
    ]
    embedding_weights = np.array(embedding_weights).astype('float32')

    return embedding_weights


if __name__ == '__main__':
    import data_helpers
    print("Loading data...")
    x, _, _, params.vocabulary_inv = data_helpers.load_data()
    w = train_word2vec(x, params.vocabulary_inv)
Exemple #31
0
        print('Loading existing Word2Vec model (Glove.6B.%dd)' %
              (num_features))

        # dictionary, where key is word, value is word vectors
        embedding_model = {}
        for line in open(model_name, 'r'):
            tmp = line.strip().split()
            word, vec = tmp[0], map(float, tmp[1:])
            assert (len(vec) == num_features)
            if word not in embedding_model:
                embedding_model[word] = vec
        assert (len(embedding_model) == 400000)

    else:
        raise ValueError('Unknown pretrain model type: %s!' % (model_type))

    embedding_weights = [
        embedding_model[w] if w in embedding_model else np.random.uniform(
            -0.25, 0.25, num_features) for w in vocabulary_inv
    ]
    embedding_weights = np.array(embedding_weights).astype('float32')

    return embedding_weights


if __name__ == '__main__':
    import data_helpers
    print("Loading data...")
    x, _, _, vocabulary_inv = data_helpers.load_data()
    w = train_word2vec(x, vocabulary_inv)
Exemple #32
0
from data_helpers import _load_data as load_data
from data_helpers import _load_vocab as load_vocab
from data_helpers import batch_iter
import numpy as np
import datetime
import os
import json
import time
import config


if __name__ == '__main__':

    # Load the training data.
    qid, que_word, que_char, pos_rel_name, pos_rel_word, pos_rel_char, \
    neg_rel_name, neg_rel_word, neg_rel_char = load_data(config.TRAIN_PATH)
    qid_dev, que_word_dev, que_char_dev, pos_rel_name_dev, pos_rel_word_dev, pos_rel_char_dev, \
    neg_rel_name_dev, neg_rel_word_dev, neg_rel_char_dev = load_data(config.TEST_PATH)
    # Create the word2id dictionaries of questions and relations.
    que_vocab, rel_vocab = load_vocab(config.DICT_DIR)

    print('Size of question vocab : {}'.format(len(que_vocab)))
    print('Size of relation vocab : {}'.format(len(rel_vocab)))

    # Change to pytorch Variable.
    que_word = prepare_sequence(que_word, config.MAX_QUESTION_LENGTH, que_vocab)
    que_char = prepare_sequence(que_char, config.MAX_QUESTION_CHAR_LEVEL_LENGTH, que_vocab)
    pos_rel_name = prepare_sequence(pos_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab)
    neg_rel_name = prepare_sequence(neg_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab)
    pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab)
    neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab)
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.iteritems()):
    print("{:25s}={}".format(attr.upper(), value.value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data(
    '../data/dftt/shuf.txt')

## Randomly shuffle data
#np.random.seed(10)
#shuffle_indices = np.random.permutation(np.arange(len(y)))
#x_shuffled = x[shuffle_indices]
#y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation

x_shuffled, y_shuffled = x, y
num_test = int(np.round(x.shape[0] * 0.1))
x_train, x_dev = x_shuffled[:-num_test], x_shuffled[-num_test:]
y_train, y_dev = y_shuffled[:-num_test], y_shuffled[-num_test:]
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load data. Load your own data here
print("Loading data...")
x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data(datfile = FLAGS.eval_filename)
y_test = np.argmax(y_test, axis=1)
print("Vocabulary size: {:d}".format(len(vocabulary)))
print("Test set size {:d}".format(len(y_test)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
Exemple #35
0
@author: Petrus
"""

from keras.layers import Input, Dense, Embedding, merge, Convolution2D, MaxPooling2D, Dropout
from sklearn.cross_validation import train_test_split
from keras.layers.core import Reshape, Flatten
from keras.callbacks import ModelCheckpoint, TensorBoard
from data_helpers import load_data
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Conv1D, LSTM, Bidirectional
from sklearn.manifold import TSNE

print('Loading data')
x, y, vocabulary, vocabulary_inv = load_data()

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

sequence_length = x.shape[1]
vocabulary_size = len(vocabulary_inv)
embedding_dim = 256
filter_sizes = [3, 4, 5]
num_filters = 512
drop = 0.5

nb_epoch = 100
batch_size = 30
Exemple #36
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS.batch_size
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn)

print("Loading pre-trained vectors...")
trained_vecs = data_helpers.load_trained_vecs(
    FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary)

# Create embedding lookup table
count = data_helpers.add_unknown_words(trained_vecs, vocabulary)
embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)]
embedding_mat = np.array(embedding_mat, dtype = np.float32)

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
Exemple #37
0
from keras.callbacks import EarlyStopping, TensorBoard
from sklearn.utils import shuffle
from cnn_radical_two.attention_layer import Attention_layer
import numpy as np
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

print('Loading data')

x, y, embeddings_matrix, x_eval, y_eval, x_eval_raw = load_data()
x_pinyin, y_pinyin, embeddings_matrix_3, x_eval_pinyin, y_eval_pinyin = load_pinyin_data(
)
x_radical, y, embeddings_matrix_2 = load_pianpang_data()

# x_pinyin = x_pinyin + x_radical
x, x_pinyin, x_radical, y = shuffle(x, x_pinyin, x_radical, y, random_state=0)

dev_sample_index = -1 * int(0.1 * float(len(y)))
X_train, X_test = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_test = y[:dev_sample_index], y[dev_sample_index:]

x_train_radical, X_test_radical = x_radical[:dev_sample_index], x_radical[
    dev_sample_index:]

x_train_pinyin, X_test_pinyin = x_pinyin[:dev_sample_index], x_pinyin[
            self.wfile.write("once" + pipe_name + self.windex)
            self.wfile.flush()
            os.write(self.pipeout, serialized.encode('string_escape') + "\n")
         else:
            self.wfile.write("once" + serialized + "end")
            self.wfile.flush()
         print("[%s] Written and sent..." % self.windex, file=dump)
         dump.flush()
      except Exception,e:
         print("[%s] Error sending. Attempt to remake connection..." % self.windex, file=dump)
         print(e, file=dump)
         self.iteration += 1
         self.make_connection(req=True)
         
   def wait_msg(self):
      rcv = ""
      while True:
         rcv = self.sock.recv(10)
         if rcv is not "" and rcv is not None:
            break
      
if __name__ == "__main__":
   # TODO: Gather test data not from end of files which are always positive instances.
   worker_index = int(sys.argv[len(sys.argv) - 1])
   tr_data = data_helpers.load_data(sys.argv[1:len(sys.argv)-1])
   psize = len(tr_data)//num_workers
   tr_data_split = tr_data[psize*worker_index:psize*worker_index+psize]
   del tr_data
   client = TensorSlurmWorker(batch_sz, websocket_port, worker_index)
   client.train_partition(tr_data_split)
Exemple #39
0
sequence_length = 56
embedding_dim = 20
filter_sizes = (3, 4)
num_filters = 3
dropout_prob = (0.25, 0.5)
hidden_dims = 100
batch_size = 32
num_epochs = 200
val_split = 0.1
min_word_count = 1
contwi = 15  # context window size
Act = "prediction"
weights = "model/weight_file"

x, y, vocab, vocab_inv = data_helpers.load_data()

if model_variation == 'CNN-non-static':
    embedding_weights = train_word2vec(x, vocab_inv, embedding_dim,
                                       min_word_count, contwi)
if model_variation == 'CNN-static':
    embedding_weights = train_word2vec(x, vocab_inv, embedding_dim,
                                       min_word_count, contwi)
    x = embedding_weights[0][x]
elif model_variation == 'CNN-rand':
    embedding_weights = None
else:
    print('No choice selected')
print(model_variation)

data_in = Input(shape=(sequence_length, embedding_dim))
Exemple #40
0
def main(_):

    #  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
    #  list_ = []
    #  for line in open("/mnt/ds3lab/litian/input_data/cifar10/label3.txt"):
    #      list_.append(['a', line.strip('\n')])
    #  classes = np.array(list_)
    #  print (len(classes))

    #  train_dataset, mean, std = create_train_datasets(classes[:, 1], num_samples=NUM_IMAGES)
    #  val_dataset = create_test_datasets(classes[:, 1], mean, std, num_samples=NUM_IMAGES)

    #  val_images, val_labels = val_dataset.next_batch(20)

    #  num_classes = len(classes)
    #  print (num_classes)
    data_sets = data_helpers.load_data()

    #  with tf.device('/gpu:0'):

    # Create the model
    x = tf.placeholder(tf.float32, shape=[None, 3072])
    y_ = tf.placeholder(tf.int64, shape=[None])

    w1 = tf.get_variable(name='w1',
                         shape=[3072, 240],
                         initializer=tf.truncated_normal_initializer(
                             stddev=1.0 / np.sqrt(float(3072))),
                         regularizer=tf.contrib.layers.l2_regularizer(0.1))
    b1 = tf.Variable(tf.zeros([240]))
    h1 = tf.nn.relu(tf.matmul(x, w1) + b1)

    w2 = tf.get_variable(name='w2',
                         shape=[240, 10],
                         initializer=tf.truncated_normal_initializer(
                             stddev=1.0 / np.sqrt(float(240))),
                         regularizer=tf.contrib.layers.l2_regularizer(0.1))
    b2 = tf.Variable(tf.zeros([10]))
    y = tf.matmul(h1, w2) + b2

    cross_entropy = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y))

    train_step = tf.train.GradientDescentOptimizer(0.0005).minimize(
        cross_entropy)

    correct_prediction = tf.equal(tf.argmax(y, 1), y_)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    sess = tf.Session()
    sess.run(tf.initialize_all_variables())

    zipped_data = zip(data_sets['images_train'], data_sets['labels_train'])
    batches = data_helpers.gen_batch(list(zipped_data), 128, 50000)
    for i in range(50000):
        #              batch_xs, batch_ys = mnist.train.next_batch(100)
        #   image_batch, label_batch = train_dataset.next_batch(60, random_crop=True)
        batch = next(batches)
        image_batch, label_batch = zip(*batch)
        sess.run(train_step, feed_dict={x: image_batch, y_: label_batch})

        if i % 50 == 0:
            train_accuracy = sess.run(accuracy,
                                      feed_dict={
                                          x: image_batch,
                                          y_: label_batch
                                      })
            train_loss = sess.run(cross_entropy,
                                  feed_dict={
                                      x: image_batch,
                                      y_: label_batch
                                  })
            localtime = time.asctime(time.localtime(time.time()))
            print(localtime)
            print("step %d, training accuracy %g, training loss %g" %
                  (i, train_accuracy, train_loss))
        if i % 500 == 0:
            val_accuracy = sess.run(accuracy,
                                    feed_dict={
                                        x: data_sets['images_test'],
                                        y_: data_sets['labels_test']
                                    })
            print("validation set accuracy %g" % val_accuracy)
        print('Training Word2Vec model...')
        sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
        embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                            size=num_features, min_count=min_word_count,
                                            window=context, sample=downsampling)

        # If we don't plan to train the model any further, calling 
        # init_sims will make the model much more memory-efficient.
        embedding_model.init_sims(replace=True)

        # Saving the model for later use. You can load it later using Word2Vec.load()
        if not exists(model_dir):
            os.mkdir(model_dir)
        print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
        embedding_model.save(model_name)

    # add unknown words
    embedding_weights = {key: embedding_model[word] if word in embedding_model else
                              np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
                         for key, word in vocabulary_inv.items()}
    return embedding_weights


if __name__ == '__main__':
    import data_helpers

    print("Loading data...")
    x, _, _, vocabulary_inv_list = data_helpers.load_data()
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
    w = train_word2vec(x, vocabulary_inv)
Exemple #42
0
parser.add_argument('--ts_batch_size', type=int, default=32, help='Batch size for training')
parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate for training')
parser.add_argument('--start_from', type=str, default='save', help='')

parser.add_argument('--num_compressed_capsule', type=int, default=128, help='The number of compact capsules')
parser.add_argument('--dim_capsule', type=int, default=16, help='The number of dimensions for capsules')

parser.add_argument('--re_ranking', type=int, default=200, help='The number of re-ranking size')

import json
args = parser.parse_args()
params = vars(args)
print(json.dumps(params, indent = 2))

X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = data_helpers.load_data(args.dataset,
                                                                                max_length=args.sequence_length,
                                                                                vocab_size=args.vocab_size)
Y_trn = Y_trn.toarray()
Y_tst = Y_tst.toarray()

X_trn = X_trn.astype(np.int32)
X_tst = X_tst.astype(np.int32)
Y_trn = Y_trn.astype(np.int32)
Y_tst = Y_tst.astype(np.int32)


embedding_weights = load_word2vec('glove', vocabulary_inv, args.vec_size)
args.num_classes = Y_trn.shape[1]

capsule_net = CapsNet_Text(args, embedding_weights)
capsule_net = nn.DataParallel(capsule_net).cuda()
Exemple #43
0
    num_epochs = 7
    val_split = 0.1

    # Word2Vec parameters, see train_word2vec
    min_word_count = 1  # Minimum word count
    context = 10        # Context window size

    # Data Preparatopn
    # ==================================================
    #
    # Load data
    print("Loading data...")
    neg_train_path = './data/imdb_train.neg'
    pos_train_path = './data/imdb_train.pos'

    x, y, vocabulary, vocabulary_inv = data_helpers.load_data(pos_train_path,neg_train_path)

    if model_variation=='CNN-non-static' or model_variation=='CNN-static':
        embedding_weights = train_word2vec(x, vocabulary_inv, model_variation, embedding_dim, min_word_count, context)
        if model_variation=='CNN-static':
            x = embedding_weights[0][x]
    elif model_variation=='CNN-rand':
        embedding_weights = None
    else:
        raise ValueError('Unknown model variation')

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices].argmax(axis=1)
Exemple #44
0
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    sys.stdout = old_stdout
    log_file.close()


if __name__ == '__main__':
    os.environ[
        "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

    # Data Preparation
    print("Load data...")
    x_train, y_train, x_test, y_test, vocabulary_inv = load_data()

    if sequence_length != x_test.shape[1]:
        print("Adjusting sequence length for actual size")
        sequence_length = x_test.shape[1]

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

    model = create_model(x_train=x_train,
                         x_test=x_test,
                         vocabulary_inv=vocabulary_inv,
                         model_type=model_type,
                         embedding_dim=embedding_dim,
                         min_word_count=min_word_count,
Exemple #45
0
from __future__ import print_function

import numpy as np
import tensorflow as tf
import time
import data_helpers

beginTime = time.time()
# Parameter definitions

batch_size = 100
learning_rate = 0.005
max_steps = 1000

# Prepare data
data_sets = data_helpers.load_data()

# Define input placeholders
images_placeholder = tf.placeholder(tf.float32, shape=[None, 3072])
labels_placeholder = tf.placeholder(tf.int64, shape=[None])

# Define variables (these are the values we want to optimize)
weights = tf.Variable(tf.zeros([3072, 10]))
biases = tf.Variable(tf.zeros([10]))

# Define the classifier's result
logits = tf.matmul(images_placeholder, weights) + biases

# Define the loss function
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels_placeholder))
Exemple #46
0
    word_index = dict()
    for line in open(filename, "r"):
        datas = line.strip().split("\t")
        if len(datas) < 2:
            continue
        index = int(datas[0])
        word = datas[1]
        word_index[word] = index
    return word_index


word_index = load_wordindex("./data/word.tsv")
sent_sos_id = word_index["<s>"]
sent_eos_id = word_index["</s>"]

train_data = data_helpers.load_data(
    open(FLAGS.traindata_file, "r").readlines(), word_index)
test_data = data_helpers.load_data(
    open(FLAGS.testdata_file, "r").readlines(), word_index)

# Training
logging.info("logging test")

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        model = Seq2seq(
            max_sequence_len=FLAGS.max_sequence_len,
            embedding_size=FLAGS.embedding_dim,
Exemple #47
0
import numpy as np
np.random.seed(1337)
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split
from data_helpers import load_data
import pandas as pd

print('Loading data')
x, y, vocabulary, vocabulary_inv, sentences_padded = load_data()

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
#X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.2, random_state=42)
df_x = pd.DataFrame(X_test)
df_y = pd.DataFrame(y_test)
df_x.to_csv(
    r'C:\Users\varsha.vishwakarma\Documents\CNN-text-classification\test_input_dataset_jd.csv',
    header=False,
    index=False)
df_y.to_csv(
    r'C:\Users\varsha.vishwakarma\Documents\CNN-text-classification\test_output_dataset_jd.csv',
    header=False,
    index=False)
sequence_length = x.shape[1]  # 56
vocabulary_size = len(vocabulary_inv)  # 18765
Exemple #48
0
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

corpus = [sent.split() for sent in test_corpus]
'''


print("Loading data...")
x, y = data_helpers.load_data()
corpus = x + y
# Split train/test set


model = glove_optimizer.GloVeModel(embedding_size=10, context_size=3, min_occurrences=0,
                            learning_rate=0.05, batch_size=2)
model.fit_to_corpus(corpus)

model.train(num_epochs=1, log_dir="log/example", summary_batch_interval=10000, tsne_epoch_interval=1)

#print model.embedding_for("graph")
#print model.embeddings()
#print(model.embeddings()[model.id_for_word('graph')])
Exemple #49
0
import numpy as np
import keras
from keras.models import Sequential, Graph
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.recurrent import LSTM
from keras.utils import np_utils, generic_utils
import data_helpers
from w2v import train_word2vec
from sklearn.cross_validation import StratifiedKFold

cnn_train,lstm_train,Y_train,cnn_vocabulary,cnn_vocabulary_inv,lstm_vocabulary,lstm_vocabulary_inv = data_helpers.load_data()
cnn_embedding_weights,lstm_embedding_weights = train_word2vec(cnn_train, cnn_vocabulary_inv,lstm_train,lstm_vocabulary_inv)
#cnn_train=cnn_embedding_weights[0][cnn_train]
#lstm_train=lstm_embedding_weights[0][lstm_train]

shuffle_indices = np.random.permutation(np.arange(len(Y_train)))
cnn_shuffled = cnn_train[shuffle_indices]
lstm_shuffled = lstm_train[shuffle_indices]
Y_train = Y_train[shuffle_indices]
#Y_train_f=np_utils.to_categorical(Y_train,27)

filter_sizes = (3, 4)
num_filters = 150
hidden_dims = 150

cnn_graph = Graph()
cnn_graph.add_input(name='input', input_shape=(32, 300))
for fsz in filter_sizes:
	conv = Convolution1D(nb_filter=num_filters,filter_length=fsz,border_mode='valid',activation='relu',subsample_length=1)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
data_size = len(y_shuffled)
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-data_size/4], x_shuffled[-data_size/4:]
y_train, y_dev = y_shuffled[:-data_size/4], y_shuffled[-data_size/4:]
print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


# Training
def initiate(args):
    # define output directory
    time_str = datetime.datetime.now().isoformat()
    out_dir = os.path.abspath(os.path.join(os.path.curdir, args.save_dir, time_str))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # initiate logger
    log_file_path = os.path.join(out_dir, 'log')
    logger = Logger(log_file_path)
    analysis_file_path = os.path.join(out_dir, 'analysis')
    analysis_logger = Logger(analysis_file_path)

    # report parameters
    logger.write("\nParameters:")
    for arg in args.__dict__:
        logger.write("{}={}".format(arg.upper(), args.__dict__[arg]))
    logger.write("")

    # load data
    logger.write("Loading data...")
    if args.data == 'gameforum':
        x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data_gameforum_only(args.use_pretrained_embedding);
    elif args.data == 'semeval':
        x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data_semeval_only(args.use_pretrained_embedding)
    else:
        x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data(args.use_pretrained_embedding)
    num_classes = len(y_train[0])

    # report
    logger.write("Vocabulary Size: {:d}".format(len(vocabulary)))
    logger.write("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

    # fill out missing arg values
    args.seq_length = x_train.shape[1]
    args.vocab_size = len(vocabulary)
    args.filter_sizes = map(int, args.filter_sizes.split(","))
    args.vocabulary_embedding = vocabulary_embedding
    args.num_classes = num_classes

    # initialize a model
    if args.model == 'deep':
        model = DeepCNN(args)
    elif args.model == 'basic':
        model = BasicCNN(args)
    else:
        logger.write("Invalid model")
        sys.exit()

    # for train summary
    grad_summaries = []
    for g, v in model.grads_and_vars:
        if g is not None:
            grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
            sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    grad_summaries_merged = tf.merge_summary(grad_summaries)

    loss_summary = tf.scalar_summary("loss", model.loss)
    acc_summary = tf.scalar_summary("accuracy", model.accuracy)

    train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")

    # prepare saver
    checkpoint_dir = os.path.join(out_dir, 'checkpoints')
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    saver = tf.train.Saver(tf.all_variables())

    # generate batches
    batches = data_helpers.batch_iter(x_train, y_train, args.batch_size, args.num_epochs)

    # define train / test methods
    def train_model(x, y, dropout_prob, writer, log=False):
        feed_dict = {
          model.input_x: x,
          model.input_y: y,
          model.dropout_keep_prob: dropout_prob
        }
        _, step, loss, accuracy, summaries = sess.run(
            [model.train_op, model.global_step, model.loss, model.accuracy, train_summary_op],
            feed_dict)
        sess.run(model.weight_rescaling_op)  # l2 norm rescaling
        writer.add_summary(summaries, step)
        if log:
            time_str = datetime.datetime.now().isoformat()
            logger.write("{}: step {}, loss {:g}, acc {:g}".format(time_str, step-1, loss, accuracy))

    def test_model(x, y):
        logger.write("\nEvaluate:")
        feed_dict = {
          model.input_x: x,
          model.input_y: y,
          model.dropout_keep_prob: 1.0
        }
        step, loss, accuracy, predictions, targets = sess.run(
                [model.global_step, model.loss, model.accuracy, model.predictions, model.targets],
                feed_dict)
        time_str = datetime.datetime.now().isoformat()
        logger.write("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
        logger.write("")
        return accuracy, predictions, targets

    # start a session
    sess_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
    sess = tf.Session(config=sess_conf)
    with sess.as_default():
        # initialize
        tf.initialize_all_variables().run()
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
        current_step = 0

        if args.train:  # train the model from scratch
            best_test_accuracy = 0.0
            for x_batch, y_batch in batches:
                # train
                train_model(x_batch, y_batch, args.dropout_keep_prob, train_summary_writer,
                            current_step % (args.evaluate_every/4) == 0)
                current_step = tf.train.global_step(sess, model.global_step)

                # evaluate with dev set
                if current_step % args.evaluate_every == 0:
                    accuracy, predictions, targets = test_model(x_dev, y_dev)

                    # Conduct analysis if the current model is the best so far
                    if accuracy > best_test_accuracy:
                        best_test_accuracy = accuracy
                        analysis_logger.write("Analysis at {}: acc={}".format(current_step, accuracy), begin=True)
                        analysis_logger.write("Tweet\tPred\tTrue (0=Positive, 1=Neutral, 2=Negative)")
                        for i in range(len(x_dev)):
                            tweet_idx = x_dev[i]
                            prediction, true_label = predictions[i], targets[i]
                            try:
                                tweet = " ".join([vocabulary_inv[word_idx] for word_idx in tweet_idx if word_idx != 0])
                                analysis_logger.write("{}\t{}\t{}".format(tweet, prediction, true_label))
                            except UnicodeEncodeError:
                                analysis_logger.write("{}\t{}\t{}".format("ENCODING ERROR", prediction, true_label))
                        analysis_logger.write("\n")

                # save model
                if current_step % args.checkpoint_every == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    logger.write("Saved model checkpoint to {}\n".format(path))

        else:  # load the model
            logger.write("Loading the model...")
Exemple #52
0
def main(warmup_iterations, num_epochs, files, testfile):
   tr_data = data_helpers.load_data(files)
   test_data = data_helpers.load_data([testfile])
   warmup_data = None
   start_parameter_server(model=model, num_epochs=num_epochs, warmup_data=warmup_data, test_data=test_data)
Exemple #53
0
# Остальные параметры
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Разрешить польщоватся удобным процессором")
tf.flags.DEFINE_boolean("log_device_placement", False, "Залогировать на каком процессоре началось выполнение")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nПараметры:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Загрузка данных. Свои данные можно загрузить сдесь
print("Загрузка данных...")
x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data()
y_test = np.argmax(y_test, axis=1)
print("Размер словаря: {:d}".format(len(vocabulary)))
print("Размер тестового набора {:d}".format(len(y_test)))

print("\nОценка...\n")

# Оценка
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
Exemple #54
0
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS.batch_size
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(
    FLAGS.vn)

print("Loading pre-trained vectors...")
trained_vecs = data_helpers.load_trained_vecs(FLAGS.vn, FLAGS.vn_embeddings,
                                              FLAGS.en_embeddings, vocabulary)

# Create embedding lookup table
count = data_helpers.add_unknown_words(trained_vecs, vocabulary)
embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)]
embedding_mat = np.array(embedding_mat, dtype=np.float32)

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
Exemple #55
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS.batch_size
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size,all_label = data_helpers.load_data()

print("Loading pre-trained vectors...")
trained_vecs = data_helpers.load_trained_vecs(FLAGS.en_embeddings, vocabulary)

# Create embedding lookup table
count = data_helpers.add_unknown_words(trained_vecs, vocabulary)
embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)]
embedding_mat = np.array(embedding_mat, dtype = np.float32)

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
Exemple #56
0
import data_helpers
from text_cnn import TextCNN
import math

import re
import itertools
from collections import Counter

import random
import nltk
import collections
import word2vec
from text_rnn import TextRNN

# Load training data
x_train, y_train, vocabulary, vocabulary_inv = data_helpers.load_data()
#y_test = np.argmax(y_test, axis=1)
vocab_size = len(vocabulary)
print("Vocabulary size: {:d}".format(vocab_size))
print("Test set size {:d}".format(len(y_train)))

# Generate word embeddings
data = x_train.flatten()
iterations = 1000
data = data[data!=468]
w2v = word2vec.word2vec(data,vocabulary,vocabulary_inv,vocab_size,iterations)
final_embeddings = w2v.runWord2Vec()

x_train = np.fliplr(x_train)
fullTrain = np.concatenate((x_train,y_train),axis=1)
shuffledTrain = np.random.permutation(fullTrain)