def train_without_pretrained_embedding(): x, y, vocab, vocab_inv = data_helpers.load_data() vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('dev shape:', x_dev.shape) print('vocab_size', vocab_size) batch_size = 50 num_embed = 300 sentence_size = x_train.shape[1] print('batch size', batch_size) print('sentence max words', sentence_size) print('embedding size', num_embed) cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, vocab_size, dropout=0.5, with_embedding=False) train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def load_data(data_source): assert data_source in ["keras_data_set", "local_dir"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" else: x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): """Construct data iter Parameters ---------- batch_size: int num_embed: int pre_trained_word2vec: boolean identify the pre-trained layers or not Returns ---------- train_set: DataIter Train DataIter valid: DataIter Valid DataIter sentences_size: int array dimensions embedded_size: int array dimensions vocab_size: int array dimensions """ print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshape for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embedded_size = x.shape[-1] sentences_size = x.shape[2] vocabulary_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embedded_size = num_embed sentences_size = x.shape[1] vocabulary_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentences_size) print('embedding size', embedded_size) print('vocab size', vocabulary_size) train_set = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return train_set, valid, sentences_size, embedded_size, vocabulary_size
def run(self): input_data, label_data = data_helpers.load_data() n_steps = 3 n_input = 1 n_classes = 1 n_hidden = self.n_hidden batch_size = self.batch_size training_iters = self.training_iters display_step = self.display_step checkpoint_step = self.checkpoint_step # batches = data_helpers.batch_gen(zip(input_data, label_data), 2) # for batch in batches: # x_batch, y_batch = zip(*batch) # print('-' * 50) # print(x_batch) # print(y_batch) new_rnn = RNN(n_steps=n_steps, n_input=n_input, n_hidden=self.n_hidden, n_classes=n_classes) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(new_rnn.cost, global_step=global_step) # Adam Optimizer # optimizer = tf.train.AdamOptimizer(self.learning_rate) # grads_and_vars = optimizer.compute_gradients(new_rnn.cost) # train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.join(self.out_dir, "checkpoints") checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables(), max_to_keep=1) init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) def train_step(_x_batch, _y_batch): feed_dict = {new_rnn.x: _x_batch, new_rnn.y: _y_batch, new_rnn.istate: np.zeros((batch_size, 2*n_hidden))} _, step = sess.run([optimizer, global_step], feed_dict=feed_dict) step = tf.train.global_step(sess, global_step) if step % display_step == 0: # Calculate batch accuracy acc = sess.run(new_rnn.accuracy, feed_dict=feed_dict) # Calculate batch loss loss = sess.run(new_rnn.cost, feed_dict=feed_dict) # print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + \ # ", Training Accuracy= " + "{:.5f}".format(acc) print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + str(loss) + \ ", Training Accuracy= " + str(acc) if step % checkpoint_step == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path)) batches = data_helpers.batch_iter(zip(input_data, label_data), batch_size, training_iters) for batch in batches: x_batch, y_batch = zip(*batch) x_batch = np.array(x_batch).reshape((batch_size, n_steps, n_input)) train_step(x_batch, y_batch)
def read_from_dataset(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff): print("Loading data...") x, y, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec = data_helpers.load_data( dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff) myvocab_size = len(vocabulary) x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.3, random_state=42) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print("max_seq_len is: ", seq_max_len) return x_train, x_dev, y_train, y_dev, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec
def read_from_dataset(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff): print("Loading data...") x, y, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec = data_helpers.load_data(dataset_path, word2vec_model_path, n_classes, max_seq_len_cutoff) x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.3, random_state=42) if(FLAGS.is_evaluation == True): x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2, random_state=27) FLAGS.evaluate_every=1 pass print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print("max_seq_len is: ", seq_max_len) return x_train, x_dev, y_train, y_dev, seq_max_len, vocabulary, vocabulary_inv, word2vec_vocab, word2vec_vec
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshpae for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embed_size = x.shape[-1] sentence_size = x.shape[2] vocab_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embed_size = num_embed sentence_size = x.shape[1] vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentence_size) print('embedding size', embed_size) print('vocab size', vocab_size) train = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return (train, valid, sentence_size, embed_size, vocab_size)
def do_inference(hostport, work_dir, concurrency, num_tests): """Tests PredictionService with concurrent requests. Args: hostport: Host:port address of the PredictionService. work_dir: The full path of working directory for test data set. concurrency: Maximum number of concurrent requests. num_tests: Number of test images to use. Returns: The classification error rate. Raises: IOError: An error occurred processing test data set. """ # print(load_data(1)) # return x, y, vocabulary, vocabulary_inv = load_data(1) # print(x) # ['2' '144' '1073' ..., '0' '0' '0'] # print(y) # [0 1] # print(vocabulary) # 'breakpoints': '169715', 'shrill': '61929', '1day': '22983' # print(vocabulary_inv) # ['krystyn'], ['litracey'], ['failbringer'] # return # Randomly shuffle datas np.random.seed(123) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set text_percent = FLAGS.test_data_ratio / 100.0 test_index = int(len(x) * text_percent) x_train, x_test = x_shuffled[:-test_index], x_shuffled[-test_index:] y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:] # batches = batch_iter(zip(x_train, y_train), FLAGS.batch_size, FLAGS.epochs) test_batches = list(batch_iter(zip(x_test, y_test), FLAGS.batch_size, 1)) host, port = hostport.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) result_counter = _ResultCounter(num_tests, concurrency) print("Testing start................: batch size: " + str(len(test_batches))) for batch in test_batches: request = predict_pb2.PredictRequest() request.model_spec.name = 'twitter-sentiment' request.model_spec.signature_name = 'predict_text' test_data, label = zip(*batch) desired_array = [ int(numeric_string) for numeric_string in test_data[0] ] desired_array = np.array(desired_array, dtype=np.int64) print('test_data[0]:') print(test_data[0]) print('converted test data (desired_array):') print(desired_array) request.inputs['text'].CopyFrom( tf.contrib.util.make_tensor_proto(desired_array, shape=[1, test_data[0].size])) # print('request: inputs(text):') # print(request.inputs['text']) request.inputs['dropout'].CopyFrom( tf.contrib.util.make_tensor_proto(1.0, shape=[1, 1])) result_counter.throttle() result_future = stub.Predict.future(request, 5.0) # 5 seconds result_future.add_done_callback( _create_rpc_callback(label[0], result_counter)) return result_counter.get_error_rate()
import time import data_helpers beginTime = time.time() # Parameter definitions batch_size = 100 learning_rate = 0.005 max_steps = 1000 # Uncommenting this line removes randomness # You'll get exactly the same result on each run # np.random.seed(1) # Prepare data data_sets = data_helpers.load_data() # ----------------------------------------------------------------------------- # Prepare the TensorFlow graph # (We're only defining the graph here, no actual calculations taking place) # ----------------------------------------------------------------------------- # Define input placeholders images_placeholder = tf.placeholder(tf.float32, shape=[None, 3072]) labels_placeholder = tf.placeholder(tf.int64, shape=[None]) # Define variables (these are the values we want to optimize) weights = tf.Variable(tf.zeros([3072, 10])) biases = tf.Variable(tf.zeros([10])) # Define the classifier's result
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load data. Load your own data here print("Loading data...") x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data() y_test = np.argmax(y_test, axis=1) print("Vocabulary size: {:d}".format(len(vocabulary))) print("Test set size {:d}".format(len(y_test))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
OUT_DIR = os.path.abspath(os.path.join(os.path.curdir, 'output')) RUN_ID = time.strftime('run%Y%m%d-%H%M%S') RUN_DIR = os.path.abspath(os.path.join(OUT_DIR, RUN_ID)) LOG_FILE_PATH = os.path.abspath(os.path.join(RUN_DIR, 'log.log')) if FLAGS.load is not None: CHECKPOINT_FILE_PATH = os.path.abspath( os.path.join(FLAGS.load, 'ckpt.ckpt')) else: CHECKPOINT_FILE_PATH = os.path.abspath(os.path.join(RUN_DIR, 'ckpt.ckpt')) os.mkdir(RUN_DIR) SUMMARY_DIR = os.path.join(RUN_DIR, 'summaries') LOG_FILE = open(LOG_FILE_PATH, 'a', buffering=1) log('======================= START! ========================') # Load data x, y, vocabulary, vocabulary_inv = load_data(FLAGS.dataset_fraction) # Randomly shuffle data np.random.seed(123) shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices.tolist()] x_shuffled = [x[idx] for idx in shuffle_indices.tolist()] y_shuffled = y[shuffle_indices] # Split train/test set text_percent = FLAGS.test_data_ratio test_index = int(len(x) * text_percent) x_train, x_test = np.array(x_shuffled[:-test_index]), np.array( x_shuffled[-test_index:]) y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:]
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.callbacks import ModelCheckpoint from keras.optimizers import Adam from keras.models import Model from sklearn.model_selection import train_test_split from data_helpers import load_data print('Loading data') x, y, vocabulary, vocabulary_inv = load_data() # x.shape -> (10662, 56) # y.shape -> (10662, 2) # len(vocabulary) -> 18765 # len(vocabulary_inv) -> 18765 X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) # X_train.shape -> (8529, 56) # y_train.shape -> (8529, 2) # X_test.shape -> (2133, 56) # y_test.shape -> (2133, 2) sequence_length = x.shape[1] # 56 vocabulary_size = len(vocabulary_inv) # 18765 embedding_dim = 256 filter_sizes = [3,4,5] num_filters = 512 drop = 0.5
#!/usr/bin/python # -*- coding:utf-8 -*- import os import time import datetime import numpy as np import data_helpers import tensorflow as tf from vgg_model import vgg16 batch_size, num_epochs = 12, 50 num_classes = 2 pos_path, neg_path = "data/positive/", "data/negative/" x_train, y_train, x_dev, y_dev = data_helpers.load_data(pos_path, neg_path) print("load data is ok...") with tf.Graph().as_default(): gpu_options = tf.GPUOptions(allow_growth=True) session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) sess = tf.Session(config=session_conf) with sess.as_default(): vggmodel = vgg16( height=data_helpers.h, width=data_helpers.w, channel=data_helpers.c, number_classes=num_classes )
def train(self, data_dir, fquery, freply, batch_size=128, steps_per_checkpoint=100): queries = data_helpers.load_data(data_dir, fquery, self.qmax_length) replies = data_helpers.load_data(data_dir, freply, self.rmax_length) validation_queries = data_helpers.load_data("data/validation_ADEM", "queries.txt", self.qmax_length) validation_replies = data_helpers.load_data("data/validation_ADEM", "hred_replies.txt", self.rmax_length) scores = data_helpers.load_file("data/validation_ADEM", "hred_scores.txt") scores = [float(score) for score in scores] #TODO - calculate MSE against these scores? data_size = len(queries) print_score = tf.print(self.score) with self.session.as_default(): self.init_model() checkpoint_path = os.path.join(self.train_dir, "unref.model") loss = 0.0 validation_loss = 0.0 best_validation_loss = 1000 prev_losses = [1.0] impatience = 0.0 while True: step, l = self.train_step(queries, replies, data_size, batch_size) # KEVIN DOES THIS TRAIN THE MODEL ON THE VALIDATION SET :( _, validation_l = self.get_validation_loss( validation_queries, validation_replies, len(validation_queries), batch_size) loss += l validation_loss += validation_l print(validation_loss) # save checkpoint if step % steps_per_checkpoint == 0: loss /= steps_per_checkpoint validation_loss /= steps_per_checkpoint print ("global_step %d, loss %f, learning rate %f" \ %(step, loss, self.learning_rate.eval())) if validation_loss < best_validation_loss: best_validation_loss = validation_loss impatience = 0.0 self.saver.save(self.session, checkpoint_path, global_step=self.global_step) else: impatience += 1 print("Validation loss is %f. The best loss thus far has been %f. Impatience: %f" \ %(validation_loss, best_validation_loss, impatience)) if loss > max(prev_losses): self.session.run(self.learning_rate_decay_op) prev_losses = (prev_losses + [loss])[-5:] loss = 0.0 self.log_writer.add_summary(self.summary, step) # """ Debug query_batch, query_sizes, idx = self.get_batch( queries, data_size, 10) reply_batch, reply_sizes, idx = self.get_batch( replies, data_size, 10, idx) input_feed = self.make_input_feed(query_batch, query_sizes, reply_batch, reply_sizes, training=False) score, tests = self.session.run( [self.pos_score, self.test], input_feed) print('-------------') for s, t in zip(score[:10], tests[:10]): print(s, t)
def train(neg_file, pos_file, checkpoint, epoch): """ Examples: python3 model.py --neg-file=/var/www/src/github.com/nlp/ChineseNlpCorpus/format_datasets/total_train_neg.txt --pos=/var/www/src/github.com/nlp/ChineseNlpCorpus/format_datasets/total_train_pos.txt """ click.echo(click.style('Loading data...', fg='green')) x, y, vocabulary, vocabulary_inv = load_data(pos_file, neg_file) click.echo(click.style('Loading data over.', fg='green')) # x.shape -> (10662, 56) # y.shape -> (10662, 2) # len(vocabulary) -> 18765 # len(vocabulary_inv) -> 18765 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # X_train.shape -> (8529, 56) # y_train.shape -> (8529, 2) # X_test.shape -> (2133, 56) # y_test.shape -> (2133, 2) sequence_length = x.shape[1] # 56 vocabulary_size = len(vocabulary_inv) # 18765 embedding_dim = 256 filter_sizes = [3,4,5] num_filters = 512 drop = 0.5 epochs = 100 batch_size = 500 # initial 30 if checkpoint: click.echo(click.style('Loading model %s...' % checkpoint, fg='green')) model = load_model(checkpoint) else: # this returns a tensor click.echo(click.style('Creating new model...', fg='green')) inputs = Input(shape=(sequence_length,), dtype='int32') embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs) reshape = Reshape((sequence_length,embedding_dim,1))(embedding) conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape) maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0) maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1) maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2) concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2]) flatten = Flatten()(concatenated_tensor) dropout = Dropout(drop)(flatten) output = Dense(units=2, activation='softmax')(dropout) # this creates a model that includes model = Model(inputs=inputs, outputs=output) adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) # tensorboard callback cb_tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) cb_checkpoint = ModelCheckpoint('./checkpoints/model.epoch.{epoch:03d}.vacc{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_weights_only=False, save_best_only=True, mode='auto') click.echo(click.style('Traning model...', fg='green')) model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, initial_epoch=epoch, verbose=1, callbacks=[cb_checkpoint, cb_tensorboard], validation_data=(X_test, y_test) )
group['lr'] = lr # save Loss and Epochs in csv def save_loss(Loss, path): num_epochs = len(Loss) Epochs = list(range(1, num_epochs + 1)) Loss_data = {'Epochs': Epochs, 'Loss': Loss} import pandas as pd df = pd.DataFrame(Loss_data, columns=['Epochs', 'Loss']) df.to_csv(path) for FOLD in range(1, 6): X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = data_helpers.load_data( fold=FOLD) X_trn = X_trn.astype(np.int32) X_tst = X_tst.astype(np.int32) Y_trn = Y_trn.astype(np.int32) Y_tst = Y_tst.astype(np.int32) args.num_classes = Y_trn.shape[1] embedding_weights = load_word2vec('glove', vocabulary_inv, num_features=300) capsule_net = CapsNet_Text(args, embedding_weights) current_lr = args.learning_rate optimizer = Adam(capsule_net.parameters(), lr=current_lr) # capsule_net = nn.DataParallel(capsule_net).cuda()
optimizers = ['adam', 'nadam'] epochs = [10, 20] batch_sizes = [200, 500] schedules = [lambda1, lambda2, step_decay] save_dir = './multi' if not os.path.exists(save_dir): os.makedirs(save_dir) # Train for d in databases: print(d) (x_train, y_train), (x_dev, y_dev), (x_test, y_test), vocab_size, max_len = load_data(d) for o in optimizers: for e in epochs: for bz in batch_sizes: for s in schedules: model = CapsNet(input_shape=x_train.shape[1:], n_class=len( np.unique(np.argmax(y_train, 1))), num_routing=3, vocab_size=vocab_size, embed_dim=50, max_len=max_len) model.summary()
logging.root.setLevel(level=logging.INFO) logger = logging.getLogger() FLAGS = tf.flags.FLAGS FLAGS._parse_flags() logger.info("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): logger.info("{}={}".format(attr.upper(), value)) logger.info("") # Data Preparation # ================================================== # Load data logger.info("Loading data...") train_size, train_data, train_label = data_helpers.load_data( FLAGS.train_data_file, FLAGS.raw_data_file) x_text = train_data y = np.array(train_label) # Build vocabulary max_document_length = max([len(x.split()) for x in train_data]) word2vec_helpers = Word2VecHelper() x = word2vec_helpers.SentencesIndex(x_text, max_document_length) #vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) #x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool1D, Conv1D, Convolution2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.callbacks import ModelCheckpoint from keras.optimizers import Adam from keras.models import Model, Sequential from sklearn.model_selection import train_test_split from data_helpers import load_data from livelossplot import PlotLossesKeras from matplotlib import pyplot print('Loading data') x, y, max_sentace_length = load_data() # x, y, vocabulary, vocabulary_inv = load_data() X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # X_train.shape -> (44958, 220) # y_train.shape -> (44958, 1) # X_test.shape -> (9992, 220) # y_test.shape -> (9992, 1) # X_train.shape -> (8529, 56) # y_train.shape -> (8529, 2) # X_test.shape -> (2133, 56) # y_test.shape -> (2133, 2) sequence_length = (max_sentace_length, 100) # number of signs in sentace #vocabulary_size = X_test.max() embedding_dim = 256
print("--pos_file and --neg_file must be specified") sys.exit(1) print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x, y, word2id, id2word = data_helpers.load_data(FLAGS.vocab_file, FLAGS.pos_file, FLAGS.neg_file, FLAGS.sequence_length, FLAGS.train_size + FLAGS.dev_size) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set x_train, x_dev = x_shuffled[:-FLAGS.dev_size], x_shuffled[-FLAGS.dev_size:] y_train, y_dev = y_shuffled[:-FLAGS.dev_size], y_shuffled[-FLAGS.dev_size:] print("Vocabulary Size: {:d}".format(len(word2id))) print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # Training # ==================================================
except: print_red("Failed to open file.") quit() def log(*string): output = ' '.join(string) print output LOG_FILE.write(''.join(['\n', output])) log("=======================================================") log("======================= START! ========================") log("=======================================================") log("Preprocessing...") ### Load data ### x, y, vocabulary, vocabulary_inv = data_helpers.load_data(FLAGS.reduced_dataset) maxLengthInX = max(len(i) for i in x) # Randomly shuffle data np.random.seed(123) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set text_percent = FLAGS.test_data_ratio / 100.0 test_index = int(len(x)*text_percent) x_train, x_test = x_shuffled[:-test_index], x_shuffled[-test_index:] y_train, y_test = y_shuffled[:-test_index], y_shuffled[-test_index:] ### Derived parameters ### sequence_length = x_train.shape[1]
''' CNN中LSTM和卷积滤波器的隐含单元数为100, 卷积滤波器长度为3,丢包率为0.5,最小批量为16个。 这些超参数通过SST-2上的网格搜索来选择开发集。 ''' use_cuda = torch.cuda.is_available() EMBEDDING_DIM = 300 HIDDEN_DIM = 100 EPOCH = 10 BATCH_SIZE = 16 LR = 1e-3 # 得到词索引 X, Y, word_to_ix, ix_to_word = data_helpers.load_data() vocab_size = len(word_to_ix) max_sent_len = X.shape[1] num_classes = Y.shape[1] print('vocab size = {}'.format(vocab_size)) print('max sentence len = {}'.format(max_sent_len)) print('num of classes = {}'.format(num_classes)) class MyModel(nn.Module): """ 1,将句子矩阵,先做卷积,通过avarage pool生成一个特征列向量,长度为句子长度 2,将句子矩阵做LSTM,得到每一个时刻输出的矩阵 3,将1的结果的每个元素点乘到2结果的向量
} x_raw = [ "The number of reported cases of gonorrhea in Colorado increased", "I am in the market for a 24-bit graphics card for a PC" ] y_test = None elif dataset_name == "financenews": datasets = { "target_names": [ 'strong_neg_examples', 'weak_neg_examples', 'neutral_examples', 'weak_pos_examples', 'strong_pos_examples' ] } datasets = data_helpers.get_datasets_financenews_test( cfg["datasets"][dataset_name]["test_path"]) x_raw = data_helpers.load_data(datasets) y_test = None # datasets = {"target_names": ['strong_neg_examples', 'weak_neg_examples', 'neutral_examples', 'weak_pos_examples', 'strong_pos_examples']} # x_raw = ["这是什么垃圾股票", "我赚翻了"] # y_test = None # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nPredicting...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
def train_rnn(): # Data Preparation # ================================================== if FLAGS.init_embedding_path is not None: embedding = np.load(FLAGS.init_embedding_path) print("Using pre-trained word embedding which shape is {}\n".format(embedding.shape)) FLAGS.vocab_size = embedding.shape[0] FLAGS.embedding_size = embedding.shape[1] if FLAGS.init_model_path is not None: assert os.path.isdir(FLAGS.init_model_path), "init_model_path must be a directory\n" ckpt = tf.train.get_checkpoint_state(FLAGS.init_model_path) assert ckpt, "No checkpoint found in {}\n".format(FLAGS.init_model_path) assert ckpt.model_checkpoint_path, "No model_checkpoint_path found in checkpoint\n" # Create root directory timestamp = str(int(time.time())) root_dir = os.path.join(os.path.curdir, 'runs', 'textrnn', 'trained_result_' + timestamp) os.makedirs(root_dir) # Load data print("Loading data...\n") x, y = data_helpers.load_data(FLAGS.data_file, FLAGS.sequence_length, FLAGS.vocab_size, root_dir=root_dir) FLAGS.num_classes = len(y[0]) # Split dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=FLAGS.test_size, stratify=y, random_state=0) x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0) # Training # ================================================== with tf.Graph().as_default(): tf_config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) tf_config.gpu_options.allow_growth = FLAGS.gpu_allow_growth with tf.Session(config=tf_config).as_default() as sess: rnn = TextRNN( vocab_size=FLAGS.vocab_size, embedding_size=FLAGS.embedding_size, sequence_length=FLAGS.sequence_length, rnn_size=FLAGS.rnn_size, num_layers=FLAGS.num_layers, attention_size=FLAGS.attention_size, num_classes=FLAGS.num_classes, learning_rate=FLAGS.learning_rate, grad_clip=FLAGS.grad_clip) # Output directory for models and summaries out_dir = os.path.abspath(root_dir) print("Writing to {}...\n".format(out_dir)) # Summaries for loss and accuracy tf.summary.scalar("loss", rnn.loss) tf.summary.scalar("accuracy", rnn.accuracy) merged_summary = tf.summary.merge_all() # Summaries dictionary train_summary_dir = os.path.join(out_dir, 'summaries', 'train') val_summary_dir = os.path.join(out_dir, 'summaries', 'val') train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) # Checkpoint directory, will not create itself checkpoint_dir = os.path.abspath(os.path.join(out_dir, 'checkpoints')) checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # Initialize all variables sess.run(tf.global_variables_initializer()) # Using pre-trained word embedding if FLAGS.init_embedding_path is not None: sess.run(rnn.embedding.assign(embedding)) del embedding # Continue training from saved model if FLAGS.init_model_path is not None: saver.restore(sess, ckpt.model_checkpoint_path) # Training start print("Start training...\n") best_at_step = 0 best_val_accuracy = 0 for epoch in range(FLAGS.num_epochs): # Generate train batches train_batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size) start = time.time() for batch in train_batches: # Training model on x_batch and y_batch x_batch, y_batch = zip(*batch) seq_len_train = data_helpers.real_len(x_batch) feed_dict = {rnn.input_x: x_batch, rnn.input_y: y_batch, rnn.seq_len: seq_len_train, rnn.keep_prob: FLAGS.dropout_keep_prob} _, global_step, train_summaries, train_loss, train_accuracy = sess.run([rnn.train_op, rnn.global_step, merged_summary, rnn.loss, rnn.accuracy], feed_dict=feed_dict) # Evaluates model on val set if global_step % FLAGS.evaluate_every == 0: end = time.time() train_summary_writer.add_summary(train_summaries, global_step) seq_len_val = data_helpers.real_len(x_val) feed_dict = {rnn.input_x: x_val, rnn.input_y: y_val, rnn.seq_len: seq_len_val, rnn.keep_prob: 1.0} val_summaries, val_loss, val_accuracy = sess.run([merged_summary, rnn.loss, rnn.accuracy], feed_dict=feed_dict) val_summary_writer.add_summary(val_summaries, global_step) print("Epoch: {}, global step: {}, training speed: {:.3f}sec/batch".format(epoch, global_step, (end - start) / FLAGS.evaluate_every)) print("train loss: {:.3f}, train accuracy: {:.3f}, val loss: {:.3f}, val accuracy: {:.3f}\n".format(train_loss, train_accuracy, val_loss, val_accuracy)) # If improved, save the model if val_accuracy > best_val_accuracy: print("Get a best val accuracy at step {}, model saving...\n".format(global_step)) saver.save(sess, checkpoint_prefix, global_step=global_step) best_val_accuracy = val_accuracy best_at_step = global_step start = time.time() # Rename the checkpoint best_model_prefix = checkpoint_prefix + '-' + str(best_at_step) os.rename(best_model_prefix + '.index', os.path.join(checkpoint_dir, 'best_model.index')) os.rename(best_model_prefix + '.meta', os.path.join(checkpoint_dir, 'best_model.meta')) os.rename(best_model_prefix + '.data-00000-of-00001', os.path.join(checkpoint_dir, 'best_model.data-00000-of-00001')) # Testing on test set print("\nTraining complete, testing the best model on test set...\n") saver.restore(sess, os.path.join(checkpoint_dir, 'best_model')) seq_len_test = data_helpers.real_len(x_test) feed_dict = {rnn.input_x: x_test, rnn.input_y: y_test, rnn.seq_len: seq_len_test, rnn.keep_prob: 1.0} y_logits, test_accuracy = sess.run([rnn.logits, rnn.accuracy], feed_dict=feed_dict) print("Testing Accuracy: {:.3f}\n".format(test_accuracy)) label_transformer = joblib.load(os.path.join(out_dir, 'label_transformer.pkl')) y_test_original = label_transformer.inverse_transform(y_test) y_logits_original = label_transformer.inverse_transform(y_logits) print("Precision, Recall and F1-Score:\n\n", classification_report(y_test_original, y_logits_original)) # Save parameters print("Parameters saving...\n") params = {} for param, value in FLAGS.__flags.items(): params[param] = value with open(os.path.join(out_dir, 'parameters.json'), 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) # Save word embedding print("Word embedding saving...\n") np.save(os.path.join(out_dir, 'embedding.npy'), sess.run(rnn.embedding))
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if (not FLAGS.pos_file or not FLAGS.neg_file or not FLAGS.vocab_file or not FLAGS.checkpoint_dir): print("--pos_file, --neg_file, --vocab_file and " "--checkpoint_dir must be specified") sys.exit(1) # Load data. Load your own data here print("Loading data...") x_test, y_test, word2id, id2word = data_helpers.load_data(FLAGS.vocab_file, FLAGS.pos_file, FLAGS.neg_file, FLAGS.sequence_length, 1000) y_test = np.argmax(y_test, axis=1) print("Vocabulary size: {:d}".format(len(word2id))) print("Test set size {:d}".format(len(y_test))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
def main(_): # cluster specification # parameter_servers = ["sgs-gpu-02:2222", "sgs-gpu-02:2223", "sgs-gpu-03:2222", "sgs-gpu-03:2223"] # workers = ["sgs-gpu-02:2224", "sgs-gpu-02:2225", "sgs-gpu-03:2224", "sgs-gpu-03:2225"] parameter_servers = [ "spaceml1:2222", "spaceml1:2223", "spaceml1:2224", "spaceml1:2225" ] workers = [ "spaceml1:2226", "spaceml1:2227", "spaceml1:2228", "spaceml1:2229" ] num_ps = len(parameter_servers) num_worker = num_ps cluster = tf.train.ClusterSpec({ "ps": parameter_servers, "worker": workers }) #local server, either ps or worker server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) data_sets = data_helpers.load_data() W1 = [0, 0, 0, 0] b1 = [0, 0, 0, 0] W2 = [0, 0, 0, 0] b2 = [0, 0, 0, 0] if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device("/job:ps/task:0"): W1[0] = tf.get_variable( name='w10', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W1[0] = tf.Variable(tf.random_normal([3072,240])) b1[0] = tf.Variable(tf.zeros([240])) W2[0] = tf.get_variable( name='w20', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[0] = tf.Variable(tf.random_normal([240,10])) b2[0] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:1"): W1[1] = tf.get_variable( name='w11', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W1[1] = tf.Variable(tf.random_normal([3072,240])) b1[1] = tf.Variable(tf.zeros([240])) W2[1] = tf.get_variable( name='w21', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W2[1] = tf.Variable(tf.random_normal([240,10])) b2[1] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:2"): W1[2] = tf.get_variable( name='w12', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W1[2] = tf.Variable(tf.random_normal([3072,240])) b1[2] = tf.Variable(tf.zeros([240])) W2[2] = tf.get_variable( name='w22', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[2] = tf.Variable(tf.random_normal([240,10])) b2[2] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:3"): W1[3] = tf.get_variable( name='w13', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W1[3] = tf.Variable(tf.random_normal([3072,240])) b1[3] = tf.Variable(tf.zeros([240])) W2[3] = tf.get_variable( name='w23', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[3] = tf.Variable(tf.random_normal([240,10])) b2[3] = tf.Variable(tf.zeros([10])) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # Create the model x = tf.placeholder(tf.float32, shape=[None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) h1 = tf.nn.relu( tf.matmul(x, W1[FLAGS.task_index]) + b1[FLAGS.task_index]) y = tf.matmul(h1, W2[FLAGS.task_index]) + b2[FLAGS.task_index] cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) opt = tf.train.GradientDescentOptimizer(FLAGS.lr) grads_and_vars = opt.compute_gradients(cross_entropy, [ W1[FLAGS.task_index], b1[FLAGS.task_index], W2[FLAGS.task_index], b2[FLAGS.task_index] ]) # w = W2[FLAGS.task_index] # b = b2[FLAGS.task_index] new_gv0 = (grads_and_vars[0][0] - (W1[(FLAGS.task_index - 1) % num_ps] + W1[ (FLAGS.task_index + 1) % num_ps] - 2 * W1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[0][1]) new_gv1 = (grads_and_vars[1][0] - (b1[(FLAGS.task_index - 1) % num_ps] + b1[ (FLAGS.task_index + 1) % num_ps] - 2 * b1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[1][1]) new_gv2 = (grads_and_vars[2][0] - (W2[(FLAGS.task_index - 1) % num_ps] + W2[ (FLAGS.task_index + 1) % num_ps] - 2 * W2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[2][1]) new_gv3 = (grads_and_vars[3][0] - (b2[(FLAGS.task_index - 1) % num_ps] + b2[ (FLAGS.task_index + 1) % num_ps] - 2 * b2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[3][1]) #print b1[FLAGS.task_index] g = grads_and_vars[1][0] new_gv = list() new_gv.append(new_gv0) new_gv.append(new_gv1) new_gv.append(new_gv2) new_gv.append(new_gv3) train_step = opt.apply_gradients(new_gv) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir="/mnt/ds3lab/litian/logs", init_op=init_op, saver=saver) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 50000) with sv.managed_session(server.target) as sess: begin = time.time() for i in range(50000): batch = next(batches) image_batch, label_batch = zip(*batch) sess.run(train_step, feed_dict={ x: image_batch, y_: label_batch }) if i % 50 == 0: train_accuracy = sess.run(accuracy, feed_dict={ x: image_batch, y_: label_batch }) train_loss = sess.run(cross_entropy, feed_dict={ x: image_batch, y_: label_batch }) localtime = time.asctime(time.localtime(time.time())) print(localtime) tmp = time.time() print((tmp - begin) / 60.0) print( "step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) sv.stop()
print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") # Costum load data: id_file = open('./data/id.csv').read() ## Loads IDs ids = id_file.splitlines() print('yay') x, y, vocabulary, vocabulary_inv, real_codes, dictionary_of_codes = data_helpers.load_data() # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev, x_full = x_shuffled[:-10000], x_shuffled[-10000:], x ##NOTE: Added full set y_train, y_dev, y_full = y_shuffled[:-10000], y_shuffled[-10000:], y print("Vocabulary Size: {:d}".format(len(vocabulary))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print("Full set size: {:d}".format(len(y_full)))
tf.flags.DEFINE_string("test_data_path", "./data/train.txt", "Data path to evaluation") tf.flags.DEFINE_string("checkpoint_dir", "/media/jb/DATA/OMGEmotionChallenge/text_cnn/runs/1525115737/checkpoints", "Checkpoint directory from training run") tf.flags.DEFINE_string("calculateEvaluationCCC", "./data/calculateEvaluationCCC.py", "path to ccc script") tf.flags.DEFINE_string("validationCSV", "./data/omg_TrainVideos.csv", "path to ccc script") tf.flags.DEFINE_boolean("compute_ccc", True, "compute_ccc ?") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() x_test, y_test, arousals, valences, video_ids, utterances, vocabulary, vocabulary_inv, onehot_label, max_sequence_length = data_helpers.load_data(FLAGS.test_data_path, FLAGS.checkpoint_dir) # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables print ("FLAGS.checkpoint_dir %s" % FLAGS.checkpoint_dir)
# Load data print("Loading data...") # train data: the note, first word is the class, the others are note # output data: the engineer name list # train again with other data output = "support_engineers.csv" # vector: glove & word2vec train = "dataset1_clean_1" vector = "w2v.txt" # seq_size: how many data should people keep seq_size = 350 # number of iteration through the data num_of_round = 110 x, y, W, vocabulary = data_helpers.load_data(train,output,vector,seq_size) # Training # ================================================== with tf.Graph().as_default(): # A Graph contains operations and tensors. # Session is the environment you are executing graph operations in, and it contains state about Variables and queues. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) with sess.as_default(): cnn = TextCNN( sequence_length=x.shape[1], # the length of the sentence num_classes=FLAGS.num_classes, # how many classes as output
(params.num_features)) # dictionary, where key is word, value is word vectors embedding_model = {} for line in open(model_name, 'r'): tmp = line.strip().split() word, vec = tmp[0], map(float, tmp[1:]) assert (len(vec) == params.num_features) if word not in embedding_model: embedding_model[word] = vec assert (len(embedding_model) == 400000) else: raise ValueError('Unknown pretrain model type: %s!' % (params.model_type)) embedding_weights = [ embedding_model[w] if w in embedding_model else np.random.uniform( -0.25, 0.25, params.num_features) for w in params.vocabulary_inv ] embedding_weights = np.array(embedding_weights).astype('float32') return embedding_weights if __name__ == '__main__': import data_helpers print("Loading data...") x, _, _, params.vocabulary_inv = data_helpers.load_data() w = train_word2vec(x, params.vocabulary_inv)
print('Loading existing Word2Vec model (Glove.6B.%dd)' % (num_features)) # dictionary, where key is word, value is word vectors embedding_model = {} for line in open(model_name, 'r'): tmp = line.strip().split() word, vec = tmp[0], map(float, tmp[1:]) assert (len(vec) == num_features) if word not in embedding_model: embedding_model[word] = vec assert (len(embedding_model) == 400000) else: raise ValueError('Unknown pretrain model type: %s!' % (model_type)) embedding_weights = [ embedding_model[w] if w in embedding_model else np.random.uniform( -0.25, 0.25, num_features) for w in vocabulary_inv ] embedding_weights = np.array(embedding_weights).astype('float32') return embedding_weights if __name__ == '__main__': import data_helpers print("Loading data...") x, _, _, vocabulary_inv = data_helpers.load_data() w = train_word2vec(x, vocabulary_inv)
from data_helpers import _load_data as load_data from data_helpers import _load_vocab as load_vocab from data_helpers import batch_iter import numpy as np import datetime import os import json import time import config if __name__ == '__main__': # Load the training data. qid, que_word, que_char, pos_rel_name, pos_rel_word, pos_rel_char, \ neg_rel_name, neg_rel_word, neg_rel_char = load_data(config.TRAIN_PATH) qid_dev, que_word_dev, que_char_dev, pos_rel_name_dev, pos_rel_word_dev, pos_rel_char_dev, \ neg_rel_name_dev, neg_rel_word_dev, neg_rel_char_dev = load_data(config.TEST_PATH) # Create the word2id dictionaries of questions and relations. que_vocab, rel_vocab = load_vocab(config.DICT_DIR) print('Size of question vocab : {}'.format(len(que_vocab))) print('Size of relation vocab : {}'.format(len(rel_vocab))) # Change to pytorch Variable. que_word = prepare_sequence(que_word, config.MAX_QUESTION_LENGTH, que_vocab) que_char = prepare_sequence(que_char, config.MAX_QUESTION_CHAR_LEVEL_LENGTH, que_vocab) pos_rel_name = prepare_sequence(pos_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab) neg_rel_name = prepare_sequence(neg_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab) pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab) neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab)
"Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): print("{:25s}={}".format(attr.upper(), value.value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data( '../data/dftt/shuf.txt') ## Randomly shuffle data #np.random.seed(10) #shuffle_indices = np.random.permutation(np.arange(len(y))) #x_shuffled = x[shuffle_indices] #y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_shuffled, y_shuffled = x, y num_test = int(np.round(x.shape[0] * 0.1)) x_train, x_dev = x_shuffled[:-num_test], x_shuffled[-num_test:] y_train, y_dev = y_shuffled[:-num_test], y_shuffled[-num_test:]
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load data. Load your own data here print("Loading data...") x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data(datfile = FLAGS.eval_filename) y_test = np.argmax(y_test, axis=1) print("Vocabulary size: {:d}".format(len(vocabulary))) print("Test set size {:d}".format(len(y_test))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
@author: Petrus """ from keras.layers import Input, Dense, Embedding, merge, Convolution2D, MaxPooling2D, Dropout from sklearn.cross_validation import train_test_split from keras.layers.core import Reshape, Flatten from keras.callbacks import ModelCheckpoint, TensorBoard from data_helpers import load_data from keras.optimizers import Adam from keras.models import Sequential from keras.layers import Conv1D, LSTM, Bidirectional from sklearn.manifold import TSNE print('Loading data') x, y, vocabulary, vocabulary_inv = load_data() X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) sequence_length = x.shape[1] vocabulary_size = len(vocabulary_inv) embedding_dim = 256 filter_sizes = [3, 4, 5] num_filters = 512 drop = 0.5 nb_epoch = 100 batch_size = 30
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS.batch_size print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn) print("Loading pre-trained vectors...") trained_vecs = data_helpers.load_trained_vecs( FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary) # Create embedding lookup table count = data_helpers.add_unknown_words(trained_vecs, vocabulary) embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Randomly shuffle data x, x_test = x_[:-test_size], x_[-test_size:] y, y_test = y_[:-test_size], y_[-test_size:] shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices]
from keras.callbacks import EarlyStopping, TensorBoard from sklearn.utils import shuffle from cnn_radical_two.attention_layer import Attention_layer import numpy as np import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' config.gpu_options.per_process_gpu_memory_fraction = 0.3 config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) print('Loading data') x, y, embeddings_matrix, x_eval, y_eval, x_eval_raw = load_data() x_pinyin, y_pinyin, embeddings_matrix_3, x_eval_pinyin, y_eval_pinyin = load_pinyin_data( ) x_radical, y, embeddings_matrix_2 = load_pianpang_data() # x_pinyin = x_pinyin + x_radical x, x_pinyin, x_radical, y = shuffle(x, x_pinyin, x_radical, y, random_state=0) dev_sample_index = -1 * int(0.1 * float(len(y))) X_train, X_test = x[:dev_sample_index], x[dev_sample_index:] y_train, y_test = y[:dev_sample_index], y[dev_sample_index:] x_train_radical, X_test_radical = x_radical[:dev_sample_index], x_radical[ dev_sample_index:] x_train_pinyin, X_test_pinyin = x_pinyin[:dev_sample_index], x_pinyin[
self.wfile.write("once" + pipe_name + self.windex) self.wfile.flush() os.write(self.pipeout, serialized.encode('string_escape') + "\n") else: self.wfile.write("once" + serialized + "end") self.wfile.flush() print("[%s] Written and sent..." % self.windex, file=dump) dump.flush() except Exception,e: print("[%s] Error sending. Attempt to remake connection..." % self.windex, file=dump) print(e, file=dump) self.iteration += 1 self.make_connection(req=True) def wait_msg(self): rcv = "" while True: rcv = self.sock.recv(10) if rcv is not "" and rcv is not None: break if __name__ == "__main__": # TODO: Gather test data not from end of files which are always positive instances. worker_index = int(sys.argv[len(sys.argv) - 1]) tr_data = data_helpers.load_data(sys.argv[1:len(sys.argv)-1]) psize = len(tr_data)//num_workers tr_data_split = tr_data[psize*worker_index:psize*worker_index+psize] del tr_data client = TensorSlurmWorker(batch_sz, websocket_port, worker_index) client.train_partition(tr_data_split)
sequence_length = 56 embedding_dim = 20 filter_sizes = (3, 4) num_filters = 3 dropout_prob = (0.25, 0.5) hidden_dims = 100 batch_size = 32 num_epochs = 200 val_split = 0.1 min_word_count = 1 contwi = 15 # context window size Act = "prediction" weights = "model/weight_file" x, y, vocab, vocab_inv = data_helpers.load_data() if model_variation == 'CNN-non-static': embedding_weights = train_word2vec(x, vocab_inv, embedding_dim, min_word_count, contwi) if model_variation == 'CNN-static': embedding_weights = train_word2vec(x, vocab_inv, embedding_dim, min_word_count, contwi) x = embedding_weights[0][x] elif model_variation == 'CNN-rand': embedding_weights = None else: print('No choice selected') print(model_variation) data_in = Input(shape=(sequence_length, embedding_dim))
def main(_): # mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # list_ = [] # for line in open("/mnt/ds3lab/litian/input_data/cifar10/label3.txt"): # list_.append(['a', line.strip('\n')]) # classes = np.array(list_) # print (len(classes)) # train_dataset, mean, std = create_train_datasets(classes[:, 1], num_samples=NUM_IMAGES) # val_dataset = create_test_datasets(classes[:, 1], mean, std, num_samples=NUM_IMAGES) # val_images, val_labels = val_dataset.next_batch(20) # num_classes = len(classes) # print (num_classes) data_sets = data_helpers.load_data() # with tf.device('/gpu:0'): # Create the model x = tf.placeholder(tf.float32, shape=[None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) w1 = tf.get_variable(name='w1', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b1 = tf.Variable(tf.zeros([240])) h1 = tf.nn.relu(tf.matmul(x, w1) + b1) w2 = tf.get_variable(name='w2', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(240))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b2 = tf.Variable(tf.zeros([10])) y = tf.matmul(h1, w2) + b2 cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) train_step = tf.train.GradientDescentOptimizer(0.0005).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess = tf.Session() sess.run(tf.initialize_all_variables()) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 50000) for i in range(50000): # batch_xs, batch_ys = mnist.train.next_batch(100) # image_batch, label_batch = train_dataset.next_batch(60, random_crop=True) batch = next(batches) image_batch, label_batch = zip(*batch) sess.run(train_step, feed_dict={x: image_batch, y_: label_batch}) if i % 50 == 0: train_accuracy = sess.run(accuracy, feed_dict={ x: image_batch, y_: label_batch }) train_loss = sess.run(cross_entropy, feed_dict={ x: image_batch, y_: label_batch }) localtime = time.asctime(time.localtime(time.time())) print(localtime) print("step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) if i % 500 == 0: val_accuracy = sess.run(accuracy, feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'] }) print("validation set accuracy %g" % val_accuracy)
print('Training Word2Vec model...') sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix] embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) # If we don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. embedding_model.init_sims(replace=True) # Saving the model for later use. You can load it later using Word2Vec.load() if not exists(model_dir): os.mkdir(model_dir) print('Saving Word2Vec model \'%s\'' % split(model_name)[-1]) embedding_model.save(model_name) # add unknown words embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-0.25, 0.25, embedding_model.vector_size) for key, word in vocabulary_inv.items()} return embedding_weights if __name__ == '__main__': import data_helpers print("Loading data...") x, _, _, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} w = train_word2vec(x, vocabulary_inv)
parser.add_argument('--ts_batch_size', type=int, default=32, help='Batch size for training') parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate for training') parser.add_argument('--start_from', type=str, default='save', help='') parser.add_argument('--num_compressed_capsule', type=int, default=128, help='The number of compact capsules') parser.add_argument('--dim_capsule', type=int, default=16, help='The number of dimensions for capsules') parser.add_argument('--re_ranking', type=int, default=200, help='The number of re-ranking size') import json args = parser.parse_args() params = vars(args) print(json.dumps(params, indent = 2)) X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = data_helpers.load_data(args.dataset, max_length=args.sequence_length, vocab_size=args.vocab_size) Y_trn = Y_trn.toarray() Y_tst = Y_tst.toarray() X_trn = X_trn.astype(np.int32) X_tst = X_tst.astype(np.int32) Y_trn = Y_trn.astype(np.int32) Y_tst = Y_tst.astype(np.int32) embedding_weights = load_word2vec('glove', vocabulary_inv, args.vec_size) args.num_classes = Y_trn.shape[1] capsule_net = CapsNet_Text(args, embedding_weights) capsule_net = nn.DataParallel(capsule_net).cuda()
num_epochs = 7 val_split = 0.1 # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 10 # Context window size # Data Preparatopn # ================================================== # # Load data print("Loading data...") neg_train_path = './data/imdb_train.neg' pos_train_path = './data/imdb_train.pos' x, y, vocabulary, vocabulary_inv = data_helpers.load_data(pos_train_path,neg_train_path) if model_variation=='CNN-non-static' or model_variation=='CNN-static': embedding_weights = train_word2vec(x, vocabulary_inv, model_variation, embedding_dim, min_word_count, context) if model_variation=='CNN-static': x = embedding_weights[0][x] elif model_variation=='CNN-rand': embedding_weights = None else: raise ValueError('Unknown model variation') # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1)
stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) sys.stdout = old_stdout log_file.close() if __name__ == '__main__': os.environ[ "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' # Data Preparation print("Load data...") x_train, y_train, x_test, y_test, vocabulary_inv = load_data() if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) model = create_model(x_train=x_train, x_test=x_test, vocabulary_inv=vocabulary_inv, model_type=model_type, embedding_dim=embedding_dim, min_word_count=min_word_count,
from __future__ import print_function import numpy as np import tensorflow as tf import time import data_helpers beginTime = time.time() # Parameter definitions batch_size = 100 learning_rate = 0.005 max_steps = 1000 # Prepare data data_sets = data_helpers.load_data() # Define input placeholders images_placeholder = tf.placeholder(tf.float32, shape=[None, 3072]) labels_placeholder = tf.placeholder(tf.int64, shape=[None]) # Define variables (these are the values we want to optimize) weights = tf.Variable(tf.zeros([3072, 10])) biases = tf.Variable(tf.zeros([10])) # Define the classifier's result logits = tf.matmul(images_placeholder, weights) + biases # Define the loss function loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels_placeholder))
word_index = dict() for line in open(filename, "r"): datas = line.strip().split("\t") if len(datas) < 2: continue index = int(datas[0]) word = datas[1] word_index[word] = index return word_index word_index = load_wordindex("./data/word.tsv") sent_sos_id = word_index["<s>"] sent_eos_id = word_index["</s>"] train_data = data_helpers.load_data( open(FLAGS.traindata_file, "r").readlines(), word_index) test_data = data_helpers.load_data( open(FLAGS.testdata_file, "r").readlines(), word_index) # Training logging.info("logging test") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): model = Seq2seq( max_sequence_len=FLAGS.max_sequence_len, embedding_size=FLAGS.embedding_dim,
import numpy as np np.random.seed(1337) from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.callbacks import ModelCheckpoint from keras.optimizers import Adam from keras.models import Model from sklearn.model_selection import train_test_split from data_helpers import load_data import pandas as pd print('Loading data') x, y, vocabulary, vocabulary_inv, sentences_padded = load_data() X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) #X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.2, random_state=42) df_x = pd.DataFrame(X_test) df_y = pd.DataFrame(y_test) df_x.to_csv( r'C:\Users\varsha.vishwakarma\Documents\CNN-text-classification\test_input_dataset_jd.csv', header=False, index=False) df_y.to_csv( r'C:\Users\varsha.vishwakarma\Documents\CNN-text-classification\test_output_dataset_jd.csv', header=False, index=False) sequence_length = x.shape[1] # 56 vocabulary_size = len(vocabulary_inv) # 18765
eps user interface system system human system eps user response time trees graph trees graph minors trees graph minors survey I like graph and stuff I like trees and stuff Sometimes I build a graph Sometimes I build trees""").split("\n") corpus = [sent.split() for sent in test_corpus] ''' print("Loading data...") x, y = data_helpers.load_data() corpus = x + y # Split train/test set model = glove_optimizer.GloVeModel(embedding_size=10, context_size=3, min_occurrences=0, learning_rate=0.05, batch_size=2) model.fit_to_corpus(corpus) model.train(num_epochs=1, log_dir="log/example", summary_batch_interval=10000, tsne_epoch_interval=1) #print model.embedding_for("graph") #print model.embeddings() #print(model.embeddings()[model.id_for_word('graph')])
import numpy as np import keras from keras.models import Sequential, Graph from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.embeddings import Embedding from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.layers.recurrent import LSTM from keras.utils import np_utils, generic_utils import data_helpers from w2v import train_word2vec from sklearn.cross_validation import StratifiedKFold cnn_train,lstm_train,Y_train,cnn_vocabulary,cnn_vocabulary_inv,lstm_vocabulary,lstm_vocabulary_inv = data_helpers.load_data() cnn_embedding_weights,lstm_embedding_weights = train_word2vec(cnn_train, cnn_vocabulary_inv,lstm_train,lstm_vocabulary_inv) #cnn_train=cnn_embedding_weights[0][cnn_train] #lstm_train=lstm_embedding_weights[0][lstm_train] shuffle_indices = np.random.permutation(np.arange(len(Y_train))) cnn_shuffled = cnn_train[shuffle_indices] lstm_shuffled = lstm_train[shuffle_indices] Y_train = Y_train[shuffle_indices] #Y_train_f=np_utils.to_categorical(Y_train,27) filter_sizes = (3, 4) num_filters = 150 hidden_dims = 150 cnn_graph = Graph() cnn_graph.add_input(name='input', input_shape=(32, 300)) for fsz in filter_sizes: conv = Convolution1D(nb_filter=num_filters,filter_length=fsz,border_mode='valid',activation='relu',subsample_length=1)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] data_size = len(y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-data_size/4], x_shuffled[-data_size/4:] y_train, y_dev = y_shuffled[:-data_size/4], y_shuffled[-data_size/4:] print("Vocabulary Size: {:d}".format(len(vocabulary))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training
def initiate(args): # define output directory time_str = datetime.datetime.now().isoformat() out_dir = os.path.abspath(os.path.join(os.path.curdir, args.save_dir, time_str)) if not os.path.exists(out_dir): os.makedirs(out_dir) # initiate logger log_file_path = os.path.join(out_dir, 'log') logger = Logger(log_file_path) analysis_file_path = os.path.join(out_dir, 'analysis') analysis_logger = Logger(analysis_file_path) # report parameters logger.write("\nParameters:") for arg in args.__dict__: logger.write("{}={}".format(arg.upper(), args.__dict__[arg])) logger.write("") # load data logger.write("Loading data...") if args.data == 'gameforum': x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data_gameforum_only(args.use_pretrained_embedding); elif args.data == 'semeval': x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data_semeval_only(args.use_pretrained_embedding) else: x_train, y_train, x_dev, y_dev, vocabulary, vocabulary_inv, vocabulary_embedding = data_helpers.load_data(args.use_pretrained_embedding) num_classes = len(y_train[0]) # report logger.write("Vocabulary Size: {:d}".format(len(vocabulary))) logger.write("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # fill out missing arg values args.seq_length = x_train.shape[1] args.vocab_size = len(vocabulary) args.filter_sizes = map(int, args.filter_sizes.split(",")) args.vocabulary_embedding = vocabulary_embedding args.num_classes = num_classes # initialize a model if args.model == 'deep': model = DeepCNN(args) elif args.model == 'basic': model = BasicCNN(args) else: logger.write("Invalid model") sys.exit() # for train summary grad_summaries = [] for g, v in model.grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) loss_summary = tf.scalar_summary("loss", model.loss) acc_summary = tf.scalar_summary("accuracy", model.accuracy) train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") # prepare saver checkpoint_dir = os.path.join(out_dir, 'checkpoints') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, "model") saver = tf.train.Saver(tf.all_variables()) # generate batches batches = data_helpers.batch_iter(x_train, y_train, args.batch_size, args.num_epochs) # define train / test methods def train_model(x, y, dropout_prob, writer, log=False): feed_dict = { model.input_x: x, model.input_y: y, model.dropout_keep_prob: dropout_prob } _, step, loss, accuracy, summaries = sess.run( [model.train_op, model.global_step, model.loss, model.accuracy, train_summary_op], feed_dict) sess.run(model.weight_rescaling_op) # l2 norm rescaling writer.add_summary(summaries, step) if log: time_str = datetime.datetime.now().isoformat() logger.write("{}: step {}, loss {:g}, acc {:g}".format(time_str, step-1, loss, accuracy)) def test_model(x, y): logger.write("\nEvaluate:") feed_dict = { model.input_x: x, model.input_y: y, model.dropout_keep_prob: 1.0 } step, loss, accuracy, predictions, targets = sess.run( [model.global_step, model.loss, model.accuracy, model.predictions, model.targets], feed_dict) time_str = datetime.datetime.now().isoformat() logger.write("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) logger.write("") return accuracy, predictions, targets # start a session sess_conf = tf.ConfigProto( allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement) sess = tf.Session(config=sess_conf) with sess.as_default(): # initialize tf.initialize_all_variables().run() train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def) current_step = 0 if args.train: # train the model from scratch best_test_accuracy = 0.0 for x_batch, y_batch in batches: # train train_model(x_batch, y_batch, args.dropout_keep_prob, train_summary_writer, current_step % (args.evaluate_every/4) == 0) current_step = tf.train.global_step(sess, model.global_step) # evaluate with dev set if current_step % args.evaluate_every == 0: accuracy, predictions, targets = test_model(x_dev, y_dev) # Conduct analysis if the current model is the best so far if accuracy > best_test_accuracy: best_test_accuracy = accuracy analysis_logger.write("Analysis at {}: acc={}".format(current_step, accuracy), begin=True) analysis_logger.write("Tweet\tPred\tTrue (0=Positive, 1=Neutral, 2=Negative)") for i in range(len(x_dev)): tweet_idx = x_dev[i] prediction, true_label = predictions[i], targets[i] try: tweet = " ".join([vocabulary_inv[word_idx] for word_idx in tweet_idx if word_idx != 0]) analysis_logger.write("{}\t{}\t{}".format(tweet, prediction, true_label)) except UnicodeEncodeError: analysis_logger.write("{}\t{}\t{}".format("ENCODING ERROR", prediction, true_label)) analysis_logger.write("\n") # save model if current_step % args.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) logger.write("Saved model checkpoint to {}\n".format(path)) else: # load the model logger.write("Loading the model...")
def main(warmup_iterations, num_epochs, files, testfile): tr_data = data_helpers.load_data(files) test_data = data_helpers.load_data([testfile]) warmup_data = None start_parameter_server(model=model, num_epochs=num_epochs, warmup_data=warmup_data, test_data=test_data)
# Остальные параметры tf.flags.DEFINE_boolean("allow_soft_placement", True, "Разрешить польщоватся удобным процессором") tf.flags.DEFINE_boolean("log_device_placement", False, "Залогировать на каком процессоре началось выполнение") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nПараметры:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Загрузка данных. Свои данные можно загрузить сдесь print("Загрузка данных...") x_test, y_test, vocabulary, vocabulary_inv = data_helpers.load_data() y_test = np.argmax(y_test, axis=1) print("Размер словаря: {:d}".format(len(vocabulary))) print("Размер тестового набора {:d}".format(len(y_test))) print("\nОценка...\n") # Оценка # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS.batch_size print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data( FLAGS.vn) print("Loading pre-trained vectors...") trained_vecs = data_helpers.load_trained_vecs(FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary) # Create embedding lookup table count = data_helpers.add_unknown_words(trained_vecs, vocabulary) embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Randomly shuffle data x, x_test = x_[:-test_size], x_[-test_size:] y, y_test = y_[:-test_size], y_[-test_size:] shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices]
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS.batch_size print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_, y_, vocabulary, vocabulary_inv, test_size,all_label = data_helpers.load_data() print("Loading pre-trained vectors...") trained_vecs = data_helpers.load_trained_vecs(FLAGS.en_embeddings, vocabulary) # Create embedding lookup table count = data_helpers.add_unknown_words(trained_vecs, vocabulary) embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Randomly shuffle data x, x_test = x_[:-test_size], x_[-test_size:] y, y_test = y_[:-test_size], y_[-test_size:] shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
import data_helpers from text_cnn import TextCNN import math import re import itertools from collections import Counter import random import nltk import collections import word2vec from text_rnn import TextRNN # Load training data x_train, y_train, vocabulary, vocabulary_inv = data_helpers.load_data() #y_test = np.argmax(y_test, axis=1) vocab_size = len(vocabulary) print("Vocabulary size: {:d}".format(vocab_size)) print("Test set size {:d}".format(len(y_train))) # Generate word embeddings data = x_train.flatten() iterations = 1000 data = data[data!=468] w2v = word2vec.word2vec(data,vocabulary,vocabulary_inv,vocab_size,iterations) final_embeddings = w2v.runWord2Vec() x_train = np.fliplr(x_train) fullTrain = np.concatenate((x_train,y_train),axis=1) shuffledTrain = np.random.permutation(fullTrain)