def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) ) as sess: # fareed gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) #fareed dot_rep = graph_to_dot(sess.graph) #s = Source(dot_rep, filename="test.gv", format="PNG") with open('./profs/rnn.dot', 'w') as fwr: fwr.write(str(dot_rep)) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() operations_tensors = {} operations_names = tf.get_default_graph().get_operations() count1 = 0 count2 = 0 for operation in operations_names: operation_name = operation.name operations_info = tf.get_default_graph().get_operation_by_name( operation_name).values() if len(operations_info) > 0: if not (operations_info[0].shape.ndims is None): operation_shape = operations_info[0].shape.as_list() operation_dtype_size = operations_info[0].dtype.size if not (operation_dtype_size is None): operation_no_of_elements = 1 for dim in operation_shape: if not (dim is None): operation_no_of_elements = operation_no_of_elements * dim total_size = operation_no_of_elements * operation_dtype_size operations_tensors[operation_name] = total_size else: count1 = count1 + 1 else: count1 = count1 + 1 operations_tensors[operation_name] = -1 # print('no shape_1: ' + operation_name) # print('no shape_2: ' + str(operations_info)) # operation_namee = operation_name + ':0' # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print('no shape_3:' + str(tf.shape(tensor))) # print('no shape:' + str(tensor.get_shape())) else: # print('no info :' + operation_name) # operation_namee = operation.name + ':0' count2 = count2 + 1 operations_tensors[operation_name] = -1 # try: # tensor = tf.get_default_graph().get_tensor_by_name(operation_namee) # print(tensor) # print(tf.shape(tensor)) # except: # print('no tensor: ' + operation_namee) print(count1) print(count2) with open('./profs/tensors_sz_32.txt', 'w') as f: for tensor, size in operations_tensors.items(): f.write('"' + tensor + '"::' + str(size) + '\n') #end fareed # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state } start = time.time() if b % 10 == 7: summary, train_loss, state, _, _ = sess.run( [ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed, run_metadata=run_metadata, options=options) profile(run_metadata, b) if b == 7: options_mem = tf.profiler.ProfileOptionBuilder.time_and_memory( ) options_mem["min_bytes"] = 0 options_mem["min_micros"] = 0 options_mem["output"] = 'file:outfile=./profs/mem.txt' options_mem["select"] = ("bytes", "peak_bytes", "output_bytes", "residual_bytes") mem = tf.profiler.profile(tf.get_default_graph(), run_meta=run_metadata, cmd="scope", options=options_mem) else: summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) speed = time.time() - start train_writer.add_summary(summary, e * data_loader.num_batches + b) if (e * data_loader.num_batches + b) % int( args.batch_size / 10) == 0 and b % 10 != 7: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) """ if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) """ train_writer.close()
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"words_vocab.pkl")),"words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt,"No checkpoint found" assert ckpt.model_checkpoint_path,"No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same=["model","rnn_size","num_layers","seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words==data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed} summary, train_loss, state, _, _ = sess.run([merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()
def train(args): tf.reset_default_graph() data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: try: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" except: print("Could not init from old file") ## Dump new stuff with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model_dict = { "model": model, "words": data_loader.words, "vocab": data_loader.vocab, "sess": sess } train_writer.add_graph(sess.graph) # Write graph quick writer = tf.summary.FileWriter(os.path.join(args.save_dir, "graph"), sess.graph) writer.close() tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: try: saver.restore(sess, ckpt.model_checkpoint_path) except: print("Could not restore") # Epoch loop for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: try: data_loader.pointer = model.batch_pointer.eval() args.init_from = None except: pass # Batch step loop for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y, last_words, syllables, topic_words = data_loader.next_batch( ) # Concatenate Inputs #x = tf.concat([x[:,:,None],last_words[:,:,None]],2) if args.end_word_training: feed = { model.input_data: x, model.targets: last_words, model.bonus_features: last_words, model.initial_state: state, model.syllables: syllables, model.topic_words: topic_words, model.batch_time: speed } elif args.syllable_training: feed = { model.input_data: x, model.targets: last_words, model.bonus_features: last_words, model.initial_state: state, model.syllables: syllables, model.topic_words: topic_words, model.batch_time: speed } else: feed = { model.input_data: x, model.targets: y, model.bonus_features: last_words, model.initial_state: state, model.syllables: syllables, model.topic_words: topic_words, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) #if (e * data_loader.num_batches + b) % args.save_every == 0 \ #if b % 1000 in [1, 100] \ if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) #sample.main(save_dir = args.save_dir, output_path = "sample.txt", internal_call = True, model = model_dict) python_path = "python" #python_path = r"/usr/bin/python2.6/python" if args.sample: subprocess.call( "python sample.py -e turtle -o sample.txt -s {}". format(args.save_dir).split(), shell=False) train_writer.close()
def run(self): data_loader = TextLoader( self.config.data_dir, self.config.batch_size, self.config.seq_length, self.config.input_encoding, ) self.config.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously # saved model if self.config.init_from is not None: # check if all necessary files exist assert os.path.isdir( self.config.init_from), ('{} must be a path'.format( self.config.init_from)) assert os.path.isfile( os.path.join(self.config.init_from, 'config.pkl')), ( 'config.pkl file does not exist in path {}'.format( self.config.init_from)) assert os.path.isfile( os.path.join(self.config.init_from, 'words_vocab.pkl') ), 'words_vocab.pkl.pkl file does not exist in path {}'.format( self.config.init_from) ckpt = tf.train.get_checkpoint_state(self.config.init_from) assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, ( 'No model path found in checkpoint') # open old config and check if models are compatible with open(os.path.join(self.config.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ['model', 'rnn_size', 'num_layers', 'seq_length'] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( self)[checkme], ( 'Command line argument and saved model disagree ' 'on "{}".'.format(checkme)) # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(self.config.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, ( 'Data and loaded model disagree on word set!') assert saved_vocab == data_loader.vocab, ( 'Data and loaded model disagree on dictionary mappings!') with open(os.path.join(self.config.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(self.config, f) with open(os.path.join(self.config.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(self.config) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(self.config.log_dir) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.config.gpu_mem) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if self.config.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), self.config.num_epochs): sess.run( tf.assign( model.lr, self.config.learning_rate * (self.config.decay_rate**e), )) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if self.config.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if self.config.init_from is not None: data_loader.pointer = model.batch_pointer.eval() self.config.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed, } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op, ], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if ((e * data_loader.num_batches + b) % self.config.batch_size == 0): print( '{}/{} (epoch {}), train_loss = {:.3f}, ' 'time/batch = {:.3f}'.format( e * data_loader.num_batches + b, self.config.num_epochs * data_loader.num_batches, e, train_loss, speed, ), ) # save for the last result if ((e * data_loader.num_batches + b) % self.config.save_every == 0 or (e == self.config.num_epochs - 1 and b == data_loader.num_batches - 1)): checkpoint_path = os.path.join( self.config.save_dir, 'model-{:.3f}.ckpt'.format(train_loss), ) saver.save( sess, checkpoint_path, global_step=e * data_loader.num_batches + b, ) print('model saved to {}'.format(checkpoint_path)) train_writer.close()
def train(args): '''start by getting the data_loader object''' data_loader = TextLoader(args.reverse, args.data_dir, args.test_split, args.batch_size, args.seq_length, args.input_encoding) '''some informative prints''' args.vocab_size = data_loader.vocab_size print("Train size: ", data_loader.num_batches * args.batch_size) if args.test_split > 0: print("Test size: ", data_loader.test_num_batches * args.batch_size) print("Vocab size: ", args.vocab_size) # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" '''idk what the pickle.dump does''' with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) '''start up the model''' model = Model(args) '''if a test split is requested, get it''' if args.test_split > 0: test_x = data_loader.test_x test_y = data_loader.test_y '''not sure about this stuff''' merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) '''begin the session for training''' with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # take a look at the learning rate schedule, if so desired (note parameters here should match the ones used) plot = False if plot: n = args.num_epochs * data_loader.num_batches n = 150000 x = np.arange(n) y = np.zeros((n, 1)) y = cosine_decay_restarts( args.learning_rate, x, # shift down every epoch 50000, # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749 .9, # doesn't hurt to look at the tf docs too .1, 1e-12).eval() plt.figure() plt.plot(x, y) plt.title("Learning rate schedule") plt.show() '''not sure what this does''' train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) '''fun fact: you cant put comments inside if-else clauses''' '''initialize from a previous model OR start from scratch, which means grabbing GloVe embeddings ''' if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) else: print("Loading my knowledge of the English language...") embeddings = data_loader.get_embeddings() sess.run([model.embedding_init], {model.embedding_placeholder: embeddings}) '''iterate over the range of epochs specified''' for e in range(model.epoch_pointer.eval(), args.num_epochs): #sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) #this is the vanilla exponential deca '''learning rate decay is cosine annealing''' sess.run( tf.assign( model.lr, cosine_decay_restarts( args.learning_rate, e * data_loader.num_batches, # shift down every epoch 20000, # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749 1, # doesn't hurt to look at the tf docs too .1, 1e-12))) '''reset the pointer to start from the beginning''' data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None '''iterative over the batches in the dataset''' for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() '''the feed dictionary gets passed to the model when tensorflow variables are computed''' feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed, model.dropout: args.dropout } '''variables to be trained, either with or without word embeddings''' run_list_full = [ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ] run_list_no_W = [ merged, model.cost, model.final_state, model.train_op_no_W, model.inc_batch_pointer_op ] # YES, TRAIN THE EMBEDDINGS if args.trainable_embeddings == 1: summary, train_loss, state, _, _ = sess.run( run_list_full, feed) # NO, DO NOT TRAIN THE EMBEDDINGS (train_op_no_W) elif args.trainable_embeddings == 0: summary, train_loss, state, _, _ = sess.run( run_list_no_W, feed) # it's been e epochs, so start training the embeddings elif e > args.trainable_embeddings: summary, train_loss, state, _, _ = sess.run( run_list_full, feed) # it hasn't been e epochs, don't train the embeddings else: summary, train_loss, state, _, _ = sess.run( run_list_no_W, feed) '''some diagnostics to be printed, and the model gets saved here too''' train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) print("learning rate: ", model.lr.eval()) #TEST LOSS EVAL - evaluates batch by batch with same batch size as for training if (args.test_split > 0): test_loss = 0 batches_in_test = len(test_x) save_state = state state = sess.run(model.initial_state) for i in range(batches_in_test): feed = { model.test_x: test_x[i], model.test_y: test_y[i], model.initial_state: state } loss, state, _ = sess.run([ model.test_cost, model.test_final_state, model.inc_batch_pointer_op ], feed) test_loss += loss test_loss = test_loss / batches_in_test state = save_state print("test_loss = {:.3f}".format(test_loss)) '''one final evaluation of the entire dataset to check the loss''' data_loader.reset_batch_pointer() state = sess.run(model.initial_state) ovr_loss = 0 start = time.time() for b in range(data_loader.pointer, data_loader.num_batches): x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state } train_loss, state, _ = sess.run( [model.cost, model.final_state, model.inc_batch_pointer_op], feed) ovr_loss += train_loss speed = time.time() - start print("ovr_train_loss = {:.3f}, time_to_eval = {:.3f}".format( ovr_loss / data_loader.num_batches, speed)) '''lets you initialize a model without training it''' if args.num_epochs == 0: saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()
def run_model(args, test=True): data_loader = TextLoader(args.data_train_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close() if test: # todo: this is very hacky... change it later test_data_loader = TextLoader(args.data_test_dir, args.batch_size, args.seq_length, args.input_encoding) # loop over the entire data set and generate the probabilities of the next word first_batch = True for b in range(test_data_loader.pointer, test_data_loader.num_batches): x, y = test_data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, probs, _, _ = sess.run([ merged, model.cost, model.final_state, model.probs, model.train_op, model.inc_batch_pointer_op ], feed) # save probability vectors along with the text # print(np.shape(probs)) # get probability and indices for the top k predictions k = 100 prob_table_top_k, sorting_idx_table_top_k = get_top_k_probs_and_indices( probs, k) # collect info if first_batch: PROBS = prob_table_top_k IDX = sorting_idx_table_top_k first_batch = False else: PROBS = np.vstack([PROBS, prob_table_top_k]) IDX = np.vstack([IDX, sorting_idx_table_top_k]) # save the probability table and indices print(np.shape(PROBS)) print('whole seq length = %d' % test_data_loader.full_text_len) np.savez(os.path.join(args.data_test_dir, 'probs'), prob_table=PROBS, idx_table=IDX)
def train(args): # parse text data and record statistics data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from # load checkpoint ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" # save arguments to config.pkl with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) # save words parsed by data_loader to words_vocab.pkl with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) # input output placeholders, loss_op, optimizer, train_op etc. are defined in model.py model = Model(args) """ tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) Merges all summaries collected in the default graph. key: GraphKey used to collect the summaries. Defaults to GraphKeys.SUMMARIES. Returns: If no summaries were collected, returns None. Otherwise returns a scalar Tensor of type string containing the serialized Summary protocol buffer resulting from the merging. """ merged = tf.summary.merge_all() # the FileWriter class provides a mechanism to create an event file in a given # directory and add summaries and events to it. train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) # Launch the graph in a session. with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Adds a Graph to the event file. train_writer.add_graph(sess.graph) # Run the Op that initializes global variables. tf.global_variables_initializer().run() # tf.global_variables() returns global variables.(A list of Variable objects) # The Saver class adds ops to save and restore variables to and from checkpoints. saver = tf.train.Saver(tf.global_variables()) # Restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) # e: epoch number for e in range(model.epoch_pointer.eval(), args.num_epochs): # model.lr <- args.learning_rate * (args.decay_rate ** e) sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) # pointer <- 0 data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: # Assign 0 to batch_pointer assign_op = model.batch_pointer.assign(0) sess.run(assign_op) # Assign e to the epoch_pointer assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) # This method wraps the provided summary in an Event protocol buffer and adds it to the event file. train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()