def __init__(self, action): '''预处理imdb数据''' self.paths = prjPaths() self.ROOT_DATA_DIR = self.ROOT_DATA_DIR self.DATASET = 'imdb' self.CSVFILENAME = os.path.join(self.ROOT_DATA_DIR, self.DATASET, '{}.csv'.format(self.DATASET)) assert action in ['create', 'fetch'], 'invalid action' if action == 'create': if os.path.exists(self.CSVFILENAME): print('removing existing csv file from {}'.format( self.CSVFILENAME)) os.remove(self.CSVFILENAME) train_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, 'acImdb', 'train') test_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, 'acImdb', 'test') trainPos_dir = os.path.join(train_dir, 'pos') trainNeg_dir = os.path.join(train_dir, 'neg') testPos_dir = os.path.join(test_dir, 'pos') testNeg_dir = os.path.join(test_dir, 'neg') self.data = { 'trainPos': self._getDirCountents(trainPos_dir), 'trainNeg': self._getDirCountents(trainNeg_dir), 'testPos': self._getDirCountents(testPos_dir), 'testNeg': self._getDirCountents(testNeg_dir) }
def __init__(self, action): """ desc: this class is used to process the imdb dataset args: action: specify whether to create or fetch the data using the IMDB class """ self.paths = prjPaths() self.ROOT_DATA_DIR = self.paths.ROOT_DATA_DIR self.DATASET = "imdb" self.CSVFILENAME = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "{}.csv".format(self.DATASET)) assert(action in ["create", "fetch"]), "invalid action" if action == "create": # if creating new csv remove old if one exists if os.path.exists(self.CSVFILENAME): print("removing existing csv file from {}".format(self.CSVFILENAME)) os.remove(self.CSVFILENAME) # directory structure train_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "aclImdb", "train") test_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "aclImdb", "test") trainPos_dir = os.path.join(train_dir, "pos") trainNeg_dir = os.path.join(train_dir, "neg") testPos_dir = os.path.join(test_dir, "pos") testNeg_dir = os.path.join(test_dir, "neg") self.data = {"trainPos": self._getDirContents(trainPos_dir), "trainNeg": self._getDirContents(trainNeg_dir), "testPos": self._getDirContents(testPos_dir), "testNeg": self._getDirContents(testNeg_dir)}
def __init__(self, action=None): self.paths = prjPaths() self.ROOT_DATA_DIR = self.paths.ROOT_DATA_DIR assert (action in ["create", None]), "invalid action" if action == "create": # directory structure train_dir = "{}/{}".format(self.ROOT_DATA_DIR, "train") test_dir = "{}/{}".format(self.ROOT_DATA_DIR, "test") trainPos_dir = "{}/{}".format(train_dir, "pos") trainNeg_dir = "{}/{}".format(train_dir, "neg") testPos_dir = "{}/{}".format(test_dir, "pos") testNeg_dir = "{}/{}".format(test_dir, "neg") self.data = { "trainPos": self._getDirContents(trainPos_dir), "trainNeg": self._getDirContents(trainNeg_dir), "testPos": self._getDirContents(testPos_dir), "testNeg": self._getDirContents(testNeg_dir) }
def main(): args = get_args() prjPaths_ = prjPaths() # determine if gpu present if torch.cuda.device_count() > 0: gpu_available = True else: gpu_available = False if args.run_type == "train": train(gpu_available=gpu_available, prjPaths=prjPaths_, n=args.n, training_steps=args.training_steps, batch_size=args.batch_size, learning_rate=args.learning_rate, show_every_n_steps=args.show_every_n_steps, checkpoint_every_n_steps=args.checkpoint_every_n_steps, verbose=args.verbose, clip_value=args.clip_value) elif args.run_type == "inference": inference(gpu_available, prjPaths=prjPaths_)
"test_sent_size.npy") _write_binaryfile(nparray=x_train, filename=train_bin_filename_x) _write_binaryfile(nparray=y_train, filename=train_bin_filename_y) _write_binaryfile(nparray=docsize_train, filename=train_bin_filename_docsize) _write_binaryfile(nparray=sent_size_train, filename=train_bin_filename_sent_size) _write_binaryfile(nparray=x_val, filename=val_bin_filename_x) _write_binaryfile(nparray=y_val, filename=val_bin_filename_y) _write_binaryfile(nparray=docsize_val, filename=val_bin_filename_docsize) _write_binaryfile(nparray=sent_size_val, filename=val_bin_filename_sent_size) _write_binaryfile(nparray=x_test, filename=test_bin_filename_x) _write_binaryfile(nparray=y_test, filename=test_bin_filename_y) _write_binaryfile(nparray=docsize_test, filename=test_bin_filename_docsize) _write_binaryfile(nparray=sent_size_test, filename=test_bin_filename_sent_size) # end if __name__ == "__main__": paths = prjPaths() args = get_args() serialize_data(paths, args=args)
def csvExist(): paths = prjPaths() csvExists = "imdb.csv" in os.listdir(paths.ROOT_DATA_DIR) return csvExists
def train(): paths = prjPaths() with open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'rb') as handle: persisted_vars = pickle.load(handle) persisted_vars['embedding_dim'] = CONFIG['embedding_dim'] persisted_vars['max_grad_norm'] = CONFIG['max_grad_norm'] persisted_vars['dropout_keep_proba'] = CONFIG['dropout_keep_proba'] persisted_vars['learning_rate'] = CONFIG['learning_rate'] pickle._dump( persisted_vars, open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'wb')) with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[ 'per_process_gpu_memory_fraction']) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) session_conf.gpu_options.allocator = 'BFC' with tf.Session(config=session_conf) as sess: han = HAN(max_seq_len=persisted_vars['max_grad_norm'], max_sent_len=persisted_vars['max_sent_len'], num_classes=persisted_vars['num_classes'], vocab_size=persisted_vars['vocab_size'], embedding_size=persisted_vars['embedding_dim'], max_grad_norm=persisted_vars['max_grad_norm'], dropout_keep_proba=persisted_vars['dropout_keep_proba'], learning_rate=persisted_vars['learning_rate']) global_step = tf.Variable(0, name='global_step', trainable=False) # 梯度裁剪需要获取训练参数 tvars = tf.trainable_variables() grads, global_norm = tf.clip_by_global_norm( tf.gradients(han.loss, tvars), han.max_grad_norm) optimizer = tf.train.AdamOptimizer( han.learning_rate) # todo 尝试其他参数 train_op = optimizer.apply_gradients(zip(grads, tvars), name='train_op', global_step=global_step) merge_summary_op = tf.summary.merge_all() train_summary_writer = tf.summary.FileWriter( os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']), sess.graph) # todo 这里的保存对象换成sess saver = tf.train.Saver(tf.global_variables(), max_to_keep=CONFIG['num_checkpoint']) sess.run(tf.global_variables_initializer()) # _________train__________ def train_step(epoch, x_batch, y_batch, docsize, sent_size, is_training): tic = time.time() feed_dict = { han.input_x: x_batch, han.input_y: y_batch, han.sentence_lengths: docsize, han.word_legths: sent_size, han.sis_training: is_training } _, step, loss, accuracy, summaries = sess.run( [ train_op, global_step, han.loss, han.accuracy, merge_summary_op ], feed_dict=feed_dict) time_elapsed = time.time() - tic if is_training: print( 'Training||CurrentEpoch: {} || GlobalStep: {} || ({} sec/sep) || Loss {:g}) || Accuracy {:g}' .format(epoch + 1, step, time_elapsed, loss, accuracy)) if step % CONFIG['log_summaries_every'] == 0: train_summary_writer.add_summary(summaries, step) print( f'Saved model summaries to {os.path.join(paths.SUMMARY_DIR,CONFIG["run_type"])} \n' ) if step % CONFIG['checkpoint_every'] == 0: chkpt_path = saver.save(sess, os.path.join( paths.CHECKPOINT_DIR, 'han'), global_step=step) print('Saved model checkpoint to {} \n'.format(chkpt_path)) imdb = IMDB(action='fetch') x_train, y_train, docsize_train, sent_size_train = imdb.get_data( type=CONFIG['run_type']) for epoch, batch in imdb.get_batch(data=list( zip(x_train, y_train, docsize_train, sent_size_train)), batch_size=CONFIG['batch_size'], num_epoch=CONFIG['num_epochs']): x_batch, y_batch, docsize, sent_size = zip(*batch) train_step(epoch=epoch, x_batch=x_batch, y_batch=y_batch, docsize=docsize, sent_size=sent_size, is_training=True)
def test(): MINUTE = 60 paths = prjPaths() print('loading persisted variables...') with open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'rb') as handle: persisted_vars = pickle.load(handle) graph = tf.Graph() with graph.as_default(): # Set GPU options gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[ 'per_process_gpu_memory_fraction']) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) session_conf.gpu_options.allocator_type = 'BFC' with tf.Session(config=session_conf) as sess: # Insert model han = HAN(max_seq_len=persisted_vars['max_seq_len'], max_sent_len=persisted_vars['max_sent_len'], num_classes=persisted_vars['num_classes'], vocab_size=persisted_vars['vocab_size'], embedding_size=persisted_vars['embedding_dim'], max_grad_norm=persisted_vars['max_grad_norm'], dropout_keep_proba=persisted_vars['dropout_keep_proba'], learning_rate=persisted_vars['learning_rate']) global_step = tf.Variable(0, name='global_step', trainable=False) tvars = tf.trainable_variables() # todo 这个方法返回的是什么 grads, global_norm = tf.clip_by_global_norm( tf.gradients(han.loss, tvars), han.max_grad_norm) optimizer = tf.train.AdamOptimizer(han.learning_rate) test_op = optimizer.apply_gradients( zip(grads, tvars), name=f'{CONFIIG["run_type"]}_op', global_step=global_step) merge_summary_op = tf.summary.merge_all() test_summary_writer = tf.summary.FileWriter( os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']), sess.graph) meta_file = get_most_recently_create_file([ os.path.join(paths.CHECKPOINT_DIR, file) for file in os.listdir(paths.CHECKPOINT_DIR) if file.endswith('.meta') ]) saver = tf.train.import_meta_graph(meta_file) sess.run(tf.global_variables_initializer()) def test_step(sample_num, x_batch, y_batch, docsize, sent_size, is_training): feed_dict = { han.input_x: x_batch, han.input_y: y_batch, han.sentence_lengths: docsize, han.word_lengths: sent_size, han.is_training: is_training } loss, accuracy = sess.run([han.loss, han.accuracy], feed_dict=feed_dict) return loss, accuracy if CONFIG['dataset'] == 'imdb': dataset_controller = IMDB(action='fetch') else: exit('set dataset flag to appropiate dataset') x, y, docsize, sent_size = dataset_controller.get_data( type=CONFIG['run_key']) all_evaluated_chkpts = [] while True: if CONFIG['wait_for_checkpoint_files']: time.sleep(2 * MINUTE) # wait for create new checkpoint file else: time.sleep(0 * MINUTE) if tf.train.latest_checkpoint( paths.CHECKPOINT_DIR) in all_evaluated_chkpts: continue saver.restore(sess, tf.train.latest_checkpoint(paths.CHECKPOINT_DIR)) all_evaluated_chkpts.append( tf.train.latest_checkpoint(paths.CHECKPOINT_DIR)) losses = [] accuracies = [] tic = time.time() for i, batch in enumerate( tqdm(list(zip(x, y, docsize, sent_size)))): x_batch, y_batch, docsize_batch, sent_size_batch = batch x_batch = np.expand_dims(x_batch, axis=0) y_batch = np.expand_dims(y_batch, axis=0) sent_size_batch = np.expand_dims(sent_size_batch, axis=0) loss, accuracy = test_step(sample_num=1, x_batch=x_batch, y_batch=y_batch, docsize=docsize, sent_size=sent_size, is_training=False) losses.append(loss) accuracies.append(accuracy) time_elapsed = time.time() - tic losses_accuracy_vars = { 'losses': losses, 'accuracy': accuracies } print( 'Time taken to complete {} evaluate of {} checkpoint : {}'. format(CONFIG['run_type'], all_evaluated_chkpts[-1], time_elapsed)) for k in losses_accuracy_vars.keys(): print('stats for {}:{}'.format( k, stats.describe(losses_accuracy_vars[k]))) print(Counter(losses_accuracy_vars[k])) filename, ext = os.path.splitext(all_evaluated_chkpts[-1]) pickle._dump( losses_accuracy_vars, open( os.path.join( paths.LIB_DIR, CONFIG['dataset'], 'losses_accuracies_vars_{}.p'.format( filename.split('/')[-1])), 'wb'))