def create_csv(paths, args): """ desc: This function creates a csv file from a downloaded dataset. Currently this process works on the imdb dataset but other datasets can be easily added. args: args: dictionary of cli arguments paths: project paths """ if args.dataset == "imdb": print("creating {} csv".format(args.dataset)) imdb = IMDB(action="create") imdb.createManager(args.binary) print("{} csv created".format(args.dataset))
def serialize_data(paths, args): """ desc: write dataset partition to binary file args: nparray: dataset partition as numpy array to write to binary file filename: name of file to write dataset partition to """ if args.dataset == "imdb": # fetch imdb dataset imdb = IMDB(action="fetch") tic = time.time() # start time of data fetch x_train, y_train, x_test, y_test = imdb.partitionManager(args.dataset) toc = time.time() # end time of data fetch print("time taken to fetch {} dataset: {}(sec)".format( args.dataset, toc - tic)) # kill if shapes don't make sense assert (len(x_train) == len(y_train) ), "x_train length does not match y_train length" assert (len(x_test) == len(y_test) ), "x_test length does not match y_test length" # combine datasets x_all = x_train + x_test y_all = np.concatenate((y_train, y_test), axis=0) # create slices train_slice_lim = int(round(len(x_all) * args.train_data_percentage)) validation_slice_lim = int( round((train_slice_lim) + len(x_all) * args.validation_data_percentage)) # partition dataset into train, validation, and test sets x_all, docsize, sent_size = imdb.hanformater(inputs=x_all) x_train = x_all[:train_slice_lim] y_train = y_all[:train_slice_lim] docsize_train = docsize[:train_slice_lim] sent_size_train = sent_size[:train_slice_lim] x_val = x_all[train_slice_lim + 1:validation_slice_lim] y_val = y_all[train_slice_lim + 1:validation_slice_lim] docsize_val = docsize[train_slice_lim + 1:validation_slice_lim] sent_size_val = sent_size[train_slice_lim + 1:validation_slice_lim] x_test = x_all[validation_slice_lim + 1:] y_test = y_all[validation_slice_lim + 1:] docsize_test = docsize[validation_slice_lim + 1:] sent_size_test = sent_size[validation_slice_lim + 1:] train_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "train_x.npy") train_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "train_y.npy") train_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "train_docsize.npy") train_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "train_sent_size.npy") val_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "val_x.npy") val_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "val_y.npy") val_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "val_docsize.npy") val_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "val_sent_size.npy") test_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "test_x.npy") test_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "test_y.npy") test_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "test_docsize.npy") test_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR, args.dataset, "test_sent_size.npy") _write_binaryfile(nparray=x_train, filename=train_bin_filename_x) _write_binaryfile(nparray=y_train, filename=train_bin_filename_y) _write_binaryfile(nparray=docsize_train, filename=train_bin_filename_docsize) _write_binaryfile(nparray=sent_size_train, filename=train_bin_filename_sent_size) _write_binaryfile(nparray=x_val, filename=val_bin_filename_x) _write_binaryfile(nparray=y_val, filename=val_bin_filename_y) _write_binaryfile(nparray=docsize_val, filename=val_bin_filename_docsize) _write_binaryfile(nparray=sent_size_val, filename=val_bin_filename_sent_size) _write_binaryfile(nparray=x_test, filename=test_bin_filename_x) _write_binaryfile(nparray=y_test, filename=test_bin_filename_y) _write_binaryfile(nparray=docsize_test, filename=test_bin_filename_docsize) _write_binaryfile(nparray=sent_size_test, filename=test_bin_filename_sent_size)
han.input_x: x_batch, han.input_y: y_batch, han.sentence_lengths: docsize, han.word_lengths: sent_size, han.is_training: is_training } loss, accuracy = sess.run([han.loss, han.accuracy], feed_dict=feed_dict) return loss, accuracy # end # generate batches on imdb dataset else quit if FLAGS.dataset == "imdb": dataset_controller = IMDB(action="fetch") else: exit("set dataset flag to appropiate dataset") x, y, docsize, sent_size = dataset_controller.get_data( type_=FLAGS.run_type) # fetch dataset all_evaluated_chkpts = [ ] # list of all checkpoint files previously evaluated # testing loop while True: if FLAGS.wait_for_checkpoint_files: time.sleep( 2 * MINUTE ) # wait to allow for creation of new checkpoint file
logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") currentTime = str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) logFileName = os.path.join(paths.LOGS_DIR, "HAN_TxtClassification_{}.log".format(currentTime)) fileHandler = logging.FileHandler(logFileName) fileHandler.setLevel(logging.ERROR) fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) print("Loading data...\n") if not IMDB.csvExist(): imdb = IMDB(action="create") imdb.createManager() x_train, y_train, x_test, y_test = imdb.partitionManager(type="han") else: imdb = IMDB() x_train, y_train, x_test, y_test = imdb.partitionManager(type="han") if FLAGS.run_type == "train": print("Training...\n") # create new graph set as default with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allocator_type = "BFC" # create new session set it as default
def train(): paths = prjPaths() with open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'rb') as handle: persisted_vars = pickle.load(handle) persisted_vars['embedding_dim'] = CONFIG['embedding_dim'] persisted_vars['max_grad_norm'] = CONFIG['max_grad_norm'] persisted_vars['dropout_keep_proba'] = CONFIG['dropout_keep_proba'] persisted_vars['learning_rate'] = CONFIG['learning_rate'] pickle._dump( persisted_vars, open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'wb')) with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[ 'per_process_gpu_memory_fraction']) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) session_conf.gpu_options.allocator = 'BFC' with tf.Session(config=session_conf) as sess: han = HAN(max_seq_len=persisted_vars['max_grad_norm'], max_sent_len=persisted_vars['max_sent_len'], num_classes=persisted_vars['num_classes'], vocab_size=persisted_vars['vocab_size'], embedding_size=persisted_vars['embedding_dim'], max_grad_norm=persisted_vars['max_grad_norm'], dropout_keep_proba=persisted_vars['dropout_keep_proba'], learning_rate=persisted_vars['learning_rate']) global_step = tf.Variable(0, name='global_step', trainable=False) # 梯度裁剪需要获取训练参数 tvars = tf.trainable_variables() grads, global_norm = tf.clip_by_global_norm( tf.gradients(han.loss, tvars), han.max_grad_norm) optimizer = tf.train.AdamOptimizer( han.learning_rate) # todo 尝试其他参数 train_op = optimizer.apply_gradients(zip(grads, tvars), name='train_op', global_step=global_step) merge_summary_op = tf.summary.merge_all() train_summary_writer = tf.summary.FileWriter( os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']), sess.graph) # todo 这里的保存对象换成sess saver = tf.train.Saver(tf.global_variables(), max_to_keep=CONFIG['num_checkpoint']) sess.run(tf.global_variables_initializer()) # _________train__________ def train_step(epoch, x_batch, y_batch, docsize, sent_size, is_training): tic = time.time() feed_dict = { han.input_x: x_batch, han.input_y: y_batch, han.sentence_lengths: docsize, han.word_legths: sent_size, han.sis_training: is_training } _, step, loss, accuracy, summaries = sess.run( [ train_op, global_step, han.loss, han.accuracy, merge_summary_op ], feed_dict=feed_dict) time_elapsed = time.time() - tic if is_training: print( 'Training||CurrentEpoch: {} || GlobalStep: {} || ({} sec/sep) || Loss {:g}) || Accuracy {:g}' .format(epoch + 1, step, time_elapsed, loss, accuracy)) if step % CONFIG['log_summaries_every'] == 0: train_summary_writer.add_summary(summaries, step) print( f'Saved model summaries to {os.path.join(paths.SUMMARY_DIR,CONFIG["run_type"])} \n' ) if step % CONFIG['checkpoint_every'] == 0: chkpt_path = saver.save(sess, os.path.join( paths.CHECKPOINT_DIR, 'han'), global_step=step) print('Saved model checkpoint to {} \n'.format(chkpt_path)) imdb = IMDB(action='fetch') x_train, y_train, docsize_train, sent_size_train = imdb.get_data( type=CONFIG['run_type']) for epoch, batch in imdb.get_batch(data=list( zip(x_train, y_train, docsize_train, sent_size_train)), batch_size=CONFIG['batch_size'], num_epoch=CONFIG['num_epochs']): x_batch, y_batch, docsize, sent_size = zip(*batch) train_step(epoch=epoch, x_batch=x_batch, y_batch=y_batch, docsize=docsize, sent_size=sent_size, is_training=True)
def test(): MINUTE = 60 paths = prjPaths() print('loading persisted variables...') with open( os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'), 'rb') as handle: persisted_vars = pickle.load(handle) graph = tf.Graph() with graph.as_default(): # Set GPU options gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[ 'per_process_gpu_memory_fraction']) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) session_conf.gpu_options.allocator_type = 'BFC' with tf.Session(config=session_conf) as sess: # Insert model han = HAN(max_seq_len=persisted_vars['max_seq_len'], max_sent_len=persisted_vars['max_sent_len'], num_classes=persisted_vars['num_classes'], vocab_size=persisted_vars['vocab_size'], embedding_size=persisted_vars['embedding_dim'], max_grad_norm=persisted_vars['max_grad_norm'], dropout_keep_proba=persisted_vars['dropout_keep_proba'], learning_rate=persisted_vars['learning_rate']) global_step = tf.Variable(0, name='global_step', trainable=False) tvars = tf.trainable_variables() # todo 这个方法返回的是什么 grads, global_norm = tf.clip_by_global_norm( tf.gradients(han.loss, tvars), han.max_grad_norm) optimizer = tf.train.AdamOptimizer(han.learning_rate) test_op = optimizer.apply_gradients( zip(grads, tvars), name=f'{CONFIIG["run_type"]}_op', global_step=global_step) merge_summary_op = tf.summary.merge_all() test_summary_writer = tf.summary.FileWriter( os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']), sess.graph) meta_file = get_most_recently_create_file([ os.path.join(paths.CHECKPOINT_DIR, file) for file in os.listdir(paths.CHECKPOINT_DIR) if file.endswith('.meta') ]) saver = tf.train.import_meta_graph(meta_file) sess.run(tf.global_variables_initializer()) def test_step(sample_num, x_batch, y_batch, docsize, sent_size, is_training): feed_dict = { han.input_x: x_batch, han.input_y: y_batch, han.sentence_lengths: docsize, han.word_lengths: sent_size, han.is_training: is_training } loss, accuracy = sess.run([han.loss, han.accuracy], feed_dict=feed_dict) return loss, accuracy if CONFIG['dataset'] == 'imdb': dataset_controller = IMDB(action='fetch') else: exit('set dataset flag to appropiate dataset') x, y, docsize, sent_size = dataset_controller.get_data( type=CONFIG['run_key']) all_evaluated_chkpts = [] while True: if CONFIG['wait_for_checkpoint_files']: time.sleep(2 * MINUTE) # wait for create new checkpoint file else: time.sleep(0 * MINUTE) if tf.train.latest_checkpoint( paths.CHECKPOINT_DIR) in all_evaluated_chkpts: continue saver.restore(sess, tf.train.latest_checkpoint(paths.CHECKPOINT_DIR)) all_evaluated_chkpts.append( tf.train.latest_checkpoint(paths.CHECKPOINT_DIR)) losses = [] accuracies = [] tic = time.time() for i, batch in enumerate( tqdm(list(zip(x, y, docsize, sent_size)))): x_batch, y_batch, docsize_batch, sent_size_batch = batch x_batch = np.expand_dims(x_batch, axis=0) y_batch = np.expand_dims(y_batch, axis=0) sent_size_batch = np.expand_dims(sent_size_batch, axis=0) loss, accuracy = test_step(sample_num=1, x_batch=x_batch, y_batch=y_batch, docsize=docsize, sent_size=sent_size, is_training=False) losses.append(loss) accuracies.append(accuracy) time_elapsed = time.time() - tic losses_accuracy_vars = { 'losses': losses, 'accuracy': accuracies } print( 'Time taken to complete {} evaluate of {} checkpoint : {}'. format(CONFIG['run_type'], all_evaluated_chkpts[-1], time_elapsed)) for k in losses_accuracy_vars.keys(): print('stats for {}:{}'.format( k, stats.describe(losses_accuracy_vars[k]))) print(Counter(losses_accuracy_vars[k])) filename, ext = os.path.splitext(all_evaluated_chkpts[-1]) pickle._dump( losses_accuracy_vars, open( os.path.join( paths.LIB_DIR, CONFIG['dataset'], 'losses_accuracies_vars_{}.p'.format( filename.split('/')[-1])), 'wb'))