def vocab_tally_from_files(data_dir): """ Given the root directory containing the data. It returns a dictionary-like of the counts of each token in the entire data. """ ext = "txt" datasets = ["train", "test"] tally = collections.Counter([]) for dataset in datasets: timer = Timer() for subdir in ["neg", "pos", "unsup"]: timer.start() print("Processing {} {} data".format(dataset, subdir), end="") dir = os.path.join(data_dir, dataset, subdir) files = glob.glob(os.path.join(dir, "*.{}".format(ext))) # ITERATE THROUGH EACH FILE IN THE SUBDIR - and update the tally for file in files: text = file2str(file) text = tokenization(text) tally.update(text) print("-- DONE in {}".format(timer.elapsed_string())) return tally
def eval_model(model, data, char2id, seq_length=200, batch_size=32): """ Evaluates the model on the desired evaluation data. Returns a tuple: avg_loss, total_time """ data_len = len(data) n_steps = int(data_len / seq_length / batch_size) # batch_size = 256 timer = Timer() timer.start() total_loss = 0 for step in range(n_steps): # Create an evaluation batch of sequences i_start = step * seq_length * batch_size X, Y = create_eval_batch(data, char2id, start_i=i_start, seq_length=seq_length, batch_size=batch_size) # Perform an evaluation step on batch of sequences loss = eval_sequence(model, X, Y) total_loss += loss # Return the average loss, and total time avg_loss = total_loss / n_steps return avg_loss, timer.elapsed()
def train_n_steps(model, data, hyper, evals, n_steps, batch_size=128, print_every=100, eval_every=1000): # TODO: Start epoch timer at last epoch time from evals (but take into # acount eval time) # TODO: Do not use globals, feed paths dictionary or object. epoch_loss = 0 snapshot_count = len(evals["loss"]) start_step = evals["step"][-1] if snapshot_count > 0 else 0 start_step += 1 epoch_timer = Timer() step_timer = Timer() eval_timer = Timer() for step in range(1, n_steps + 1): step_timer.start() X, Y = create_random_batch(data["xtrain"], data["ytrain"], batchsize=batch_size, maxlen=hyper["SAMPLE_LENGTH"]) loss = train_step(model, X, Y) epoch_loss += loss # PRINTOUTS if step % print_every == 0: progress = 100 * float(step) / n_steps print_train_feedback(start_step + step, loss=loss, progress=progress, elapsed_time=epoch_timer.elapsed(), avg_time_ms=step_timer.elapsed() / batch_size) # EVALUATIONS AND SNAPSHOTS if (step % eval_every == 0): epoch_time = epoch_timer.elapsed() print("=" * 60) snapshot_count += 1 epoch_loss /= eval_every # EVALUATE - on train and validation data eval_timer.start() train_acc = evaluate_model(model, data["xtrain"], data["ytrain"], seq_maxlen=100) eval_time = eval_timer.elapsed() valid_acc = evaluate_model(model, data["xvalid"], data["yvalid"], seq_maxlen=100) print_epoch_feedback(train_acc, valid_acc, epoch_loss) # UPDATE EVALS update_evals(evals, loss=epoch_loss, train_acc=train_acc, valid_acc=valid_acc, train_time=epoch_time, eval_time=eval_time, alpha=model.alpha, step=start_step + step) # SAVE SNAPSHOTS - of model parameters, and evals dict epoch_snapshot(model, snapshot_count, accuracy=valid_acc, name=MODEL_NAME, dir=SNAPSHOTS_DIR) obj2pickle(evals, EVALS_FILE) save_hyper_params(hyper, HYPERPARAMS_FILE) # RESET epoch_loss = 0 epoch_timer.start() print("=" * 60) print("DONE")
def load_data(data_dir, vocab_file, classes=["neg", "pos"], valid_ratio=0.3, seed=0): """ Given the root directory containing the IMDB data. It returns a dictionary with separate keys for the train and test datasets. {"xtrain":[...] "ytrain": [...] "xtest": [...] "ytest": [...] } Each value is a list of lists, where each inner list contains the sequence of token ids for that review. Args: data_dir: (str) Path to the root directory containing the data vocab_file: (str) File containing the vocab classes: Returns: (dict) """ id2word = file2list(vocab_file) word2id = {word: id for id, word in enumerate(id2word)} ext = "txt" # file extensions to look for data = { "xtrain": [], "ytrain": [], "xtest": [], "ytest": [], "xvalid": [], "yvalid": [] } # ITERATE THROUGH EACH OF THE DATASETS datasets = ["train", "test"] for dataset in datasets: timer = Timer() # ITERATE THROUGH EACH CLASS LABEL for class_id, class_name in enumerate(classes): print("Processing {} {} ({}) data".format(dataset, class_name, class_id), end="") timer.start() # MAKE LIST OF FILES - for current subdirectory dir = os.path.join(data_dir, dataset, class_name) files = glob.glob(os.path.join(dir, "*.{}".format(ext))) # ITERATE THROUGH EACH FILE for file in files: # Create input features and labels text = file2str(file) text = str2ids(text, word2id=word2id, unknown_id=1) data["x" + dataset].append(text) data["y" + dataset].append(class_id) print("-- DONE in {}".format(timer.elapsed_string())) # RANDOMIZE THE ORDER OF THE data # TODO: Consider using a different method that does it in place n = len(data["y" + dataset]) np.random.seed(seed=seed) ids = np.random.permutation(n) data["x" + dataset] = map(lambda id: data["x" + dataset][id], ids) data["y" + dataset] = map(lambda id: data["y" + dataset][id], ids) # VALIDATION DATA - Split a portion of train data for validation n_valid = int(len(data["ytrain"]) * valid_ratio) data["xvalid"] = data["xtrain"][:n_valid] data["yvalid"] = data["ytrain"][:n_valid] data["xtrain"] = data["xtrain"][n_valid:] data["ytrain"] = data["ytrain"][n_valid:] return data
def train_n_epochs(model, hyper, data, data_valid, evals, n_epochs, feedbacks_per_epoch=10, alpha_decay=1.0): """ Train the model for a desired amount of epochs. Automatically takes snapshots of the parameters after each epoch, and monitors the progress. Args: model: data: (str) Training data hyper: (dict) hyperparameters dictionary data_valid: (str) Validation data evals: (dict of lists) The dict that stores the losses and times for each epoch n_epochs: (int) Number of epochs to train for feedbacks_per_epoch: (int) Max number of progress printouts per epoch alpha_decay: (float)(default=1.0) How much to decay the alpha by after each epoch. Returns: (dict) - evals - the dictionary that monitors the losses, and times """ timer = Timer() timer.start() # CALCULATE NUMBER OF STEPS NEEDED # Technically the following calculation for `samples_per_epoch` is incorrect, # since we are randomly sampling windows, and not dividing the data into an # even number of chunks. # But it is still a useful approximation, that allows us to have more variation # in the training data. samples_per_epoch = int(len(data_train) // hyper["SAMPLE_LENGTH"]) steps_per_epoch = int(samples_per_epoch / hyper["BATCH_SIZE"]) feedback_every = int(steps_per_epoch / feedbacks_per_epoch) try: for i in range(n_epochs): print() print("=" * 60) print("EPOCH {}/{} ({:0.2f}%) alpha={}".format( i + 1, n_epochs, 100 * (i / n_epochs), model.alpha)) print("=" * 60) # TRAIN OVER A SINGLE EPOCH train_loss, epoch_time = train_n_steps( model, hyper, data_train, n_steps=steps_per_epoch, batch_size=hyper["BATCH_SIZE"], feedback_every=feedback_every) evals["train_loss"].append(train_loss) evals["train_time"].append(epoch_time) evals["alpha"].append(model.alpha) # EVALUATE ON VALIDATION DATA eval_loss, eval_time = eval_model( model, data_valid, char2id, seq_length=hyper["SAMPLE_LENGTH"], batch_size=hyper["BATCH_SIZE"]) evals["valid_loss"].append(eval_loss) evals["valid_time"].append(eval_time) # PREPARE MODEL FOR NEXT EPOCH model.update_learning_rate(model.alpha * alpha_decay) hyper["LAST_ALPHA"] = model.alpha # TAKE SNAPSHOTS - of parameters and evaluation dictionary global_epoch = len(evals["train_loss"]) epoch_snapshot(model, epoch=global_epoch, loss=eval_loss, name=MODEL_NAME, dir=SNAPSHOTS_DIR) obj2pickle(evals, EVALS_FILE) save_hyper_params(hyper, HYPERPARAMS_FILE) # FEEDBACK PRINTOUTS # TODO: Save a sample numerous generated strings to files at each epoch # Print a sample of generated text print_sample_generation(model, char2id, exploration=0.85) epoch_template = "({}) TRAIN_LOSS={: 7.3f} VALID_LOSS={: 7.3f}" print( epoch_template.format(timer.elapsed_string(), train_loss, eval_loss)) # UPDATE LEARNING CURVE PLOT plot_learning_curves(evals, file=LEARNING_CURVES_FILE, model_name=MODEL_NAME) print("- DONE") return evals # HANDLE EARLY TERMINATION except KeyboardInterrupt: print("\n A keyboard interrupt was triggered at", timer.elapsed_string()) # Save parameters as a recovery file print("Storing Recovery parameters") file = os.path.join(SNAPSHOTS_DIR, MODEL_NAME + ".recovery_params") take_snapshot(model, file) # Save evals as a recovery file print("Storing Recovery evals") file = os.path.join(MODELS_DIR, MODEL_NAME + ".recovery_evals") obj2pickle(evals, file) # Save hyper parameters print("Saving Hyper Params") hyper["LAST_ALPHA"] = model.alpha save_hyper_params(hyper, HYPERPARAMS_FILE) print("OK DONE") return evals
def train_n_steps(model, hyper, train_data, n_steps=1000, batch_size=32, feedback_every=1000): """ Trains the model for n_steps number of steps. Returns a tuple: avg_loss, total_train_time """ total_timer = Timer() feedback_timer = Timer() total_timer.start() feedback_timer.start() total_loss = 0 feedback_loss = 0 for step in range(1, n_steps + 1): # Perform a training step X, Y = random_training_batch(train_data, char2id, hyper["SAMPLE_LENGTH"], batch_size) loss = train(model, X, Y) # Increment losses total_loss += loss feedback_loss += loss # Print Feedback if (step > 0) and (step % feedback_every == 0): # Average Loss over feedback steps, and avg train time per sample avg_feedback_loss = feedback_loss / feedback_every avg_train_time = feedback_timer.elapsed( ) / feedback_every / batch_size print_train_feedback(step, total_steps=n_steps, loss=avg_feedback_loss, elapsed_time=total_timer.elapsed(), avg_time=avg_train_time) # Reset timer and loss for feedback cycle feedback_timer.start() feedback_loss = 0 # Return the average loss, and total time avg_loss = total_loss / n_steps return avg_loss, total_timer.elapsed()
def epoch_evaluations(s, evals, loss, alpha, tr_time, data, paths): """ Perform evaluation metrics on the validation data, and the subset of the training data to be used for evaluation. It places the results in the `evals` object, and creates visualisations of predictions vs reality for bounding boxes and digit recognition. It saves a snapshot of the evals file. And also creates a plot of the training curves. Args: s: (tensorflow session) The current open session. evals: (evals object) loss: (float) The last loss value in training. alpha: (float) The current learning rate tr_time: (float) in milliseconds, how long did it take to train a single sample (on average) for the last last batch of data. data: (DataObjects object) should contain the following attributes - valid - train_eval paths: (Paths object) should contain the following attributes with filepaths as values: - learning_curves - evals - evals_max """ # EVALS ON TRAIN DATA digit_preds_tr, bbox_preds_tr = in_session_predictions( s=s, data=data.train_eval) pda_tr = 100 * per_element_accuracy(digit_preds_tr, data.train_eval.Y) wna_tr = 100 * full_row_accuracy(digit_preds_tr, data.train_eval.Y) ious_tr = avg_multi_column_iou(bbox_preds_tr, data.train_eval.BBOX, axis=0) w_iou_tr = ious_tr[0] digit_iou_tr = ious_tr[1:].mean() # EVALS ON VALID DATA timer = Timer() timer.start() digit_preds_va, bbox_preds_va = in_session_predictions(s=s, data=data.valid) avg_pred_time = 1000 * timer.stop() / float(data.valid.n_samples) pda_va = 100 * per_element_accuracy(digit_preds_va, data.valid.Y) wna_va = 100 * full_row_accuracy(digit_preds_va, data.valid.Y) ious_va = avg_multi_column_iou(bbox_preds_va, data.valid.BBOX, axis=0) w_iou_va = ious_va[0] digit_iou_va = ious_va[1:].mean() # ADD NEW SET OF EVALUATION ENTRIES TO EVALS OBJECT evals.append( loss=loss, alpha=alpha, pda_train=pda_tr, pda_valid=pda_va, wna_train=wna_tr, wna_valid=wna_va, iou_train=digit_iou_tr, iou_valid=digit_iou_va, time_train=tr_time, time_pred=avg_pred_time, ) # SAVE SNAPSHOT OF EVALS evals.save_dict_pickle(f=paths.evals) if evals.newest_is_max(): evals.save_dict_pickle(f=paths.evals_max) printout_end = " *\n" # Indicate the current max in the evals printout else: printout_end = "\n" # PRINTOUTS AND VISUALISATIONS evals.print_line(end=printout_end) epoch_visualisations(path=paths.epoch_vis, epoch=evals.epochs, data=data.valid, bboxes_pred=bbox_preds_va, digits_pred=digit_preds_va) plot_training_curves(evals, crop=(None, None), saveto=paths.learning_curves)
def run_session(graph, data, paths, alpha=0.001, epochs=1): """ Runs a training session. Args: graph: (Tensorflow graph) The graph that contains the model data: (DataObjects) DataObjects object with the following attributes: - train (DataObj) containing the training data - valid (DataObj) containing the validation data - train_eval (DataObj) containing the portion of train data to be used for evaluation. paths: (Paths object) The paths, containing the following attributes: - checkpoint - checkpoint_max - evals - evals_max - learning_curves - epoch_vis alpha: (float) learning rate epochs: (int) number of epochs to run. """ assert_these_attributes(paths, "Paths Object", [ "checkpoint", "checkpoint_max", "evals", "evals_max", "learning_curves", "epoch_vis" ]) assert_these_attributes(data, "DataObjects object", ["train", "valid", "train_eval"]) # PREPARE SESSION timer = Timer() print_headers("SESSION", border="=", width=PRINT_WIDTH) print("Training on {} samples in batches of {}".format( data.train.n_samples, data.train.batchsize)) print("Alpha: ", alpha) evals = Evals(pickle=paths.evals, verbose=True) with tf.Session(graph=graph) as sess: # GET IMPORTANT OPERATIONS AND TENSORS FROM GRAPH g = GraphOps(graph, "X", "Y", "BBOX", "is_training", "alpha", "train", "loss", "digit_logits", "bbox_logits") g.update_moving_avgs = tf.group( *tf.get_collection("update_moving_averages")) # INITIALIZE VARIABLES saver = tf.train.Saver(name="saver") tf_initialize_vars_from_file(f=paths.checkpoint, s=sess, saver=saver, verbose=verbose) # TRAIN FOR SEVERAL EPOCHS evals.print_header() for epoch in range(epochs): # PREPARE FOR A NEW EPOCH timer.start() data.train.shuffle() # TRAIN IN BATCHES for batch_n in xrange(data.train.n_batches): batch = data.train.create_batch(batch_n=batch_n, augment=True) loss = train_step(s=sess, g=g, data=batch, alpha=alpha) # INTERMITTENT FEEDBACK ON PROGRESS n_feedback_steps = 4 feedback_steps = int(data.train.n_batches / n_feedback_steps) if batch_n % feedback_steps == 0: evals.print_loss(loss=loss) # EVALUATIONS AT END OF EACH EPOCH avg_train_time = 1000 * timer.stop() / float(data.train.n_samples) epoch_evaluations(s=sess, evals=evals, loss=loss, alpha=alpha, tr_time=avg_train_time, data=data, paths=paths) # SAVE CHECKPOINTS saver.save(sess, paths.checkpoint) if evals.newest_is_max(): saver.save(sess, paths.checkpoint_max)