Exemple #1
0
def vocab_tally_from_files(data_dir):
    """ Given the root directory containing the data. It returns a
        dictionary-like of the counts of each token in the entire
        data.
    """
    ext = "txt"
    datasets = ["train", "test"]
    tally = collections.Counter([])
    for dataset in datasets:
        timer = Timer()
        for subdir in ["neg", "pos", "unsup"]:
            timer.start()
            print("Processing {} {} data".format(dataset, subdir), end="")

            dir = os.path.join(data_dir, dataset, subdir)
            files = glob.glob(os.path.join(dir, "*.{}".format(ext)))

            # ITERATE THROUGH EACH FILE IN THE SUBDIR - and update the tally
            for file in files:
                text = file2str(file)
                text = tokenization(text)
                tally.update(text)

            print("-- DONE in {}".format(timer.elapsed_string()))
    return tally
Exemple #2
0
def eval_model(model, data, char2id, seq_length=200, batch_size=32):
    """ Evaluates the model on the desired evaluation data.
        Returns a tuple:
            avg_loss, total_time
    """
    data_len = len(data)
    n_steps = int(data_len / seq_length / batch_size)

    # batch_size = 256

    timer = Timer()
    timer.start()
    total_loss = 0

    for step in range(n_steps):
        # Create an evaluation batch of sequences
        i_start = step * seq_length * batch_size
        X, Y = create_eval_batch(data,
                                 char2id,
                                 start_i=i_start,
                                 seq_length=seq_length,
                                 batch_size=batch_size)

        # Perform an evaluation step on batch of sequences
        loss = eval_sequence(model, X, Y)
        total_loss += loss

    # Return the average loss, and total time
    avg_loss = total_loss / n_steps
    return avg_loss, timer.elapsed()
Exemple #3
0
def train_n_steps(model,
                  data,
                  hyper,
                  evals,
                  n_steps,
                  batch_size=128,
                  print_every=100,
                  eval_every=1000):
    # TODO: Start epoch timer at last epoch time from evals (but take into
    # acount eval time)
    # TODO: Do not use globals, feed paths dictionary or object.
    epoch_loss = 0
    snapshot_count = len(evals["loss"])
    start_step = evals["step"][-1] if snapshot_count > 0 else 0
    start_step += 1

    epoch_timer = Timer()
    step_timer = Timer()
    eval_timer = Timer()

    for step in range(1, n_steps + 1):
        step_timer.start()
        X, Y = create_random_batch(data["xtrain"],
                                   data["ytrain"],
                                   batchsize=batch_size,
                                   maxlen=hyper["SAMPLE_LENGTH"])
        loss = train_step(model, X, Y)
        epoch_loss += loss

        # PRINTOUTS
        if step % print_every == 0:
            progress = 100 * float(step) / n_steps
            print_train_feedback(start_step + step,
                                 loss=loss,
                                 progress=progress,
                                 elapsed_time=epoch_timer.elapsed(),
                                 avg_time_ms=step_timer.elapsed() / batch_size)

        # EVALUATIONS AND SNAPSHOTS
        if (step % eval_every == 0):
            epoch_time = epoch_timer.elapsed()

            print("=" * 60)
            snapshot_count += 1
            epoch_loss /= eval_every

            # EVALUATE - on train and validation data
            eval_timer.start()
            train_acc = evaluate_model(model,
                                       data["xtrain"],
                                       data["ytrain"],
                                       seq_maxlen=100)
            eval_time = eval_timer.elapsed()
            valid_acc = evaluate_model(model,
                                       data["xvalid"],
                                       data["yvalid"],
                                       seq_maxlen=100)
            print_epoch_feedback(train_acc, valid_acc, epoch_loss)

            # UPDATE EVALS
            update_evals(evals,
                         loss=epoch_loss,
                         train_acc=train_acc,
                         valid_acc=valid_acc,
                         train_time=epoch_time,
                         eval_time=eval_time,
                         alpha=model.alpha,
                         step=start_step + step)

            # SAVE SNAPSHOTS - of model parameters, and evals dict
            epoch_snapshot(model,
                           snapshot_count,
                           accuracy=valid_acc,
                           name=MODEL_NAME,
                           dir=SNAPSHOTS_DIR)
            obj2pickle(evals, EVALS_FILE)
            save_hyper_params(hyper, HYPERPARAMS_FILE)

            # RESET
            epoch_loss = 0
            epoch_timer.start()
            print("=" * 60)
    print("DONE")
Exemple #4
0
def load_data(data_dir,
              vocab_file,
              classes=["neg", "pos"],
              valid_ratio=0.3,
              seed=0):
    """ Given the root directory containing the IMDB data. It returns a
        dictionary with separate keys for the train and test datasets.
        {"xtrain":[...]
         "ytrain": [...]
         "xtest": [...]
         "ytest": [...]

         }
         
        Each value is a list of lists, where each inner list contains the
        sequence of token ids for that review.

    Args:
        data_dir: (str) Path to the root directory containing the data
        vocab_file:  (str) File containing the vocab
        classes:

    Returns:
        (dict)
    """
    id2word = file2list(vocab_file)
    word2id = {word: id for id, word in enumerate(id2word)}

    ext = "txt"  # file extensions to look for
    data = {
        "xtrain": [],
        "ytrain": [],
        "xtest": [],
        "ytest": [],
        "xvalid": [],
        "yvalid": []
    }

    # ITERATE THROUGH EACH OF THE DATASETS
    datasets = ["train", "test"]
    for dataset in datasets:
        timer = Timer()

        # ITERATE THROUGH EACH CLASS LABEL
        for class_id, class_name in enumerate(classes):
            print("Processing {} {} ({}) data".format(dataset, class_name,
                                                      class_id),
                  end="")
            timer.start()

            # MAKE LIST OF FILES - for current subdirectory
            dir = os.path.join(data_dir, dataset, class_name)
            files = glob.glob(os.path.join(dir, "*.{}".format(ext)))

            # ITERATE THROUGH EACH FILE
            for file in files:
                # Create input features and labels
                text = file2str(file)
                text = str2ids(text, word2id=word2id, unknown_id=1)
                data["x" + dataset].append(text)
                data["y" + dataset].append(class_id)

            print("-- DONE in {}".format(timer.elapsed_string()))

        # RANDOMIZE THE ORDER OF THE data
        # TODO: Consider using a different method that does it in place
        n = len(data["y" + dataset])
        np.random.seed(seed=seed)
        ids = np.random.permutation(n)
        data["x" + dataset] = map(lambda id: data["x" + dataset][id], ids)
        data["y" + dataset] = map(lambda id: data["y" + dataset][id], ids)

        # VALIDATION DATA - Split a portion of train data for validation
        n_valid = int(len(data["ytrain"]) * valid_ratio)
        data["xvalid"] = data["xtrain"][:n_valid]
        data["yvalid"] = data["ytrain"][:n_valid]
        data["xtrain"] = data["xtrain"][n_valid:]
        data["ytrain"] = data["ytrain"][n_valid:]

    return data
Exemple #5
0
def train_n_epochs(model,
                   hyper,
                   data,
                   data_valid,
                   evals,
                   n_epochs,
                   feedbacks_per_epoch=10,
                   alpha_decay=1.0):
    """ Train the model for a desired amount of epochs.
        Automatically takes snapshots of the parameters after each epoch, and
        monitors the progress.

    Args:
        model:
        data:       (str) Training data
        hyper:      (dict) hyperparameters dictionary
        data_valid: (str) Validation data
        evals:      (dict of lists)
                    The dict that stores the losses and times for each epoch
        n_epochs:   (int) Number of epochs to train for
        feedbacks_per_epoch: (int) Max number of progress printouts per epoch
        alpha_decay: (float)(default=1.0)
                    How much to decay the alpha by after each epoch.

    Returns: (dict)
        - evals - the dictionary that monitors the losses, and times
    """
    timer = Timer()
    timer.start()

    # CALCULATE NUMBER OF STEPS NEEDED
    # Technically the following calculation for `samples_per_epoch` is incorrect,
    # since we are randomly sampling windows, and not dividing the data into an
    # even number of chunks.
    # But it is still a useful approximation, that allows us to have more variation
    # in the training data.
    samples_per_epoch = int(len(data_train) // hyper["SAMPLE_LENGTH"])
    steps_per_epoch = int(samples_per_epoch / hyper["BATCH_SIZE"])
    feedback_every = int(steps_per_epoch / feedbacks_per_epoch)

    try:
        for i in range(n_epochs):
            print()
            print("=" * 60)
            print("EPOCH {}/{} ({:0.2f}%) alpha={}".format(
                i + 1, n_epochs, 100 * (i / n_epochs), model.alpha))
            print("=" * 60)

            # TRAIN OVER A SINGLE EPOCH
            train_loss, epoch_time = train_n_steps(
                model,
                hyper,
                data_train,
                n_steps=steps_per_epoch,
                batch_size=hyper["BATCH_SIZE"],
                feedback_every=feedback_every)

            evals["train_loss"].append(train_loss)
            evals["train_time"].append(epoch_time)
            evals["alpha"].append(model.alpha)

            # EVALUATE ON VALIDATION DATA
            eval_loss, eval_time = eval_model(
                model,
                data_valid,
                char2id,
                seq_length=hyper["SAMPLE_LENGTH"],
                batch_size=hyper["BATCH_SIZE"])
            evals["valid_loss"].append(eval_loss)
            evals["valid_time"].append(eval_time)

            # PREPARE MODEL FOR NEXT EPOCH
            model.update_learning_rate(model.alpha * alpha_decay)
            hyper["LAST_ALPHA"] = model.alpha

            # TAKE SNAPSHOTS - of parameters and evaluation dictionary
            global_epoch = len(evals["train_loss"])
            epoch_snapshot(model,
                           epoch=global_epoch,
                           loss=eval_loss,
                           name=MODEL_NAME,
                           dir=SNAPSHOTS_DIR)
            obj2pickle(evals, EVALS_FILE)
            save_hyper_params(hyper, HYPERPARAMS_FILE)

            # FEEDBACK PRINTOUTS
            # TODO: Save a sample numerous generated strings to files at each epoch
            # Print a sample of generated text
            print_sample_generation(model, char2id, exploration=0.85)
            epoch_template = "({}) TRAIN_LOSS={: 7.3f} VALID_LOSS={: 7.3f}"
            print(
                epoch_template.format(timer.elapsed_string(), train_loss,
                                      eval_loss))

            # UPDATE LEARNING CURVE PLOT
            plot_learning_curves(evals,
                                 file=LEARNING_CURVES_FILE,
                                 model_name=MODEL_NAME)

        print("- DONE")
        return evals

    # HANDLE EARLY TERMINATION
    except KeyboardInterrupt:
        print("\n A keyboard interrupt was triggered at",
              timer.elapsed_string())

        # Save parameters as a recovery file
        print("Storing Recovery parameters")
        file = os.path.join(SNAPSHOTS_DIR, MODEL_NAME + ".recovery_params")
        take_snapshot(model, file)

        # Save evals as a recovery file
        print("Storing Recovery evals")
        file = os.path.join(MODELS_DIR, MODEL_NAME + ".recovery_evals")
        obj2pickle(evals, file)

        # Save hyper parameters
        print("Saving Hyper Params")
        hyper["LAST_ALPHA"] = model.alpha
        save_hyper_params(hyper, HYPERPARAMS_FILE)

        print("OK DONE")
        return evals
Exemple #6
0
def train_n_steps(model,
                  hyper,
                  train_data,
                  n_steps=1000,
                  batch_size=32,
                  feedback_every=1000):
    """ Trains the model for n_steps number of steps.
        Returns a tuple:
            avg_loss, total_train_time
    """
    total_timer = Timer()
    feedback_timer = Timer()
    total_timer.start()
    feedback_timer.start()

    total_loss = 0
    feedback_loss = 0
    for step in range(1, n_steps + 1):
        # Perform a training step
        X, Y = random_training_batch(train_data, char2id,
                                     hyper["SAMPLE_LENGTH"], batch_size)
        loss = train(model, X, Y)

        # Increment losses
        total_loss += loss
        feedback_loss += loss

        # Print Feedback
        if (step > 0) and (step % feedback_every == 0):
            # Average Loss over feedback steps, and avg train time per sample
            avg_feedback_loss = feedback_loss / feedback_every
            avg_train_time = feedback_timer.elapsed(
            ) / feedback_every / batch_size

            print_train_feedback(step,
                                 total_steps=n_steps,
                                 loss=avg_feedback_loss,
                                 elapsed_time=total_timer.elapsed(),
                                 avg_time=avg_train_time)

            # Reset timer and loss for feedback cycle
            feedback_timer.start()
            feedback_loss = 0

    # Return the average loss, and total time
    avg_loss = total_loss / n_steps
    return avg_loss, total_timer.elapsed()
Exemple #7
0
def epoch_evaluations(s, evals, loss, alpha, tr_time, data, paths):
    """ Perform evaluation metrics on the validation data, and the subset of the
        training data to be used for evaluation.
        
        It places the results in the `evals` object, and creates visualisations
        of predictions vs reality for bounding boxes and digit recognition.
        
        It saves a snapshot of the evals file.
        
        And also creates a plot of the training curves.
    
    Args:
        s:          (tensorflow session) The current open session.
        evals:      (evals object)
        loss:       (float) The last loss value in training.
        alpha:      (float) The current learning rate
        tr_time:    (float) in milliseconds, how long did it take to train a
                    single sample (on average) for the last last batch of data.
        data:       (DataObjects object) should contain the following attributes
                    - valid
                    - train_eval
        paths:      (Paths object) should contain the following attributes with
                    filepaths as values:
                    - learning_curves
                    - evals
                    - evals_max
    """
    # EVALS ON TRAIN DATA
    digit_preds_tr, bbox_preds_tr = in_session_predictions(
        s=s, data=data.train_eval)
    pda_tr = 100 * per_element_accuracy(digit_preds_tr, data.train_eval.Y)
    wna_tr = 100 * full_row_accuracy(digit_preds_tr, data.train_eval.Y)
    ious_tr = avg_multi_column_iou(bbox_preds_tr, data.train_eval.BBOX, axis=0)
    w_iou_tr = ious_tr[0]
    digit_iou_tr = ious_tr[1:].mean()

    # EVALS ON VALID DATA
    timer = Timer()
    timer.start()
    digit_preds_va, bbox_preds_va = in_session_predictions(s=s,
                                                           data=data.valid)
    avg_pred_time = 1000 * timer.stop() / float(data.valid.n_samples)

    pda_va = 100 * per_element_accuracy(digit_preds_va, data.valid.Y)
    wna_va = 100 * full_row_accuracy(digit_preds_va, data.valid.Y)
    ious_va = avg_multi_column_iou(bbox_preds_va, data.valid.BBOX, axis=0)
    w_iou_va = ious_va[0]
    digit_iou_va = ious_va[1:].mean()

    # ADD NEW SET OF EVALUATION ENTRIES TO EVALS OBJECT
    evals.append(
        loss=loss,
        alpha=alpha,
        pda_train=pda_tr,
        pda_valid=pda_va,
        wna_train=wna_tr,
        wna_valid=wna_va,
        iou_train=digit_iou_tr,
        iou_valid=digit_iou_va,
        time_train=tr_time,
        time_pred=avg_pred_time,
    )

    # SAVE SNAPSHOT OF EVALS
    evals.save_dict_pickle(f=paths.evals)
    if evals.newest_is_max():
        evals.save_dict_pickle(f=paths.evals_max)
        printout_end = " *\n"  # Indicate the current max in the evals printout
    else:
        printout_end = "\n"

    # PRINTOUTS AND VISUALISATIONS
    evals.print_line(end=printout_end)
    epoch_visualisations(path=paths.epoch_vis,
                         epoch=evals.epochs,
                         data=data.valid,
                         bboxes_pred=bbox_preds_va,
                         digits_pred=digit_preds_va)
    plot_training_curves(evals,
                         crop=(None, None),
                         saveto=paths.learning_curves)
Exemple #8
0
def run_session(graph, data, paths, alpha=0.001, epochs=1):
    """ Runs a training session.
    
    Args:
        graph: (Tensorflow graph) The graph that contains the model
        data:  (DataObjects) DataObjects object with the following attributes:
               - train (DataObj) containing the training data
               - valid (DataObj) containing the validation data
               - train_eval (DataObj) containing the portion of train data to be
                 used for evaluation.
        paths: (Paths object) The paths, containing the following attributes:
                - checkpoint
                - checkpoint_max
                - evals
                - evals_max
                - learning_curves
                - epoch_vis
                
        alpha:  (float) learning rate
        epochs: (int) number of epochs to run.
    """
    assert_these_attributes(paths, "Paths Object", [
        "checkpoint", "checkpoint_max", "evals", "evals_max",
        "learning_curves", "epoch_vis"
    ])
    assert_these_attributes(data, "DataObjects object",
                            ["train", "valid", "train_eval"])

    # PREPARE SESSION
    timer = Timer()
    print_headers("SESSION", border="=", width=PRINT_WIDTH)
    print("Training on {} samples in batches of {}".format(
        data.train.n_samples, data.train.batchsize))
    print("Alpha: ", alpha)
    evals = Evals(pickle=paths.evals, verbose=True)

    with tf.Session(graph=graph) as sess:
        # GET IMPORTANT OPERATIONS AND TENSORS FROM GRAPH
        g = GraphOps(graph, "X", "Y", "BBOX", "is_training", "alpha", "train",
                     "loss", "digit_logits", "bbox_logits")
        g.update_moving_avgs = tf.group(
            *tf.get_collection("update_moving_averages"))

        # INITIALIZE VARIABLES
        saver = tf.train.Saver(name="saver")
        tf_initialize_vars_from_file(f=paths.checkpoint,
                                     s=sess,
                                     saver=saver,
                                     verbose=verbose)

        # TRAIN FOR SEVERAL EPOCHS
        evals.print_header()
        for epoch in range(epochs):
            # PREPARE FOR A NEW EPOCH
            timer.start()
            data.train.shuffle()

            # TRAIN IN BATCHES
            for batch_n in xrange(data.train.n_batches):
                batch = data.train.create_batch(batch_n=batch_n, augment=True)
                loss = train_step(s=sess, g=g, data=batch, alpha=alpha)

                # INTERMITTENT FEEDBACK ON PROGRESS
                n_feedback_steps = 4
                feedback_steps = int(data.train.n_batches / n_feedback_steps)
                if batch_n % feedback_steps == 0:
                    evals.print_loss(loss=loss)

            # EVALUATIONS AT END OF EACH EPOCH
            avg_train_time = 1000 * timer.stop() / float(data.train.n_samples)
            epoch_evaluations(s=sess,
                              evals=evals,
                              loss=loss,
                              alpha=alpha,
                              tr_time=avg_train_time,
                              data=data,
                              paths=paths)

            # SAVE CHECKPOINTS
            saver.save(sess, paths.checkpoint)
            if evals.newest_is_max():
                saver.save(sess, paths.checkpoint_max)