Exemple #1
0
def get_data(data_dir, cached_data, vocab_file):
    """ Loads cached data (as sequences of word ids) if it exists, otherwise it
        creates the dataset from the raw IMDB text files and caches the
        processed dataset.
    
    Args:
        data_dir:       (str) The IMDB root directory containing the "train"
                        and "test" subdirectories.
        cached_data:    (str) The path to the pickle file contianing the
                        cached data
        vocab_file:     (str) The file that contains the vocabulary, one
                        token per line in order from most frequent to
                        least frequent.

    Returns:
        (dict)
    """
    if os.path.exists(cached_data):
        print("LOADING CACHED DATA")
        data = pickle2obj(cached_data)
    else:
        print("PROCESSING RAW DATA")
        data = load_data(data_dir=data_dir,
                         vocab_file=vocab_file,
                         valid_ratio=0.3,
                         seed=45)
        print("CACHING DATA")
        obj2pickle(data, cached_data)

    return data
Exemple #2
0
def extract_word2vec_embeddings(file,
                                n_words,
                                embed_size,
                                id2word,
                                datadir=None):
    """ Tries to load pretrained word2vec weights from a file. If it does
        not exist, then it trains from scratch and caches the trained
        embeddings to that file.

        Returns a numpy array of the trained embeddings according to the
        word order from `id2word`
    """
    if not os.path.isfile(file):
        print("Training word2vec embeddings from scratch")
        embeddings = create_word2vec_vectors(datadir, embed_size=embed_size)
        print("Caching word2vec embeddings")
        obj2pickle(embeddings, file)
    else:
        print("Loading cached word2vec embedings")
        embeddings = pickle2obj(file)

    # Reorder the embeddings
    weights = initialize_embeddings(n_words, embed_size)
    for id, word in enumerate(id2word):
        vector = embeddings.get(word, None)
        if vector is not None:
            weights[id] = vector

    return weights
Exemple #3
0
def merge_train_extra(pickles_dir, shuffle=True):
    """ Merges the train and extra datasets. Optionally shuffles them as well.
        then saves the merged data as two pickle files:
            X_train_extra_cropped64.pickle
            Y_train_extra.pickle

    Args:
        pickles_dir: (str) directory containing the picle files
        shuffle:     (bool) Should it shuffle the data (default is True)
    """
    print("#" * 60)
    print((" " * 34) + "MERGE TRAIN AND EXTRA DATA")
    print("#" * 60)

    # OPEN TRAIN
    X_train = pickle2obj(os.path.join(pickles_dir, "X_train_cropped64.pickle"))
    Y_train = pickle2obj(os.path.join(pickles_dir, "Y_train.pickle"))

    # OPEN EXTRA
    X_extra = pickle2obj(os.path.join(pickles_dir, "X_extra_cropped64.pickle"))
    Y_extra = pickle2obj(os.path.join(pickles_dir, "Y_extra.pickle"))

    # CONCATENATE
    X_merged = np.append(X_train, X_extra, axis=0)
    Y_merged = {}
    for key in Y_train.keys():
        Y_merged[key] = np.append(Y_train[key], Y_extra[key], axis=0)

    # SHUFFLE
    if shuffle:
        random_indices = np.random.permutation(Y_merged["N"].shape[0])
        X_merged = X_merged[random_indices]
        for key in Y_merged.keys():
            Y_merged[key] = Y_merged[key][random_indices]

    # SAVE AS:
    obj2pickle(X_merged,
               file=os.path.join(pickles_dir,
                                 "X_train_extra_cropped64.pickle"))
    obj2pickle(Y_merged,
               file=os.path.join(pickles_dir, "Y_train_extra.pickle"))

    # FEEDBACK
    print()
    print("X")
    print("Train Shape : ", X_train.shape)
    print("Extra Shape : ", X_extra.shape)
    print("Merged Shape: ", X_merged.shape)
    print()
    print("Y")
    for key in Y_merged.keys():
        print("{} : {}".format(key.ljust(10, " "), Y_merged[key].shape))
Exemple #4
0
def create_increased_representation_data(pickles_dir):
    print("-" * 60)
    print((" " * 35) + "INCREASING REPRESENTATION")
    print("-" * 60)

    # LOAD DATA
    X = pickle2obj(os.path.join(pickles_dir, "X_train_extra_cropped64.pickle"))
    Y = pickle2obj(os.path.join(pickles_dir, "Y_train_extra.pickle"))

    # INCREASE REPRESENTATION
    X, Y = increase_representation(X, Y, min_samples=5000)

    # SAVE PICKLES
    obj2pickle(X,
               os.path.join(pickles_dir, "X_aug_train_extra_cropped64.pickle"),
               verbose=True)
    obj2pickle(Y,
               os.path.join(pickles_dir, "Y_aug_train_extra.pickle"),
               verbose=True)

    # EXPLORATORY PRINTOUT
    explore_data(X=X, labels=Y)
Exemple #5
0
def process_the_data(data, data_dir, out_dir, limit=None):
    """Cleans the data labels, creates images array of 64x64 images, and
       saves the data to pickle files.

    Args:
        data: (str) "train", "test", "extra"
        data_dir: (str) the base directory where the train, test, and extra
                        direcotories will all be found.
        out_dir: (str) the directory where you want to save the pickle files.
        limit = (None or int) Only proccess a subset of the data
    Returns:

    """
    print("#" * 60)
    print((" " * 47) + "{} DATA".format(data.upper()))
    print("#" * 60)

    # CREATE CLEANED DATA
    mat_file = os.path.join(data_dir, data, "digitStruct.mat")
    images_dir = os.path.join(data_dir, data)
    X, Y = data_cleaning(matfile=mat_file,
                         images_dir=images_dir,
                         dataset=data,
                         limit=limit)

    # SAVE THE DATA
    obj2pickle(Y,
               file=os.path.join(out_dir, "Y_{}.pickle".format(data)),
               verbose=True)
    obj2pickle(X,
               file=os.path.join(out_dir,
                                 "X_{}_cropped64.pickle".format(data)),
               verbose=True)

    # EXPLORATORY PRINTOUT
    explore_data(X=X, labels=Y)
Exemple #6
0
def save_pickle(obj, f):
    print("Saving: ", f)
    obj2pickle(obj, file=f)
    print("--DONE")
Exemple #7
0
def train_n_steps(model,
                  data,
                  hyper,
                  evals,
                  n_steps,
                  batch_size=128,
                  print_every=100,
                  eval_every=1000):
    # TODO: Start epoch timer at last epoch time from evals (but take into
    # acount eval time)
    # TODO: Do not use globals, feed paths dictionary or object.
    epoch_loss = 0
    snapshot_count = len(evals["loss"])
    start_step = evals["step"][-1] if snapshot_count > 0 else 0
    start_step += 1

    epoch_timer = Timer()
    step_timer = Timer()
    eval_timer = Timer()

    for step in range(1, n_steps + 1):
        step_timer.start()
        X, Y = create_random_batch(data["xtrain"],
                                   data["ytrain"],
                                   batchsize=batch_size,
                                   maxlen=hyper["SAMPLE_LENGTH"])
        loss = train_step(model, X, Y)
        epoch_loss += loss

        # PRINTOUTS
        if step % print_every == 0:
            progress = 100 * float(step) / n_steps
            print_train_feedback(start_step + step,
                                 loss=loss,
                                 progress=progress,
                                 elapsed_time=epoch_timer.elapsed(),
                                 avg_time_ms=step_timer.elapsed() / batch_size)

        # EVALUATIONS AND SNAPSHOTS
        if (step % eval_every == 0):
            epoch_time = epoch_timer.elapsed()

            print("=" * 60)
            snapshot_count += 1
            epoch_loss /= eval_every

            # EVALUATE - on train and validation data
            eval_timer.start()
            train_acc = evaluate_model(model,
                                       data["xtrain"],
                                       data["ytrain"],
                                       seq_maxlen=100)
            eval_time = eval_timer.elapsed()
            valid_acc = evaluate_model(model,
                                       data["xvalid"],
                                       data["yvalid"],
                                       seq_maxlen=100)
            print_epoch_feedback(train_acc, valid_acc, epoch_loss)

            # UPDATE EVALS
            update_evals(evals,
                         loss=epoch_loss,
                         train_acc=train_acc,
                         valid_acc=valid_acc,
                         train_time=epoch_time,
                         eval_time=eval_time,
                         alpha=model.alpha,
                         step=start_step + step)

            # SAVE SNAPSHOTS - of model parameters, and evals dict
            epoch_snapshot(model,
                           snapshot_count,
                           accuracy=valid_acc,
                           name=MODEL_NAME,
                           dir=SNAPSHOTS_DIR)
            obj2pickle(evals, EVALS_FILE)
            save_hyper_params(hyper, HYPERPARAMS_FILE)

            # RESET
            epoch_loss = 0
            epoch_timer.start()
            print("=" * 60)
    print("DONE")
Exemple #8
0
#                                                                          TRAIN
################################################################################
# Load evaluations dictionary
evals = get_evals_dict(EVALS_FILE)

# Calculate important steps
steps_per_epoch = int(np.ceil(n_samples / hyper["BATCH_SIZE"]))
n_steps = n_epochs * steps_per_epoch

# TRAIN - and handle early termination through keyboard interrupt
try:
    print("#" * 60)
    print(MODEL_NAME.upper())
    print("#" * 60)
    train_n_steps(model,
                  data,
                  evals,
                  n_steps=n_steps,
                  batch_size=hyper["BATCH_SIZE"],
                  print_every=int(steps_per_epoch / 5),
                  eval_every=steps_per_epoch)
except KeyboardInterrupt:
    print("\nEARLY TERMINATION INITIATED - saving evals and hypers")
    obj2pickle(evals, EVALS_FILE)
    save_hyper_params(hyper, HYPERPARAMS_FILE)

# TODO: Pad the data with zeroes at the end not begining.
# TODO: Create plots and save them in each epoch
# TODO: Custome weight initializations  - See  https://discuss.pytorch.org/t/weight-initilzation/157/9
# TODO: Time in training printout is based on epoch not total time since train_n_steps called
Exemple #9
0
    # LIMIT TRAIN DATA - eg during development and debugging
    limit = opts.data_size
    data.train.limit_samples(n=limit, verbose=verbose)

    # PORTION OF THE TRAINING DATA USED FOR EVALUATION
    data.set_train_eval_data(n=1024,
                             random=False,
                             random_transforms=True,
                             batchsize=128,
                             verbose=verbose)

    # CREATE TENSORFLOW GRAPH
    print("USING MODEL: ", opts.model)
    models = {"a": model_a, "b": model_b, "c": model_c, "d": model_d}
    graph = create_graph(logit_func=models[opts.model], settings=opts)

    # RUN TENSORFLOW TRAINING SESSION
    run_session(graph=graph,
                data=data,
                paths=paths,
                alpha=opts.alpha,
                epochs=opts.epochs)

    # SAVE SETTINGS TO TEXT AND PICKLE FILES
    save_dict_as_text_file(d=vars(opts), f=paths.settings_text_file)
    obj2pickle(opts, paths.settings_pickle_file, verbose=verbose)

    # CREATE A COMPARISONS FILE
    comparisons_file(opts, paths)
Exemple #10
0
 def save_dict_pickle(self, f, verbose=False):
     short_path = limit_string(f, front=10, tail=31)
     verbose_print("Saving Evals to " + short_path, verbose, end="")
     obj2pickle(self.stuff, file=f)
     verbose_print_done(verbose)