def get_data(data_dir, cached_data, vocab_file): """ Loads cached data (as sequences of word ids) if it exists, otherwise it creates the dataset from the raw IMDB text files and caches the processed dataset. Args: data_dir: (str) The IMDB root directory containing the "train" and "test" subdirectories. cached_data: (str) The path to the pickle file contianing the cached data vocab_file: (str) The file that contains the vocabulary, one token per line in order from most frequent to least frequent. Returns: (dict) """ if os.path.exists(cached_data): print("LOADING CACHED DATA") data = pickle2obj(cached_data) else: print("PROCESSING RAW DATA") data = load_data(data_dir=data_dir, vocab_file=vocab_file, valid_ratio=0.3, seed=45) print("CACHING DATA") obj2pickle(data, cached_data) return data
def extract_word2vec_embeddings(file, n_words, embed_size, id2word, datadir=None): """ Tries to load pretrained word2vec weights from a file. If it does not exist, then it trains from scratch and caches the trained embeddings to that file. Returns a numpy array of the trained embeddings according to the word order from `id2word` """ if not os.path.isfile(file): print("Training word2vec embeddings from scratch") embeddings = create_word2vec_vectors(datadir, embed_size=embed_size) print("Caching word2vec embeddings") obj2pickle(embeddings, file) else: print("Loading cached word2vec embedings") embeddings = pickle2obj(file) # Reorder the embeddings weights = initialize_embeddings(n_words, embed_size) for id, word in enumerate(id2word): vector = embeddings.get(word, None) if vector is not None: weights[id] = vector return weights
def merge_train_extra(pickles_dir, shuffle=True): """ Merges the train and extra datasets. Optionally shuffles them as well. then saves the merged data as two pickle files: X_train_extra_cropped64.pickle Y_train_extra.pickle Args: pickles_dir: (str) directory containing the picle files shuffle: (bool) Should it shuffle the data (default is True) """ print("#" * 60) print((" " * 34) + "MERGE TRAIN AND EXTRA DATA") print("#" * 60) # OPEN TRAIN X_train = pickle2obj(os.path.join(pickles_dir, "X_train_cropped64.pickle")) Y_train = pickle2obj(os.path.join(pickles_dir, "Y_train.pickle")) # OPEN EXTRA X_extra = pickle2obj(os.path.join(pickles_dir, "X_extra_cropped64.pickle")) Y_extra = pickle2obj(os.path.join(pickles_dir, "Y_extra.pickle")) # CONCATENATE X_merged = np.append(X_train, X_extra, axis=0) Y_merged = {} for key in Y_train.keys(): Y_merged[key] = np.append(Y_train[key], Y_extra[key], axis=0) # SHUFFLE if shuffle: random_indices = np.random.permutation(Y_merged["N"].shape[0]) X_merged = X_merged[random_indices] for key in Y_merged.keys(): Y_merged[key] = Y_merged[key][random_indices] # SAVE AS: obj2pickle(X_merged, file=os.path.join(pickles_dir, "X_train_extra_cropped64.pickle")) obj2pickle(Y_merged, file=os.path.join(pickles_dir, "Y_train_extra.pickle")) # FEEDBACK print() print("X") print("Train Shape : ", X_train.shape) print("Extra Shape : ", X_extra.shape) print("Merged Shape: ", X_merged.shape) print() print("Y") for key in Y_merged.keys(): print("{} : {}".format(key.ljust(10, " "), Y_merged[key].shape))
def create_increased_representation_data(pickles_dir): print("-" * 60) print((" " * 35) + "INCREASING REPRESENTATION") print("-" * 60) # LOAD DATA X = pickle2obj(os.path.join(pickles_dir, "X_train_extra_cropped64.pickle")) Y = pickle2obj(os.path.join(pickles_dir, "Y_train_extra.pickle")) # INCREASE REPRESENTATION X, Y = increase_representation(X, Y, min_samples=5000) # SAVE PICKLES obj2pickle(X, os.path.join(pickles_dir, "X_aug_train_extra_cropped64.pickle"), verbose=True) obj2pickle(Y, os.path.join(pickles_dir, "Y_aug_train_extra.pickle"), verbose=True) # EXPLORATORY PRINTOUT explore_data(X=X, labels=Y)
def process_the_data(data, data_dir, out_dir, limit=None): """Cleans the data labels, creates images array of 64x64 images, and saves the data to pickle files. Args: data: (str) "train", "test", "extra" data_dir: (str) the base directory where the train, test, and extra direcotories will all be found. out_dir: (str) the directory where you want to save the pickle files. limit = (None or int) Only proccess a subset of the data Returns: """ print("#" * 60) print((" " * 47) + "{} DATA".format(data.upper())) print("#" * 60) # CREATE CLEANED DATA mat_file = os.path.join(data_dir, data, "digitStruct.mat") images_dir = os.path.join(data_dir, data) X, Y = data_cleaning(matfile=mat_file, images_dir=images_dir, dataset=data, limit=limit) # SAVE THE DATA obj2pickle(Y, file=os.path.join(out_dir, "Y_{}.pickle".format(data)), verbose=True) obj2pickle(X, file=os.path.join(out_dir, "X_{}_cropped64.pickle".format(data)), verbose=True) # EXPLORATORY PRINTOUT explore_data(X=X, labels=Y)
def save_pickle(obj, f): print("Saving: ", f) obj2pickle(obj, file=f) print("--DONE")
def train_n_steps(model, data, hyper, evals, n_steps, batch_size=128, print_every=100, eval_every=1000): # TODO: Start epoch timer at last epoch time from evals (but take into # acount eval time) # TODO: Do not use globals, feed paths dictionary or object. epoch_loss = 0 snapshot_count = len(evals["loss"]) start_step = evals["step"][-1] if snapshot_count > 0 else 0 start_step += 1 epoch_timer = Timer() step_timer = Timer() eval_timer = Timer() for step in range(1, n_steps + 1): step_timer.start() X, Y = create_random_batch(data["xtrain"], data["ytrain"], batchsize=batch_size, maxlen=hyper["SAMPLE_LENGTH"]) loss = train_step(model, X, Y) epoch_loss += loss # PRINTOUTS if step % print_every == 0: progress = 100 * float(step) / n_steps print_train_feedback(start_step + step, loss=loss, progress=progress, elapsed_time=epoch_timer.elapsed(), avg_time_ms=step_timer.elapsed() / batch_size) # EVALUATIONS AND SNAPSHOTS if (step % eval_every == 0): epoch_time = epoch_timer.elapsed() print("=" * 60) snapshot_count += 1 epoch_loss /= eval_every # EVALUATE - on train and validation data eval_timer.start() train_acc = evaluate_model(model, data["xtrain"], data["ytrain"], seq_maxlen=100) eval_time = eval_timer.elapsed() valid_acc = evaluate_model(model, data["xvalid"], data["yvalid"], seq_maxlen=100) print_epoch_feedback(train_acc, valid_acc, epoch_loss) # UPDATE EVALS update_evals(evals, loss=epoch_loss, train_acc=train_acc, valid_acc=valid_acc, train_time=epoch_time, eval_time=eval_time, alpha=model.alpha, step=start_step + step) # SAVE SNAPSHOTS - of model parameters, and evals dict epoch_snapshot(model, snapshot_count, accuracy=valid_acc, name=MODEL_NAME, dir=SNAPSHOTS_DIR) obj2pickle(evals, EVALS_FILE) save_hyper_params(hyper, HYPERPARAMS_FILE) # RESET epoch_loss = 0 epoch_timer.start() print("=" * 60) print("DONE")
# TRAIN ################################################################################ # Load evaluations dictionary evals = get_evals_dict(EVALS_FILE) # Calculate important steps steps_per_epoch = int(np.ceil(n_samples / hyper["BATCH_SIZE"])) n_steps = n_epochs * steps_per_epoch # TRAIN - and handle early termination through keyboard interrupt try: print("#" * 60) print(MODEL_NAME.upper()) print("#" * 60) train_n_steps(model, data, evals, n_steps=n_steps, batch_size=hyper["BATCH_SIZE"], print_every=int(steps_per_epoch / 5), eval_every=steps_per_epoch) except KeyboardInterrupt: print("\nEARLY TERMINATION INITIATED - saving evals and hypers") obj2pickle(evals, EVALS_FILE) save_hyper_params(hyper, HYPERPARAMS_FILE) # TODO: Pad the data with zeroes at the end not begining. # TODO: Create plots and save them in each epoch # TODO: Custome weight initializations - See https://discuss.pytorch.org/t/weight-initilzation/157/9 # TODO: Time in training printout is based on epoch not total time since train_n_steps called
# LIMIT TRAIN DATA - eg during development and debugging limit = opts.data_size data.train.limit_samples(n=limit, verbose=verbose) # PORTION OF THE TRAINING DATA USED FOR EVALUATION data.set_train_eval_data(n=1024, random=False, random_transforms=True, batchsize=128, verbose=verbose) # CREATE TENSORFLOW GRAPH print("USING MODEL: ", opts.model) models = {"a": model_a, "b": model_b, "c": model_c, "d": model_d} graph = create_graph(logit_func=models[opts.model], settings=opts) # RUN TENSORFLOW TRAINING SESSION run_session(graph=graph, data=data, paths=paths, alpha=opts.alpha, epochs=opts.epochs) # SAVE SETTINGS TO TEXT AND PICKLE FILES save_dict_as_text_file(d=vars(opts), f=paths.settings_text_file) obj2pickle(opts, paths.settings_pickle_file, verbose=verbose) # CREATE A COMPARISONS FILE comparisons_file(opts, paths)
def save_dict_pickle(self, f, verbose=False): short_path = limit_string(f, front=10, tail=31) verbose_print("Saving Evals to " + short_path, verbose, end="") obj2pickle(self.stuff, file=f) verbose_print_done(verbose)