def __init__(self, train_data_path, val_data_path=None, additional_data_paths=None, hyperparams={}, preprocessing_function=None, log_dir="./", use_gpu=False, verbose=True, remove_extra_labels=True): """ Train a supervised fasttext model :param train_data_path: str, path to train.txt file :param val_data_path: str, path to val.txt file. if val_data_path is None the score won't be keeped in history.json :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file :param hyperparams: dict, all hyperparams for train_supervised :param preprocessing_function: function, function to apply on text data before feeding into network :param log_dir: str, directory to save the training files and the model :param use_gpu: bool, use gpu for training :param verbose: bool :param remove_extra_labels: bool, remove datapoints with labels which appear in additional_data_paths but not in train_data_path. Ignored if additional_data_paths is None :return: object, the trained model """ log_dir = validate(log_dir) self.hyperparams = \ {"train_path": handle_space_paths("./train.txt"), "validation_path": handle_space_paths(""), "min_word_count": 1, "min_label_count": 1, "label_prefix": "__label__", "dim": 100, "n_epochs": 10, "word_ngrams": 1, "sort_ngrams": 0, "batch_size": 1024, "batch_size_inference": 1024, "batch_norm": 0, "seed": 17, "top_k": 5, "learning_rate": 0.3, "learning_rate_multiplier": 0.8, "dropout": 0.5, "l2_reg_weight": 1e-06, "data_fraction": 1, "save_models": 0, "use_validation": 0, "use_gpu": 0, "gpu_fraction": 0.5, "force": 0, "cache_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "cache"))), "result_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "results"))), "flush": 1} assert os.path.exists(train_data_path), "train_data_path is incorrect" if val_data_path: assert os.path.exists(val_data_path), "val_data_path is incorrect" self.hyperparams["use_validation"] = 1 self.hyperparams["validation_path"] = val_data_path to_restore = {} if len(hyperparams) != 0: for k, v in hyperparams.items(): if k not in self.hyperparams: to_restore[k] = v if k != "split_and_train_params": print("WARNING! {} not in hyperparams, ignoring it".format(k)) else: if k in ["train_path", "validation_path", "cache_dir", "result_dir"]: self.hyperparams[k] = handle_space_paths(v) else: self.hyperparams[k] = v train_data_path = os.path.abspath(train_data_path) if additional_data_paths: data_to_save = [] paths_joined_hashed = hash_(" ".join(additional_data_paths)) concat_path = "/tmp/tmp.txt" joined_path = "/tmp/{}.txt".format(paths_joined_hashed) os.system("cat {} {} > {}".format(train_data_path, val_data_path, concat_path)) _, all_labels = parse_txt(train_data_path) unique_labels = set(all_labels) assert type(additional_data_paths) == list, "type of additional_data_paths should be list" for additional_data_path in additional_data_paths: assert os.path.exists(additional_data_path), "val_data_path is incorrect" current_data, current_labels = parse_txt(additional_data_path, join_desc=True) if remove_extra_labels: needed_inds = [i for i, j in enumerate(current_labels) if j in unique_labels] current_data = [current_data[i] for i in needed_inds] current_labels = [current_labels[i] for i in needed_inds] data_to_save.extend(["{}{} {}".format(self.hyperparams["label_prefix"], i, j) for i, j in zip(current_labels, current_data)]) with open(concat_path, "w+") as outfile: outfile.write("\n".join(data_to_save)) os.system("cat {} {} > {}".format(concat_path, train_data_path, joined_path)) self.hyperparams["train_path"] = joined_path to_restore["original_train_path"] = train_data_path to_restore["additional_data_paths"] = additional_data_paths else: self.hyperparams["train_path"] = train_data_path if use_gpu: self.hyperparams["use_gpu"] = 1 command = self._get_command() process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True) for line in iter(process.stdout.readline, b""): line = line.rstrip().decode("utf-8") if "stored at" in line: log_dir_line = line if "accuracy" in line: line_split = line.split() if "val" in line: self.top_1_accuracy = float(line_split[-4][:-1]) self.top_k_accuracy = float(line_split[-1]) else: if str(1) in line.split(): self.top_1_accuracy = float(line_split[-1]) if str(self.hyperparams["top_k"]) in line.split(): self.top_k_accuracy = float(line_split[-1]) if verbose: print(line) process.stdout.close() log_dir_split = log_dir_line.split("at ") for k, v in to_restore.items(): self.hyperparams[k] = v super(train_supervised, self). \ __init__(model_path=os.path.join(log_dir_split[-1], "model_ep{}.pb".format(self.hyperparams["n_epochs"])), model_params_path=os.path.join(log_dir_split[-1], "model_params.json"), use_gpu=use_gpu, label_prefix=self.hyperparams["label_prefix"], preprocessing_function=preprocessing_function, hyperparams=self.hyperparams)
def run_train(data, train_specific, train_params, data_specific, train_history, train_history_path): """ Run training with the given data, parameters and hyperparameters :param data: dict, data :param train_specific: dict, train hyper-parameters :param train_params: dict, train parameters :param data_specific: dict, data-specific parameters :param train_history: dict, train history :param train_history_path: str, path to train history :return: None, prints the training outputs """ seed = train_specific["seed"] learning_rate = train_specific["learning_rate"] embedding_dim = train_specific["embedding_dim"] use_batch_norm = train_specific["use_batch_norm"] l2_reg_weight = train_specific["l2_reg_weight"] num_epochs = train_specific["num_epochs"] batch_size = train_specific["batch_size"] train_dropout_keep_rate = train_specific["dropout"] learning_rate_multiplier = train_specific["learning_rate_multiplier"] cache_dir = train_specific["cache_dir"] train_path = train_specific["train_path"] del train_specific["train_path"] train_description_hashes = data["train_description_hashes"] train_labels = data["train_labels"] test_description_hashes = data["test_description_hashes"] test_labels = data["test_labels"] label_vocab = data["label_vocab"] cache = data["cache"] num_words_in_train = data["num_words_in_train"] test_path = data["test_path"] initial_test_len = data["initial_test_len"] num_labels = len(label_vocab) use_gpu = train_params["use_gpu"] gpu_fraction = train_params["gpu_fraction"] use_tensorboard = train_params["use_tensorboard"] top_k = train_params["top_k"] save_all_models = train_params["save_all_models"] compare_top_k = train_params["compare_top_k"] use_test = train_params["use_test"] log_dir = train_params["log_dir"] batch_size_inference = train_params["batch_size_inference"] progress_bar = train_params["progress_bar"] flush = train_params["flush"] hyperparameter_hash = hash_("".join( [str(hyperparam) for hyperparam in train_specific.values()])) if use_gpu: device = "/gpu:0" config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction, allow_growth=True)) else: device = "/cpu:0" os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config = tf.ConfigProto(allow_soft_placement=True) with tf.device(device): with tf.Session(config=config) as sess: input_placholder = tf.placeholder(tf.int32, shape=[None, None], name="input") weights_placeholder = tf.placeholder(tf.float32, shape=[None, None], name="input_weights") labels_placeholder = tf.placeholder(tf.int32, shape=[None], name="label") learning_rate_placeholder = tf.placeholder_with_default( learning_rate, shape=[], name="learning_rate") dropout_drop_rate_placeholder = tf.placeholder_with_default( 0., shape=[], name="dropout_rate") is_training = tf.placeholder_with_default(False, shape=[], name="do_dropout") tf.set_random_seed(seed) with tf.name_scope("embeddings"): token_embeddings = tf.Variable(tf.random.uniform( [num_words_in_train, embedding_dim]), name="embedding_matrix") with tf.name_scope("mean_sentence_embedding"): gathered_embeddings = tf.gather(token_embeddings, input_placholder) weights_broadcasted = tf.expand_dims(weights_placeholder, axis=2) mean_embedding = tf.reduce_sum(tf.multiply( weights_broadcasted, gathered_embeddings), axis=1, name="sentence_embedding") if use_batch_norm: mean_embedding = tf.layers.batch_normalization( mean_embedding, training=is_training) mean_embedding_dropout = tf.layers.dropout( mean_embedding, rate=dropout_drop_rate_placeholder, training=is_training) logits = tf.layers.dense( mean_embedding_dropout, num_labels, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(), name="logits") output = tf.nn.softmax(logits, name="prediction") # this is not used in the training, but will be used for inference with tf.name_scope("Accuracy"): correctly_predicted = tf.nn.in_top_k(logits, labels_placeholder, 1, name="Top_1") correctly_predicted_top_k = tf.nn.in_top_k(logits, labels_placeholder, top_k, name="Top_k") if use_tensorboard: train_writer = tf.summary.FileWriter( os.path.join(log_dir, "Train"), sess.graph) train_end_writer = tf.summary.FileWriter( os.path.join(log_dir, "End_epoch_train")) if use_test: batch_counter = 0 if use_tensorboard: test_writer = tf.summary.FileWriter( os.path.join(log_dir, "Test")) test_end_writer = tf.summary.FileWriter( os.path.join(log_dir, "End_epoch_test")) ce_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_placeholder, logits=logits), name="CE_loss") l2_vars = tf.trainable_variables() l2_loss = tf.multiply(tf.add_n([tf.nn.l2_loss(v) for v in l2_vars]), l2_reg_weight, name="L2_loss") total_loss = tf.add(ce_loss, l2_loss, name="Total_loss") if use_tensorboard: tf.summary.scalar("Cross_entropy_loss", ce_loss) summary_op = tf.summary.merge_all() else: summary_op = tf.constant(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.train.AdamOptimizer( learning_rate_placeholder).minimize(total_loss) sess.run(tf.global_variables_initializer()) iteration = 0 train_start = time.time() best_score, best_scores = -1, {1: None, top_k: None} logs = {1: [], top_k: [], "best": -1} for epoch in range(1, num_epochs + 1): start_iteration = iteration print("\n\nEpoch {}".format(epoch), flush=flush) end_epoch_accuracy, end_epoch_accuracy_k, end_epoch_loss, end_epoch_l2_loss, losses = [], [], [], [], [] for batch, batch_weights, batch_labels in \ batch_generator(train_description_hashes, train_labels, batch_size, label_vocab, cache, shuffle=True, show_progress=progress_bar, progress_desc="Fit train"): _, train_summary, _loss, correct, correct_k, batch_loss, batch_l2 = \ sess.run([train_op, summary_op, total_loss, correctly_predicted, correctly_predicted_top_k, ce_loss, l2_loss], feed_dict={input_placholder: batch, weights_placeholder: batch_weights, labels_placeholder: batch_labels, learning_rate_placeholder: learning_rate, dropout_drop_rate_placeholder: 1 - train_dropout_keep_rate, is_training: True}) if use_tensorboard: train_writer.add_summary(train_summary, iteration) losses.append(_loss) end_epoch_accuracy.extend(correct) end_epoch_accuracy_k.extend(correct_k) end_epoch_loss.append(batch_loss) end_epoch_l2_loss.append(batch_l2) iteration += 1 end_iteration = iteration print("Current learning rate: {}".format( round(learning_rate, 7)), flush=flush) learning_rate *= learning_rate_multiplier mean_loss = percent_array(losses) if np.isnan(mean_loss): print("Loss is NaN. Try using smaller learning rate") exit() print("Moving mean loss: {}".format(mean_loss), flush=flush) mean_accuracy = percent_array(end_epoch_accuracy) mean_accuracy_k = percent_array(end_epoch_accuracy_k) if use_tensorboard: write_summaries(end_epoch_loss, mean_accuracy, mean_accuracy_k, top_k, train_end_writer, epoch) summary_loss_l2 = tf.Summary(value=[ tf.Summary.Value( tag="L2", simple_value=np.mean(end_epoch_l2_loss)) ]) train_end_writer.add_summary(summary_loss_l2, epoch) print("Train moving accuracy: {}, top {}: {}".format( mean_accuracy, top_k, mean_accuracy_k), flush=flush) if use_test: num_test_iterations = int( np.ceil(len(test_labels) / batch_size_inference)) test_iterations = np.linspace(start_iteration, end_iteration, num_test_iterations) end_epoch_accuracy, end_epoch_accuracy_k, end_epoch_loss = [], [], [] for index, (batch, batch_weights, batch_labels) in enumerate( batch_generator(test_description_hashes, test_labels, batch_size_inference, label_vocab, cache, show_progress=progress_bar, progress_desc="Test")): correct, correct_k, batch_loss, test_summary = sess.run( [ correctly_predicted, correctly_predicted_top_k, ce_loss, summary_op ], feed_dict={ input_placholder: batch, weights_placeholder: batch_weights, labels_placeholder: batch_labels }) if use_tensorboard: test_writer.add_summary( test_summary, int(test_iterations[index])) end_epoch_accuracy.extend(correct) end_epoch_accuracy_k.extend(correct_k) end_epoch_loss.append(batch_loss) batch_counter += 1 mean_accuracy = np.round( 100 * np.sum(end_epoch_accuracy) / initial_test_len, 2) mean_accuracy_k = np.round( 100 * np.sum(end_epoch_accuracy_k) / initial_test_len, 2) if use_tensorboard: write_summaries(end_epoch_loss, mean_accuracy, mean_accuracy_k, top_k, test_end_writer, epoch) print("Test accuracy: {}, top {}: {}".format( mean_accuracy, top_k, mean_accuracy_k), flush=flush) logs[1].append(mean_accuracy) logs[top_k].append(mean_accuracy_k) comparable = mean_accuracy if compare_top_k: comparable = mean_accuracy_k if comparable > best_score: best_score = comparable best_scores[1] = mean_accuracy best_scores[top_k] = mean_accuracy_k freeze_save_graph(sess, log_dir, "model_best.pb", "prediction") logs["best"] = epoch if save_all_models: freeze_save_graph(sess, log_dir, "model_ep{}.pb".format(epoch), "prediction") else: if epoch == num_epochs: freeze_save_graph(sess, log_dir, "model_ep{}.pb".format(epoch), "prediction") iteration += 1 print("Best model mean test accuracy: {}, top {}: {}".format( logs[1][logs["best"] - 1], top_k, logs[top_k][logs["best"] - 1]), flush=flush) print("The model is stored at {}".format(log_dir), flush=flush) if use_test: results = { "hyperparams": train_specific, "scores": { test_path: best_scores } } else: results = { "hyperparams": train_specific, "scores": { train_path: best_scores } } train_history[hyperparameter_hash] = results with open(os.path.join(log_dir, "results.json"), "w+") as outfile: json.dump(results, outfile) with open(os.path.join(cache_dir, "details.json"), "w+") as outfile: json.dump(data_specific, outfile) with open(train_history_path, "w+") as outfile: json.dump(train_history, outfile) with open(os.path.join(log_dir, "accuracy_logs.json"), "w+") as outfile: json.dump(logs, outfile) print("The training took {} seconds".format( round(time.time() - train_start, 0)), flush=flush) print("Peak memory usage: {}".format( round(tracemalloc.get_traced_memory()[1] / 1e6, 0)), flush=flush)
def cache_data(descriptions, labels, word_vocab, label_vocab, word_ngrams, sort_ngrams, cache=None, is_test_data=False, show_progress=True, progress_desc=None, print_postfix="\n", flush=False): """ Cache data in order not to do repetitive work :param descriptions: list, hashed strings of the input data :param labels: list :param word_vocab: dict, mapping of words and n-grams to their indices :param label_vocab: dict, mapping of labels to their indices :param word_ngrams: int :param sort_ngrams: bool :param cache: dict :param is_test_data: bool :param show_progress: bool, show progress bar :param progress_desc: str, description for progress bar :param print_postfix: str :param flush: bool, flush after printing :return: tuple, (description hashes, labels, cache) """ if cache is None: cache = dict() description_hashes, labels2 = [], [] descriptions_thrown, labels_thrown = 0, 0 disable_progressbar = not show_progress if disable_progressbar: if progress_desc: print(progress_desc, flush=flush) for description, label in \ zip(tqdm(descriptions, disable=disable_progressbar, desc=progress_desc, file=sys.stdout), labels): phrase_indices = [0] + [ word_vocab[phrase]["id"] for phrase in get_all(description, word_ngrams, sort_ngrams) if phrase in word_vocab ] if len(phrase_indices) == 1: descriptions_thrown += 1 continue if label not in label_vocab: if is_test_data: labels_thrown += 1 continue tmp_hash = hash_(str(description)) if tmp_hash not in cache: desc_weights = [ 1. / len(phrase_indices) for _ in range(len(phrase_indices)) ] cache[tmp_hash] = {"i": phrase_indices, "w": desc_weights} labels2.append(label) description_hashes.append(tmp_hash) if labels_thrown > 0: print("{} datapoints thrown because of empty description".format( descriptions_thrown), flush=flush) print("{} datapoints thrown because of label {}".format( labels_thrown, print_postfix), flush=flush) else: print("{} datapoints thrown because of empty description {}".format( descriptions_thrown, print_postfix), flush=flush) return description_hashes, labels2, cache
def password(self, pass_): self._password = hash_(pass_)
def authenticate(self, pass_): return self.password == hash_(pass_)
def __init__(self, *args, **kwargs): for p in ['password', '_password']: if p in kwargs: self._password = hash_(kwargs[p]) del (kwargs[p]) super(User, self).__init__(*args, **kwargs)
def __init__(self, train_path, test_path=None, additional_data_paths=None, hyperparams=None, preprocessing_function=None, log_dir="./", use_gpu=False, gpu_fraction=0.5, verbose=True, remove_extra_labels=True, force=False): """ Train a supervised fasttext model :param train_path: str, path to train file :param test_path: str or None, path to test file, if None training will be done without test :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file :param hyperparams: dict, all hyperparams for train_supervised :param preprocessing_function: function, function to apply on text data before feeding into network :param log_dir: str, directory to save the training files and the model :param use_gpu: bool, use gpu for training :param gpu_fraction: float, gpu fraction to allocate :param remove_extra_labels: bool, remove data from additional paths, which have labels not contained in train.txt :param verbose: bool :param remove_extra_labels: bool, remove datapoints with labels which appear in additional_data_paths but not in train_data_path. Ignored if additional_data_paths is None :param force: bool, forced training :return: object, the trained model """ log_dir = validate(log_dir) # defualt hyperparams self.hyperparams = \ {"train_path": '', "test_path": '', "label_prefix": "__label__", "data_fraction": 1, "seed": 17, "embedding_dim": 100, "num_epochs": 10, "word_ngrams": 1, "sort_ngrams": 0, "batch_size": 4096, "use_batch_norm": 0, "min_word_count": 1, "learning_rate": 0.1, "learning_rate_multiplier": 0.8, "dropout": 0.5, "l2_reg_weight": 1e-06, "batch_size_inference": 4096, "top_k": 3, "compare_top_k": 0, "save_all_models": 0, "use_test": 0, "use_gpu": 0, "gpu_fraction": 0.5, "cache_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "cache"))), "log_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "results"))), "force": 0, "progress_bar": 1, "flush": 1} if not os.path.exists(train_path): raise FileNotFoundError("train_path is incorrect") if test_path: if not os.path.exists(test_path): raise FileNotFoundError("test_path is incorrect") if preprocessing_function and verbose: print("Preprocessing train data ...") to_restore = dict() if hyperparams is None: hyperparams = dict() do_preprocessing = preprocessing_function is not None if len(hyperparams) != 0: for key, value in hyperparams.items(): if key not in self.hyperparams: to_restore[key] = value print("WARNING! {} not in hyperparams, ignoring it".format( key)) else: if key in ["cache_dir", "log_dir"]: self.hyperparams[key] = handle_space_paths(value) else: self.hyperparams[key] = value train_path = os.path.abspath(train_path) if additional_data_paths: data_to_save = [] paths_joined_hashed = hash_(" ".join(additional_data_paths)) concat_path = "./tmp.txt" joined_path = "./{}.txt".format(paths_joined_hashed) _, all_labels = parse_txt(train_path) unique_labels = np.unique(all_labels) if not isinstance(additional_data_paths, list): raise ValueError( "Type of additional_data_paths should be list") for additional_data_path in additional_data_paths: if not os.path.isfile(additional_data_path): raise FileNotFoundError( "{} in additional data paths doesn't exist".format( additional_data_path)) current_data, current_labels = parse_txt(additional_data_path) if remove_extra_labels: needed_mask = np.in1d(current_labels, unique_labels) current_data = [ data for data, needed in zip(current_data, needed_mask) if needed ] current_labels = [ data for data, needed in zip(current_labels, needed_mask) if needed ] if do_preprocessing: data_to_save.extend([ "{}{} {}".format(self.hyperparams["label_prefix"], label, preprocessing_function(data)) for label, data in zip(current_labels, current_data) ]) else: data_to_save.extend([ "{}{} {}".format(self.hyperparams["label_prefix"], label, data) for label, data in zip(current_labels, current_data) ]) np.savetxt(concat_path, data_to_save, fmt="%s") if do_preprocessing: prep_train_path = preprocess_data(train_path, preprocessing_function) os.system("cat {} {} > {}".format(concat_path, prep_train_path, joined_path)) to_restore["original_train_path"] = prep_train_path else: os.system("cat {} {} > {}".format(concat_path, train_path, joined_path)) to_restore["original_train_path"] = train_path self.hyperparams["train_path"] = joined_path to_restore["additional_data_paths"] = additional_data_paths else: if do_preprocessing: prep_train_path = preprocess_data(train_path, preprocessing_function) self.hyperparams["train_path"] = prep_train_path else: self.hyperparams["train_path"] = train_path if preprocessing_function and verbose: print("Done!") if test_path is not None: test_path = os.path.abspath(test_path) self.hyperparams["use_test"] = 1 if do_preprocessing: prep_test_path = preprocess_data(test_path, preprocessing_function) to_restore["original_test_path"] = test_path self.hyperparams["test_path"] = prep_test_path else: self.hyperparams["test_path"] = test_path if use_gpu: self.hyperparams["use_gpu"] = 1 self.hyperparams["gpu_fraction"] = gpu_fraction if force: self.hyperparams["force"] = 1 # using Popen as calling the command from Jupyter doesn't deallocate GPU memory train_command = self._get_train_command() process = Popen(train_command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True) self.top_1_accuracy, self.top_k_accuracy, log_dir = \ get_accuracy_log_dir(process, self.hyperparams["top_k"], verbose) for key, value in to_restore.items(): self.hyperparams[key] = value super(train_supervised, self).__init__(model_path=os.path.join(log_dir, "model_best.pb"), model_params_path=os.path.join( log_dir, "model_params.json"), use_gpu=use_gpu, gpu_fraction=gpu_fraction, hyperparams=self.hyperparams, label_prefix=self.hyperparams["label_prefix"], preprocessing_function=preprocessing_function)
def main(): main_start = time.time() tracemalloc.start() parser = argparse.ArgumentParser() # data specific parameters parser.add_argument("-trp", "--train_path", type=str, required=True, help="path to train file", default="") parser.add_argument("-tp", "--test_path", type=str, help="path to test file", default="") parser.add_argument("-lp", "--label_prefix", type=str, help="label prefix", default="__label__") parser.add_argument("-df", "--data_fraction", type=float, default=1, help="data fraction") parser.add_argument("-seed", "--seed", type=int, default=17) # hyper-parameters parser.add_argument("-dim", "--embedding_dim", type=int, default=100, help="length of embedding vector") parser.add_argument("-nep", "--num_epochs", type=int, default=5, help="number of epochs") parser.add_argument("-wng", "--word_ngrams", type=int, default=1, help="word ngrams") parser.add_argument("-sng", "--sort_ngrams", type=int, default=0, help="sort n-grams alphabetically") parser.add_argument("-bs", "--batch_size", type=int, default=4096, help="batch size for train") parser.add_argument("-bn", "--use_batch_norm", type=int, default=0, help="use batch norm") parser.add_argument( "-mwc", "--min_word_count", type=int, default=1, help="discard words which appear less than this number") parser.add_argument("-lr", "--learning_rate", type=float, default=0.3, help="learning rate") parser.add_argument("-lrm", "--learning_rate_multiplier", type=float, default=0.8, help="learning rate multiplier") parser.add_argument("-dr", "--dropout", type=float, default=0.5, help="train dropout keep rate") parser.add_argument("-l2", "--l2_reg_weight", type=float, default=1e-6, help="regularization weight") # parameters parser.add_argument("-bsi", "--batch_size_inference", type=int, default=4096, help="batch size for test") parser.add_argument("-k", "--top_k", type=int, default=3, help="report results for top k predictions") parser.add_argument( "-ck", "--compare_top_k", type=int, default=0, help="compare top k accuracies for determining the best model") parser.add_argument("-sm", "--save_all_models", type=int, default=0, help="save model after each epoch") parser.add_argument("-ut", "--use_test", type=int, default=1, help="evaluate on test data") parser.add_argument("-gpu", "--use_gpu", type=int, default=0, help="use gpu for training") parser.add_argument("-gpu_fr", "--gpu_fraction", type=float, default=0.5, help="what fraction of gpu to allocate") parser.add_argument("-utb", "--use_tensorboard", type=int, default=0, help="use tensorboard") parser.add_argument("-cd", "--cache_dir", type=str, help="cache directory", default="./cache/") parser.add_argument("-ld", "--log_dir", type=str, help="log directory", default="./results/") parser.add_argument("-f", "--force", type=int, default=0, help="force retraining") parser.add_argument("-pb", "--progress_bar", type=int, default=1, help="show progress bar") parser.add_argument("-fl", "--flush", type=int, default=0, help="flush after print") args = parser.parse_args() for bool_param in [ args.use_batch_norm, args.save_all_models, args.use_test, args.sort_ngrams, args.use_gpu, args.use_tensorboard, args.force, args.flush, args.compare_top_k, args.progress_bar ]: if bool_param not in [0, 1]: raise ValueError("{} should be 0 or 1.".format(bool_param)) train_path = os.path.abspath(args.train_path) sort_ngrams = bool(args.sort_ngrams) progress_bar = bool(args.progress_bar) flush = bool(args.flush) use_test = False if args.test_path: args.test_path = os.path.abspath(args.test_path) if bool(args.use_test): use_test = True print("\n\nTraining with arguments:\n{}\n".format(args)) cache_dir = validate(args.cache_dir) log_dir = validate(args.log_dir) train_history_path = os.path.join(log_dir, "history.json") np.random.seed(args.seed) train_descriptions, train_labels, max_words = \ parse_txt(train_path, as_tokens=True, return_max_len=True, fraction=args.data_fraction, seed=args.seed, label_prefix=args.label_prefix) data_specific = { "seed": args.seed, "data_fraction": args.data_fraction, "min_word_count": args.min_word_count, "word_ngrams": args.word_ngrams, "sort_ngrams": sort_ngrams, } data_hash = get_cache_hash(list_of_texts=train_descriptions, data_specific_params=data_specific) cache_dir = os.path.abspath(validate(os.path.join(cache_dir, data_hash))) train_specific = { "embedding_dim": args.embedding_dim, "num_epochs": args.num_epochs, "batch_size": args.batch_size, "learning_rate": args.learning_rate, "learning_rate_multiplier": args.learning_rate_multiplier, "use_batch_norm": bool(args.use_batch_norm), "l2_reg_weight": args.l2_reg_weight, "dropout": args.dropout, "cache_dir": cache_dir } for k, v in data_specific.items(): train_specific[k] = v model_params = { "word_ngrams": args.word_ngrams, "sort_ngrams": sort_ngrams, "word_dict_path": os.path.abspath(os.path.join(cache_dir, "word_dict.json")), "label_dict_path": os.path.abspath(os.path.join(cache_dir, "label_dict.json")) } hyperparams_hashed = hash_("".join( [str(i) for i in train_specific.values()])) current_log_dir = validate(os.path.join(log_dir, hyperparams_hashed)) data_specific["train_path"], train_specific[ "train_path"] = train_path, train_path train_params = { "use_gpu": bool(args.use_gpu), "gpu_fraction": args.gpu_fraction, "use_tensorboard": bool(args.use_tensorboard), "top_k": args.top_k, "save_all_models": bool(args.save_all_models), "compare_top_k": bool(args.compare_top_k), "use_test": use_test, "log_dir": current_log_dir, "batch_size_inference": args.batch_size_inference, "progress_bar": progress_bar, "flush": flush, } if os.path.exists(train_history_path): with open(train_history_path) as infile: train_history = json.load(infile) if hyperparams_hashed in train_history and check_model_presence( current_log_dir): if not bool(args.force): if args.test_path: get_accuracy(current_log_dir, train_params, train_history_path, hyperparams_hashed, train_history, args.test_path, args.label_prefix) else: get_accuracy(current_log_dir, train_params, train_history_path, hyperparams_hashed, train_history, train_path, args.label_prefix) print("The model is stored at {}".format(current_log_dir)) exit() else: print("Forced retraining") print("Training hyper-parameters hashed: {}".format( hyperparams_hashed)) else: print("Training hyper-parameters hashed: {}".format( hyperparams_hashed)) else: train_history = dict() clean_directory(current_log_dir) max_words_with_ng = get_max_words_with_ngrams(max_words, args.word_ngrams) print("Preparing dataset") print("Total number of datapoints: {}".format(len(train_descriptions))) print("Max number of words in description: {}".format(max_words)) print("Max number of words with n-grams in description: {}".format( max_words_with_ng)) word_vocab, label_vocab = get_word_label_vocabs(train_descriptions, train_labels, args.word_ngrams, args.min_word_count, sort_ngrams, cache_dir, bool(args.force), show_progress=progress_bar, flush=flush) with open(os.path.join(current_log_dir, "model_params.json"), "w+") as outfile: json.dump(model_params, outfile) num_words_in_train = len(word_vocab) train_description_hashes, train_labels, cache = \ cache_data(train_descriptions, train_labels, word_vocab, label_vocab, args.word_ngrams, sort_ngrams, show_progress=progress_bar, progress_desc="Cache train descriptions", flush=flush) del train_descriptions test_description_hashes, test_labels = [], [] initial_test_len = 0 if use_test: test_descriptions, test_labels, max_words_test = parse_txt( args.test_path, as_tokens=True, return_max_len=True, label_prefix=args.label_prefix) initial_test_len = len(test_descriptions) print("Total number of test datapoints: {}".format( len(test_descriptions))) test_description_hashes, test_labels, cache = \ cache_data(test_descriptions, test_labels, word_vocab, label_vocab, args.word_ngrams, sort_ngrams, cache=cache, is_test_data=True, show_progress=progress_bar, progress_desc="Cache test descriptions", flush=flush) del test_descriptions data = { "train_description_hashes": train_description_hashes, "train_labels": train_labels, "test_description_hashes": test_description_hashes, "test_labels": test_labels, "cache": cache, "label_vocab": label_vocab, "num_words_in_train": num_words_in_train, "test_path": args.test_path, "initial_test_len": initial_test_len, } run_train(data, train_specific, train_params, data_specific, train_history, train_history_path) print("All process took {} seconds".format( round(time.time() - main_start, 0)), flush=flush)