def run(arguments) -> None: vocab_nodes, vocab_actions = build_vocab_from_data_dir( [arguments["DATA_DIR"]], vocab_size=500, max_num_files=arguments.get("--max-num-files"), ) tensorised_nodes, tensorised_actions, _ = load_data_from_dir( vocab_nodes, vocab_actions, length=50, data_dirs=[arguments["DATA_DIR"]], max_num_files=arguments.get("--max-num-files"), ) for idx in range(min(5, len(tensorised_actions))): token_ids = tensorised_actions[idx] length = find_first( vocab_actions.get_id_or_unk(vocab_actions.get_pad()), token_ids) tokens = [ vocab_actions.get_name_for_id(tok_id) for tok_id in token_ids ] print("Sample %i:" % (idx)) print(" Real length: %i" % (length)) print(" Tensor length: %i" % (len(token_ids))) print(" Raw tensor: %s (truncated)" % (str(token_ids[:length + 2]))) print(" Interpreted tensor: %s (truncated)" % (str(tokens[:length + 2])))
def run(arguments) -> None: vocab = build_vocab_from_data_dir( arguments["DATA_DIR"], vocab_size=500, max_num_files=arguments.get("--max-num-files")) print("Loaded vocabulary for dataset: ") print(" %s [...]" % (str(vocab)[:100]))
def run(arguments) -> None: hyperparameters = LanguageModel.get_default_hyperparameters() hyperparameters["run_id"] = make_run_id(arguments) max_epochs = int(arguments.get("--max-num-epochs")) patience = int(arguments.get("--patience")) max_num_files = arguments.get("--max-num-files") # override hyperparams if flag is passed hypers_override = arguments.get("--hypers-override") if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) save_model_dir = args["SAVE_DIR"] os.makedirs(save_model_dir, exist_ok=True) save_file = os.path.join(save_model_dir, f"{hyperparameters['run_id']}_best_model.bin") print("Loading data ...") vocab = build_vocab_from_data_dir( data_dir=args["TRAIN_DATA_DIR"], vocab_size=hyperparameters["max_vocab_size"], max_num_files=max_num_files, ) print(f" Built vocabulary of {len(vocab)} entries.") train_data = load_data_from_dir( vocab, length=hyperparameters["max_seq_length"], data_dir=args["TRAIN_DATA_DIR"], max_num_files=max_num_files, ) print( f" Loaded {train_data.shape[0]} training samples from {args['TRAIN_DATA_DIR']}." ) valid_data = load_data_from_dir( vocab, length=hyperparameters["max_seq_length"], data_dir=args["VALID_DATA_DIR"], max_num_files=max_num_files, ) print( f" Loaded {valid_data.shape[0]} validation samples from {args['VALID_DATA_DIR']}." ) model = LanguageModel(hyperparameters, vocab) model.build(([None, hyperparameters["max_seq_length"]])) print( f"Constructed model, using the following hyperparameters: {json.dumps(hyperparameters)}" ) train( model, train_data, valid_data, batch_size=hyperparameters["batch_size"], max_epochs=max_epochs, patience=patience, save_file=save_file, )
train_data_dir = args['--train-data-dir'] with open('./test_outputs/training_data_info.txt', 'w') as output: print("Loading data ...") print("%18s %11s %15s %15s %11s %15s" % ("Folder name", "File count", "Vocab actions", "Vocab nodes", "Methods", "Time_process")) start_time = datetime.now() for corpus_dir in training_dirs: corpus_path = os.path.join(train_data_dir, corpus_dir) file_count = len(get_data_files_from_directory(corpus_path)) vocab_nodes, vocab_actions = build_vocab_from_data_dir( data_dir=corpus_path, vocab_size=1000, max_num_files=100, ) train_data = load_data_from_dir( vocab_nodes, vocab_actions, length=50, data_dir=args["--train-data-dir"], max_num_files=1000, ) # train_data = [np.zeros(2)] print("%15s %11s %15s %15s %11s %15s" % (corpus_dir, file_count,\ len(vocab_actions), len(vocab_nodes),\ train_data[0].shape[0], datetime.now() - start_time))
*_DATA_DIR are directories filled with files that we use as data. Options: -h --help show this message and exit. --max-num-files INT maximum number of files to consider -v --verbose show unnecessary extra information. -f --is_file the path is a file (and not a folder) --debug debug mode [default: False] """ from docopt import docopt import os, sys # Add parent directory dynamically sys.path.insert(1, os.path.join(sys.path[0], '..')) from dataset import build_vocab_from_data_dir if __name__ == '__main__': args = docopt(__doc__) vocab_nodes, vocab_actions = build_vocab_from_data_dir( args["CORPUS_DATA_DIR"], 500, args["--max-num-files"]) with open('./test_outputs/vocabulary.txt', 'w') as output: output.write("Nodes vocabulary has size %d\n" % len(vocab_nodes)) output.write(" %s [...]\n\n" % (str(vocab_nodes)[:1000])) output.write("Actions vocabulary has size %d\n" % len(vocab_actions)) output.write(" %s [...]\n\n" % (str(vocab_actions)[:1000]))
def run(args) -> None: hyperparameters = BaseModel.get_default_hyperparameters() if args['--model'] == 'v1': hyperparameters = SyntacticModelv1.get_default_hyperparameters() elif args['--model'] == 'v2': hyperparameters = SyntacticModelv2.get_default_hyperparameters() elif args['--model'] == 'v3': hyperparameters = SyntacticModelv3.get_default_hyperparameters() hyperparameters["run_id"] = make_run_id(args) max_epochs = int(args.get("--max-num-epochs")) patience = int(args.get("--patience")) max_num_files = args.get("--max-num-files") # override hyperparams if flag is passed hypers_override = args.get("--hypers-override") if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) if args['--compute-data']: """ Create and save the training data """ if os.path.isdir(f'./data/{args["--max-num-files"]}') == False: os.mkdir(f'./data/{args["--max-num-files"]}') #### Make list of all training data directories data_dirs = [os.path.join(args["--train-data-dir"], data_dir) for data_dir in training_dirs] logging.info("Computing train/valid/test data ...") vocab_nodes, vocab_actions = build_vocab_from_data_dir( data_dirs=data_dirs, vocab_size=hyperparameters["max_vocab_size"], max_num_files=max_num_files, ) logging.info(f" Built vocabulary of {len(vocab_actions)} entries.") ### Build Train/Valid/SeenTest datasets all_nodes, all_actions, all_fathers = load_data_from_dir( vocab_nodes, vocab_actions, length=hyperparameters["max_seq_length"], data_dirs=data_dirs, max_num_files=max_num_files, ) logging.info(f" Built dataset of {all_nodes.shape[0]} training/valid/test examples.") # Save data with open(f'data/{args["--max-num-files"]}/vocab_nodes', 'wb') as output: pickle.dump(vocab_nodes, output) with open(f'data/{args["--max-num-files"]}/vocab_actions', 'wb') as output: pickle.dump(vocab_actions, output) # Shuffle and split data into train/valid/test indices = np.arange(all_nodes.shape[0]) np.random.shuffle(indices) ind_len = indices.shape[0] train_indices = indices[:(int)(0.8 * ind_len)] valid_indices = indices[(int)(0.8 * ind_len):(int)(0.9 * ind_len)] seen_test_indices = indices[(int)(0.9 * ind_len):] # Make the datasets train_data = (all_nodes[train_indices], all_actions[train_indices], all_fathers[train_indices]) valid_data = (all_nodes[valid_indices], all_actions[valid_indices], all_fathers[valid_indices]) seen_test_data = (all_nodes[seen_test_indices], all_actions[seen_test_indices], all_fathers[seen_test_indices]) with open(f'data/{args["--max-num-files"]}/train_data', 'wb') as output: pickle.dump(train_data, output) with open(f'data/{args["--max-num-files"]}/valid_data', 'wb') as output: pickle.dump(valid_data, output) with open(f'data/{args["--max-num-files"]}/seen_test_data', 'wb') as output: pickle.dump(seen_test_data, output) ### Build UnseenTest dataset unseen_test_data_dirs = [os.path.join(args["--train-data-dir"], data_dir) for data_dir in unseen_test_dirs] logging.info("Computing data ...") ### Build Train/Valid/SeenTest datasets unseen_test_data = load_data_from_dir( vocab_nodes, vocab_actions, length=hyperparameters["max_seq_length"], data_dirs=unseen_test_data_dirs, max_num_files=max_num_files, ) with open(f'data/{args["--max-num-files"]}/unseen_test_data', 'wb') as output: pickle.dump(unseen_test_data, output) logging.info("Finished computing data ...") logging.info("Now exiting program. Rerun with loading the data from memory...") with open(f'data/{args["--max-num-files"]}/log.txt', "w") as compute_data_log: compute_data_log.write(f"Train/Valid/Seen_test directories are:\n") for data_dir in training_dirs: compute_data_log.write(f" {data_dir}\n") compute_data_log.write("Unseen test directories are:\n") for data_dir in unseen_test_dirs: compute_data_log.write(f" {data_dir}\n") compute_data_log.write( f"This has {len(get_data_files_from_directory(data_dirs, max_num_files))} files, from which I " f"extracted samples (of length {hyperparameters['max_seq_length']}):\n " f" {len(train_indices)} training samples\n" f" {len(valid_indices)} validation samples\n" f" {len(seen_test_indices)} seen test samples\n" f" {len(unseen_test_data[0])} unseen test samples\n") compute_data_log.write(f"Nodes vocabulary has size {len(vocab_nodes)}\n") compute_data_log.write(f"Actions vocabulary has size {len(vocab_actions)}\n") exit(0) logging.info("Loading data into memory...") ### Load data with open(os.path.join(args['--saved-data-dir'], 'vocab_nodes'), 'rb') as input: vocab_nodes = pickle.load(input) with open(os.path.join(args['--saved-data-dir'], 'vocab_actions'), 'rb') as input: vocab_actions = pickle.load(input) with open(os.path.join(args['--saved-data-dir'], 'train_data'), 'rb') as input: train_data = pickle.load(input) with open(os.path.join(args['--saved-data-dir'], 'valid_data'), 'rb') as input: valid_data = pickle.load(input) logging.info(f" Loaded {train_data[0].shape[0]} training samples.") logging.info(f" Loaded {valid_data[0].shape[0]} validation samples.") # Construct model if args['--model'] == 'v1': model = SyntacticModelv1(hyperparameters, vocab_nodes, vocab_actions) elif args['--model'] == 'v2': model = SyntacticModelv2(hyperparameters, vocab_nodes, vocab_actions) elif args['--model'] == 'v3': model = SyntacticModelv3(hyperparameters, vocab_nodes, vocab_actions) model.build(([None, hyperparameters["max_seq_length"], 3])) logging.info("Constructed model, using the following hyperparameters:") logging.info(json.dumps(hyperparameters)) # Peth for saving the model save_model_dir = args["--save-dir"] os.makedirs(save_model_dir, exist_ok=True) save_file = os.path.join( save_model_dir, f"{hyperparameters['run_id']}_best_model.bin" ) train( model, train_data, valid_data, batch_size=hyperparameters["batch_size"], max_epochs=max_epochs, patience=patience, save_file=save_file, args=args )