def run(arguments) -> None:
    vocab_nodes, vocab_actions = build_vocab_from_data_dir(
        [arguments["DATA_DIR"]],
        vocab_size=500,
        max_num_files=arguments.get("--max-num-files"),
    )
    tensorised_nodes, tensorised_actions, _ = load_data_from_dir(
        vocab_nodes,
        vocab_actions,
        length=50,
        data_dirs=[arguments["DATA_DIR"]],
        max_num_files=arguments.get("--max-num-files"),
    )

    for idx in range(min(5, len(tensorised_actions))):
        token_ids = tensorised_actions[idx]
        length = find_first(
            vocab_actions.get_id_or_unk(vocab_actions.get_pad()), token_ids)
        tokens = [
            vocab_actions.get_name_for_id(tok_id) for tok_id in token_ids
        ]
        print("Sample %i:" % (idx))
        print(" Real length: %i" % (length))
        print(" Tensor length: %i" % (len(token_ids)))
        print(" Raw tensor: %s (truncated)" % (str(token_ids[:length + 2])))
        print(" Interpreted tensor: %s (truncated)" %
              (str(tokens[:length + 2])))
Ejemplo n.º 2
0
def run(arguments) -> None:
    vocab = build_vocab_from_data_dir(
        arguments["DATA_DIR"],
        vocab_size=500,
        max_num_files=arguments.get("--max-num-files"))

    print("Loaded vocabulary for dataset: ")
    print(" %s [...]" % (str(vocab)[:100]))
def run(arguments) -> None:
    hyperparameters = LanguageModel.get_default_hyperparameters()
    hyperparameters["run_id"] = make_run_id(arguments)
    max_epochs = int(arguments.get("--max-num-epochs"))
    patience = int(arguments.get("--patience"))
    max_num_files = arguments.get("--max-num-files")

    # override hyperparams if flag is passed
    hypers_override = arguments.get("--hypers-override")
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))

    save_model_dir = args["SAVE_DIR"]
    os.makedirs(save_model_dir, exist_ok=True)
    save_file = os.path.join(save_model_dir,
                             f"{hyperparameters['run_id']}_best_model.bin")

    print("Loading data ...")
    vocab = build_vocab_from_data_dir(
        data_dir=args["TRAIN_DATA_DIR"],
        vocab_size=hyperparameters["max_vocab_size"],
        max_num_files=max_num_files,
    )
    print(f"  Built vocabulary of {len(vocab)} entries.")
    train_data = load_data_from_dir(
        vocab,
        length=hyperparameters["max_seq_length"],
        data_dir=args["TRAIN_DATA_DIR"],
        max_num_files=max_num_files,
    )
    print(
        f"  Loaded {train_data.shape[0]} training samples from {args['TRAIN_DATA_DIR']}."
    )
    valid_data = load_data_from_dir(
        vocab,
        length=hyperparameters["max_seq_length"],
        data_dir=args["VALID_DATA_DIR"],
        max_num_files=max_num_files,
    )
    print(
        f"  Loaded {valid_data.shape[0]} validation samples from {args['VALID_DATA_DIR']}."
    )
    model = LanguageModel(hyperparameters, vocab)
    model.build(([None, hyperparameters["max_seq_length"]]))
    print(
        f"Constructed model, using the following hyperparameters: {json.dumps(hyperparameters)}"
    )

    train(
        model,
        train_data,
        valid_data,
        batch_size=hyperparameters["batch_size"],
        max_epochs=max_epochs,
        patience=patience,
        save_file=save_file,
    )
    train_data_dir = args['--train-data-dir']

    with open('./test_outputs/training_data_info.txt', 'w') as output:
        print("Loading data ...")
        print("%18s %11s %15s %15s %11s %15s" %
              ("Folder name", "File count", "Vocab actions", "Vocab nodes",
               "Methods", "Time_process"))
        start_time = datetime.now()
        for corpus_dir in training_dirs:
            corpus_path = os.path.join(train_data_dir, corpus_dir)

            file_count = len(get_data_files_from_directory(corpus_path))
            vocab_nodes, vocab_actions = build_vocab_from_data_dir(
                data_dir=corpus_path,
                vocab_size=1000,
                max_num_files=100,
            )

            train_data = load_data_from_dir(
                vocab_nodes,
                vocab_actions,
                length=50,
                data_dir=args["--train-data-dir"],
                max_num_files=1000,
            )
            # train_data = [np.zeros(2)]
            print("%15s %11s %15s %15s %11s %15s" % (corpus_dir, file_count,\
                                                     len(vocab_actions), len(vocab_nodes),\
                                                     train_data[0].shape[0], datetime.now() - start_time))
Ejemplo n.º 5
0
*_DATA_DIR are directories filled with files that we use as data.

Options:
    -h --help            show this message and exit.
    --max-num-files INT  maximum number of files to consider
    -v --verbose         show unnecessary extra information.
    -f --is_file         the path is a file (and not a folder)
    --debug              debug mode [default: False]
"""
from docopt import docopt

import os, sys
# Add parent directory dynamically
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from dataset import build_vocab_from_data_dir

if __name__ == '__main__':
    args = docopt(__doc__)

    vocab_nodes, vocab_actions = build_vocab_from_data_dir(
        args["CORPUS_DATA_DIR"], 500, args["--max-num-files"])

    with open('./test_outputs/vocabulary.txt', 'w') as output:
        output.write("Nodes vocabulary has size %d\n" % len(vocab_nodes))
        output.write(" %s [...]\n\n" % (str(vocab_nodes)[:1000]))

        output.write("Actions vocabulary has size %d\n" % len(vocab_actions))
        output.write(" %s [...]\n\n" % (str(vocab_actions)[:1000]))
Ejemplo n.º 6
0
def run(args) -> None:
    hyperparameters = BaseModel.get_default_hyperparameters()
    if args['--model'] == 'v1':
        hyperparameters = SyntacticModelv1.get_default_hyperparameters()
    elif args['--model'] == 'v2':
        hyperparameters = SyntacticModelv2.get_default_hyperparameters()
    elif args['--model'] == 'v3':
        hyperparameters = SyntacticModelv3.get_default_hyperparameters()

    hyperparameters["run_id"] = make_run_id(args)
    max_epochs = int(args.get("--max-num-epochs"))
    patience = int(args.get("--patience"))
    max_num_files = args.get("--max-num-files")

    # override hyperparams if flag is passed
    hypers_override = args.get("--hypers-override")
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))

    if args['--compute-data']:
        """
        Create and save the training data
        """

        if os.path.isdir(f'./data/{args["--max-num-files"]}') == False:
            os.mkdir(f'./data/{args["--max-num-files"]}')

        #### Make list of all training data directories
        data_dirs = [os.path.join(args["--train-data-dir"], data_dir) for data_dir in training_dirs]
        logging.info("Computing train/valid/test data ...")
        vocab_nodes, vocab_actions = build_vocab_from_data_dir(
            data_dirs=data_dirs,
            vocab_size=hyperparameters["max_vocab_size"],
            max_num_files=max_num_files,
        )
        logging.info(f"  Built vocabulary of {len(vocab_actions)} entries.")

        ### Build Train/Valid/SeenTest datasets
        all_nodes, all_actions, all_fathers = load_data_from_dir(
            vocab_nodes,
            vocab_actions,
            length=hyperparameters["max_seq_length"],
            data_dirs=data_dirs,
            max_num_files=max_num_files,
        )
        logging.info(f"  Built dataset of {all_nodes.shape[0]} training/valid/test examples.")
        # Save data
        with open(f'data/{args["--max-num-files"]}/vocab_nodes', 'wb') as output:
            pickle.dump(vocab_nodes, output)
        with open(f'data/{args["--max-num-files"]}/vocab_actions', 'wb') as output:
            pickle.dump(vocab_actions, output)

        # Shuffle and split data into train/valid/test
        indices = np.arange(all_nodes.shape[0])
        np.random.shuffle(indices)
        ind_len = indices.shape[0]

        train_indices = indices[:(int)(0.8 * ind_len)]
        valid_indices = indices[(int)(0.8 * ind_len):(int)(0.9 * ind_len)]
        seen_test_indices = indices[(int)(0.9 * ind_len):]

        # Make the datasets
        train_data = (all_nodes[train_indices],
                      all_actions[train_indices],
                      all_fathers[train_indices])
        valid_data = (all_nodes[valid_indices],
                      all_actions[valid_indices],
                      all_fathers[valid_indices])
        seen_test_data = (all_nodes[seen_test_indices],
                          all_actions[seen_test_indices],
                          all_fathers[seen_test_indices])

        with open(f'data/{args["--max-num-files"]}/train_data', 'wb') as output:
            pickle.dump(train_data, output)
        with open(f'data/{args["--max-num-files"]}/valid_data', 'wb') as output:
            pickle.dump(valid_data, output)
        with open(f'data/{args["--max-num-files"]}/seen_test_data', 'wb') as output:
            pickle.dump(seen_test_data, output)

        ### Build UnseenTest dataset
        unseen_test_data_dirs = [os.path.join(args["--train-data-dir"], data_dir) for data_dir in unseen_test_dirs]
        logging.info("Computing data ...")

        ### Build Train/Valid/SeenTest datasets
        unseen_test_data = load_data_from_dir(
            vocab_nodes,
            vocab_actions,
            length=hyperparameters["max_seq_length"],
            data_dirs=unseen_test_data_dirs,
            max_num_files=max_num_files,
        )
        with open(f'data/{args["--max-num-files"]}/unseen_test_data', 'wb') as output:
            pickle.dump(unseen_test_data, output)

        logging.info("Finished computing data ...")
        logging.info("Now exiting program. Rerun with loading the data from memory...")

        with open(f'data/{args["--max-num-files"]}/log.txt', "w") as compute_data_log:
            compute_data_log.write(f"Train/Valid/Seen_test directories are:\n")
            for data_dir in training_dirs:
                compute_data_log.write(f"   {data_dir}\n")

            compute_data_log.write("Unseen test directories are:\n")
            for data_dir in unseen_test_dirs:
                compute_data_log.write(f"   {data_dir}\n")

            compute_data_log.write(
                f"This has {len(get_data_files_from_directory(data_dirs, max_num_files))} files, from which I "
                f"extracted samples (of length {hyperparameters['max_seq_length']}):\n "
                f"   {len(train_indices)} training samples\n"
                f"   {len(valid_indices)} validation samples\n"
                f"   {len(seen_test_indices)} seen test samples\n"
                f"   {len(unseen_test_data[0])} unseen test samples\n")

            compute_data_log.write(f"Nodes vocabulary has size {len(vocab_nodes)}\n")
            compute_data_log.write(f"Actions vocabulary has size {len(vocab_actions)}\n")

        exit(0)

    logging.info("Loading data into memory...")
    ### Load data
    with open(os.path.join(args['--saved-data-dir'], 'vocab_nodes'), 'rb') as input:
        vocab_nodes = pickle.load(input)
    with open(os.path.join(args['--saved-data-dir'], 'vocab_actions'), 'rb') as input:
        vocab_actions = pickle.load(input)
    with open(os.path.join(args['--saved-data-dir'], 'train_data'), 'rb') as input:
        train_data = pickle.load(input)
    with open(os.path.join(args['--saved-data-dir'], 'valid_data'), 'rb') as input:
        valid_data = pickle.load(input)

    logging.info(f"  Loaded {train_data[0].shape[0]} training samples.")
    logging.info(f"  Loaded {valid_data[0].shape[0]} validation samples.")

    # Construct model
    if args['--model'] == 'v1':
        model = SyntacticModelv1(hyperparameters, vocab_nodes, vocab_actions)
    elif args['--model'] == 'v2':
        model = SyntacticModelv2(hyperparameters, vocab_nodes, vocab_actions)
    elif args['--model'] == 'v3':
        model = SyntacticModelv3(hyperparameters, vocab_nodes, vocab_actions)
    model.build(([None, hyperparameters["max_seq_length"], 3]))
    logging.info("Constructed model, using the following hyperparameters:")
    logging.info(json.dumps(hyperparameters))

    # Peth for saving the model
    save_model_dir = args["--save-dir"]
    os.makedirs(save_model_dir, exist_ok=True)
    save_file = os.path.join(
        save_model_dir, f"{hyperparameters['run_id']}_best_model.bin"
    )

    train(
        model,
        train_data,
        valid_data,
        batch_size=hyperparameters["batch_size"],
        max_epochs=max_epochs,
        patience=patience,
        save_file=save_file,
        args=args
    )