Ejemplo n.º 1
0
def test(config):
    if not config.config_path or not config.restore_from:
        raise AttributeError('You need to specify config_path and restore_from')
    else:
        config = load_config(config, config.config_path)

    set_logger(config)

    char_vocab = Vocab()
    char_vocab.load_from(os.path.join(config.vocab_dir, 'char_vocab.data'))
    label_vocab = Vocab(use_special_token=False)
    label_vocab.load_from(os.path.join(config.vocab_dir, 'label_vocab.data'))

    test_set = build_dataset(config, 'test', char_vocab, label_vocab)
    inputs = build_inputs(test_set.output_types, test_set.output_shapes)

    model = build_model(config, inputs)
    eval_metrics, results = model.evaluate(test_set)

    print('Eval metrics: {}'.format(eval_metrics))
    if config.result_name:
        with open(os.path.join(config.result_dir, os.path.join(config.result_name)) + '.json', 'w') as f:
            json.dump(eval_metrics, f, indent=4)

        with open(os.path.join(config.result_dir, os.path.join(config.result_name)) + '.txt', 'w') as f:
            for result in results:
                f.write(label_vocab.id2token[result] + '\n')
Ejemplo n.º 2
0
def main():
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    np.random.seed(1)
    print(args)

    config = [('conv2d', [args.num_filters, 1, 3, 3, 2, 1]), ('relu', [True]),
              ('bn', [args.num_filters]),
              ('conv2d', [args.num_filters, args.num_filters, 3, 3, 2, 1]),
              ('relu', [True]), ('bn', [args.num_filters]),
              ('conv2d', [args.num_filters, args.num_filters, 3, 3, 2, 1]),
              ('relu', [True]), ('bn', [args.num_filters]),
              ('conv2d', [args.num_filters, args.num_filters, 3, 3, 2, 1]),
              ('relu', [True]), ('bn', [args.num_filters]), ('flatten', []),
              ('linear', [args.n_way + 2, args.num_filters * 9])]

    device = torch.device('cuda')
    maml = Meta(args, config).to(device)

    tmp = filter(lambda x: x.requires_grad, maml.parameters())
    num = sum(map(lambda x: np.prod(x.shape), tmp))
    print(maml)
    print('Total trainable tensors:', num)

    # batchsz here means total sampled meta-task number
    if args.train == 'True':
        mini_train = LingualData('./data',
                                 mode='train',
                                 task_type=args.task_type,
                                 n_way=args.n_way,
                                 k_shot=args.k_spt_train,
                                 k_query=args.k_qry_train,
                                 k_unk_shot=args.k_spt_unk_train,
                                 k_unk_query=args.k_qry_unk_train,
                                 k_silence_shot=args.k_spt_silence_train,
                                 k_silence_query=args.k_qry_silence_train,
                                 batchsz=16000,
                                 resize=args.imgsz,
                                 unk_sil_spt=args.unk_sil_spt)

    exp_string = 'cls_' + str(args.n_way) + '.tskn_' + str(
        args.task_num) + '.spttrain_' + str(
            args.k_spt_train) + '.qrytrain_' + str(
                args.k_qry_train) + '.numstep' + str(
                    args.update_step) + '.updatelr' + str(args.update_lr)
    model_path = args.logdir + '/' + exp_string
    model_file = None

    if args.train == 'True':
        if not os.path.exists(model_path):
            os.makedirs(model_path)
            print("logs directory ", args.logdir, " created!")
        writer = SummaryWriter(model_path)
        set_logger(os.path.join(args.logdir, 'train.log'))
        train(maml, mini_train, model_path, args.resume_itr, device, writer)
    else:
        if args.test_iter >= 0:
            model_file = model_path + '/' + 'model-' + str(
                args.test_iter) + '.pth'
            test(maml, model_file, device)
Ejemplo n.º 3
0
def get_dataloaders(args):
    model_prefix = '{}_{}'.format(args.model_type, args.train_id)

    log_path = args.LOG_DIR + model_prefix + '/'
    checkpoint_path = args.CHK_DIR + model_prefix + '/'
    result_path = args.RESULT_DIR + model_prefix + '/'
    cp_file = checkpoint_path + "best_model.pth.tar"
    init_epoch = 0

    if not os.path.exists(log_path):
        os.makedirs(log_path)
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    ## set up the logger
    set_logger(os.path.join(log_path, 'train.log'))

    ## save argparse parameters
    with open(log_path + 'args.yaml', 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}: {}\n'.format(k, v))

    logging.info('Training model: {}'.format(model_prefix))

    ## set up vocab txt
    # create txt here
    print('running setup')
    setup(args, clear=True)
    print(args.__dict__)

    # indicate src and tgt language
    if args.source_language == 'en':
        src, tgt = 'en', 'zh'
    else:
        src, tgt = 'zh', 'en'

    maps = {'en': args.TRAIN_VOCAB_EN, 'zh': args.TRAIN_VOCAB_ZH}
    vocab_src = read_vocab(maps[src])
    tok_src = Tokenizer(language=src,
                        vocab=vocab_src,
                        encoding_length=args.MAX_INPUT_LENGTH)
    vocab_tgt = read_vocab(maps[tgt])
    tok_tgt = Tokenizer(language=tgt,
                        vocab=vocab_tgt,
                        encoding_length=args.MAX_INPUT_LENGTH)
    logging.info('Vocab size src/tgt:{}/{}'.format(len(vocab_src),
                                                   len(vocab_tgt)))

    ## Setup the training, validation, and testing dataloaders
    train_loader, val_loader, test_loader = create_split_loaders(
        args.DATA_DIR, (tok_src, tok_tgt),
        args.batch_size,
        args.MAX_VID_LENGTH, (src, tgt),
        num_workers=4,
        pin_memory=True)
    logging.info('train/val/test size: {}/{}/{}'.format(
        len(train_loader), len(val_loader), len(test_loader)))

    return train_loader, val_loader, test_loader, tok_src, tok_tgt, len(
        vocab_src), len(vocab_tgt)
Ejemplo n.º 4
0
    def export(directory: str,
               recursive=True,
               pattern='*.imdb',
               output='./kodi.csv',
               interactive=False,
               debug=False):
        """
        Exports the IDs from ID or .nfo files.

        Parameters
        ----------
        directory : str
            the directory to look for ID/.nfo files
        recursive : bool, optional
            whether to locate files recursively, by default True
        pattern : str, optional
            the pattern for the ID files (glob), by default "*.imdb"
        output : str, optional
            the output CSV file to generate, by default "./kodi.csv"
        interactive : bool, optional
            whether to use interactive mode, by default False
        debug : bool, optional
            debug message, by default False
        """
        utils.set_logger(debug=debug)

        export_ids(dir_=directory,
                   idtype='imdb',
                   recursive=recursive,
                   pattern=pattern,
                   output=output,
                   interactive=interactive)
Ejemplo n.º 5
0
def main(data_dir, model_dir, restore_file):
    """
        Evaluate the model on the test set.
    """
    # Load the parameters
    json_path = Path(model_dir) / 'hyper_params.json'
    assert json_path.is_file(
    ), "No json configuration file found at {}".format(json_path)
    hyper_params = utils.HyperParams(json_path)

    # use GPU if available
    #hyper_params.cuda = torch.cuda.is_available()     # use GPU is available
    hyper_params.cuda = torch.device(
        'cuda:0') if torch.cuda.is_available() else -1

    # Set the random seed for reproducible experiments
    torch.manual_seed(230)
    if hyper_params.cuda is not -1:
        with torch.cuda.device(str(hyper_params.cuda)[-1]):
            torch.cuda.manual_seed(230)

    # Get the logger
    utils.set_logger(Path(model_dir) / 'evaluate.log')

    # Create the input data pipeline
    logging.info("Creating the dataset...")

    # fetch dataloaders
    dataloaders = data_loader.fetch_dataloader(
        ['test'], data_dir + hyper_params.augmentation, hyper_params)
    test_dl = dataloaders['test']

    logging.info("- done.")

    # Define the model
    model = getattr(net, hyper_params.model, None)
    assert model is not None, "Model {} couldn't be found!".format(
        hyper_params.model)
    model = model(hyper_params).to(
        device=hyper_params.cuda) if hyper_params.cuda is not -1 else model(
            hyper_params)

    loss_fn = getattr(loss, hyper_params.loss, None)
    assert loss_fn is not None, "Loss Fn {} couldn't be found!".format(
        hyper_params.loss)

    metrics_dict = metric.metrics_dict

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(str(Path(model_dir) / (restore_file + '.pth.tar')),
                          model)

    # Evaluate
    test_metrics = evaluate(model, loss_fn, test_dl, metrics_dict, model_dir,
                            hyper_params)
    save_path = str(
        Path(model_dir) / "metrics_test_{}.json".format(restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
Ejemplo n.º 6
0
def settings(args):
    if args.save_folder and not os.path.isdir(args.save_folder):
        os.makedirs(args.save_folder)

    if args.log_path:
        set_logger(args.log_path)

    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        args.cuda = False

    cudnn.benchmark = True

    # Set default train and test path if not provided as input.
    utils.set_dataset_paths(args)

    args.unfreeze_layers = [
        'layer.0.', 'layer.1.', 'layer.2.', 'layer.3.', 'layer.4.', 'layer.5.',
        'layer.6.', 'layer.7.', 'layer.8.', 'layer.9.', 'layer.10.',
        'layer.11.', 'pooler'
    ]
    args.shared_layers = args.unfreeze_layers
    # preprocess
    if args.build_data_seperate:
        build_data_seperate()

    if args.mode == 'finetune' and args.build_data_file:
        build_data_file(args)
Ejemplo n.º 7
0
def main():

    # Set the log file for debuging use
    utils.set_logger(os.path.join(os.getcwd(), 'train.log'))

    logging.info('Loading datasets...')

    data_loader = DataLoader(DATA_PATH)

    X_train, Y_train, X_val, Y_val = data_loader.get_train_data()
    X_test, Y_test = data_loader.get_test_data()

    logging.info('Building the model...')
    my_model = seq2class()  # NEED TO PASS PARAMETERS SHIT

    print("Here is our model: ")
    print(my_model.model.summary())

    logging.info('Training....')
    history = my_model.model.fit(X_train, Y_train, epochs=EPOCHS, verbose=1, batch_size=BATCH_SIZE, validation_data=(X_val, Y_val))

    logging.info(f"train loss: {history.history['loss']}")
    logging.info(f"val loss: {history.history['val_loss']}")

    logging.info(f"train accuracy: {history.history['acc']}")
    logging.info(f"val accuracy: {history.history['val_acc']}")
    # Plotting the loss history #
    plot = utils.Plotting(history)
    # plot.plot_loss()
    # plot.plot_accuracy()

    print('Testing...')
    loss, accuracy = my_model.model.evaluate(X_test, Y_test)
    logging.info('Testing loss', loss)
    logging.info("Test accuracy", accuracy)
Ejemplo n.º 8
0
def evaluate_from_workspace(workspace_dir):
    global args, data_loader
    """
        Evaluate the model on the test set.
    """
    data_dir = workspace_dir
    model_dir = os.path.join(data_dir, "model")

    # Load the parameters
    args = parser.parse_args()
    json_path = os.path.join(model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)
    params.data_dir = data_dir if data_dir else args.data_dir
    params.model_dir = model_dir if model_dir else args.model_dir

    # use GPU if available
    params.cuda = torch.cuda.is_available()  # use GPU is available

    # Set the random seed for reproducible experiments
    torch.manual_seed(230)
    if params.cuda: torch.cuda.manual_seed(230)

    # Get the logger
    utils.set_logger(os.path.join(params.model_dir, 'evaluate.log'))

    # Create the input data pipeline
    logging.info("Creating the dataset...")

    # load data
    data_loader = DataLoader(params.data_dir, params)
    data = data_loader.load_data_from_dir(['test'], params.data_dir)
    test_data = data['test']

    # specify the test set size
    params.test_size = test_data['size']
    test_data_iterator = data_loader.data_iterator(test_data, params)

    logging.info("- done.")

    # Define the model
    model = net.Net(params).cuda() if params.cuda else net.Net(params)

    loss_fn = net.loss_fn
    metrics = net.metrics

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(
        os.path.join(params.model_dir, args.restore_file + '.pth.tar'), model)

    # Evaluate
    num_steps = (params.test_size + 1) // params.batch_size
    test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics,
                            params, num_steps)
    save_path = os.path.join(params.model_dir,
                             "metrics_test_{}.json".format(args.restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
Ejemplo n.º 9
0
def main(config):
    set_logger()
    prepare_dirs(config)
    """ NOTE : should fix problems when valid mode is on """
    # get trainer instance
    if config.dataset == 'nugu':
        train, ans2idx, W_e_init, word2idx = \
            load_skt_nugu_sample_dataset(config)
        valid = train
    elif config.dataset == 'simque':
        train, valid, W_e_init, word2idx = \
            load_simple_questions_dataset(config)
        ans2idx = None
    else:
        raise Exception('Unsupported dataset:', config.dataset)

    # data, W_e_init, word2idx = 0,0,0
    if config.trainer_mode == "G":
        trainer = GTrainer(config, train, valid, W_e_init, word2idx, ans2idx)
    elif config.trainer_mode == "D":
        trainer = DTrainer(config, train, valid, W_e_init, word2idx)
    else:  # config.trainer_mode == "GAN":
        trainer = GANTrainer(config, train, valid, W_e_init, word2idx, ans2idx)

    if config.is_train:
        save_config(config)  # save config file(params.json)
        trainer.train()  # Train!
    else:
        if not config.load_path:  # raise Exception when load_path unknown.
            raise Exception("[!] You should specify `load_path` to load a " +
                            "pretrained model")
        if config.interactive:
            trainer.test_interactive()
        else:
            trainer.test()
Ejemplo n.º 10
0
def runTraining(model_dir, data_dir, restore_file):
    json_path = os.path.join(model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()
    print(params.cuda)

    # Set the random seed for reproducible experiments
    torch.manual_seed(231)
    if params.cuda: torch.cuda.manual_seed(231)

    # Addint tensorbard
    if restore_file == None:
        writer_train = SummaryWriter("Tensorboard/" +
                                     os.path.join(model_dir, "train") +
                                     ".SUNet")
        writer_eval = SummaryWriter("Tensorboard/" +
                                    os.path.join(model_dir, "eval") + ".SUNet")
        writer = {"train": writer_train, "eval": writer_eval}

        # writer = SummaryWriter()
    else:
        writer_train = SummaryWriter(log_dir="Tensorboard/" +
                                     os.path.join(restore_file, "train") +
                                     ".SUNet")
        writer_eval = SummaryWriter(log_dir="Tensorboard/" +
                                    os.path.join(restore_file, "eval") +
                                    ".SUNet")
        writer = {"train": writer_train, "eval": writer_eval}

    # Set the logger
    utils.set_logger(os.path.join(model_dir, 'train.log'))

    # Create the input data pipeline
    logging.info("Loading the datasets...")

    # fetch dataloaders
    dataloaders = data_loader.fetch_dataloader(['train', 'val'], data_dir,
                                               params)
    train_dl = dataloaders['train']
    val_dl = dataloaders['val']

    logging.info("- done.")

    # Define the model and optimizer
    model = net.Net(params).cuda() if params.cuda else net.Net(params)
    optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)

    # fetch loss function and metrics
    loss_fn = net.loss_fn
    metrics = net.metrics

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    train_and_evaluate(model, train_dl, val_dl, optimizer, loss_fn, metrics,
                       params, model_dir, restore_file, writer)
Ejemplo n.º 11
0
def main():
    parse_args(config)
    set_logger(config, logger)
    if config['daemon']:
        daemon()
    pid_file = config['pidfile']
    mk_pid_file(pid_file)
    run_server(config)
Ejemplo n.º 12
0
def main():
    """Main function
    """
    # Load the parameters
    args = args_parser()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # Create summary writer for use with tensorboard
    writer = SummaryWriter(os.path.join(args.model_dir, 'runs', 'eval'))

    # use GPU if available
    params.cuda = torch.cuda.is_available()  # use GPU is available

    # Set the random seed for reproducible experiments
    torch.manual_seed(230)
    if params.cuda:
        torch.cuda.manual_seed(230)
        params.device = "cuda:0"
    else:
        params.device = "cpu"

    # Set the logger
    utils.set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    logging.info("Loading the dataset...")

    # fetch dataloaders
    dataloaders = d_l.get_dataloader(['test'], args.data_dir, params)
    test_dl = dataloaders['test']

    logging.info("- done.")

    # Define the model
    model = Net(params)
    if params.cuda:
        model = model.to(params.device)
    writer.add_graph(model, next(iter(test_dl))[0])

    criterion = loss_fn
    metrics = get_metrics()

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(
        os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)

    # Evaluate
    test_metrics = evaluate(model, criterion, test_dl, metrics, params, writer,
                            0)
    save_path = os.path.join(args.model_dir,
                             "metrics_test_{}.json".format(args.restore_file))
    utils.save_dict_to_json(test_metrics, save_path)

    writer.close()
Ejemplo n.º 13
0
def run():
    """train the model"""
    # set the logger
    utils.set_logger(config.log_dir)
    logging.info("device: {}".format(config.device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.process()
    logging.info("--------Process Done!--------")
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = load_dev('train')
    # build dataset
    train_dataset = NERDataset(word_train, label_train, config)
    dev_dataset = NERDataset(word_dev, label_dev, config)
    logging.info("--------Dataset Build!--------")
    # get dataset size
    train_size = len(train_dataset)
    # build data_loader
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
                              shuffle=True, collate_fn=train_dataset.collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
                            shuffle=True, collate_fn=dev_dataset.collate_fn)
    logging.info("--------Get Dataloader!--------")
    # Prepare model
    device = config.device
    model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id))
    model.to(device)
    # Prepare optimizer
    if config.full_fine_tuning:
        # model.named_parameters(): [bert, classifier, crf]
        bert_optimizer = list(model.bert.named_parameters())
        classifier_optimizer = list(model.classifier.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0},
            {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay},
            {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': 0.0},
            {'params': model.crf.parameters(), 'lr': config.learning_rate * 5}
        ]
    # only fine-tune the head classifier
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False)
    train_steps_per_epoch = train_size // config.batch_size
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch,
                                                num_training_steps=config.epoch_num * train_steps_per_epoch)

    # Train the model
    logging.info("--------Start Training!--------")
    train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)
Ejemplo n.º 14
0
    def train(self):
        set_logger(os.path.join(self.log_dir, 'train.log'), terminal=False)

        epochs = self.hps.num_epochs
        print_every = self.hps.print_every
        log_every = self.hps.log_summary_every
        lr = self.hps.learning_rate

        loss_avg = RunningAverage()
        summary_writer = SummaryWriter(log_dir=self.summ_dir)
        current_best_loss = 1e3

        encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=lr)
        decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=lr)

        training_pairs = self.dl

        criterion = nn.NLLLoss(reduce=False)

        if self.hps.resume:
            log('- load ckpts...')
            self.load_state_dict()

        for epoch in trange(epochs, desc='epochs'):
            loss_avg.reset()
            with tqdm(total=len(training_pairs)) as progress_bar:
                for language_pair, mask_pair in training_pairs:
                    language_pair, mask_pair = language_pair.to(
                        self.device), mask_pair.to(self.device)
                    loss = self.train_single(language_pair, mask_pair,
                                             encoder_optimizer,
                                             decoder_optimizer, criterion)
                    loss_avg.update(loss.item())
                    self.global_step += 1
                    if self.global_step % log_every == 0:
                        summary_writer.add_scalar('loss_value',
                                                  loss,
                                                  global_step=self.global_step)
                    if self.global_step % print_every == 0:
                        log('global step: {}, loss average: {:.3f}'.format(
                            self.global_step, loss_avg()))

                    progress_bar.set_postfix(loss_avg=loss_avg())
                    progress_bar.update()
            if loss_avg() < current_best_loss:
                log('new best loss average found, saving modules...')
                current_best_loss = loss_avg()
                state = {
                    'encoder': self.encoder.state_dict(),
                    'decoder': self.decoder.state_dict(),
                    'global_step': self.global_step,
                    'epoch': epoch,
                    'loss_avg': loss_avg()
                }
                torch.save(state, os.path.join(self.ckpt_dir, 'best.pth.tar'))
Ejemplo n.º 15
0
def main():
    """Main function
    """
    # Load the parameters from json file
    args = args_parser()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # Create summary writer for use with tensorboard
    writer = SummaryWriter(os.path.join(args.model_dir, 'runs', 'train'))

    # use GPU if available
    params.cuda = torch.cuda.is_available()

    # Set the random seed for reproducible experiments
    torch.manual_seed(230)
    if params.cuda:
        torch.cuda.manual_seed(230)
        params.device = "cuda:0"
    else:
        params.device = "cpu"

    # Set the logger
    utils.set_logger(os.path.join(args.model_dir, 'train.log'))

    # Create the input data pipeline
    logging.info("Loading the datasets...")

    # fetch dataloaders
    dataloaders = d_l.get_dataloader(['train', 'val'], args.data_dir, params)
    train_dl = dataloaders['train']
    val_dl = dataloaders['val']

    logging.info("- done.")

    # Define the model and optimizer
    model = Net(params)
    if params.cuda:
        model = model.to(params.device)
    writer.add_graph(model, next(iter(train_dl))[0])

    optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate)

    # fetch loss function and metrics
    criterion = loss_fn
    metrics = get_metrics()

    # Train the model
    logging.info("Starting training for %d epoch(s)", params.num_epochs)
    train_and_evaluate(model, train_dl, val_dl, optimizer, criterion, metrics,
                       params, args.model_dir, writer, args.restore_file)
    writer.close()
Ejemplo n.º 16
0
def train(data_dir, model_dir, params):
    set_logger(os.path.join(model_dir, 'train.log'))

    # find the CSV file(s)
    filenames, labels = loadCSV(data_dir)

    train_filenames, eval_filenames, train_labels, eval_labels = train_test_split(
        filenames, labels, test_size=0.2)

    training_pipeline(train_filenames, train_labels, eval_filenames,
                      eval_labels, model_dir, params)
Ejemplo n.º 17
0
def run_sdce(args):
    config = get_config(f'./configs/y2h_config_cesd.yml')
    config.log_prefix = f'workspace/ResnetY2HEstimator/mode_{config.mode}_Pn_{config.Pn}/SDCE'
    config.log_dir = os.path.join(config.log_prefix, args.time)
    config.ckpt_dir = os.path.join(config.log_dir, 'checkpoints')
    if not os.path.isdir(config.ckpt_dir):
        os.makedirs(config.ckpt_dir)
    set_logger(config)
    logging.info(config)
    runner = SDCERunner(config)
    runner.run()
Ejemplo n.º 18
0
def run_ema(args):
    config = get_config(f'./configs/y2h_config_ema.yml')
    assert config.model == 'ema'
    config.log_prefix = f'workspace/ResnetY2HEstimator/mode_{config.mode}_Pn_{config.Pn}/EMA'
    config.log_dir = os.path.join(config.log_prefix, args.time)
    config.ckpt_dir = os.path.join(config.log_dir, 'checkpoints')
    if not os.path.isdir(config.ckpt_dir):
        os.makedirs(config.ckpt_dir)
    set_logger(config)
    logging.info(config)
    runner = EMAY2HRunner(config)
    runner.run()
Ejemplo n.º 19
0
    def generate(directory: str,
                 recursive=True,
                 pattern='*.imdb',
                 delay=1,
                 dry_run=False,
                 overwrite=False,
                 language="en",
                 fanart=None,
                 fanart_file="folder.jpg",
                 interactive=False,
                 debug=False):
        """
        Traverses the directory Generates the .nfo files.

        Parameters
        ----------
        directory : str
            the directory to traverse
        recursive : bool, optional
            whether to search recursively, by default True
        pattern : str, optional
            the file pattern (glob) to use for identifying the files with the IDs, by default '*.imdb'
        delay : int, optional
            the delay in seconds between web queries, by default 1
        dry_run : bool, optional
            whether to perform a 'dry-run', ie generating .nfo content
            but not saving them (only outputting them to stdout), by default False
        overwrite : bool, optional
            whether to overwrite existing .nfo files (ie recreating them), by default False
        language : str, optional
            the preferred language for the titles, by default "en"
        fanart : str, optional
            how to deal with fanart, by default None
        fanart_file : str, optional
            the fanart filename to use (when downloading or re-using existing), by default "folder.jpg"
        interactive : bool, optional
            whether to use interactive mode, by default False
        debug : bool, optional
            debug message, by default False
        """
        utils.set_logger(debug=debug)

        generate(dir_=directory,
                 idtype='imdb',
                 recursive=recursive,
                 pattern=pattern,
                 delay=delay,
                 dry_run=dry_run,
                 overwrite=overwrite,
                 language=language,
                 fanart=fanart,
                 fanart_file=fanart_file,
                 interactive=interactive)
Ejemplo n.º 20
0
def make_vecs():
    utils.set_logger('log')
    log(utils.separator())

    data_loader.convert_data_to_vectors(root_path_str='./training_data/expressive_all_tr', text_file_str='tr_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./training_data/normal_all_tr', text_file_str='tr_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./faceScrub_big_train', text_file_str='tr_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./validation_data/expressive_all_val', text_file_str='val_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./validation_data/normal_all_val', text_file_str='val_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./test_data/expressive_all_test', text_file_str='test_data.txt')
    data_loader.convert_data_to_vectors(root_path_str='./test_data/normal_all_test', text_file_str='test_data.txt')
    log(utils.separator())
Ejemplo n.º 21
0
def run():
    set_logger()

    # load data
    ent_path = os.path.join(config.data_path, "entities.dict")
    rel_path = os.path.join(config.data_path, "relations.dict")
    ent2id = read_elements(ent_path)
    rel2id = read_elements(rel_path)
    ent_num = len(ent2id)
    rel_num = len(rel2id)
    train_triples = read_triples(os.path.join(config.data_path, "train.txt"),
                                 ent2id, rel2id)
    valid_triples = read_triples(os.path.join(config.data_path, "valid.txt"),
                                 ent2id, rel2id)
    test_triples = read_triples(os.path.join(config.data_path, "test.txt"),
                                ent2id, rel2id)
    logging.info("#ent_num: %d" % ent_num)
    logging.info("#rel_num: %d" % rel_num)
    logging.info("#train triple num: %d" % len(train_triples))
    logging.info("#valid triple num: %d" % len(valid_triples))
    logging.info("#test triple num: %d" % len(test_triples))
    logging.info("#Model: %s" % config.model)

    # 创建模型
    kge_model = TransE(ent_num, rel_num)
    if config.model == "TransH":
        kge_model = TransH(ent_num, rel_num)
    elif config.model == "TransR":
        kge_model = TransR(ent_num, rel_num)
    elif config.model == "TransD":
        kge_model = TransD(ent_num, rel_num)
    elif config.model == "STransE":
        kge_model = STransE(ent_num, rel_num)
    elif config.model == "LineaRE":
        kge_model = LineaRE(ent_num, rel_num)
    elif config.model == "DistMult":
        kge_model = DistMult(ent_num, rel_num)
    elif config.model == "ComplEx":
        kge_model = ComplEx(ent_num, rel_num)
    elif config.model == "RotatE":
        kge_model = RotatE(ent_num, rel_num)

    if config.cuda:
        kge_model = kge_model.cuda()
    logging.info("Model Parameter Configuration:")
    for name, param in kge_model.named_parameters():
        logging.info("Parameter %s: %s, require_grad = %s" %
                     (name, str(param.size()), str(param.requires_grad)))

    # 训练
    train(model=kge_model,
          triples=(train_triples, valid_triples, test_triples),
          ent_num=ent_num)
Ejemplo n.º 22
0
def run(params,dirs,seed=None,restore_file=None):
    #set random seed to do reproducible experiments
    if seed is not None:
        utils.seed(seed)
    utils.set_logger(os.path.join(dirs.model_dir, 'train.log'))
    logger = logging.getLogger('DeepAR.Train')
    #check cuda is avaliable or not
    use_cuda=torch.cuda.is_available()
    # Set random seeds for reproducible experiments if necessary
    if use_cuda:
        dirs.device = torch.device('cuda:0')
        logger.info('Using Cuda...')
        model = net.Net(params,dirs.device).cuda(dirs.device)
    else:
        dirs.device = torch.device('cpu')
        logger.info('Not using cuda...')
        model = net.Net(params,dirs.device)

    logger = logging.getLogger('DeepAR.Data')
    logger.info('Loading the datasets...')
    train_set = TrainDataset(dirs.data_dir, dirs.dataset)
    vali_set = ValiDataset(dirs.data_dir, dirs.dataset)
    test_set = TestDataset(dirs.data_dir, dirs.dataset)
    train_loader = DataLoader(train_set,batch_size=params.batch_size,pin_memory=False,
                              num_workers=4)
    vali_loader = DataLoader(vali_set,batch_size=params.batch_size,pin_memory=False,
                             sampler=RandomSampler(vali_set),num_workers=4)
    test_loader = DataLoader(test_set,batch_size=params.batch_size,pin_memory=False,
                             sampler=RandomSampler(test_set),num_workers=4)
    logger.info('Data loading complete.')
    logger.info('###############################################\n')

    logger = logging.getLogger('DeepAR.Train')
    logger.info(f'Model: \n{str(model)}')
    logger.info('###############################################\n')

    optimizer = optim.Adam(model.parameters(), lr=params.lr)
    # fetch loss function
    loss_fn = net.loss_fn
    # Train the model
    logger.info('Starting training for {} epoch(s)'.format(params.num_epochs))
    train_and_evaluate(model,train_loader,vali_loader,optimizer,loss_fn,scheduler,params,dirs,restore_file)
    logger.handlers.clear()
    logging.shutdown()

    load_dir = os.path.join(dirs.model_save_dir, 'best.pth.tar')
    if not os.path.exists(load_dir):
        return
    utils.load_checkpoint(load_dir, model)
    out=evaluate(model, loss_fn, test_loader, params, dirs, istest=True)
    test_json_path=os.path.join(dirs.model_dir,'test_results.json')
    utils.save_dict_to_json(out, test_json_path)
Ejemplo n.º 23
0
 def apply_file(self, args):
     utils._init()
     logger = log.Log()
     utils.set_logger(logger)
     config = utils.ConfFile(args.file)
     test_mode = config.get_test_mode()
     if test_mode == 'quorum':
         test_quorum = control.QuorumAutoTest(config)
         # test_quorum.ssh_conn_build()
         test_quorum.test_drbd_quorum()
     if test_mode == 'drbd_in_used':
         test_iscsi = control.IscsiTest(config)
         test_iscsi.test_drbd_in_used()
Ejemplo n.º 24
0
def run_full(args):
    config = get_config('./configs/full_config.yml')
    config.run_mode = args.runner
    config.Pn = args.Pn
    config.log_dir = os.path.join('workspace', config.run_mode,
                                  f'mode_{config.mode}_Pn_{config.Pn}',
                                  args.time)
    if not os.path.isdir(config.ckpt_dir):
        os.makedirs(config.ckpt_dir)
    set_logger(config)
    logging.info(config)
    runner = FullRunner(config)
    runner.run()
Ejemplo n.º 25
0
Archivo: main.py Proyecto: baofff/BiSM
def init():
    parser = argparse.ArgumentParser(description=globals()['__doc__'])

    parser.add_argument('--config',
                        type=str,
                        required=True,
                        help='Path to the config file')
    parser.add_argument('--workspace',
                        type=str,
                        required=True,
                        help='Path to the workspace')
    parser.add_argument('--mode',
                        type=str,
                        default='train',
                        help='Train, valid or test the model (or others)')

    args = parser.parse_args()

    # set config
    config_ = load_yaml(args.config)
    config_["device"] = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    config_["workspace_root"] = args.workspace
    config.set_config(config_)

    # set writer
    summary_root = os.path.join(config.workspace_root, "summary")
    if not os.path.exists(summary_root):
        os.makedirs(summary_root)
    writer.set_path(summary_root)

    # set seed
    seed = config.get("others", "seed", default=1234)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # set logger
    log_root = os.path.join(config.workspace_root, "logs")
    if not os.path.exists(log_root):
        os.makedirs(log_root)
    log_path = os.path.join(log_root, "{}.log".format(args.mode))
    set_logger(log_path)

    logging.info("running @ {}".format(socket.gethostname()))
    logging.info(config)

    return args
Ejemplo n.º 26
0
def hyperparamSearch(X_train, Y_train, X_dev, Y_dev, X_test, Y_test, lr_rng, num_hid_layers_rng, beta_rng, k_p_rng, reg_type,
                     size_hid_layers_rng, num_sims, num_epochs, minibatch_size, log_dir, parallel=False, cores=1):
    # compute random values within the ranges for each param of length num_sims
    num_batches, batch_length, sequence_length, num_features = X_train.shape 
    num_params = 5
    np.random.seed(13) # set seed for rand
    lower_bounds = [lr_rng[0],num_hid_layers_rng[0],size_hid_layers_rng[0],beta_rng[0],k_p_rng[0]]
    upper_bounds = [lr_rng[1],num_hid_layers_rng[1],size_hid_layers_rng[1],beta_rng[1],k_p_rng[1]]
    sample_size = [num_sims, num_params] # num_sims x number of params in search
    samples_params = np.random.uniform(lower_bounds, upper_bounds, sample_size)

    # modifying the initial random parameters
    lr_samples = 10**samples_params[:,0] # log scale
    hl_samples = samples_params[:,1].astype(int) # rounded down to nearest int
    hu_samples = (samples_params[:,2]*num_features).astype(int) # base of 10 neurons used for each level
    beta = samples_params[:,3]
    k_p = samples_params[:,4]
    
    # save the data for the ranges used to the main sim file
    log_name = "Model_"+str(learning_rate)+"_"+str(num_epochs)+"_"+str(num_hid_layers)+"_"+str(size_hid_layers)
    utils.set_logger(os.path.join(cwd+"/"+log_dir,log_name+'.log'))
    utils.logging.info("lr_rng = "+str(lr_rng)+" hidden layers rng = "+str(num_hid_layers_rng)+" hidden units rng = "+str(size_hid_layers_rng)+" num sims = %d", num_sims)
    
    results = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
    
    if parallel: # Need cores 
        print("parallelizing the training") # add parallel ability for specified number of cores similar to funct above
        results = multi_sim(X_train, Y_train, X_dev, Y_dev, X_test, Y_test, lr_samples, beta, k_p, reg_type,
                            num_epochs, hl_samples, hu_samples, minibatch_size, log_dir, False, False, 10, 
                            100, True, cores)
        print(results)
    else:
        for i in range(len(lr_samples)):
            
            train_err, Y_train, batch_pred, Y_dev, dev_pred, dev_err, min_dev, min_epoch, test_err = model(X_train, Y_train, X_dev, Y_dev, X_test, Y_test, lr_samples[i], beta[i], k_p[i], reg_type,
                                                                 num_epochs, hl_samples[i], hu_samples[i], minibatch_size, log_dir, False, False) # call model funct

            temp_results = np.array([lr_samples[i], hl_samples[i], hu_samples[i], beta[i], k_p[i], num_epochs, train_err, dev_err, test_err, min_epoch, min_dev])
            #utils.set_logger(os.path.join(cwd+"/"+log_dir,log_dir+'.log')) # reset logger to main log file
            utils.logging.info("START OF NEW MODEL")
            utils.logging.info("learning rate = %f, hidden layers = %d, hidden units = %d, beta = %f, keep_prob = %f, epochs = %d, reg_type = %s", lr_samples[i], hl_samples[i], hu_samples[i], beta[i], k_p[i], num_epochs, reg_type) # add other hyperparams
            utils.logging.info("Train Err = %f, Dev Err = %f, Test Err = %f, Min Dev Err = %f, Min Epoch = %d", train_err, dev_err, test_err, min_dev, min_epoch) # add other hyperparams
            results = np.vstack((results,temp_results))# get all results in a list
        
        # results contain an array of the parameters and then the resulting errors
        results = results[1:,:] # get rid of placeholder row
        results= results[results[:,-1].argsort()] # sort by the lowest dev error
        utils.logging.info("RESULTS")
        utils.logging.info(str(results))
    return results
Ejemplo n.º 27
0
    def hyperparas_search(self):

        # Set the logger
        utils.set_logger(
            os.path.join(self.params.model_dir, 'params_opt_train.log'))
        # Create the input data pipeline
        logging.info("Loading the datasets...")

        # fetch dataloaders
        # train_dl = self.dataset_train
        # val_dl = self.dataset_eval

        logging.info("- done.")

        # Define the model and optimizer
        # 加载最原始论文提供的ckpt,Load the checkpoint
        checkpoint = self.params.restore_file
        # if os.path.exists(checkpoint):
        #     print("{} load !".format(checkpoint))
        #     self.ckpt = torch.load(checkpoint)
        #     self.model.load_state_dict(self.ckpt['net_dict'])
        if checkpoint is not None:  # TODO 有bug要修
            restore_path = checkpoint  # checkpoints/ped2/code_length_128.pth.tar
            print("restore_path: ", restore_path)
            logging.info("Restoring parameters from {}".format(restore_path))
            utils.load_checkpoint(restore_path, self.model)
        self.model = self.model.to(self.device)
        optimizer = optim.Adam(self.model.parameters(), lr=self.params.LR)

        # fetch loss function and metrics
        self.c = load_init_center_c(self.params.dataset_name,
                                    self.params.code_length).to(self.device)
        self.loss = LSALoss_deepSVDD(lam_rec=self.params.lam_rec,
                                     lam_svdd=self.params.lam_svdd,
                                     c=self.c,
                                     R=self.params.R,
                                     nu=self.params.nu,
                                     objective=self.params.objective)
        loss_fn = self.loss
        metrics = utils.metrics

        # Train the model
        logging.info("Starting training for {} epoch(s)".format(
            self.params.epoch))
        # restore_file = self.params.restore_file_path # 默认为None, TODO
        restore_file = None
        self.train_and_evaluate(self.model, self.dataset_train,
                                self.dataset_eval, optimizer, loss_fn, metrics,
                                self.params, self.params.model_dir,
                                restore_file)
Ejemplo n.º 28
0
    def set_baseline_dataset(self):
        utils.set_logger(os.path.join(self.model_dir, 'train.log'))
        logging.info("Loading the datasets...")

        if self.params.subset_percent < 1.0:
            trainloader = datautils.fetch_subset_dataloader(
                'train', self.params)
        else:
            trainloader = datautils.fetch_dataloader('train', self.params)

        testloader = datautils.fetch_dataloader('test', self.params)

        logging.info("- done.")
        self.trainloader = trainloader
        self.testloader = testloader
Ejemplo n.º 29
0
def main():
    logging.info("Transformer implementation")
    parser = argparse.ArgumentParser(description="Transformer CRF implementation")
    opt = parse_arguments_t(parser)
    conf = Config(opt)
    set_seed(opt, conf.seed)
    # set logger
    utils.set_logger(os.path.join("log", opt.log_name))

    # params
    for k in opt.__dict__:
        logging.info(k + ": " + str(opt.__dict__[k]))

    trains, devs = prepare_data(logging, conf)
    train_model(config=conf, train_insts=trains, dev_insts=devs)
Ejemplo n.º 30
0
def run_y2h(args):
    config = get_config(f'./configs/y2h_config_{args.run_mode}.yml')
    config.model = args.run_mode
    if config.model == 'fc':
        config.log_prefix = f'workspace/ResnetY2HEstimator/mode_{config.mode}_Pn_{config.Pn}/FC'
    elif config.model == 'cnn':
        config.log_prefix = f'workspace/ResnetY2HEstimator/mode_{config.mode}_Pn_{config.Pn}/CNN'
    config.log_dir = os.path.join(config.log_prefix, args.time)
    config.ckpt_dir = os.path.join(config.log_dir, 'checkpoints')
    if not os.path.isdir(config.ckpt_dir):
        os.makedirs(config.ckpt_dir)
    set_logger(config)
    logging.info(config)
    runner = Y2HRunner(config)
    runner.run()
Ejemplo n.º 31
0
def runEvaluate(model_dir, data_dir, restore_file):
    """
        Evaluate the model on the test set.
    """

    # Load the parameters

    json_path = os.path.join(model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()  # use GPU is available

    # Set the random seed for reproducible experiments
    torch.manual_seed(231)
    if params.cuda: torch.cuda.manual_seed(231)

    # Get the logger
    utils.set_logger(os.path.join(model_dir, 'evaluate.log'))

    # Create the input data pipeline
    logging.info("Creating the dataset...")

    # fetch dataloaders
    dataloaders = data_loader.fetch_dataloader(['test'], data_dir, params)
    test_dl = dataloaders['test']

    logging.info("- done.")

    # Define the model
    model = net.Net(params).cuda() if params.cuda else net.Net(params)

    loss_fn = net.loss_fn
    metrics = net.metrics

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(os.path.join(model_dir, restore_file + '.pth.tar'),
                          model)

    # Evaluate
    test_metrics = evaluate(model, loss_fn, test_dl, metrics, params)
    save_path = os.path.join(model_dir,
                             "metrics_test_{}.json".format(restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
    # Launch training with this config
    cmd = "{python} train.py --model_dir={model_dir}".format(python=PYTHON,
                                                             model_dir=model_dir)
    print(cmd)
    check_call(cmd, shell=True)


if __name__ == "__main__":
    # Load the "reference" parameters from parent_dir json file
    args = parser.parse_args()
    json_path = os.path.join(args.parent_dir, 'params.json')
    assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # Set the logger
    utils.set_logger(os.path.join(args.parent_dir, 'search_hyperparameters.log'))

    '''
    Temperature and alpha search for KD on CNN (teacher model picked in params.json)
    Perform hypersearch (empirical grid): distilling 'temperature', loss weight 'alpha'
    '''

    # hyperparameters for KD
    alphas = [0.99, 0.95, 0.5, 0.1, 0.05]
    temperatures = [20., 10., 8., 6., 4.5, 3., 2., 1.5]

    logging.info("Searching hyperparameters...")
    logging.info("alphas: {}".format(alphas))
    logging.info("temperatures: {}".format(temperatures))

    for alpha in alphas:
    """
    # Load the parameters
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()     # use GPU is available

    # Set the random seed for reproducible experiments
    torch.manual_seed(230)
    if params.cuda: torch.cuda.manual_seed(230)
        
    # Get the logger
    utils.set_logger(os.path.join(args.model_dir, 'analysis.log'))

    # Create the input data pipeline
    logging.info("Loading the dataset...")

    # fetch dataloaders
    # train_dl = data_loader.fetch_dataloader('train', params)
    # dev_dl = data_loader.fetch_dataloader('dev', params)
    dataloader = data_loader.fetch_dataloader(args.dataset, params)

    logging.info("- done.")

    # Define the model graph
    model = resnet.ResNet18().cuda() if params.cuda else resnet.ResNet18()

    # fetch loss function and metrics
Ejemplo n.º 34
0
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import linear_model, datasets
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from pandas import *

from settings import DATA_DIR, LOG_DIR
import utils

TOTAL_INDEX = 'djia gspc ixic'.split() # dow jones. snp500, nasdaq, vol
EXPID = utils.get_expid()
utils.set_logger('%s/%s.log' % (LOG_DIR, EXPID), 'DEBUG')
# TODO: log configurations (ex: parsing method etc) and/or commit id


def openfiles(filename, arg):

    data = pd.read_csv(filename, sep='\t', header = 0)
    data = data.where((pd.notnull(data)), '')   # Replace np.nan with ''
    if arg == 100: # X
        columns = ['id', 'text', 'closePrice', 'week', 'month', 'quater', 'year','djia', 'gspc', 'ixic', 'vix']
        data.columns = columns
        value = pd.DataFrame(data)
        value.index = data['id']
    else: # y
        columns = TOTAL_INDEX[arg]
        value = pd.DataFrame(data[TOTAL_INDEX[arg]])
Ejemplo n.º 35
0
Archivo: zomphp.py Proyecto: wk8/ZomPhP
def main():
    # argument processing
    parser = argparse.ArgumentParser(description='Detect your PHP dead code')
    parser.add_argument('--dir', dest='dir', metavar='dir_path',
                        type=str, nargs=1, default=None,
                        help='Make ZomPHP process that directory')
    parser.add_argument('--ignore-sub-dirs', dest='ignore_sub_dirs',
                        metavar='dir_path', type=str, nargs='+', default=[],
                        help='A directory path (or list of those) that won\'t '
                        'be processed (only makes sense when used with the '
                        '--dir option)')
    parser.add_argument('--files', dest='files', metavar='file_path',
                        type=str, nargs='+', default=[],
                        help='A file or list of files (given as absolute paths'
                        ') to have processed by ZomPHP')
    parser.add_argument('--strict', dest='strict', action='store_const',
                        const=True, default=False, help='If set to true, will'
                        ' guarantee that any function NOT marked is indeed '
                        'used, but might also yield more false negatives (this'
                        ' option should only be used if you have files '
                        'containing functions with the same name)')
    parser.add_argument('--path-translation', dest='path_translation',
                        metavar='local_path path_in_db', type=str, nargs='+',
                        default=[], help='A list of couples of paths to '
                        'translate (useful if running the code in a different '
                        'location than the one running the PHP code)')
    parser.add_argument('--logging-level', dest='logging_level', metavar='level',
                        type=str, nargs=1, default=None, help='A logging '
                        'level to override the one set in the settings file')
    args = parser.parse_args()

    # start the logger
    utils.set_logger()

    # some sanity checks
    def check_abs_path(path, option_name):
        # helper function, checks the paths are absolute, and translates them to real paths
        if not path:
            return path
        if isinstance(path, (tuple, list)):
            return [check_abs_path(p, option_name) for p in path]
        if os.path.isabs(path):
            return os.path.realpath(path)
        logging.error('The --%s option requires using absolute paths (you entered %s) exiting' % (option_name, path))
        sys.exit(1)
    if bool(args.dir) == bool(args.files):
        logging.error('You must specify exactly one of the --dir or --files options, exiting')
        sys.exit(1)
    args.dir = check_abs_path(args.dir, 'dir')
    args.files = check_abs_path(args.files, 'files')
    if args.ignore_sub_dirs:
        if args.dir:
            args.ignore_sub_dirs = check_abs_path(args.ignore_sub_dirs, 'ignore-sub-dirs')
        else:
            logging.warning('Ignoring the --ignore-sub-dirs option, that option can only be used together with the --dir option')
    translator = utils.PathTranslator.build_translator(args.path_translation)

    # down to work!
    bckend = backend.get_new_backend()

    if args.dir:
        bckend.process_directory(args.dir[0], strict=args.strict, ignore_sub_dirs=args.ignore_sub_dirs, translator=translator)
    else:
        # then it must be --files
        for fle in args.files:
            bckend.process_file(fle, args.strict, translator=translator)

    logging.info(bckend.stats)
    # Load the parameters from json file
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()

    # Set the random seed for reproducible experiments
    random.seed(230)
    torch.manual_seed(230)
    if params.cuda: torch.cuda.manual_seed(230)

    # Set the logger
    utils.set_logger(os.path.join(args.model_dir, 'train.log'))

    # Create the input data pipeline
    logging.info("Loading the datasets...")

    # fetch dataloaders, considering full-set vs. sub-set scenarios
    if params.subset_percent < 1.0:
        train_dl = data_loader.fetch_subset_dataloader('train', params)
    else:
        train_dl = data_loader.fetch_dataloader('train', params)
    
    dev_dl = data_loader.fetch_dataloader('dev', params)

    logging.info("- done.")

    """Based on the model_version, determine model/optimizer and KD training mode