def run(dataset):
    fs = FS.for_parse_projects(dataset)

    logger.info(f"Getting files from {fs.path_to_raw_dataset}")
    logger.info(f"Writing preprocessed files to {fs.path_to_parsed_dataset}")
    preprocessing_types_dict = {k: None for k in PrepParam}

    fs.save_pp_params(pp_params)
    fs.save_preprocessing_types(preprocessing_types_dict)

    params = []

    for train_test_valid, project in fs.get_raw_projects():
        params.append((fs.path_to_raw_dataset, fs.path_to_parsed_dataset,
                       train_test_valid, project, preprocessing_types_dict))

    files_total = len(params)
    current_file = 0
    start_time = time.time()
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in it:
            current_file += 1
            logger.info(
                f"Processed {current_file} out of {files_total} chunks")
            time_elapsed = time.time() - start_time
            logger.info(
                f"Time elapsed: {time_elapsed:.2f} s, estimated time until completion: "
                f"{time_elapsed / current_file * files_total - time_elapsed:.2f} s"
            )
Exemple #2
0
def train_and_save_model(rnn_learner: RNN_Learner, fs: FS,
                         training: LMTraining, metric_list: List[str],
                         cache: Cache, use_subword_aware_metrics: bool):
    only_validation = False
    n = training.cycle.n
    if training.cycle.n == 0:
        logger.info("Number of epochs specified is 0. Not training...")
        fs.save_best(rnn_learner)
        only_validation = True
        n = 1

    training_start_time = time()
    training_log_file = os.path.join(fs.path_to_model, 'training.log')
    logger.info(
        f"Starting training, check {training_log_file} for training progress")
    callbacks = []

    if training.early_stop:
        callbacks.append(
            EarlyStopping(rnn_learner,
                          save_path=BEST_MODEL_NAME,
                          best_loss_path=BEST_LOSS_FILENAME,
                          best_acc_path=BEST_ACC_FILENAME,
                          best_epoch_path=BEST_EPOCH_FILENAME,
                          enc_path=ENCODER_NAME))

    validation_function = get_validation_function(cache,
                                                  use_subword_aware_metrics,
                                                  rnn_learner.text_field)
    vals, ep_vals = rnn_learner.fit(lrs=training.lr,
                                    n_cycle=n,
                                    wds=training.wds,
                                    cycle_len=training.cycle.len,
                                    cycle_mult=training.cycle.mult,
                                    metrics=list(
                                        map(lambda x: getattr(metrics, x),
                                            metric_list)),
                                    get_ep_vals=True,
                                    file=open(training_log_file, 'w'),
                                    callbacks=callbacks,
                                    valid_func=validation_function,
                                    only_validation=only_validation)
    training_time_mins = int(time() - training_start_time) // 60
    with open(os.path.join(fs.path_to_model, 'results.out'), 'w') as f:
        f.write(str(training_time_mins) + "\n")
        for _, vals in ep_vals.items():
            f.write(" ".join(map(lambda x: str(x), vals)) + "\n")
Exemple #3
0
def get_best_available_model(fs: FS, data: Data, arch: Arch):
    preloaded_text_filed = fs.load_text_field()
    rnn_learner = create_nn_architecture(
        fs, data, arch, path=None, preloaded_text_field=preloaded_text_filed)
    logger.info(rnn_learner)

    logger.info("Checking if there exists a model with the same architecture")
    model_loaded = fs.load_best(rnn_learner)
    if not model_loaded and fs.base_model_specified:
        logger.info(f'Trying to load base model: {fs.base_model_id}')
        try:
            fs.load_base_model(rnn_learner)
        except FileNotFoundError:
            raise FileNotFoundError(
                "Base model was not found. Training model from scratch")

    return rnn_learner, model_loaded
Exemple #4
0
    def __init__(self, dataset: str, repr: str, model: str, backwards: bool):
        fs = FS.for_lang_model(dataset, repr, model)
        text_field = fs.load_text_field()

        super().__init__(repr=repr,
                         fs=fs,
                         text_field=text_field,
                         config_class=LMTrainingConfig,
                         output_field=text_field,
                         n_predictions=10,
                         backwards=backwards)
Exemple #5
0
    def __init__(self, dataset: str, repr: str, model: str, backwards: bool,
                 classifier_type: str):
        fs = FS.for_classifier(dataset, repr, model, PretrainingType.FULL,
                               classifier_type)
        text_field = fs.load_text_field()

        super().__init__(repr=repr,
                         fs=fs,
                         text_field=text_field,
                         config_class=ClassifierTrainingConfig,
                         output_field=LEVEL_LABEL,
                         n_predictions=6 if classifier_type == 'level' else 2,
                         backwards=backwards)
Exemple #6
0
def run(dataset):
    fs = FS.for_parse_projects(dataset)

    logger.info(f"Getting files from {fs.path_to_raw_dataset}")
    logger.info(f"Writing preprocessed files to {fs.path_to_parsed_dataset}")
    preprocessing_types_dict = {k: None for k in PrepParam}

    fs.save_pp_params(pp_params)
    fs.save_preprocessing_types(preprocessing_types_dict)

    params = []

    for train_test_valid, project in fs.get_raw_projects():
        params.append((fs.path_to_raw_dataset, fs.path_to_parsed_dataset,
                       train_test_valid, project, preprocessing_types_dict))

    files_total = len(params)
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in tqdm(it, total=files_total):
            pass
Exemple #7
0
def run_on_device(config: Union[LMLRConfig, LMConfig], find_lr: bool,
                  force_rerun: bool) -> None:
    fs = FS.for_lang_model(config.data.dataset, config.data.repr,
                           config.base_model)

    fs.create_path_to_model(config.data, config.training_config)
    attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log')

    print_gpu_info()

    learner, model_trained = get_best_available_model(fs, config.data,
                                                      config.arch)

    fs.save_vocab_data(learner.text_field, config.data.percent,
                       config.data.start_from)

    if model_trained and not force_rerun:
        logger.info(
            f'Model {fs.path_to_model} already trained. Not rerunning training.'
        )
        return
    elif model_trained:
        logger.info(f"Forcing rerun")
    else:
        logger.info(f'Model with the same training config was not found.')

    config_manager.save_config(config.training_config, fs.path_to_model)

    if find_lr:
        find_and_plot_lr(learner, fs)
    else:
        train_and_save_model(learner, fs, config.training, config.metrics,
                             config.cache, config.use_subword_aware_metrics)
        model_loaded = fs.load_best(learner)
        if not model_loaded:
            raise AssertionError(
                "The best model should have been trained and saved!")
        gen_text_path = os.path.join(fs.path_to_model, 'gen_text.out')
        run_and_display_tests(learner, config.arch, config.testing,
                              config.data.backwards, gen_text_path)
Exemple #8
0
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None:
    base_model = config.base_model
    pretraining = config.pretraining_type

    PrepConfig.assert_classification_config(config.data.repr)

    if bool(base_model) != bool(pretraining):
        raise ValueError(
            'Base model and pretraining_type params must be both set or both unset!'
        )

    fs = FS.for_classifier(config.data.dataset,
                           config.data.repr,
                           base_model=base_model,
                           pretraining=pretraining,
                           classification_type=config.classification_type)

    fs.create_path_to_model(config.data, config.training_config)
    attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log')

    print_gpu_info()

    text_field = fs.load_text_field()

    rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL,
                                         config.data, config.arch,
                                         config.min_log_coverage_percent)
    logger.info(rnn_learner)

    same_model_exists = fs.best_model_exists(rnn_learner)
    if same_model_exists and not force_rerun:
        logger.info(
            f'Model {fs.path_to_classification_model} already trained. Not rerunning training.'
            f'To retrain the model with this parameters, specify --force-rerun flag'
        )
        return
    elif same_model_exists:
        logger.info(
            f"Model {fs.path_to_classification_model} already trained. Forcing rerun."
        )

    if pretraining == PretrainingType.FULL:
        try:
            logger.info(f'Trying to load base classifier: {base_model}')
            fs.load_base_model(rnn_learner)
            logger.info('Base classifier model is loaded.')
        except Exception as e:
            logger.warning(e)
            logger.warning(
                'Base classifier model not loaded. Training from scratch')

    elif pretraining == PretrainingType.ONLY_ENCODER:
        try:
            logger.info(f'Trying to load pretarined LM: {base_model}')
            # TODO its a dirty hack. fix it
            fs.lm_cl_pretraining = True
            fs.load_pretrained_langmodel(rnn_learner)
            logger.info("Using pretrained LM")
        except Exception as e:
            logger.warning(e)
            logger.warning('Pretrained LM not loaded. Training from scratch')
    else:
        logger.info("No pretraining. Training classifier from scratch.")

    config_manager.save_config(config.training_config, fs.path_to_model)

    train(fs, rnn_learner, config.training, config.metrics)

    model = rnn_learner.model

    to_test_mode(model)
    sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out')
    n_predicitions = 6 if config.classification_type == 'level' else 2
    show_tests(fs.test_path, model, text_field, sample_test_runs_file,
               config.data.backwards, n_predicitions, config.testing.n_samples)
    logger.info("Classifier training finished successfully.")