Ejemplo n.º 1
0
def run(dataset: str, repr: str, classifier: str):
    from logrec.classifier.context_datasets import ContextsDataset

    PrepConfig.assert_classification_config(repr)

    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset)
    full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr)
    dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier,
                            args.repr)
    logger.info(f"Writing to {dest_dir}")

    os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True)

    total_files = sum(
        file_mapper(full_src_dir, lambda f: 1,
                    lambda fi: fi.endswith("parsed.repr")))
    count = 0

    cases_creator = get_cases_creator(classifier)
    for lines, rel_path in file_mapper(full_src_dir, cases_creator,
                                       lambda fi: fi.endswith("parsed.repr")):
        count += 1
        logger.info(f"Processing {count} out of {total_files}")
        forward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT,
                   rel_path))
        backward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT,
                   rel_path))
        label_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path))
        with open(forward_path,
                  'w') as f, open(backward_path,
                                  'w') as b, open(label_path, 'w') as l:
            for line in lines:
                if line:
                    l.write(f'{line[2]}\n')
                    f.write(f'{" ".join(line[0])}\n')
                    b.write(f'{" ".join(line[1])}\n')
                else:
                    l.write('\n')
                    f.write('\n')
                    b.write('\n')
Ejemplo n.º 2
0
def run(dataset, repr, threshold):
    PrepConfig.assert_classification_config(repr)

    path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset,
                                          CLASSIFICATION_DIR)
    dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr)

    logger.info(f"Getting stats for {dest_dir}")
    logger.info(
        f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %"
    )
    projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold)
    for i, p in enumerate(projects_to_ignore):
        logger.info(f"{i}: {p}")
    logger.info("")
    logger.info(logged_stats)
    output_file_path = os.path.join(
        path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
    dump_list(projects_to_ignore, output_file_path)
    logger.info(
        f"Ignored files with threshold {threshold} % were written to {output_file_path}"
    )
    logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
Ejemplo n.º 3
0
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None:
    base_model = config.base_model
    pretraining = config.pretraining_type

    PrepConfig.assert_classification_config(config.data.repr)

    if bool(base_model) != bool(pretraining):
        raise ValueError(
            'Base model and pretraining_type params must be both set or both unset!'
        )

    fs = FS.for_classifier(config.data.dataset,
                           config.data.repr,
                           base_model=base_model,
                           pretraining=pretraining,
                           classification_type=config.classification_type)

    fs.create_path_to_model(config.data, config.training_config)
    attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log')

    print_gpu_info()

    text_field = fs.load_text_field()

    rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL,
                                         config.data, config.arch,
                                         config.min_log_coverage_percent)
    logger.info(rnn_learner)

    same_model_exists = fs.best_model_exists(rnn_learner)
    if same_model_exists and not force_rerun:
        logger.info(
            f'Model {fs.path_to_classification_model} already trained. Not rerunning training.'
            f'To retrain the model with this parameters, specify --force-rerun flag'
        )
        return
    elif same_model_exists:
        logger.info(
            f"Model {fs.path_to_classification_model} already trained. Forcing rerun."
        )

    if pretraining == PretrainingType.FULL:
        try:
            logger.info(f'Trying to load base classifier: {base_model}')
            fs.load_base_model(rnn_learner)
            logger.info('Base classifier model is loaded.')
        except Exception as e:
            logger.warning(e)
            logger.warning(
                'Base classifier model not loaded. Training from scratch')

    elif pretraining == PretrainingType.ONLY_ENCODER:
        try:
            logger.info(f'Trying to load pretarined LM: {base_model}')
            # TODO its a dirty hack. fix it
            fs.lm_cl_pretraining = True
            fs.load_pretrained_langmodel(rnn_learner)
            logger.info("Using pretrained LM")
        except Exception as e:
            logger.warning(e)
            logger.warning('Pretrained LM not loaded. Training from scratch')
    else:
        logger.info("No pretraining. Training classifier from scratch.")

    config_manager.save_config(config.training_config, fs.path_to_model)

    train(fs, rnn_learner, config.training, config.metrics)

    model = rnn_learner.model

    to_test_mode(model)
    sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out')
    n_predicitions = 6 if config.classification_type == 'level' else 2
    show_tests(fs.test_path, model, text_field, sample_test_runs_file,
               config.data.backwards, n_predicitions, config.testing.n_samples)
    logger.info("Classifier training finished successfully.")