def run(dataset: str, repr: str, classifier: str): from logrec.classifier.context_datasets import ContextsDataset PrepConfig.assert_classification_config(repr) path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset) full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr) dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier, args.repr) logger.info(f"Writing to {dest_dir}") os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True) total_files = sum( file_mapper(full_src_dir, lambda f: 1, lambda fi: fi.endswith("parsed.repr"))) count = 0 cases_creator = get_cases_creator(classifier) for lines, rel_path in file_mapper(full_src_dir, cases_creator, lambda fi: fi.endswith("parsed.repr")): count += 1 logger.info(f"Processing {count} out of {total_files}") forward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT, rel_path)) backward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT, rel_path)) label_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path)) with open(forward_path, 'w') as f, open(backward_path, 'w') as b, open(label_path, 'w') as l: for line in lines: if line: l.write(f'{line[2]}\n') f.write(f'{" ".join(line[0])}\n') b.write(f'{" ".join(line[1])}\n') else: l.write('\n') f.write('\n') b.write('\n')
def run(dataset, repr, threshold): PrepConfig.assert_classification_config(repr) path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset, CLASSIFICATION_DIR) dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr) logger.info(f"Getting stats for {dest_dir}") logger.info( f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %" ) projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold) for i, p in enumerate(projects_to_ignore): logger.info(f"{i}: {p}") logger.info("") logger.info(logged_stats) output_file_path = os.path.join( path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") dump_list(projects_to_ignore, output_file_path) logger.info( f"Ignored files with threshold {threshold} % were written to {output_file_path}" ) logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None: base_model = config.base_model pretraining = config.pretraining_type PrepConfig.assert_classification_config(config.data.repr) if bool(base_model) != bool(pretraining): raise ValueError( 'Base model and pretraining_type params must be both set or both unset!' ) fs = FS.for_classifier(config.data.dataset, config.data.repr, base_model=base_model, pretraining=pretraining, classification_type=config.classification_type) fs.create_path_to_model(config.data, config.training_config) attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log') print_gpu_info() text_field = fs.load_text_field() rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL, config.data, config.arch, config.min_log_coverage_percent) logger.info(rnn_learner) same_model_exists = fs.best_model_exists(rnn_learner) if same_model_exists and not force_rerun: logger.info( f'Model {fs.path_to_classification_model} already trained. Not rerunning training.' f'To retrain the model with this parameters, specify --force-rerun flag' ) return elif same_model_exists: logger.info( f"Model {fs.path_to_classification_model} already trained. Forcing rerun." ) if pretraining == PretrainingType.FULL: try: logger.info(f'Trying to load base classifier: {base_model}') fs.load_base_model(rnn_learner) logger.info('Base classifier model is loaded.') except Exception as e: logger.warning(e) logger.warning( 'Base classifier model not loaded. Training from scratch') elif pretraining == PretrainingType.ONLY_ENCODER: try: logger.info(f'Trying to load pretarined LM: {base_model}') # TODO its a dirty hack. fix it fs.lm_cl_pretraining = True fs.load_pretrained_langmodel(rnn_learner) logger.info("Using pretrained LM") except Exception as e: logger.warning(e) logger.warning('Pretrained LM not loaded. Training from scratch') else: logger.info("No pretraining. Training classifier from scratch.") config_manager.save_config(config.training_config, fs.path_to_model) train(fs, rnn_learner, config.training, config.metrics) model = rnn_learner.model to_test_mode(model) sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out') n_predicitions = 6 if config.classification_type == 'level' else 2 show_tests(fs.test_path, model, text_field, sample_test_runs_file, config.data.backwards, n_predicitions, config.testing.n_samples) logger.info("Classifier training finished successfully.")