Ejemplo n.º 1
0
def to_repr(prep_config: PrepConfig, token_list: List, n_gramm_splitting_config: Optional[NgramSplitConfig] = None):
    types_to_be_repr = get_types_to_be_repr(prep_config)
    splitting_config = n_gramm_splitting_config or get_global_n_gramm_splitting_config()
    dict_based_non_eng = (prep_config.get_param_value(PrepParam.EN_ONLY) != 3)
    lowercase = (prep_config.get_param_value(PrepParam.CAPS) == 1)
    repr_list = to_repr_list(token_list, ReprConfig(types_to_be_repr, splitting_config, dict_based_non_eng, lowercase))
    return repr_list
Ejemplo n.º 2
0
def run(dataset: str, repr: str, classifier: str):
    from logrec.classifier.context_datasets import ContextsDataset

    PrepConfig.assert_classification_config(repr)

    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset)
    full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr)
    dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier,
                            args.repr)
    logger.info(f"Writing to {dest_dir}")

    os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True)

    total_files = sum(
        file_mapper(full_src_dir, lambda f: 1,
                    lambda fi: fi.endswith("parsed.repr")))
    count = 0

    cases_creator = get_cases_creator(classifier)
    for lines, rel_path in file_mapper(full_src_dir, cases_creator,
                                       lambda fi: fi.endswith("parsed.repr")):
        count += 1
        logger.info(f"Processing {count} out of {total_files}")
        forward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT,
                   rel_path))
        backward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT,
                   rel_path))
        label_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path))
        with open(forward_path,
                  'w') as f, open(backward_path,
                                  'w') as b, open(label_path, 'w') as l:
            for line in lines:
                if line:
                    l.write(f'{line[2]}\n')
                    f.write(f'{" ".join(line[0])}\n')
                    b.write(f'{" ".join(line[1])}\n')
                else:
                    l.write('\n')
                    f.write('\n')
                    b.write('\n')
Ejemplo n.º 3
0
    def test_to_repr_2_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 2,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.ONLY_NUMBERS)

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl["word_start"], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl["word_start"], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 4
0
def preprocess(s, r):
    parsed = apply_preprocessors(from_string(s), pp_params["preprocessors"],
                                 {'interesting_context_words': []})
    params = PrepConfig.from_encoded_string(r)
    init_splitting_config(DEFAULT_DATASET, params, DEFAULT_BPE_BASE_REPR,
                          DEFAULT_BPE_N_MERGES, None)
    return to_repr(params, parsed)
Ejemplo n.º 5
0
    def test_log_no_mark_logs(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 0,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig()

        tokens = [
            LogStatement(
                SplitContainer.from_single_token('LOGGER'),
                SplitContainer.from_single_token('Info'), INFO,
                [StringLiteral([SplitContainer.from_single_token("Hi")])])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"',
            pl['capital'], 'hi', '"', ')', ';'
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 6
0
    def test_to_repr_no_no_sep_with_bpe_no_merges(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE, merges=[], merges_cache={})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'e', 'n', 'g',
            'l', 'i', 's', 'h', pl['word_end'], '*/', '//', pl['word_start'],
            pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 7
0
    def test_to_repr_with_non_eng(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={
                'english': ['engl', 'ish'],
                'dieselbe': ['die', 'selbe']
            })

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'dinero',
            '"', pl['word_start'], pl['capitals'], 'a', pl['capital'],
            'wirklich', pl['word_end'], '"', '/*', 'ц', pl['word_start'],
            'blanco', '_', 'engl', 'ish', pl['word_end'], '*/', '//',
            pl['word_start'], pl['capitals'], 'die', 'selbe', "8",
            pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 8
0
    def test_to_repr_with_enonlycontents(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 2,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={})

        tokens = [
            Number([1, DecimalPoint(), 1]), "*",
            SplitContainer([NonEng(Word.from_("dinero"))]),
            StringLiteral([
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("weiss")),
                NonEng(Word.from_("nicht")),
                NonEng(Word.from_("was")),
                NonEng(Word.from_("soll")),
                NonEng(Word.from_("es")),
                NonEng(Word.from_("bedeuten")),
                NonEng(Word.from_("dass")),
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("so")),
                NonEng(Word.from_("traurig")),
                NonEng(Word.from_("bin")),
            ]),
            NewLine(),
            MultilineComment([
                SplitContainer([NonEng(Word.from_('ц'))]),
                SplitContainer([
                    NonEng(Word.from_("blanco")),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            NewLine(),
            Tab(),
            OneLineComment([
                SplitContainer(
                    [NonEng(Word.from_("DIESELBE")),
                     Word.from_("8")])
            ])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl["non_eng_content"], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 9
0
    def test_merges_no_cache(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges={('w', 'h'): 0},
            merges_cache={})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], pl['capital'], "wh", "i", "l", "e",
            pl["word_end"]
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 10
0
def calc_stats_for_prepconfig(prepconfig,
                              lang_checker,
                              token_list,
                              include_sample=False):
    repr = to_token_list(
        to_repr(PrepConfig.from_encoded_string(prepconfig), token_list,
                NgramSplitConfig())).split(' ')
    return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
Ejemplo n.º 11
0
    def test(self):
        for input, output_tuple in test_cases.items():
            parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {})

            self.assertEqual(output_tuple[0], parsed)

            repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config)

            self.assertEqual(output_tuple[1], repred)
Ejemplo n.º 12
0
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str],
        bpe_n_merges: Optional[int], splitting_file: Optional[str]):
    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset)
    full_src_dir = os.path.join(path_to_dataset, PARSED_DIR)

    if not os.path.exists(full_src_dir):
        logger.error(f"Dir does not exist: {full_src_dir}")
        exit(3)
    logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}")

    preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params)
    init_splitting_config(dataset, preprocessing_params, bpe_base_repr,
                          bpe_n_merges, splitting_file)

    repr = str(preprocessing_params)

    full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, repr)
    full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr)
    logger.info(
        f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}")
    if not os.path.exists(full_dest_dir):
        os.makedirs(full_dest_dir)
    if not os.path.exists(full_metadata_dir):
        os.makedirs(full_metadata_dir)

    with open(os.path.join(full_dest_dir, 'preprocessing_types.json'),
              "w") as f:
        json_str = jsons.dumps(preprocessing_params)
        f.write(json_str)

    params = []
    for root, dirs, files in os.walk(full_src_dir):
        for file in files:
            if file.endswith(f".{PARSED_FILE_EXTENSION}"):

                full_dest_dir_with_sub_dir = os.path.join(
                    full_dest_dir, os.path.relpath(root, full_src_dir))
                if not os.path.exists(full_dest_dir_with_sub_dir):
                    os.makedirs(full_dest_dir_with_sub_dir)
                params.append((os.path.join(root, file),
                               os.path.join(full_dest_dir_with_sub_dir,
                                            file), preprocessing_params))
    files_total = len(params)
    current_file = 0
    start_time = time.time()
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in it:
            current_file += 1
            logger.info(f"Processed {current_file} out of {files_total}")
            time_elapsed = time.time() - start_time
            logger.info(
                f"Time elapsed: {time_elapsed:.2f} s, estimated time until completion: "
                f"{time_elapsed / current_file * files_total - time_elapsed:.2f} s"
            )
Ejemplo n.º 13
0
 def test_both_enonly_and_nosplit(self):
     with self.assertRaises(ValueError):
         prep_config = PrepConfig({
             PrepParam.EN_ONLY: 1,
             PrepParam.COM_STR: 0,
             PrepParam.SPLIT: 0,
             PrepParam.TABS_NEWLINES: 1,
             PrepParam.MARK_LOGS: 1,
             PrepParam.CAPS: 1
         })
         to_repr(prep_config, [], NgramSplitConfig())
Ejemplo n.º 14
0
def run(dataset, repr, threshold):
    PrepConfig.assert_classification_config(repr)

    path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset,
                                          CLASSIFICATION_DIR)
    dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr)

    logger.info(f"Getting stats for {dest_dir}")
    logger.info(
        f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %"
    )
    projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold)
    for i, p in enumerate(projects_to_ignore):
        logger.info(f"{i}: {p}")
    logger.info("")
    logger.info(logged_stats)
    output_file_path = os.path.join(
        path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
    dump_list(projects_to_ignore, output_file_path)
    logger.info(
        f"Ignored files with threshold {threshold} % were written to {output_file_path}"
    )
    logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
Ejemplo n.º 15
0
    def test_to_repr_0(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 0,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 0
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", 'dinero', '"', 'AWirklich', '"', '/*', 'ц',
            'blanco_english', '*/', '//', "DIESELBE8", pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 16
0
def init_splitting_config(dataset: str, prep_config: PrepConfig,
                          bpe_base_repr: Optional[str],
                          bpe_n_merges: Optional[int],
                          splitting_file: Optional[str]):
    global global_n_gramm_splitting_config
    global_n_gramm_splitting_config = NgramSplitConfig()
    if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]:
        if not bpe_base_repr:
            bpe_base_repr = prep_config.get_base_bpe_prep_config()

        if prep_config.get_param_value(PrepParam.SPLIT) == 9:
            if not bpe_n_merges:
                raise ValueError(
                    "--bpe-n-merges must be specified for repr **9**")
        else:
            bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0}
            bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value(
                PrepParam.SPLIT)]

        if bpe_base_repr.find("/") == -1:
            bpe_base_dataset = dataset
        else:
            bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/")
        logger.info(f'Using bpe base dataset: {bpe_base_dataset}')
        logger.info(f'Using bpe base repr: {bpe_base_repr}')
        logger.info(f'Using bpe_n_merges: {bpe_n_merges}')
        path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR,
                                          bpe_base_dataset, METADATA_DIR,
                                          bpe_base_repr, BPE_DIR,
                                          str(bpe_n_merges))
        bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt')
        bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt')

        global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns(
            bpe_merges_cache, val_type=list)
        global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file)
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.BPE)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 3:
        if not splitting_file:
            raise ValueError("--splitting-file must be specified")

        splittings = read_dict_from_2_columns(splitting_file,
                                              val_type=list,
                                              delim='|')
        global_n_gramm_splitting_config.sc_splittings = splittings
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.NUMBERS_AND_CUSTOM)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 2:
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.ONLY_NUMBERS)
Ejemplo n.º 17
0
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str],
        bpe_n_merges: Optional[int], splitting_file: Optional[str], merges_file):
    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset)
    full_src_dir = os.path.join(path_to_dataset, PARSED_DIR)

    if not os.path.exists(full_src_dir):
        logger.error(f"Dir does not exist: {full_src_dir}")
        exit(3)
    logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}")

    preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params)
    init_splitting_config(dataset, preprocessing_params, bpe_base_repr, bpe_n_merges, splitting_file, merges_file)

    repr = str(preprocessing_params)

    full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, f'{repr}_{bpe_n_merges if bpe_n_merges else ""}_{os.path.basename(merges_file)}')
    full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr)
    logger.info(f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}")
    if not os.path.exists(full_dest_dir):
        os.makedirs(full_dest_dir)
    if not os.path.exists(full_metadata_dir):
        os.makedirs(full_metadata_dir)

    with open(os.path.join(full_dest_dir, 'preprocessing_types.json'), "w") as f:
        json_str = jsons.dumps(preprocessing_params)
        f.write(json_str)

    params = []
    for root, dirs, files in os.walk(full_src_dir):
        for file in files:
            if file.endswith(f".{PARSED_FILE_EXTENSION}"):

                full_dest_dir_with_sub_dir = os.path.join(full_dest_dir, os.path.relpath(root, full_src_dir))
                if not os.path.exists(full_dest_dir_with_sub_dir):
                    os.makedirs(full_dest_dir_with_sub_dir)
                params.append((os.path.join(root, file),
                               os.path.join(full_dest_dir_with_sub_dir, file),
                               preprocessing_params))
    files_total = len(params)
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in tqdm(it, total=files_total):
            pass
Ejemplo n.º 18
0
def gen():
    with open(path_to_file, 'r') as f:
        identifiers = [line.rstrip('\n') for line in f]

    csv_lines = [
        DELIMITER.join(["config"] + [p for p in PrepParam] + identifiers)
    ]
    for prep in prep_configs:
        csv_line = [prep]
        for p in PrepParam:
            csv_line.append(PrepConfig.human_readable_values[p][
                PrepConfig.from_encoded_string(prep).get_param_value(p)])
        for identifier in identifiers:
            tokens = preprocess(identifier, prep)
            csv_line.append(' '.join(tokens))
        csv_lines.append(DELIMITER.join(csv_line))

    with open(path_to_file_out, 'w') as f:
        for line in csv_lines:
            f.write(f'{line}\n')
Ejemplo n.º 19
0
    def test_to_repr_1_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'],
            'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], '8', pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 20
0
    def test_to_repr_no_str_no_com(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 2,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={'english': ['engl', 'ish']})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"]
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 21
0
    def test_1(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges_cache={'while': ['while']})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capital'],
            "while",
        ]

        self.assertEqual(expected, actual)
Ejemplo n.º 22
0
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None:
    base_model = config.base_model
    pretraining = config.pretraining_type

    PrepConfig.assert_classification_config(config.data.repr)

    if bool(base_model) != bool(pretraining):
        raise ValueError(
            'Base model and pretraining_type params must be both set or both unset!'
        )

    fs = FS.for_classifier(config.data.dataset,
                           config.data.repr,
                           base_model=base_model,
                           pretraining=pretraining,
                           classification_type=config.classification_type)

    fs.create_path_to_model(config.data, config.training_config)
    attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log')

    print_gpu_info()

    text_field = fs.load_text_field()

    rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL,
                                         config.data, config.arch,
                                         config.min_log_coverage_percent)
    logger.info(rnn_learner)

    same_model_exists = fs.best_model_exists(rnn_learner)
    if same_model_exists and not force_rerun:
        logger.info(
            f'Model {fs.path_to_classification_model} already trained. Not rerunning training.'
            f'To retrain the model with this parameters, specify --force-rerun flag'
        )
        return
    elif same_model_exists:
        logger.info(
            f"Model {fs.path_to_classification_model} already trained. Forcing rerun."
        )

    if pretraining == PretrainingType.FULL:
        try:
            logger.info(f'Trying to load base classifier: {base_model}')
            fs.load_base_model(rnn_learner)
            logger.info('Base classifier model is loaded.')
        except Exception as e:
            logger.warning(e)
            logger.warning(
                'Base classifier model not loaded. Training from scratch')

    elif pretraining == PretrainingType.ONLY_ENCODER:
        try:
            logger.info(f'Trying to load pretarined LM: {base_model}')
            # TODO its a dirty hack. fix it
            fs.lm_cl_pretraining = True
            fs.load_pretrained_langmodel(rnn_learner)
            logger.info("Using pretrained LM")
        except Exception as e:
            logger.warning(e)
            logger.warning('Pretrained LM not loaded. Training from scratch')
    else:
        logger.info("No pretraining. Training classifier from scratch.")

    config_manager.save_config(config.training_config, fs.path_to_model)

    train(fs, rnn_learner, config.training, config.metrics)

    model = rnn_learner.model

    to_test_mode(model)
    sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out')
    n_predicitions = 6 if config.classification_type == 'level' else 2
    show_tests(fs.test_path, model, text_field, sample_test_runs_file,
               config.data.backwards, n_predicitions, config.testing.n_samples)
    logger.info("Classifier training finished successfully.")
Ejemplo n.º 23
0
def to_repr_l(lst):
    return to_repr(PrepConfig.from_encoded_string('000010'), lst,
                   NgramSplitConfig())