Ejemplo n.º 1
0
def preprocess(s, r):
    parsed = apply_preprocessors(from_string(s), pp_params["preprocessors"],
                                 {'interesting_context_words': []})
    params = PrepConfig.from_encoded_string(r)
    init_splitting_config(DEFAULT_DATASET, params, DEFAULT_BPE_BASE_REPR,
                          DEFAULT_BPE_N_MERGES, None)
    return to_repr(params, parsed)
Ejemplo n.º 2
0
def calc_stats_for_prepconfig(prepconfig,
                              lang_checker,
                              token_list,
                              include_sample=False):
    repr = to_token_list(
        to_repr(PrepConfig.from_encoded_string(prepconfig), token_list,
                NgramSplitConfig())).split(' ')
    return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
    def test(self):
        for input, output_tuple in test_cases.items():
            parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {})

            self.assertEqual(output_tuple[0], parsed)

            repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config)

            self.assertEqual(output_tuple[1], repred)
Ejemplo n.º 4
0
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str],
        bpe_n_merges: Optional[int], splitting_file: Optional[str]):
    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset)
    full_src_dir = os.path.join(path_to_dataset, PARSED_DIR)

    if not os.path.exists(full_src_dir):
        logger.error(f"Dir does not exist: {full_src_dir}")
        exit(3)
    logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}")

    preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params)
    init_splitting_config(dataset, preprocessing_params, bpe_base_repr,
                          bpe_n_merges, splitting_file)

    repr = str(preprocessing_params)

    full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, repr)
    full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr)
    logger.info(
        f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}")
    if not os.path.exists(full_dest_dir):
        os.makedirs(full_dest_dir)
    if not os.path.exists(full_metadata_dir):
        os.makedirs(full_metadata_dir)

    with open(os.path.join(full_dest_dir, 'preprocessing_types.json'),
              "w") as f:
        json_str = jsons.dumps(preprocessing_params)
        f.write(json_str)

    params = []
    for root, dirs, files in os.walk(full_src_dir):
        for file in files:
            if file.endswith(f".{PARSED_FILE_EXTENSION}"):

                full_dest_dir_with_sub_dir = os.path.join(
                    full_dest_dir, os.path.relpath(root, full_src_dir))
                if not os.path.exists(full_dest_dir_with_sub_dir):
                    os.makedirs(full_dest_dir_with_sub_dir)
                params.append((os.path.join(root, file),
                               os.path.join(full_dest_dir_with_sub_dir,
                                            file), preprocessing_params))
    files_total = len(params)
    current_file = 0
    start_time = time.time()
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in it:
            current_file += 1
            logger.info(f"Processed {current_file} out of {files_total}")
            time_elapsed = time.time() - start_time
            logger.info(
                f"Time elapsed: {time_elapsed:.2f} s, estimated time until completion: "
                f"{time_elapsed / current_file * files_total - time_elapsed:.2f} s"
            )
Ejemplo n.º 5
0
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str],
        bpe_n_merges: Optional[int], splitting_file: Optional[str], merges_file):
    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset)
    full_src_dir = os.path.join(path_to_dataset, PARSED_DIR)

    if not os.path.exists(full_src_dir):
        logger.error(f"Dir does not exist: {full_src_dir}")
        exit(3)
    logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}")

    preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params)
    init_splitting_config(dataset, preprocessing_params, bpe_base_repr, bpe_n_merges, splitting_file, merges_file)

    repr = str(preprocessing_params)

    full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, f'{repr}_{bpe_n_merges if bpe_n_merges else ""}_{os.path.basename(merges_file)}')
    full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr)
    logger.info(f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}")
    if not os.path.exists(full_dest_dir):
        os.makedirs(full_dest_dir)
    if not os.path.exists(full_metadata_dir):
        os.makedirs(full_metadata_dir)

    with open(os.path.join(full_dest_dir, 'preprocessing_types.json'), "w") as f:
        json_str = jsons.dumps(preprocessing_params)
        f.write(json_str)

    params = []
    for root, dirs, files in os.walk(full_src_dir):
        for file in files:
            if file.endswith(f".{PARSED_FILE_EXTENSION}"):

                full_dest_dir_with_sub_dir = os.path.join(full_dest_dir, os.path.relpath(root, full_src_dir))
                if not os.path.exists(full_dest_dir_with_sub_dir):
                    os.makedirs(full_dest_dir_with_sub_dir)
                params.append((os.path.join(root, file),
                               os.path.join(full_dest_dir_with_sub_dir, file),
                               preprocessing_params))
    files_total = len(params)
    with Pool() as pool:
        it = pool.imap_unordered(preprocess_and_write, params)
        for _ in tqdm(it, total=files_total):
            pass
Ejemplo n.º 6
0
def gen():
    with open(path_to_file, 'r') as f:
        identifiers = [line.rstrip('\n') for line in f]

    csv_lines = [
        DELIMITER.join(["config"] + [p for p in PrepParam] + identifiers)
    ]
    for prep in prep_configs:
        csv_line = [prep]
        for p in PrepParam:
            csv_line.append(PrepConfig.human_readable_values[p][
                PrepConfig.from_encoded_string(prep).get_param_value(p)])
        for identifier in identifiers:
            tokens = preprocess(identifier, prep)
            csv_line.append(' '.join(tokens))
        csv_lines.append(DELIMITER.join(csv_line))

    with open(path_to_file_out, 'w') as f:
        for line in csv_lines:
            f.write(f'{line}\n')
Ejemplo n.º 7
0
def to_repr_l(lst):
    return to_repr(PrepConfig.from_encoded_string('000010'), lst,
                   NgramSplitConfig())