예제 #1
0
def to_repr(
    prep_config: PrepConfig,
    token_list: List[ParsedToken],
    bpe_data: Optional[BpeData] = None
) -> Tuple[List[str], PreprocessingMetadata]:
    bpe_data = bpe_data or get_global_bpe_data_if_available()
    repr_list, metadata = to_repr_list(token_list,
                                       prep_config.get_repr_config(bpe_data))
    if prep_config.is_bpe():
        repr_list = insert_and_word_tokens(repr_list, metadata)
    return repr_list, metadata
예제 #2
0
def preprocess_corpus(
        path: str,
        prep_config: PrepConfig,
        bpe_codes_id: Optional[str] = None,
        extensions: Optional[str] = None,
        output_path: Optional[str] = None,
        calc_vocab: Optional[bool] = False) -> PreprocessedCorpus:
    output_path = output_path or os.getcwd()
    custom_bpe_config = None
    if prep_config.is_bpe():
        assert bpe_codes_id
        if not is_predefined_id(bpe_codes_id):
            custom_bpe_config = CustomBpeConfig.from_id(bpe_codes_id)

    dataset = Dataset.create(str(path),
                             prep_config,
                             extensions,
                             custom_bpe_config,
                             overriden_path_to_prep_dataset=output_path)
    if calc_vocab:
        stages.run_until_vocab(dataset, custom_bpe_config)
        path_to_vocab = dataset.path_to_vocab_file
    else:
        stages.run_until_preprocessing(dataset, custom_bpe_config)
        path_to_vocab = None
    logger.info(
        f"Preprocessed dataset is ready at {dataset.preprocessed.path}")
    return PreprocessedCorpus(dataset.preprocessed, path_to_vocab)
예제 #3
0
def test_to_repr_no_str_no_com():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: '0',
        PrepParam.STR: '0',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        pl["string_literal"],
        pl["comment"],
        pl["comment"],
        pl["comment"],
        pl["comment"]
    ]

    expected_metadata = PreprocessingMetadata({'*'}, word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12],
                                              token_types=[Number, Operator, NonEng, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #4
0
def test_to_repr_with_non_eng():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        'übersetzen',
        '"', pl['word_start'], pl['capitals'], 'a', pl['capital'], 'wirklicä', pl['word_end'], '"',
        '/', '*', 'ц', pl['word_start'], 'blanco', '_', 'english', pl['word_end'], '*', '/',
        '/', '/', pl['word_start'], pl['capitals'], 'dieselbe', "8", pl['word_end'], pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/"}, word_boundaries=[0, 5, 6, 7, 8, 14, 15, 16, 17, 18,
                                                                                23, 24, 25, 26, 27, 32, 33],
                                              token_types=[Number, Operator, SplitContainer]
                                                          + [StringLiteral] * 3
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #5
0
def test_to_repr_1_nosep():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '1',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        pl['non_eng'],
        '"',
        pl['non_eng'], '"',
        '/', '*', pl['non_eng'], pl['non_eng'], '*', '/',
        '/', '/', pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=list(range(16+1)),
                                              token_types=[Number, Operator, NonEng,
                                                           StringLiteral, StringLiteral, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #6
0
def test_to_repr_0_max_str_length_7():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '7',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        'übersetzen',
        '"', '"',
        '/', '*', 'ц', 'blanco_english', '*', '/',
        '/', '/', "DIESELBE8", pl['olc_end']
    ]
    expected_metadata = PreprocessingMetadata({'"', "*", "/"},
                                              word_boundaries=[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                              token_types=[Number, Operator, SplitContainer, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #7
0
def test_non_bpe_split_with_one_extension(get_timestamp_mock, os_exists_mock):
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    actual = Dataset.create(PATH_TO_DATASET_STUB, prep_config, "java", None)

    assert PATH_TO_DATASET_STUB == actual._path
    assert prep_config == actual._prep_config
    assert ['java'] == actual._normalized_extension_list
    assert actual._custom_bpe_config is None
    assert actual._bpe_config is None
    assert '01_01_01' == actual._dataset_last_modified

    assert SubDataset(actual, PATH_TO_DATASET_STUB, ''), actual._original
    assert SubDataset(
        actual, os.path.join(PARSED_DATASETS_DIR, 'dataset_01_01_01_java'),
        '.parsed'), actual._parsed
    assert SubDataset(
        actual,
        os.path.join(PREP_DATASETS_DIR, 'dataset_01_01_01_java_-_uc10su'),
        '.prep'), actual._preprocessed
예제 #8
0
def test_to_repr_0():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        '1.1',
        "*",
        'übersetzen',
        '"', 'AWirklicä', '"',
        '/', '*', 'ц', 'blanco_english', '*', '/',
        '/', '/', "DIESELBE8", pl['olc_end']
    ]
    expected_metadata = PreprocessingMetadata({'"', "*", "/"},
                                              word_boundaries=list(range(16+1)),
                                              token_types=[Number, Operator, SplitContainer,
                                                           StringLiteral, StringLiteral, StringLiteral,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           MultilineComment, MultilineComment, MultilineComment,
                                                           OneLineComment, OneLineComment, OneLineComment, OneLineComment])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #9
0
def test_true_true_code_bytes(abspath_mock, bpe_learner_mock, dataset_mock):

    # given
    abspath_mock.return_value = PATH_TO_DATASET_STUB
    dataset_mock.create = Mock(spec=dataset_mock, return_value=dataset_mock)
    argv = [
        'learn-bpe', '1000', '-p', PATH_TO_DATASET_STUB, '--bytes',
        '--word-end'
    ]

    # when
    parse_and_run(argv)

    # then
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: '0',
        PrepParam.STR: 'E',
        PrepParam.SPLIT: 'F',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })
    bpe_config = BpeConfig({
        BpeParam.CASE: 'yes',
        BpeParam.WORD_END: True,
        BpeParam.BASE: 'code',
        BpeParam.UNICODE: 'bytes',
    })
    dataset_mock.create.assert_called_with(PATH_TO_DATASET_STUB, prep_config,
                                           None, None, bpe_config)
    bpe_learner_mock.run.assert_called_with(dataset_mock, 1000, bpe_config)
예제 #10
0
def test_to_repr_no_no_sep_with_bpe_no_merges():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={}, merges=MergeList()))

    expected = [
        '1',
        '.',
        '1',
        cwe,
        "*" + cwe,
        '÷', 'b', 'e', 'r', 's', 'e', 't', 'z', 'e', 'n', '</t>',
        '"', 'A', 'W', 'i', 'r', 'k', 'l', 'i', 'c', '\xf7', '\xa0', '"', cwe,
        '/' + cwe, '*' + cwe, '\xf7', cwe, 'b', 'l', 'a', 'n', 'c', 'o', '_', 'e', 'n', 'g', 'l', 'i', 's', 'h', cwe, '*' + cwe, '/' + cwe,
        '/' + cwe, '/' + cwe, 'D', 'I', 'E', 'S', 'E', 'L', 'B', 'E', '8', cwe,
        pl['olc_end'] + cwe
    ]

    assert expected == actual
예제 #11
0
def test_to_repr_no_nosep():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl['non_eng'], '"',
        '/', '*', pl['non_eng'], pl['non_eng'], '*', '/',
        '/', '/', pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/"},
                                              word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 3
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #12
0
def init_bpe_data(prep_config: PrepConfig,
                  custom_bpe_config: Optional[CustomBpeConfig],
                  force_reinit: bool = True):
    if get_global_bpe_data_if_available() and not force_reinit:
        return  # already initialized
    global global_bpe_data
    global_bpe_data = BpeData()
    if custom_bpe_config:
        logger.info(f'Using bpe merges file: {custom_bpe_config.codes_file}')
        if custom_bpe_config.can_use_cache_file():
            global_bpe_data.merges_cache = read_bpe_cache(
                custom_bpe_config.cache_file)
        else:
            global_bpe_data.merges_cache = {}
        global_bpe_data.merges = read_merges(custom_bpe_config.codes_file,
                                             custom_bpe_config.n_merges)

        if custom_bpe_config.n_merges:
            logger.info(f'Using first {custom_bpe_config.n_merges} merges.')
        nonbpe_vocab = vocabloader.nonbpe(custom_bpe_config.merge_list_id)
        global_bpe_data.merges_cache.update({s: [s] for s in nonbpe_vocab})
    else:
        bpe_n_merges_dict = {
            '4': '5k',
            '5': '1k',
            '6': '10k',
            '7': '20k',
            '8': '0'
        }
        bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value(
            PrepParam.SPLIT)]

        bpe_merges_file = os.path.join(
            DEFAULT_BPE_DIR, CASE_DIR if prep_config.get_param_value(
                PrepParam.CASE) == 'u' else NO_CASE_DIR, str(bpe_n_merges),
            'merges.txt')
        bpe_merges_cache_file = os.path.join(
            DEFAULT_BPE_CACHE_DIR, CASE_DIR if prep_config.get_param_value(
                PrepParam.CASE) == 'u' else NO_CASE_DIR, str(bpe_n_merges),
            'merges_cache.txt')
        if os.path.exists(bpe_merges_cache_file):
            global_bpe_data.merges_cache = read_bpe_cache(
                bpe_merges_cache_file)
        else:
            global_bpe_data.merges_cache = {}
        global_bpe_data.merges = read_merges(bpe_merges_file)
예제 #13
0
def test_both_enonly_and_nosplit():
    with pytest.raises(ValueError):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 'U',
            PrepParam.COM: 'c',
            PrepParam.STR: '1',
            PrepParam.SPLIT: '0',
            PrepParam.TABS_NEWLINES: '0',
            PrepParam.CASE: 'l'
        })
        to_repr(prep_config, [], BpeData())
예제 #14
0
def test_all_custom(get_timestamp_mock, os_exists_mock):
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })
    bpe_config = BpeConfig({
        BpeParam.CASE: 'yes',
        BpeParam.WORD_END: False,
        BpeParam.BASE: "code",
        BpeParam.UNICODE: "no",
    })

    custom_bpe_config = CustomBpeConfig("id", 1000, "/codes/file",
                                        "/cache/file")
    actual = Dataset.create(PATH_TO_DATASET_STUB,
                            prep_config,
                            "c|java",
                            custom_bpe_config,
                            bpe_config,
                            overriden_path_to_prep_dataset=OVERRIDDEN_PATH)

    assert PATH_TO_DATASET_STUB == actual._path
    assert prep_config == actual._prep_config
    assert ['c', 'java'] == actual._normalized_extension_list
    assert custom_bpe_config == actual._custom_bpe_config
    assert bpe_config == actual._bpe_config
    assert '01_01_01' == actual._dataset_last_modified

    assert SubDataset(actual, PATH_TO_DATASET_STUB, '') == actual.original
    assert SubDataset(
        actual, os.path.join(PARSED_DATASETS_DIR, 'dataset_01_01_01_c_java'),
        '.parsed') == actual.parsed
    assert SubDataset(
        actual,
        os.path.join(OVERRIDDEN_PATH,
                     'dataset_01_01_01_c_java_-_uc10su_id-1000_-_prep'),
        '.prep') == actual.preprocessed
    assert os.path.join(
        USER_CONFIG_DIR, VOCAB_DIR,
        'dataset_01_01_01_c_java_-_U0EFsu') == actual.base_bpe_vocab_path
    assert os.path.join(
        USER_CONFIG_DIR, BPE_DIR,
        'dataset_01_01_01_c_java_-_nounicode') == actual.bpe_path
    assert os.path.join(
        USER_CACHE_DIR, 'file_lists',
        'dataset_01_01_01_c_java') == actual.path_to_file_list_folder
    assert os.path.join(
        USER_CONFIG_DIR, VOCAB_DIR,
        'dataset_01_01_01_c_java_-_uc10su_id-1000') == actual.vocab_path
예제 #15
0
def test_xxxFsx(api_mock):
    argv = ['nosplit', 'str', '-e', 'java', '--full-strings']
    parse_and_run(argv)
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: 'F',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })
    api_mock.text.preprocess.assert_called_with("str",
                                                prep_config,
                                                None,
                                                extension="java")
예제 #16
0
def test_xxx1xu(api_mock):
    argv = ['basic', 'str', '-e', 'java', '--no-spaces']
    parse_and_run(argv)
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '1',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })
    api_mock.text.preprocess.assert_called_with("str",
                                                prep_config,
                                                None,
                                                extension="java")
예제 #17
0
 def to_prep_config(self):
     return PrepConfig({
         PrepParam.EN_ONLY:
         'U' if self.get_param_value(BpeParam.UNICODE) == 'no' else 'u',
         PrepParam.COM:
         '0',
         PrepParam.STR:
         'E',
         PrepParam.SPLIT:
         'F',
         PrepParam.TABS_NEWLINES:
         's',
         PrepParam.CASE:
         'u'
     })
예제 #18
0
def test_all_short_config_options(api_mock):
    argv = ['basic', 'str', '-e', 'java', '-0lSCU']
    parse_and_run(argv)
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: '0',
        PrepParam.STR: '0',
        PrepParam.SPLIT: '1',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })
    api_mock.text.preprocess.assert_called_with("str",
                                                prep_config,
                                                None,
                                                extension="java")
예제 #19
0
def test_path_short(api_mock):
    argv = ['nosplit', '-p', PATH_TO_DATASET_STUB, '--no-spaces']
    parse_and_run(argv)
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '0',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'u'
    })
    api_mock.corpus.preprocess_corpus.assert_called_with(PATH_TO_DATASET_STUB,
                                                         prep_config,
                                                         None,
                                                         calc_vocab=False,
                                                         extensions=None,
                                                         output_path=None)
예제 #20
0
파일: text.py 프로젝트: mir-am/codeprep
def preprocess(text: str, config: PrepConfig, bpe_codes_id: Optional[str] = None, extension: Optional[str] = None,
               return_metadata: bool = False, force_reinit_bpe_data: bool = True, append_eof: bool = False) \
        -> Union[List[str], Tuple[List[str], PreprocessingMetadata]]:
    parsed = [parsed_token for parsed_token in convert_text(text, extension)]
    parsed = remove_trailing_newline(parsed)
    if append_eof:
        parsed.append(SpecialToken(placeholders['ect']))
    if config.is_bpe():
        assert bpe_codes_id
        custom_bpe_config = None if is_predefined_id(
            bpe_codes_id) else CustomBpeConfig.from_id(bpe_codes_id)
        init_bpe_data(config, custom_bpe_config, force_reinit_bpe_data)
    prep_tokens, metadata = to_repr(config, parsed)
    if return_metadata:
        return prep_tokens, metadata
    else:
        return prep_tokens
예제 #21
0
파일: impl.py 프로젝트: mir-am/codeprep
def create_prep_config_from_args(arguments: Dict) -> PrepConfig:
    max_str_length = get_option(arguments, '--max-str-length')
    max_str_length = int(
        max_str_length) if max_str_length is not None else sys.maxsize
    return PrepConfig({
        PrepParam.EN_ONLY:
        'U' if is_option_true(arguments, '--no-unicode') else 'u',
        PrepParam.COM:
        '0' if is_option_true(arguments, '--no-com') else 'c',
        PrepParam.STR:
        create_str_value(is_option_true(arguments, '--no-str'),
                         max_str_length),
        PrepParam.SPLIT:
        create_split_value_from_args(arguments),
        PrepParam.TABS_NEWLINES:
        '0' if is_option_true(arguments, '--no-spaces') else 's',
        PrepParam.CASE:
        'l' if is_option_true(arguments, '--no-case') else 'u',
    })
예제 #22
0
def test_bpe_string_literal_performance():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    n= 10000
    tokens = [StringLiteral(['a' * n], n)]

    merge_list = MergeList()
    for i in range(1):
        merge_list.append(Merge(('a', 'a'), 10))
    start = time.perf_counter()
    to_repr(prep_config, tokens, BpeData(merges=merge_list, merges_cache={'Whi@@le@': ['Whi@@le@']}))
    assert (time.perf_counter() - start) < 1
예제 #23
0
def test_1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@le")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']}))

    expected = ["Whi@le" + placeholders['compound_word_end']]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #24
0
def test_merges_no_cache():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@l@@e@")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)),
                                                                    merges_cache={} ))

    expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
예제 #25
0
import os
from unittest import mock
from unittest.mock import Mock

from codeprep.api.corpus import preprocess_corpus
from codeprep.prepconfig import PrepConfig, PrepParam

PATH_TO_CUR_DIR_STUB = os.path.join('path', 'to', 'curdir')
PATH_TO_DATASET_STUB = os.path.join('path', 'to', 'dataset')
PATH_TO_OUTPUT_STUB = os.path.join('path', 'to', 'output')

DEFAULT_PREP_CONFIG = PrepConfig({
    PrepParam.EN_ONLY: 'u',
    PrepParam.COM: 'c',
    PrepParam.STR: '1',
    PrepParam.SPLIT: '0',
    PrepParam.TABS_NEWLINES: '0',
    PrepParam.CASE: 'u',
})


@mock.patch('codeprep.api.corpus.Dataset', autospec=True)
@mock.patch('codeprep.api.corpus.stages', autospec=True)
@mock.patch('codeprep.cli.impl.os.getcwd',
            autospec=True,
            return_value=PATH_TO_CUR_DIR_STUB)
def test_simple(os_mock, stages_mock, dataset_mock):
    # given
    dataset_mock.create = Mock(spec=dataset_mock, return_value=dataset_mock)

    # when
예제 #26
0
def test_to_repr_with_enonlycontents1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    tokens = [
        Number("1.1"),
        Operator("*"),
        NonEng(SplitContainer([Word.from_("dinero")])),
        StringLiteral([
            NonCodeChar('"'),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("weiss")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("nicht")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("was")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("soll")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("es")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bedeuten")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("dass")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("so")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("traurig")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bin")])),
            NonCodeChar('"'),
        ], 62),
        NewLine(),
        MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
        MultilineComment([
            NonEng(SplitContainer([Word.from_('ц')])),
            NonEng(
                SplitContainer([
                    Word.from_("blanco"),
                    Underscore(),
                    Word.from_("english")
                ])
            ),
        ]),
        MultilineComment([NonCodeChar('*'), NonCodeChar('/')]),
        NewLine(), Tab(),
        OneLineComment([NonCodeChar('/'), NonCodeChar('/'),
            NonEng(
                SplitContainer([
                    Word.from_("DIESELBE"),
                    Word.from_("8")
                ])
            )
        ])
    ]

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"],
        pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"',
        '/', '*', pl['non_eng'], pl['non_eng'],
        '*', '/',
        '/', '/',  pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=[0] + list(range(5, 32)),
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 14
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata