Esempio n. 1
0
 def test_both_enonly_and_nosplit(self):
     with self.assertRaises(ValueError):
         prep_config = PrepConfig({
             PrepParam.EN_ONLY: 1,
             PrepParam.COM_STR: 0,
             PrepParam.SPLIT: 0,
             PrepParam.TABS_NEWLINES: 1,
             PrepParam.MARK_LOGS: 1,
             PrepParam.CAPS: 1
         })
         to_repr(prep_config, [], NgramSplitConfig())
Esempio n. 2
0
def preprocess(s, r):
    parsed = apply_preprocessors(from_string(s), pp_params["preprocessors"],
                                 {'interesting_context_words': []})
    params = PrepConfig.from_encoded_string(r)
    init_splitting_config(DEFAULT_DATASET, params, DEFAULT_BPE_BASE_REPR,
                          DEFAULT_BPE_N_MERGES, None)
    return to_repr(params, parsed)
Esempio n. 3
0
    def test_to_repr_2_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 2,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.ONLY_NUMBERS)

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl["word_start"], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl["word_start"], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 4
0
    def test_merges_no_cache(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges={('w', 'h'): 0},
            merges_cache={})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], pl['capital'], "wh", "i", "l", "e",
            pl["word_end"]
        ]

        self.assertEqual(expected, actual)
Esempio n. 5
0
    def test_log_no_mark_logs(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 0,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig()

        tokens = [
            LogStatement(
                SplitContainer.from_single_token('LOGGER'),
                SplitContainer.from_single_token('Info'), INFO,
                [StringLiteral([SplitContainer.from_single_token("Hi")])])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"',
            pl['capital'], 'hi', '"', ')', ';'
        ]

        self.assertEqual(expected, actual)
Esempio n. 6
0
    def test_to_repr_no_no_sep_with_bpe_no_merges(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE, merges=[], merges_cache={})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'e', 'n', 'g',
            'l', 'i', 's', 'h', pl['word_end'], '*/', '//', pl['word_start'],
            pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 7
0
    def test_to_repr_with_non_eng(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={
                'english': ['engl', 'ish'],
                'dieselbe': ['die', 'selbe']
            })

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'dinero',
            '"', pl['word_start'], pl['capitals'], 'a', pl['capital'],
            'wirklich', pl['word_end'], '"', '/*', 'ц', pl['word_start'],
            'blanco', '_', 'engl', 'ish', pl['word_end'], '*/', '//',
            pl['word_start'], pl['capitals'], 'die', 'selbe', "8",
            pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 8
0
    def test_to_repr_with_enonlycontents(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 2,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={})

        tokens = [
            Number([1, DecimalPoint(), 1]), "*",
            SplitContainer([NonEng(Word.from_("dinero"))]),
            StringLiteral([
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("weiss")),
                NonEng(Word.from_("nicht")),
                NonEng(Word.from_("was")),
                NonEng(Word.from_("soll")),
                NonEng(Word.from_("es")),
                NonEng(Word.from_("bedeuten")),
                NonEng(Word.from_("dass")),
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("so")),
                NonEng(Word.from_("traurig")),
                NonEng(Word.from_("bin")),
            ]),
            NewLine(),
            MultilineComment([
                SplitContainer([NonEng(Word.from_('ц'))]),
                SplitContainer([
                    NonEng(Word.from_("blanco")),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            NewLine(),
            Tab(),
            OneLineComment([
                SplitContainer(
                    [NonEng(Word.from_("DIESELBE")),
                     Word.from_("8")])
            ])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl["non_eng_content"], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 9
0
def calc_stats_for_prepconfig(prepconfig,
                              lang_checker,
                              token_list,
                              include_sample=False):
    repr = to_token_list(
        to_repr(PrepConfig.from_encoded_string(prepconfig), token_list,
                NgramSplitConfig())).split(' ')
    return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
    def test(self):
        for input, output_tuple in test_cases.items():
            parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {})

            self.assertEqual(output_tuple[0], parsed)

            repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config)

            self.assertEqual(output_tuple[1], repred)
Esempio n. 11
0
    def test_to_repr_0(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 0,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 0
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", 'dinero', '"', 'AWirklich', '"', '/*', 'ц',
            'blanco_english', '*/', '//', "DIESELBE8", pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 12
0
    def test_to_repr_1_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'],
            'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], '8', pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Esempio n. 13
0
    def test_to_repr_no_str_no_com(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 2,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={'english': ['engl', 'ish']})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"]
        ]

        self.assertEqual(expected, actual)
Esempio n. 14
0
    def test_1(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges_cache={'while': ['while']})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capital'],
            "while",
        ]

        self.assertEqual(expected, actual)
Esempio n. 15
0
def to_repr_l(lst):
    return to_repr(PrepConfig.from_encoded_string('000010'), lst,
                   NgramSplitConfig())