Example #1
0
def test_profile_with_bad_metadata(tmpdir):
    mdpath = tmpdir / 'md.json'
    md = deepcopy(Profile.MD)
    md['tables'].append({'tableSchema': {'columns': []}})
    jsonlib.dump(md, str(mdpath))

    with pytest.raises(ValueError):
        Profile.from_file(str(mdpath))
Example #2
0
def profile(args):
    """
    Create an orthography profile for a string (passed as argument or read from stdin)

    segments profile [STRING]
    """
    _write(args, Profile.from_text(_read(args)))
Example #3
0
def test_normalization():
    specs = [
        {'Grapheme': 'ä'},
        {'Grapheme': 'aa'},
        {'Grapheme': 'a'},
    ]
    prf = Profile(*specs, **{'form': 'NFD'})
    t = Tokenizer(profile=prf)
    # "aa" matches, because the "ä" is decomposed:
    assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
    # A composed "ä" doesn't match anymore:
    assert t(unicodedata.normalize('NFC', 'aä')) == 'a ' + REPLACEMENT_MARKER
    prf = Profile(*specs, **{'form': 'NFC'})
    t = Tokenizer(profile=prf)
    # "aa" doesn't match here, this is typically the behaviour one wants:
    assert t(unicodedata.normalize('NFC', 'aä')) == 'a ä'
    assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
Example #4
0
def _orthography_profile(s):
    graphemes = {c: c for c in s}
    graphemes[NEWLINE] = '\n'
    graphemes.update(basic_confusables)
    graphemes.update(epa_confusables)
    graphemes.update(ipa_confusables)
    return Profile(*({
        'Grapheme': k,
        'IPA': SPACE if v == ' ' else v
    } for k, v in graphemes.items()))
Example #5
0
def test_profile():
    prf = Profile(
        {'Grapheme': 'bischen', 'Out': 'b i s ch e n'},
        {'Grapheme': 'sch', 'Out': 'sch'},
        {'Grapheme': 'n', 'Out': 'n'},
        {'Grapheme': 'a', 'Out': 'a'},
        {'Grapheme': 'e', 'Out': 'e'},
        {'Grapheme': 'n', 'Out': 'n'},
    )
    t = Tokenizer(profile=prf)
    assert t('bischen', column='Out') == 'b i s ch e n'
    assert t('naschen', column='Out') == 'n a sch e n'
    assert t('x', column='Out') == REPLACEMENT_MARKER

    prf = Profile(
        {'Grapheme': 'uu'},
        {'Grapheme': 'b'},
        {'Grapheme': 'o'},
    )
    t = Tokenizer(profile=prf)
    assert t('uubo uubo') == 'uu b o # uu b o'
Example #6
0
    def tokenizer(self):
        """
        Datasets can provide support for segmentation (aka tokenization) in two ways:
        - by providing an orthography profile at etc/orthography.tsv or
        - by overwriting this method to return a custom tokenizer callable.

        :return: A callable to do segmentation.

        The expected signature of the callable is

            def t(item, string, **kw)

        where
        - `item` is a `dict` representing the complete CLDF FormTable row
        - `string` is the string to be segmented
        - `kw` may be used to pass any context info to the tokenizer, when called
          explicitly.
        """
        profile = self.dir / 'etc' / 'orthography.tsv'
        if profile.exists():
            profile = Profile.from_file(str(profile), form='NFC')
            default_spec = list(next(iter(profile.graphemes.values())).keys())
            for grapheme in ['^', '$']:
                if grapheme not in profile.graphemes:
                    profile.graphemes[grapheme] = {
                        k: None
                        for k in default_spec
                    }
            profile.tree = Tree(list(profile.graphemes.keys()))
            tokenizer = Tokenizer(profile=profile,
                                  errors_replace=lambda c: '<{0}>'.format(c))

            def _tokenizer(item, string, **kw):
                kw.setdefault("column", "IPA")
                kw.setdefault("separator", " + ")
                return tokenizer(
                    unicodedata.normalize('NFC', '^' + string + '$'),
                    **kw).split()

            return _tokenizer
Example #7
0
def main(args):
    # Initiate tokenizer and profile
    profile = Profile.from_file(args.profile)
    tokenizer = Tokenizer(profile=profile)

    # Open file and check items
    errors = []
    with open(args.wordlist) as handler:
        reader = csv.DictReader(handler, delimiter="\t")
        for count, row in enumerate(reader):
            segments = my_tokenizer(row[args.form], tokenizer)
            reference = row[args.segments]
            if segments != reference:
                errors.append([row["ID"], row[args.form], segments, reference])

            if args.l:
                if count > args.l:
                    break

    # Output
    print(tabulate(errors, headers=["ID", "Form", "Result", "Reference"]))
    print("Errors: %i/%i (%.2f%%)" % (len(errors), count + 1,
                                      (len(errors) / (count + 1)) * 100))
Example #8
0
def test_from_text():
    res = Profile.from_text('abcdabcab')
    assert 'a' in res.graphemes
    assert res.graphemes['a']['frequency'] == 3
Example #9
0
def test_Profile_from_metadata(testdata):
    res = Profile.from_file(testdata / 'profile.json', form='NFD')
    assert 'ch' in res.graphemes
    assert res.graphemes['ch']['XSAMPA'] == 'tS'
    assert res.graphemes['-']['XSAMPA'] is None
    assert res.metadata['dc:language'] == 'abcd1234'
Example #10
0
def test_Profile_from_file(profile_path):
    res = Profile.from_file(profile_path)
    assert 'aa' in res.graphemes
    assert res.graphemes['aa']['XSAMPA'] == 'a:'
    assert res.graphemes['-']['XSAMPA'] is None
Example #11
0
def test_missing_grapheme():
    with pytest.raises(ValueError):
        Profile({})

    with pytest.raises(ValueError):
        Profile({'Grapheme': ''})
Example #12
0
def test_duplicate_grapheme(mocker):
    logging = mocker.patch('segments.profile.logging')
    Profile({'Grapheme': 'a'}, {'Grapheme': 'a'})
    assert logging.getLogger.return_value.warning.call_args[0][0].startswith(
        'line 3')
    "aĭ",
    "e͡i",
    "a͡i",
    "o͡i",
    "u͡i",
    "a͡e",
    "o͡e",
    "e͡o",
    "a͡o",
    "i͡u",
    "e͡u",
    "a͡u",
    "o͡u",
])
tokenizer = Tokenizer(Profile(*({
    "Grapheme": x,
    "mapping": x
} for x in sounds)),
                      errors_ignore=lambda c: c)

from pylexirumah import get_dataset, repository


def needleman_wunsch(x,
                     y,
                     lodict={},
                     gop=-2.5,
                     gep=-1.75,
                     local=False,
                     indel=''):
    """Needleman-Wunsch algorithm with affine gaps penalties.
Example #14
0
    def cmd_install(self, **kw):
        # Read individual orthographic profiles, extract the corresponding
        # doculect ids (here, glottocodes), and build the appropriate
        # tokenizers
        profile_files = sorted(glob.glob(str(self.dir / "etc" / "*.prof")))
        doculect_codes = [
            os.path.splitext(os.path.basename(pf))[0] for pf in profile_files
        ]

        self.doc_tokenizers = {
            doculect: Tokenizer(
                profile=Profile.from_file(pf, form="NFC"),
                errors_replace=lambda c: "<{0}>".format(c),
            )
            for pf, doculect in zip(profile_files, doculect_codes)
        }

        # Cache the Concepticon IDs
        concepticon = {
            x.attributes["wold_id"]: x.concepticon_id
            for x in self.conceptlist.concepts.values()
        }

        # cache the field names for CLDF output
        fields = self.lexeme_class.fieldnames()

        # Write data to CLDF
        with self.cldf as ds:
            vocab_ids = [
                v["ID"] for v in self.original_cldf["contributions.csv"]
            ]

            # add sources
            self.add_sources(ds)

            # add languages and build map for choosing the right profile
            lang_map = {}
            for row in self.original_cldf["LanguageTable"]:
                gc, iso = row["Glottocode"], row["ISO639P3code"]
                if gc == "tzot1264":
                    gc, iso = "tzot1259", "tzo"
                if row["ID"] in vocab_ids:
                    ds.add_language(ID=row["ID"],
                                    Name=row["Name"],
                                    Glottocode=gc,
                                    ISO639P3code=iso)

                # Add to map only those which are receivers
                if int(row["ID"]) <= 41:
                    lang_map[row["ID"]] = gc

            # add parameters
            for row in self.original_cldf["ParameterTable"]:
                ds.add_concept(
                    ID=row["ID"],
                    Name=row.pop("Name"),
                    Concepticon_ID=concepticon.get(row["ID"]),
                )

            # Being explicit on what we are adding
            for row in self.original_cldf["FormTable"]:
                if row["Language_ID"] in vocab_ids:
                    # Copy the raw Form to Value, clean form, and tokenize
                    row["Value"] = row["Form"]
                    row["Form"] = self.clean_form(row["Form"])
                    row["Segments"] = self.tokenizer(
                        row["Form"], lang_map[row["Language_ID"]])

                    # Note: We count words marked as "probably borrowed" as loans.
                    row["Loan"] = float(row["BorrowedScore"]) > 0.6

                    ds.add_form_with_segments(
                        **{k: v
                           for k, v in row.items() if k in fields})
Example #15
0
def test_from_textfile(testdata):
    res = Profile.from_textfile(testdata / 'Kabiye_input.txt')
    assert 'à̙' in res.graphemes
    assert res.graphemes['à̙']['frequency'] == 20
Example #16
0
"""Create an orthography profile for grapheme tokenization."""

from collections import OrderedDict

from segments import Profile

from filenames import GRAPHEME_PROFILE
from preprocessing import clean
from utils import read

# Read in all EvaLatin training data into a single pyconll CoNLL structure
conll = read()

# Collect all the word forms
text = ""
for sentence in conll:
    for token in sentence:
        text += clean(token.form) + " "

# Create orthography profile
profile = Profile.from_text(text)
profile.column_labels.remove("frequency")
profile.graphemes.pop(" ")
for key in ["ch", "qu", "th", "rh", "ph", "gn"]:
    profile.graphemes[key] = OrderedDict([("mapping", key[0].upper())])
    profile.graphemes.move_to_end(key, last=False)
with open(GRAPHEME_PROFILE, "w") as file:
    file.write(str(profile))
Example #17
0
def test_tokenize_with_profile_from_object():
    prf = Profile(dict(Grapheme='aa', mapping=['x', 'y']),
                  dict(Grapheme='b', mapping='z'))
    assert Tokenizer(profile=prf)('aab', column='mapping') == 'x y z'
Example #18
0
def get_orthography(name):
    return Tokenizer(Profile.from_file(profile_path(name + '.tsv'),
                                       form='NFD'))