Ejemplo n.º 1
0
    def test_clean_string(self):
        seq1 = 'this (is an error)'
        seq2 = 'feature/vector'
        seq3 = 'ta:tata'
        seq4 = 'what (the) hack [this is]'
        seq5 = 't a t'

        _get_brackets('A')

        assert clean_string(seq1)[0] == 'th i s'
        assert clean_string(seq2)[1] == 'v e c t o r'
        assert clean_string(seq3)[0] == 't a: t a t a'
        assert clean_string(seq4)[0] == 'wh a t _ h a c k'
        assert clean_string(seq5, segmentized=True)[0] == 't a t'
        assert clean_string('a(a', ignore_brackets=False)[0] == 'a ( a'
        assert clean_string('a/a', split_entries=False)[0] == 'a / a'
        assert clean_string('aa', preparse=[('a', 'b')])[0] == 'bb'
        assert clean_string('bb', merge_geminates=False)[0] == 'b b'
        assert clean_string('bb', rules={"b": "cd"},
                            merge_geminates=False)[0] == "cd cd"
Ejemplo n.º 2
0
    def test_clean_string(self):
        seq1 = 'this (is an error)'
        seq2 = 'feature/vector'
        seq3 = 'ta:tata'
        seq4 = 'what (the) hack [this is]'
        seq5 = 't a t'

        _get_brackets('A')

        assert clean_string(seq1)[0] == 'th i s'
        assert clean_string(seq2)[1] == 'v e c t o r'
        assert clean_string(seq3)[0] == 't a: t a t a'
        assert clean_string(seq4)[0] == 'wh a t _ h a c k'
        assert clean_string(seq5, segmentized=True)[0] == 't a t'
        assert clean_string('a(a', ignore_brackets=False)[0] == 'a ( a'
        assert clean_string('a/a', split_entries=False)[0] == 'a / a'
        assert clean_string('aa', preparse=[('a', 'b')])[0] == 'bb'
        assert clean_string('bb', merge_geminates=False)[0] == 'b b'
        assert clean_string('bb', rules={"b": "cd"},
                            merge_geminates=False)[0] == "cd cd"
Ejemplo n.º 3
0
def test_clean_string():

    seq1 = 'this (is an error)'
    seq2 = 'feature/vector'
    seq3 = 'ta:tata'
    seq4 = 'what (the) hack [this is]'
    seq5 = 't a t'

    _get_brackets('A')

    assert clean_string(seq1)[0] == 'th i s'
    assert clean_string(seq2)[1] == 'v e c t o r'
    assert clean_string(seq3)[0] == 't a: t a t a'
    assert clean_string(seq4)[0] == 'wh a t _ h a c k'
    assert clean_string(seq5, segmentized=True)[0] == 't a t'
    assert clean_string('a(a', ignore_brackets=False)[0] == 'a ( a'
    assert clean_string('a/a', split_entries=False)[0] == 'a / a'
Ejemplo n.º 4
0
def test_sequence(sequence, **keywords):
    """
    Test a sequence for compatibility with CLPA and LingPy.
    """
    invalid = Counter()
    segment_count = Counter()
    lingpy_errors = set()
    clpa_errors = set()
    clpa_repl = defaultdict(set)
    general_errors = 0

    # clean the string at first, we only take the first item, ignore the rest
    try:
        segments = clean_string(sequence, **keywords)[0].split(' ')
        lingpy_analysis = [
            x if y != '0' else '?'
            for x, y in zip(segments, tokens2class(segments, 'dolgo'))
        ]
        clpa_analysis, _sounds, _errors = clpa.check_sequence(segments)
        general_errors = len(
            ['?' for x in zip(lingpy_analysis, clpa_analysis) if '?' in x])
    except (ValueError, IndexError, AttributeError):
        invalid.update([sequence])
        segments, clpa_analysis = [], []

    if segments:
        for a, b, c in zip(segments, lingpy_analysis, clpa_analysis):
            if a[0] in clpa.accents:
                a = a[1:]
            if c[0] in clpa.accents:
                c = c[1:]
            segment_count.update([a])
            if b == '?':
                lingpy_errors.add(a)
            if c != a:
                if c == '?':
                    clpa_errors.add(a)
                else:
                    clpa_repl[a].add(c)

    return (segments, [clpa.segment2clpa(x)
                       for x in clpa_analysis], invalid, segment_count,
            lingpy_errors, clpa_errors, clpa_repl, general_errors)
Ejemplo n.º 5
0
def simple_profile(wordlist,
                   ref='ipa',
                   semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                   merge_vowels=False,
                   brackets=None,
                   splitters='/,;~',
                   merge_geminates=True,
                   bad_word="<???>",
                   bad_sound="<?>",
                   clts=None,
                   unknown_sound="!{0}"):
    """
    Create an initial Orthography Profile using Lingpy's clean_string procedure.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.    
    
    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(int)
    words = [wordlist[idx, ref] for idx in wordlist]
    for word in pb(words, desc='iterating over words'):
        if isinstance(word, list):
            word = ' '.join(word)
        cleaned_string = clean_string(word,
                                      semi_diacritics=semi_diacritics,
                                      merge_vowels=merge_vowels,
                                      brackets=None,
                                      ignore_brackets=False,
                                      split_entries=False,
                                      preparse=None,
                                      rules=None,
                                      merge_geminates=merge_geminates)[0]

        # retain whole word if there are splitters in the word
        if [x for x in cleaned_string if x in brackets + splitters]:
            profile[word] += 1
            bad_words.add(word)
        else:
            for segment in cleaned_string.split(' '):
                profile[segment] += 1
            for segment in [x for x in word if x not in cleaned_string]:
                profile[segment] += 1
                nulls.add(segment)

    for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
                   desc='preparing profile'):
        sclass = token2class(s, 'dolgo')
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0' and s not in nulls:
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts:
            sound = clts.get(s, False)
            if not sound:
                ipa = '!' + s
            else:
                ipa = text_type(sound)
        else:
            ipa = s
        yield s, ipa, text_type(f), codepoint(s)
Ejemplo n.º 6
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Ejemplo n.º 7
0
 def tokenizer(self):
     from lingpy.sequence.sound_classes import clean_string
     return lambda _, s, **kw: clean_string(s)