Ejemplo n.º 1
0
def iter_cognates(dataset, column='Segments', method='turchin', threshold=0.5, **kw):
    """
    Compute cognates automatically for a given dataset.
    """
    if method == 'turchin':
        for row in dataset.objects['FormTable']:
            sounds = ''.join(lingpy.tokens2class(row[column], 'dolgo'))
            if sounds.startswith('V'):
                sounds = 'H' + sounds
            sounds = '-'.join([s for s in sounds if s != 'V'][:2])
            cogid = slug(row['Parameter_ID']) + '-' + sounds
            if '0' not in sounds:
                yield dict(
                    Form_ID=row['ID'],
                    Form=row['Value'],
                    Cognateset_ID=cogid,
                    Cognate_Detection_Method='CMM')

    if method in ['sca', 'lexstat']:
        lex = _cldf2lexstat(dataset)
        if method == 'lexstat':
            lex.get_scorer(**kw)
        lex.cluster(method=method, threshold=threshold, ref='cogid')
        for k in lex:
            yield Cognate(
                Form_ID=lex[k, 'lid'],
                Form=lex[k, 'value'],
                Cognateset_ID=lex[k, 'cogid'],
                Cognate_Detection_Method=method + '-t{0:.2f}'.format(threshold))
Ejemplo n.º 2
0
def toASJP(w):
    if w == u'ũ': return 'u'
    if w == u'XXX': return ''
    w = w.replace('\"', '').replace('-', '')  #.replace(' ','')
    wA = ''.join(lp.tokens2class(w.split(), 'asjp'))
    wAA = cleanASJP(
        wA.replace(u'0', '').replace(u'I', u'3').replace(u'H', u'N'))
    return ''.join([x for x in wAA if x in sounds])
Ejemplo n.º 3
0
def find_bad_tokens(wordlist):
    """Collect which bad symbols appear in which forms."""
    bad_tokens = {}
    for k, segments, form_id in wordlist.iter_rows('tokens', "reference"):
        classes = lingpy.tokens2class(segments, 'dolgo')
        for token, cls in zip(segments, classes):
            if cls == "0":
                bad_tokens.setdefault(token, []).append(form_id)
    return bad_tokens
Ejemplo n.º 4
0
def ipa2sca(ipa):
    """Convert an IPA string into a SCA token string.

    This function tries to preserve the len of the token string.

    """
    sca_list = [
        t for x in tokenize_word_reversibly(ipa)
        for t, char in itertools.zip_longest(tokens2class(x, 'sca'), "0")
    ]
    assert len(''.join(sca_list)) == len(ipa)
    return ''.join(sca_list)
Ejemplo n.º 5
0
def iter_cognates(dataset, column='Segments', method='turchin', threshold=0.5, **kw):
    """
    Compute cognates automatically for a given dataset.

    :param dataset: Either a `LexibankWriter` instance or a `pycldf.Dataset`.
    """
    forms = dataset.objects['FormTable'] \
        if hasattr(dataset, 'objects') else list(dataset['FormTable'])

    if method == 'turchin':
        for row in forms:
            sounds = ''.join(lingpy.tokens2class(row[column], 'dolgo'))
            if sounds.startswith('V'):
                sounds = 'H' + sounds
            sounds = '-'.join([s for s in sounds if s != 'V'][:2])
            cogid = slug(row['Parameter_ID']) + '-' + sounds
            if '0' not in sounds:
                yield dict(
                    Form_ID=row['ID'],
                    Form=row['Value'],
                    Cognateset_ID=cogid,
                    Cognate_Detection_Method='CMM')

    if method in ['sca', 'lexstat']:
        try:
            lex = _cldf2lexstat(dataset)
        except ValueError:
            return
        if method == 'lexstat':
            lex.get_scorer(**kw)
        lex.cluster(method=method, threshold=threshold, ref='cogid')
        for k in lex:
            yield dict(
                Form_ID=lex[k, 'lid'],
                Form=lex[k, 'value'],
                Cognateset_ID=lex[k, 'cogid'],
                Cognate_Detection_Method=method + '-t{0:.2f}'.format(threshold))
Ejemplo n.º 6
0
    def get_wordlist(
            self,
            doculect='base',
            profile=False,
            ref='crossid',
            lexstat=True,
            threshold=0.4):
        """
        Return a classical wordlist from the data.
        """
        if profile:
            profile = segments.Tokenizer(profile)
            tokenize = lambda x: profile('^' + x + '$', column='IPA').split()  # noqa: E731
        else:
            tokenize = lingpy.ipa2tokens

        D = {
            0: [
                'doculect',
                'concept',
                'concept_in_source',
                'concept_type',
                'form',
                'tokens',
                'occurrences',
                'word_forms',
                'gloss_forms',
                'phrase_example',
                'gloss_example',
                'references',
            ]
        }
        idx = 1
        for ctype in ['lexicon', 'grammar']:
            concepts = self.get_concepts(ctype=ctype)
            concordance = self._concordances[ctype]
            for concept, entries in concepts.items():
                for form, lid, cis, freq in entries:
                    # retrieve the concordance
                    pidx, sA, sB = concordance[form, concept, cis, lid][0]
                    txt = self[pidx].phrase
                    gls = self[pidx].gloss
                    word, fgls = self[pidx, sA]
                    tokens = tokenize(form)
                    references = ' '.join(
                        ['{0}:{1}:{2}'.format(a, b, c)
                         for a, b, c in concordance[form, concept, cis, lid]])
                    # check tokens
                    try:
                        lingpy.tokens2class(tokens, 'sca')
                        check = True
                    except:  # noqa: E722, # pragma: no cover
                        check = False
                    if concept.strip() and check:
                        D[idx] = [
                            doculect if self.monolingual else lid,
                            concept,
                            cis,
                            ctype,
                            form,
                            tokens,
                            freq,
                            word,
                            fgls,
                            txt,
                            gls,
                            references]
                        idx += 1
                    else:
                        print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format(
                            concept,
                            form,
                            tokens,
                            pidx,
                            sA,
                            sB,
                        ))
        wl = lingpy.Wordlist(D)

        if lexstat:
            wl = lingpy.LexStat(D)
            wl.cluster(method='sca', threshold=threshold, ref=ref)
        else:
            wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]])
            wl.renumber('cog', ref)
        return wl
Ejemplo n.º 7
0
def parse_chinese_morphemes(seq, context=False):
    """
    Parse a Chinese syllable and return its basic structure.
    """

    # get the tokens
    if isinstance(seq, list):
        tokens = [s for s in seq]
    else:
        tokens = lingpy.ipa2tokens(seq, merge_vowels=False)

    # get the sound classes according to the art-model
    arts = [int(x) for x in lingpy.tokens2class(tokens, _art, cldf=True)]

    # get the pro-string
    prostring = lingpy.prosodic_string(arts)

    # parse the zip of tokens and arts
    I, M, N, C, T = '', '', '', '', ''

    ini = False
    med = False
    nuc = False
    cod = False
    ton = False

    triples = [('?', '?', '?')] + list(zip(tokens, arts,
                                           prostring)) + [('?', '?', '?')]

    for i in range(1,
                   len(triples) -
                   1):  #enumerate(triples[1:-1]): #zip(tokens,arts,prostring):

        t, c, p = triples[i]
        _t, _c, _p = triples[i - 1]
        t_, c_, p_ = triples[i + 1]

        # check for initial entry first
        if p == 'A' and _t == '?':

            # now, if we have a j-sound and a vowel follows, we go directly to
            # medial environment
            if t[0] in 'jɥw':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                ini = True
                med, nuc, doc, ton = False, False, False, False

        # check for initial vowel
        elif p == 'X' and _t == '?':
            if t[0] in 'iuy' and c_ == '7':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                nuc = True
                ini, med, cod, ton = False, False, False, False

        # check for medial after initial
        elif p == 'C':
            med = True
            ini, nuc, cod, ton = False, False, False, False

        # check for vowel medial
        elif p == 'X' and p_ == 'Y':

            # if we have a medial vowel, we classify it as medial
            if t in 'iyu':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                nuc = True
                ini, med, cod, ton = False, False, False, False

        # check for vowel without medial
        elif p == 'X' or p == 'Y':
            if p_ in 'LTY' or p_ == '?':
                nuc = True
                ini, med, cod, ton = False, False, False, False
            elif p == 'Y':
                nuc = True
                ini, med, cod, ton = 4 * [False]
            else:
                cod = True
                ini, med, nuc, ton = 4 * [False]

        # check for consonant
        elif p == 'L':
            cod = True
            ini, med, nuc, ton = 4 * [False]

        # check for tone
        elif p == 'T':
            ton = True
            ini, med, nuc, cod = 4 * [False]

        if ini:
            I += t
        elif med:
            M += t
        elif nuc:
            N += t
        elif cod:
            C += t
        else:
            T += t

    # bad conversion for output, but makes what it is supposed to do
    out = [I, M, N, C, T]
    tf = lambda x: x if x else '-'
    out = [tf(x) for x in out]

    # transform tones to normal letters
    tones = dict(zip('¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀', '1234567890123456789'))

    # now, if context is wanted, we'll yield that
    ic = '1' if [x for x in I if x in 'bdgmnŋȵɳɴ'] else '0'
    mc = '1' if [m for m in M + N if m in 'ijyɥ'] else '0'
    cc = '1' if C in 'ptkʔ' else '0'
    tc = ''.join([tones.get(x, x) for x in T])

    IC = '/'.join(['I', ic, mc, cc, tc]) if I else ''
    MC = '/'.join(['M', ic, mc, cc, tc]) if M else ''
    NC = '/'.join(['N', ic, mc, cc, tc]) if N else ''
    CC = '/'.join(['C', ic, mc, cc, tc]) if C else ''
    TC = '/'.join(['T', ic, mc, cc, tc]) if T else ''

    if context:
        return out, [x for x in [IC, MC, NC, CC, TC] if x]
Ejemplo n.º 8
0
def to_asjp(segments):
    return lingpy.tokens2class(segments, model, cldf=False)