Ejemplo n.º 1
0
def test_ipa2tokens(alm):
    # iterate over the keys
    for key in alm:  # get_list(language="Turkish",flat=True):
        ipa = alm[key, 'ipa']
        tokens_a = alm[key, 'tokensa'].split(' ')
        tokens_b = alm[key, 'tokensb'].split(' ')

        new_tokens_a = lp.ipa2tokens(ipa, merge_vowels=True, merge_geminates=False)
        new_tokens_b = lp.ipa2tokens(ipa, merge_vowels=False, merge_geminates=False)
        assert tokens_a == new_tokens_a
        assert tokens_b == new_tokens_b
Ejemplo n.º 2
0
    def test_ipa2tokens(self):
        # iterate over the keys
        for key in self.alm: #.get_list(language="Turkish",flat=True):
            ipa = self.alm[key, 'ipa']
            tokensA = self.alm[key, 'tokensa'].split(' ')
            tokensB = self.alm[key, 'tokensb'].split(' ')

            new_tokensA = lp.ipa2tokens(ipa, merge_vowels=True)
            new_tokensB = lp.ipa2tokens(ipa, merge_vowels=False)
            assert tokensA == new_tokensA
            assert tokensB == new_tokensB
Ejemplo n.º 3
0
    def test_ipa2tokens(self):
        # iterate over the keys
        for key in self.alm:  #.get_list(language="Turkish",flat=True):
            ipa = self.alm[key, 'ipa']
            tokensA = self.alm[key, 'tokensa'].split(' ')
            tokensB = self.alm[key, 'tokensb'].split(' ')

            new_tokensA = lp.ipa2tokens(ipa, merge_vowels=True)
            new_tokensB = lp.ipa2tokens(ipa, merge_vowels=False)
            assert tokensA == new_tokensA
            assert tokensB == new_tokensB
Ejemplo n.º 4
0
def tokenize_word_reversibly(ipa):
    """Reversibly convert an IPA string into a list of tokens.

    In contrast to LingPy's tokenize_word, do this without removing
    symbols. This means that the original IPA string can be recovered
    from the tokens.

    """
    tokenized_word = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False)
    token = 0
    index = 0
    for i in ipa:
        try:
            tokenized_word[token][index]
        except IndexError:
            token += 1
            index = 0
        try:
            if i != tokenized_word[token][index]:
                if index == 0:
                    tokenized_word.insert(token, i)
                else:
                    tokenized_word[token] = (tokenized_word[token][:index] +
                                             i + tokenized_word[token][index:])
        except IndexError:
            tokenized_word.append(i)
        index += 1
    assert ''.join(tokenized_word) == ipa
    return tokenized_word
Ejemplo n.º 5
0
def read_data_cldf(datafile,
                   sep="\t",
                   char_list=set(),
                   cogids_are_cross_semantically_unique=True,
                   data='ASJP'):
    """Read a CLDF file in TSV or CSV format."""
    reader = csv.DictReader(datafile,
                            dialect='excel' if sep == ', ' else 'excel-tab')
    langs = set()
    data_dict = collections.defaultdict(lambda: collections.defaultdict())
    cogid_dict = collections.defaultdict(lambda: collections.defaultdict())
    words_dict = collections.defaultdict(lambda: collections.defaultdict(list))
    for line, row in enumerate(reader):
        lang = row["Language ID"]
        langs.add(lang)

        if data == 'ASJP':
            try:
                asjp_word = clean_word(row["ASJP"])
            except KeyError:
                asjp_word = ipa2asjp.ipa2asjp(row["IPA"])
        elif data == 'IPA':
            asjp_word = tuple(lingpy.ipa2tokens(row["IPA"],
                                                merge_vowels=False))
        else:
            asjp_word = row[data]

        if not asjp_word:
            continue

        for ch in asjp_word:
            if ch not in char_list:
                char_list.add(ch)

        concept = row["Feature ID"]
        cogid = row["Cognate Class"]

        data_dict[concept][line, lang] = asjp_word
        cogid_dict.setdefault(
            cogid if cogids_are_cross_semantically_unique else
            (cogid, concept), set()).add((lang, concept, asjp_word))
        words_dict[concept].setdefault(lang, []).append(asjp_word)

    return (data_dict, list(cogid_dict.values()), words_dict, list(langs),
            char_list)
Ejemplo n.º 6
0
def get_structure(word,
                  sep='+',
                  zipped=False,
                  semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'):
    if not isinstance(word, (list, tuple)):
        word = lingpy.ipa2tokens(word,
                                 expand_nasals=True,
                                 merge_vowels=False,
                                 semi_diacritics=semi_diacritics)

    # check for unknown chars
    try:
        tokens2class(word, 'cv', cldf=True)
    except ValueError:
        print('problem with {0}'.format(''.join(word)))
        return []

    # get the morphemes
    if sep in word:
        words = tokens2morphemes(word, cldf=True)
        morphemes = []
        for w in words:
            morphemes += tokens2morphemes(w, sep=sep)
    else:
        morphemes = tokens2morphemes(word, cldf=True)
    # get the basic structure for each morpheme
    for morpheme in morphemes:
        try:
            segments = parse_chinese_morphemes(morpheme)
        except:
            if not zipped:
                yield ['NULL']
            else:
                yield ([('NULL', 'NULL')], morpheme)
        if not zipped:
            yield [x for x, y in zip('imnct', segments) if y != '-']
        else:
            yield ([x for x in zip('imnct', segments)
                    if x[1] != '-'], morpheme)
Ejemplo n.º 7
0
    output[['Taxon','Gloss']] = output[['Taxon','Gloss']].astype('string')
    output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values]
    output.to_csv('reformattedData/asjp/'+db+'.tsv',encoding='utf-8',
                  sep='\t',index=False)

for f in [x for x in os.listdir(path+'data/list_length_project/sets/mattis_new/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'/data/list_length_project/sets/mattis_new/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    output = pd.DataFrame()
    output['ID'] = arange(len(data))+1
    output['Taxon'] = data.language.values
    output['Gloss'] = data.gloss.values
    output['GlossID'] = pd.match(data.gloss.values,data.gloss.unique())+1
    output['IPA'] = [re.sub(r"[ -]","",unicode(x)) for x in data.transcription]
    output['Tokens'] = [' '.join(lp.ipa2tokens(unicode(w))) for w in output.IPA]
    cClasses = array([x+':'+unicode(y).strip('?')
                      for (x,y) in data[['gloss','cognate_class']].values])
    output['CogID'] = pd.match(cClasses,unique(cClasses))
    output[['Taxon','Gloss']] = output[['Taxon','Gloss']]
    output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values]
    output.to_csv('reformattedData/ipa/'+db+'.tsv',encoding='utf-8',
                  sep='\t',index=False)


for f in [x for x in os.listdir(path+'data/list_length_project/sets/abvd2/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'/data/list_length_project/sets/abvd2/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    data = data[[',' not in unicode(x) for x in data.cognate_class.values]]
    data = data[['?' not in unicode(x) for x in data.cognate_class.values]]
Ejemplo n.º 8
0
def parse_chinese_morphemes(seq, context=False):
    """
    Parse a Chinese syllable and return its basic structure.
    """

    # get the tokens
    if isinstance(seq, list):
        tokens = [s for s in seq]
    else:
        tokens = lingpy.ipa2tokens(seq, merge_vowels=False)

    # get the sound classes according to the art-model
    arts = [int(x) for x in lingpy.tokens2class(tokens, _art, cldf=True)]

    # get the pro-string
    prostring = lingpy.prosodic_string(arts)

    # parse the zip of tokens and arts
    I, M, N, C, T = '', '', '', '', ''

    ini = False
    med = False
    nuc = False
    cod = False
    ton = False

    triples = [('?', '?', '?')] + list(zip(tokens, arts,
                                           prostring)) + [('?', '?', '?')]

    for i in range(1,
                   len(triples) -
                   1):  #enumerate(triples[1:-1]): #zip(tokens,arts,prostring):

        t, c, p = triples[i]
        _t, _c, _p = triples[i - 1]
        t_, c_, p_ = triples[i + 1]

        # check for initial entry first
        if p == 'A' and _t == '?':

            # now, if we have a j-sound and a vowel follows, we go directly to
            # medial environment
            if t[0] in 'jɥw':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                ini = True
                med, nuc, doc, ton = False, False, False, False

        # check for initial vowel
        elif p == 'X' and _t == '?':
            if t[0] in 'iuy' and c_ == '7':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                nuc = True
                ini, med, cod, ton = False, False, False, False

        # check for medial after initial
        elif p == 'C':
            med = True
            ini, nuc, cod, ton = False, False, False, False

        # check for vowel medial
        elif p == 'X' and p_ == 'Y':

            # if we have a medial vowel, we classify it as medial
            if t in 'iyu':
                med = True
                ini, nuc, cod, ton = False, False, False, False
            else:
                nuc = True
                ini, med, cod, ton = False, False, False, False

        # check for vowel without medial
        elif p == 'X' or p == 'Y':
            if p_ in 'LTY' or p_ == '?':
                nuc = True
                ini, med, cod, ton = False, False, False, False
            elif p == 'Y':
                nuc = True
                ini, med, cod, ton = 4 * [False]
            else:
                cod = True
                ini, med, nuc, ton = 4 * [False]

        # check for consonant
        elif p == 'L':
            cod = True
            ini, med, nuc, ton = 4 * [False]

        # check for tone
        elif p == 'T':
            ton = True
            ini, med, nuc, cod = 4 * [False]

        if ini:
            I += t
        elif med:
            M += t
        elif nuc:
            N += t
        elif cod:
            C += t
        else:
            T += t

    # bad conversion for output, but makes what it is supposed to do
    out = [I, M, N, C, T]
    tf = lambda x: x if x else '-'
    out = [tf(x) for x in out]

    # transform tones to normal letters
    tones = dict(zip('¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀', '1234567890123456789'))

    # now, if context is wanted, we'll yield that
    ic = '1' if [x for x in I if x in 'bdgmnŋȵɳɴ'] else '0'
    mc = '1' if [m for m in M + N if m in 'ijyɥ'] else '0'
    cc = '1' if C in 'ptkʔ' else '0'
    tc = ''.join([tones.get(x, x) for x in T])

    IC = '/'.join(['I', ic, mc, cc, tc]) if I else ''
    MC = '/'.join(['M', ic, mc, cc, tc]) if M else ''
    NC = '/'.join(['N', ic, mc, cc, tc]) if N else ''
    CC = '/'.join(['C', ic, mc, cc, tc]) if C else ''
    TC = '/'.join(['T', ic, mc, cc, tc]) if T else ''

    if context:
        return out, [x for x in [IC, MC, NC, CC, TC] if x]
Ejemplo n.º 9
0
 def get_tokenizer():
     return lambda x, y: ipa2tokens(y, merge_vowels=False)
Ejemplo n.º 10
0
def cldf(dataset, concepticon, **kw):
    """
    Implements the conversion of the raw data to CLDF dataset(s).

    :param dataset: provides access to the information in supplementary files as follows:\
     - the JSON object from `metadata.json` is available as `dataset.md`\
     - items from languages.csv are available as `dataset.languages`\
     - items from concepts.csv are available as `dataset.concepts`\
     - if a Concepticon conceptlist was specified in metadata.json, its ID is available\
       as `dataset.conceptlist`
    :param glottolog: a pyglottolog.api.Glottolog` instance.
    :param concepticon:  a pyconcepticon.api.Concepticon` instance.
    :param kw: All arguments passed on the command line.
    """

    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())

    # get language identifiers
    lids, cids, coords = {}, {}, {}
    for row in dataset.languages:
        language = row['NAME']
        lids[language] = row['GLOTTOCODE']
    coords = dict([wl.coords[taxon] for taxon in lids])
    modify = {
        'thunder (verb)': 'thunder',
        'flash (verb)': 'lightning',
        'room': 'flat',
        'have diarrea': 'have diarrhoea',
        'watery': 'light'
    }
    for row in dataset.concepts:
        concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \
                row['CONCEPT']
        cids[concept] = row['CONCEPT_SET']

    # language ids
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2014b')

    # get partial identifiers
    partial_ids = defaultdict(list)
    partial_converter = {}
    idx = 1
    for k in wl:
        for char in wl[k, 'counterpart']:
            if char in partial_converter:
                pidx = partial_converter[char]
            else:
                pidx = idx
                partial_converter[char] = idx
                idx += 1
            partial_ids[k] += [pidx]

    # trace if proto-langugages was visited
    visited = []
    idx = max([k for k in wl]) + 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Parameter_Chinese_name', 'Value',
         'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank',
         'Comment'), dataset) as ds:

        ds.sources.add(src)
        ds.sources.add(src2)

        D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']}
        for k in wl:
            tokens = lp.ipa2tokens(wl[k, 'ipa'],
                                   merge_vowels=False,
                                   expand_nasals=True)
            # remove sandhi-annotation in tokens, as it is confusing clpa
            for i, t in enumerate(tokens):
                if '⁻' in t:
                    tokens[i] = t[:t.index('⁻')]
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                lids[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                cids[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'mandarin'],
                wl[k, 'ipa'],
                wl[k, 'counterpart'],
                SOURCE,
                ' '.join(tokens),
                wl[k, 'cogid'],
                wl[k, 'order'],
                wl[k, 'note'] if wl[k, 'note'] != '-' else '',
            ])
            D[k] = [
                wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens,
                wl[k, 'cogid']
            ]
            if wl[k, 'cogid'] not in visited:
                # we need to add new tones, otherwise it won't work, so we
                # split syllables first, then check if the syllable ends with
                # tone or not and add a '1' if this is not the case
                syllables = wl[k, 'mch'].split('.')
                for i, s in enumerate(syllables):
                    if s[-1] not in '²³':
                        if s[-1] not in 'ptk':
                            syllables[i] += '¹'
                        else:
                            syllables[i] += '⁴'
                tokens = lp.ipa2tokens(''.join(syllables))
                ds.add_row([
                    '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245',
                    'Middle Chinese', '', cids[wl[k, 'concept']],
                    wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'],
                    SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', ''
                ])
                D[idx] = [
                    'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens,
                    wl[k, 'cogid']
                ]
                idx += 1
                visited += [wl[k, 'cogid']]
        alms = lp.Alignments(D)
        cognates = [[
            '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'],
            '-'.join([slug(alms[k, 'concept']),
                      str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', ''
        ] for k in alms]

        dataset.cognates.extend(
            iter_alignments(alms, cognates, method='library'))