Example #1
0
    def check(self, column='TOKENS', rules=False, clpa=None):
        clpa = clpa or get_clpa()

        if rules:
            rules = load_alias(rules)
            for val in self:
                tokens = [
                    rules[t] if t in rules else t for t in split(val[column])
                ]
                val[column] = join(tokens)

        sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0})
        for item in self:
            new_tokens, sounds, errors = clpa.check_sequence(split(
                item[column]),
                                                             sounds=sounds,
                                                             errors=errors)
            idxs = [clpa.segment2clpa(t) for t in new_tokens]

            #    new_tokens.append(accent + sounds[token]['clpa'])
            #    idxs.append(sounds[token]['id'])
            item['CLPA_TOKENS'] = join(new_tokens)
            item['CLPA_IDS'] = join(idxs)

        return sounds, errors
Example #2
0
    def check(self, column='TOKENS', rules=False, clpa=None):
        clpa = clpa or get_clpa()

        if rules:
            rules = load_alias(rules)
            for val in self:
                tokens = [rules[t] if t in rules else t for t in split(val[column])]
                val[column] = join(tokens)

        sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0})
        for item in self:
            new_tokens, sounds, errors = clpa.check_sequence(
                split(item[column]), sounds=sounds, errors=errors)
            idxs = [clpa.segment2clpa(t) for t in new_tokens]

            #    new_tokens.append(accent + sounds[token]['clpa'])
            #    idxs.append(sounds[token]['id'])
            item['CLPA_TOKENS'] = join(new_tokens)
            item['CLPA_IDS'] = join(idxs)

        return sounds, errors
Example #3
0
def inventories(dataset):
    clpa = get_clpa()
    files = glob(dataset.get_path('raw', 'inventories.tsv'))
    dialects = []

    t = Tokenizer(dataset.get_path('raw', 'profile.prf'))
    sounds = defaultdict(lambda: defaultdict(set))
    transform = lambda x, y: unicodedata.normalize('NFC', t.transform(x, y))
    invs = {l: [] for l in dataset.languages}
    for f in files:
        data = csv2list(f)
        for i, line in enumerate(data):
            number, dialect, page, sound, value, *rest = line
            if not rest: rest = ['']
            cddb = transform(value, 'CDDB')
            src = transform(value, 'SOURCE')
            struct = ' '.join(list(t.transform(value, 'STRUCTURE')))
            invs[dialect] += [[
                src.replace(' ', ''), cddb, struct, ', '.join(rest)
            ]]
            if len(struct.split()) != len(cddb.split()):
                print(i + 1, 'warn', struct, '   |   ', cddb)
    dataset.write_inventories(invs)
Example #4
0
 def setUp(self):
     self.clpa = get_clpa()
     self.clpa2 = CLPA(rules=dict(th='t'))
# coding=utf-8
from __future__ import unicode_literals, print_function
from collections import defaultdict, Counter

from clldutils.misc import slug
from six.moves.urllib.request import urlopen
from lingpy.sequence.sound_classes import clean_string, tokens2class
import lingpy as lp
from pyclpa.base import get_clpa
from pybtex import database

clpa = get_clpa()


def getEvoBibAsSource(key):
    """Download bibtex format and parse it from EvoBib"""
    return database.parse_string(
        urlopen("http://bibliography.lingpy.org/raw.php?key=" +
                key).read().decode('utf-8'),
        bib_format='bibtex')


def wordlist2cognates(wordlist, dataset, source, expert='expert', ref='cogid'):
    """Turn a wordlist into a cognate set list, using the cldf parameters."""
    return [[
        wordlist[k, 'lid'], dataset.name, wordlist[k, 'ipa'],
        '{0}-{1}'.format(slug(wordlist[k, 'concept']),
                         wordlist[k, ref]), '', expert, source, '', '', ''
    ] for k in wordlist]