def cmd_makecldf(self, args):
     wl = lingpy.Wordlist(str(self.raw_dir / 'D_subset-300-22.tsv'))
     args.writer.add_sources()
     args.writer.add_languages(id_factory='Name')
     source_lookup = {
         language['ID']: language['Source']
         for language in self.languages
     }
     concepts = {}
     for concept in self.conceptlists[0].concepts.values():
         idx = concept.number + '_' + slug(concept.english)
         args.writer.add_concept(
             ID=idx,
             Name=concept.english,
             Concepticon_ID=concept.concepticon_id,
             Concepticon_Gloss=concept.concepticon_gloss)
         concepts[concept.english] = idx
     for k in pb(wl, desc='wl-to-cldf', total=len(wl)):
         if wl[k, 'tokens']:
             args.writer.add_form(
                 Language_ID=wl[k, 'doculect'],
                 Parameter_ID=concepts[wl[k, 'concept']],
                 Value=wl[k, 'ipa'].strip() or ''.join(wl[k, 'tokens']),
                 Form=wl[k, 'ipa'].strip().replace(' ', '_')
                 or ''.join(wl[k, 'tokens']),
                 Source=[source_lookup[wl[k, 'doculect']]],
                 Comment=wl[k, 'note'])
    def cmd_makecldf(self, args):

        concepts = {}
        wl = lp.Wordlist(
            self.raw_dir.joinpath('D_test_Bahnaric-200-24.tsv').as_posix())

        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Number=concept.number,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.english] = idx
        #concepts['burn'] = concepts['burn tr.']
        #concepts['claw'] = concepts['claw (nail)']
        #concepts['come (V)'] = concepts['come']
        #concepts['die (V)'] = concepts['die']
        #concepts['drink (V)'] = concepts['drink']
        #concepts['eat (V)'] = concepts['eat']
        #concepts['fat'] = concepts['fat n.']
        #concepts['fly'] = concepts['fly v.']
        #concepts['give (V)'] = concepts['give']
        #concepts['hear (V)'] = concepts['hear']
        #concepts['kill (V)'] = concepts['kill']
        #concepts['know (V)'] = concepts['know']
        #concepts['lie (V)'] = concepts['lie']
        #concepts['rain (V)'] = concepts['rain']
        #concepts['say (V)'] = concepts['say']
        #concepts['see (V)'] = concepts['see']
        #concepts['sit (V)'] = concepts['sit']
        #concepts['sleep (V)'] = concepts['sleep']
        #concepts['stand (V)'] = concepts['stand']
        #concepts['swim (V)'] = concepts['swim']
        #concepts['walk (V)'] = concepts['walk(go)']

        languages = args.writer.add_languages(
            lookup_factory="Name", id_factory=lambda x: slug(x['Name']))

        args.writer.add_sources()
        visited = set()
        for idx, concept in wl.iter_rows('concept'):
            if wl[idx, 'concept'] in concepts:
                lexeme = args.writer.add_form(
                    Language_ID=languages[wl[idx, 'language']],
                    Parameter_ID=concepts[wl[idx, 'concept']],
                    Value=wl[idx, 'ipa'],
                    Form='.'.join(wl[idx, 'tokens']),
                    Source='Sidwell2015',
                    Loan=True if wl[idx, 'cogid'] < 0 else False)
                args.writer.add_cognate(lexeme=lexeme,
                                        Cognateset_ID=wl[idx, 'cogid'],
                                        Cognate_Detection_Method='expert',
                                        Source=['Sidwell2015'])
            else:
                if concept not in visited:
                    visited.add(concept)
                    print(concept)
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        wl = lingpy.Wordlist(self.raw_dir.joinpath("GEM-CNL.csv").as_posix())
        concepts = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name"
        )
        for concept in self.conceptlists[0].concepts.values():
            for cis in concept.attributes["lexibank_gloss"]:
                if cis not in concepts:
                    concepts[cis] = concepts[concept.english]

        languages = args.writer.add_languages(lookup_factory="STEDT_Name")
        args.writer.add_sources()

        for idx, language, concept, value, pos in wl.iter_rows(
            "doculect", "concept", "reflex", "gfn"
        ):
            # Fix for 251479
            if concept == "top (i.e. highest point":
                concept = "top (i.e. highest point)"

            if concept not in concepts:
                args.log.warning(concept)
            else:
                args.writer.add_forms_from_value(
                    Language_ID=languages[language],
                    Parameter_ID=concepts[concept],
                    Value=value,
                    Source=["Marrison1967"],
                )
    def cmd_makecldf(self, args):

        concepts = {}
        wl = lp.Wordlist(self.raw_dir.joinpath('IDS.csv').as_posix())

        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Number=concept.number,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.attributes['ids_id'].replace(
                '-', '.').strip('0')] = idx

        languages = args.writer.add_languages(
            lookup_factory="Name", id_factory=lambda x: slug(x['Name']))

        args.writer.add_sources()
        for idx in wl:
            lexeme = args.writer.add_form(
                Language_ID=languages[wl[idx, 'language']],
                Parameter_ID=concepts[wl[idx, 'ids_id']],
                Value=wl[idx, 'ortho'],
                Form=wl[idx, 'ipa'].replace('#', '-'),
                Source='List2014c',
                Loan=True if wl[idx, 'cogid'] < 0 else False)
            args.writer.add_cognate(lexeme=lexeme,
                                    Cognateset_ID=wl[idx, 'cogid'],
                                    Cognate_Detection_Method='expert',
                                    Source=['List2014c'])
Exemple #5
0
def cldf(dataset, concepticon, **kw):
    for dset, srckey in zip(DSETS, SOURCES):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'),
                         dataset,
                         subset=dset.split('-')[0]) as ds:

            ds.sources.add(src)
            for k in wl:
                ds.add_row([
                    '{0}-{1}'.format(srckey,
                                     k), wl[k, 'glottolog'], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']),
                    ' '.join(wl[k, 'clpa']), wl[k, 'cogid'],
                    ' '.join([str(x) for x in wl[k, 'partialids']])
                ])
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']])
                cognates += [[
                    '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf,
                    '', 'expert', srckey, '', '', ''
                ]]

            dataset.cognates.extend(
                iter_alignments(wl,
                                cognates,
                                method='progressive',
                                prefix=srckey + '-'))
Exemple #6
0
    def cmd_makecldf(self, args):

        wl = lingpy.Wordlist(self.raw_dir.joinpath("yi-wl.tsv").as_posix())
        args.writer.add_sources()

        languages = args.writer.add_languages(lookup_factory="Name")

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Chinese_Gloss=concept.attributes["chinese"],
            )
            concepts[concept.english] = idx
        concepts["Daughter-in-law"] = concepts["daughter-in-law"]

        for idx in pylexibank.progressbar(wl, desc="cldfify", total=len(wl)):
            args.writer.add_form_with_segments(
                Language_ID=languages[wl[idx, "doculect"]],
                Parameter_ID=concepts[wl[idx, "concept"]],
                Value=wl[idx, "value"],
                Form=wl[idx, "form"],
                Segments=wl[idx, "tokens"],
                Source=["Castro2010"],
            )
Exemple #7
0
    def add_doculect(self, doculect, values):
        """
        Add a new column (like a new doculect or the like) to the data.

        NOTES
        -----
        For the moment, we assume that we are dealing with doculects and
        concepts, which may be changed later on...
        """
        # get an index for all the values in values
        converter = {
            value: {self[k, 'concept']: self[k, value]
                    for k in self}
            for value in values
        }

        # now, create the wordlist
        d = {0: ['doculect', 'concept'] + values}
        for idx, k in enumerate(self.concepts, start=1):
            d[idx] = [doculect, k] + [converter[value][k] for value in values]

        wl = lingpy.Wordlist(d)
        self.add_data(wl)
        print('Successfully added new doculect template for {0}'.format(
            doculect))
    def cmd_makecldf(self, args):

        args.writer.add_sources()
        wl = lingpy.Wordlist(self.dir.joinpath("raw", "wordlist.tsv").as_posix())
        concepts = {}
        strip_concept = lambda x: x.replace(" ", "").replace("*", "")

        for concept in self.conceptlists[0].concepts.values():
            args.writer.add_concept(
                ID=concept.id,
                Name=concept.english,
                Chinese_Gloss=strip_concept(concept.attributes["chinese"]),
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[strip_concept(concept.attributes["chinese"])] = concept.id
        langs = {k["ChineseName"]: k["ID"] for k in self.languages}
        args.writer.add_languages()

        for idx in pylexibank.progressbar(wl, desc="cldfify"):

            args.writer.add_form_with_segments(
                Language_ID=langs[wl[idx, "doculect"]],
                Parameter_ID=concepts[strip_concept(wl[idx, "concept"])],
                Value=wl[idx, "value"],
                Form=wl[idx, "form"],
                Segments=wl[idx, "tokens"],
                Source=["Castro2010a"],
            )
Exemple #9
0
    def cmd_install(self, **kw):

        wl = lingpy.Wordlist(self.raw.posix('Bruzzi_Granadillo.txt'))

        with self.cldf as ds:
            ds.add_sources(*self.raw.read_bib())
            for k in pb(wl, desc='wl-to-cldf'):
                ds.add_language(ID=slug(wl[k, 'doculect']),
                                Name=wl[k, 'doculect'],
                                Glottocode='bani1255')

                ds.add_concept(ID=slug(wl[k, 'concept']),
                               Name=wl[k, 'concept'],
                               Concepticon_ID=wl[k, 'concepticon_id'] or '',
                               Portuguese_Gloss=wl[k, 'concept_portuguese'])

                for row in ds.add_lexemes(Language_ID=slug(wl[k, 'doculect']),
                                          Parameter_ID=slug(wl[k, 'concept']),
                                          Value=wl[k, 'entrj_in_source'],
                                          Form=wl[k, 'ipa'],
                                          Segments=wl[k, 'tokens'],
                                          Source=[
                                              'granadillo_ethnographic_2006',
                                              'silva_discoteca_1961'
                                          ]):
                    cid = slug(wl[k, 'concept'] + '-' +
                               '{0}'.format(wl[k, 'cogid']))
                    ds.add_cognate(lexeme=row,
                                   Cognateset_ID=cid,
                                   Source=['Chacon2018'],
                                   Alignment=wl[k, 'alignment'],
                                   Alignment_Source='Chacon2018')
Exemple #10
0
def fetch(
    dataset,
    remote_dbase=None,
    concepts=None,
    languages=None,
    columns=None,
    to_lingpy=None,
    transform=None,
    base_url="http://lingulist.de/edictor",
):
    url = base_url + "/triples/get_data.py?file=" + dataset
    if not remote_dbase:
        url += "&remote_dbase=" + dataset + ".sqlite3"
    if concepts:
        url += "&concepts=" + "|".join(
            [urllib.parse.quote(c) for c in concepts])
    if languages:
        url += "&doculects=" + "|".join(
            [urllib.parse.quote(c) for c in languages])
    if columns:
        url += "&columns=" + "|".join(columns)

    data = urlopen(url).read()
    if to_lingpy:
        with tempfile.NamedTemporaryFile() as tf:
            tf.write(data)
            tf.flush()
            return transform(tf.name) if transform else lingpy.Wordlist(
                tf.name)
    return data.decode("utf-8")
    def cmd_makecldf(self, args):

        concepts = {}
        wl = lp.Wordlist(self.raw_dir.joinpath("OUG.csv").as_posix())

        for concept in self.conceptlists[0].concepts.values():
            idx = "{0}_{1}".format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Number=concept.number,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.english] = idx
        concepts["bite (V)"] = concepts["bite"]
        concepts["burn (V)"] = concepts["burn tr."]
        concepts["claw"] = concepts["claw (nail)"]
        concepts["come (V)"] = concepts["come"]
        concepts["die (V)"] = concepts["die"]
        concepts["drink (V)"] = concepts["drink"]
        concepts["eat (V)"] = concepts["eat"]
        concepts["fat"] = concepts["fat n."]
        concepts["fly (V)"] = concepts["fly v."]
        concepts["give (V)"] = concepts["give"]
        concepts["hear (V)"] = concepts["hear"]
        concepts["kill (V)"] = concepts["kill"]
        concepts["know (V)"] = concepts["know"]
        concepts["lie (V)"] = concepts["lie"]
        concepts["rain (V)"] = concepts["rain"]
        concepts["say (V)"] = concepts["say"]
        concepts["see (V)"] = concepts["see"]
        concepts["sit (V)"] = concepts["sit"]
        concepts["sleep (V)"] = concepts["sleep"]
        concepts["stand (V)"] = concepts["stand"]
        concepts["swim (V)"] = concepts["swim"]
        concepts["warm (hot)"] = concepts["warm"]
        concepts["walk (go)"] = concepts["walk(go)"]
        languages = args.writer.add_languages(
            lookup_factory="Name", id_factory=lambda x: slug(x["Name"]))

        args.writer.add_sources()
        for idx in wl:
            lexeme = args.writer.add_form(
                Language_ID=languages[wl[idx, "language"]],
                Parameter_ID=concepts[wl[idx, "concept"]],
                Value=wl[idx, "ipa"],
                Form=".".join(wl[idx, "tokens"]).replace("#", "-"),
                # Segments=wl[idx, 'tokens'],
                Source="Zhivlov2011",
                Loan=True if wl[idx, "cogid"] < 0 else False,
            )
            args.writer.add_cognate(
                lexeme=lexeme,
                Cognateset_ID=wl[idx, "cogid"],
                Cognate_Detection_Method="expert",
                Source=["Zhivlov2011"],
            )
    def cmd_makecldf(self, args):
        # add sources
        args.writer.add_sources()

        # add languages
        languages = args.writer.add_languages(lookup_factory="Name")

        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda cpt: "%s_%s" %
            (cpt.id.split("_")[0], slug(cpt.english)),
            lookup_factory="Name",
        )

        # Hard-coded fixes to segment errors in raw source
        segments = {
            "áː": "áː/aː",
            "âː": "âː/aː",
            "aʰ": "a h",
            "ɐ̃ʰ": "ɐ̃ h",
            "í": "í/i",
            "íː": "íː/iː",
            "iʰ": "i h",
            "i̥": "i̥/i",
            "ka": "k a",
            "kw": "kʷ",  # the single instance is a labialized velar
            "nⁱ": "n i",
            "óː": "óː/oː",
            "teː": "t eː",
            "ú": "u/u",
            '#': '+'
        }

        # read wordlist with lingpy
        wl_file = self.raw_dir / "Bruzzi_Granadillo.txt"
        wl = lingpy.Wordlist(wl_file.as_posix())

        # iterate over wordlist
        for idx in progressbar(wl, desc="makecldf"):
            # write lexemes
            lex = args.writer.add_form_with_segments(
                Language_ID=languages[wl[idx, "doculect"]],
                Parameter_ID=concepts[wl[idx, "concept"]],
                Value=wl[idx, "entrj_in_source"],
                Form=wl[idx, "ipa"],
                Segments=" ".join(
                    [segments.get(x, x) for x in wl[idx, "tokens"]]).split(),
                Source=[
                    "granadillo_ethnographic_2006", "silva_discoteca_1961"
                ],
            )

            args.writer.add_cognate(
                lexeme=lex,
                Cognateset_ID=wl[idx, "cogid"],
                Source=["Chacon2019"],
            )
Exemple #13
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(self.raw_dir.joinpath("suansu.tsv").as_posix())

        converter = {
            "ll": "lː",
            "ddʑ": "dʑː",
            "mm": "mː",
            "nn": "nː",
            "ss": "sː",
            "tts": "tsː",
            "tʂ": "ʈʂː",
            "bb": "bː",
            "dd": "dː",
            "pp": "pː",
            "tt": "tː",
            "ttʰ": "tʰː",
            "ɹɹ": "ɹː",
            "ff": "fː",
            "je": "j e",
            "oj": "oi",
            "ph": "pʰ",
            "th": "tʰ",
            "ttɕ": "tɕː",
            "ttʃ": "tʃː",
            "ma": "m a",
            "ē": "e",
            "ê": "e",
            "ʈʈʂ": "ʈʂː",
            "I": "ɪ",
            "ʷ": "w",
        }

        args.writer.add_sources()
        concepts = {}
        args.writer.add_languages()

        for k in progressbar(wl, desc="wl-to-cldf", total=len(wl)):
            if wl[k, "concepticon_id"] not in concepts:
                cid = "{0}_{1}".format(wl[k, "concepticon_id"],
                                       slug(wl[k, "concept"]))
                concepts[wl[k, "concept"]] = cid
                args.writer.add_concept(
                    ID=cid,
                    Name=wl[k, "concept"],
                    Concepticon_ID=wl[k, "concepticon_id"],
                    Concepticon_Gloss=wl[k, "concepticon_gloss"],
                )
            args.writer.add_form_with_segments(
                Language_ID="Suansu",
                Parameter_ID=concepts[wl[k, "concept"]],
                Value="".join(wl[k, "tokens"]),
                Form="".join(wl[k, "tokens"]),
                Segments=" ".join(
                    [converter.get(x, x) for x in wl[k, "tokens"]]).split(),
                Source=["Ivani2019"],
            )
Exemple #14
0
    def cmd_install(self, **kw):

        # sources are poorly annotated, so we need to correct manually
        src = {
            "H&R92": "huber_vocabulario_1991",
            "Klumpp95": "",
            "H&R 1992": "huber_vocabulario_1991",
            "None": "",
            "Melendez 2011": "melendez_lozano_diccionario_2011",
            "Epps": "",
            "Schauer2005": "",
            "Allin 1979": "allin_vocabulario_1979",
            "Aikhenvald": "",
            "dp91": "",
            "Aikhenvald 2012": "aikhenvald_dicionario_2012",
            "Aikenvald2001": "aihenvald_dicionario_2001",
            "Oliveira 93": "cunha_de_oliveira_uma_1993",
            "Ramirez2001": "ramirez_dicionario_2001",
            "Ramirez 2001": "ramirez_dicionario_2001",
            "Schauer 2005": "schauer_diccionario_2005",
            "Aikhenvald 2001": "aikhenvald_dicionario_2001"
        }

        wl = lingpy.Wordlist(
            self.raw.posix('arawakan_swadesh_100_edictor.tsv'))
        with self.cldf as ds:
            ds.add_sources(*self.raw.read_bib())
            for l in self.languages:
                ds.add_language(ID=slug(l['Name']),
                                Name=l['Name'],
                                Glottocode=l['Glottocode'])
            for c in self.concepts:
                ds.add_concept(ID=slug(c['ENGLISH']),
                               Name=c['ENGLISH'],
                               Concepticon_ID=c['CONCEPTICON_ID'],
                               Portuguese_Gloss=c['PORTUGUESE'])

            for k in pb(wl, desc='wl-to-cldf'):
                if wl[k, 'value']:
                    for row in ds.add_lexemes(
                            Language_ID=slug(wl[k, 'doculect']),
                            Parameter_ID=slug(wl[k, 'concept']),
                            Value=wl[k, 'value'],
                            Form=wl[k, 'form'],
                            Segments=wl[k, 'segments'],
                            Source=src.get(wl[k, 'source'], '')):

                        cid = slug(wl[k, 'concept'] + '-' +
                                   '{0}'.format(wl[k, 'cogid']))
                        ds.add_cognate(lexeme=row,
                                       Cognateset_ID=cid,
                                       Source=['Chacon2017'],
                                       Alignment=wl[k, 'alignment'],
                                       Alignment_Source='Chacon2017')
def cldf(dataset, concepticon, **kw):
    gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}

    for dset, srckey in zip(DSETS, sources):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        if 'tokens' not in wl.header:
            wl.add_entries('tokens',
                           'ipa',
                           lp.ipa2tokens,
                           merge_vowels=False,
                           expand_nasals=True)
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'Cognacy', 'Loan'),
                         dataset,
                         subset=dset.split('.')[0]) as ds:
            ds.sources.add(src)
            errors = []
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                if '(V)' in concept:
                    concept = concept[:-4]
                concept = correct_concepts.get(concept, concept)
                if concept not in gloss2con:
                    errors += [concept]
                doculect = correct_languages.get(wl[k, 'doculect'],
                                                 wl[k, 'doculect'])
                loan = wl[k, 'cogid'] < 0
                cogid = abs(wl[k, 'cogid'])

                wid = '{0}-{1}'.format(dset.split('.')[0], k)
                ds.add_row([
                    wid, lang2glot[doculect], wl[k, 'doculect'], '',
                    gloss2con.get(wl[k, 'concept'],
                                  ''), wl[k, 'concept'], wl[k, 'ipa'], srckey,
                    ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan']
                ])

                cognates.append([
                    wid, ds.name, wl[k, 'ipa'], cogid,
                    'borrowed' if loan else '', 'expert', srckey, '', '', ''
                ])

            dataset.cognates.extend(
                iter_alignments(lp.Alignments(wl), cognates, method='library'))
            for er in sorted(set(errors)):
                print(er, dset)
    def cmd_makecldf(self, args):
        # sources are poorly annotated, so we need to correct manually
        src = {
            "H&R92": "huber_vocabulario_1992",
            "H&R 1992": "huber_vocabulario_1992",
            "Melendez 2011": "melendez_lozano_diccionario_2011",
            "Allin 1979": "allin_vocabulario_1979",
            "Aikhenvald 2012": "aikhenvald_dicionario_2012",
            "Aikenvald2001": "aihenvald_dicionario_2001",
            "Oliveira 93": "cunha_de_oliveira_uma_1993",
            "Ramirez2001": "ramirez_dicionario_2001",
            "Ramirez 2001": "ramirez_dicionario_2001",
            "Schauer 2005": "schauer_diccionario_2005",
            "Aikhenvald 2001": "aikhenvald_dicionario_2001",
        }

        # add source
        args.writer.add_sources()

        # add languages
        languages = args.writer.add_languages(lookup_factory="Name")

        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: "%s_%s" %
            (c.id.split("-")[-1], slug(c.english)),
            lookup_factory="Name",
        )

        # read raw wordlist add lexemes
        wl_file = self.raw_dir / "arawakan_swadesh_100_edictor.tsv"
        wl = lingpy.Wordlist(wl_file.as_posix())

        for idx in progressbar(wl, desc="makecldf"):
            if wl[idx, "value"]:
                if wl[idx, 'segments'][0] == '_':
                    wl[idx, 'segments'] = wl[idx, 'segments'][1:]
                lex = args.writer.add_form_with_segments(
                    Language_ID=languages[wl[idx, "doculect"]],
                    Parameter_ID=concepts[wl[idx, "concept"]],
                    Value=wl[idx, "value"],
                    Form=wl[idx, "form"],
                    Segments=wl[idx, "segments"],
                    Source=src.get(wl[idx, "source"], "Chacon2017"),
                )

                # add cognate
                args.writer.add_cognate(lexeme=lex,
                                        Cognateset_ID=wl[idx, "cogid"],
                                        Source=["Chacon2017"])
Exemple #17
0
    def cmd_makecldf(self, args):

        args.writer.add_sources()
        language_lookup = args.writer.add_languages(lookup_factory="Name")
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="Name")
        wl = lingpy.Wordlist(self.raw_dir.joinpath("HSH-SCL.csv").as_posix())
        for idx in pylexibank.progressbar(wl):
            args.writer.add_forms_from_value(
                Language_ID=language_lookup[wl[idx, "language"]],
                Value=wl[idx, "reflex"],
                Source=["SoHartmann1988"],
                Parameter_ID=concept_lookup[wl[idx, "concept"]],
            )
Exemple #18
0
def prepare(dataset):

    data = lingpy.csv2list(dataset.get_path('raw', 'data-starostin.tsv'),
                           strip_lines=False)
    header = [h.lower() for h in data[0]]
    out = {}
    idx = 1
    for line in data[1:]:
        char = line[0]
        coc = line[2]
        bijiang = line[1]
        note = line[3]
        dali = line[4]
        doc_url = line[5]
        lhc = line[7]
        gloss = line[8]
        jianchuan = line[12]
        kg = line[14]
        mch = line[16]
        pinyin = line[18]
        rad = line[20]
        shijing = line[21]

        if coc.strip():
            out[idx] = [
                char, pinyin, 'Old_Chinese', 'Classical Old Chinese', coc, rad,
                kg[:4], kg, gloss
            ]
            idx += 1
        if lhc.strip():
            out[idx] = [
                char, pinyin, 'Late_Han_Chinese', 'Eastern Han Chinese', lhc,
                rad, kg[:4], kg, gloss
            ]
            idx += 1
        if mch.strip():
            out[idx] = [
                char, pinyin, 'Middle_Chinese', 'Middle Chinese', mch, rad,
                kg[:4], kg, gloss
            ]
            idx += 1
    out[0] = [
        'character', 'pinyin', 'doculect', 'doculect_in_source', 'reading',
        'semantic_class', 'phonetic_class', 'karlgren_id', 'gloss'
    ]
    dataset.write_wordlist(lingpy.Wordlist(out, row='character'), 'characters')
Exemple #19
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(self.raw_dir.joinpath("chinese.tsv").as_posix())
        maxcogid = 0

        args.writer.add_sources()
        args.writer.add_languages(id_factory=lambda l: l["Name"])
        args.writer.add_concepts(
            id_factory=lambda c: slug(c.label, lowercase=False))

        # store list of proto-form to cognate set
        p2c = {}

        for k in wl:
            for row in args.writer.add_lexemes(
                    Language_ID=wl[k, "doculect"],
                    Parameter_ID=slug(wl[k, "concept"], lowercase=False),
                    Value=wl[k, "ipa"],
                    Source="Hamed2006",
                    Cognacy=wl[k, "COGID"],
            ):
                args.writer.add_cognate(lexeme=row,
                                        Cognateset_ID=wl[k, "cogid"],
                                        Source=["Hamed2006", "List2015"])
            maxcogid = max([maxcogid, int(wl[k, "cogid"])])
            p2c[wl[k, "concept"], wl[k, "proto"]] = wl[k, "cogid"]
        idx = max([k for k in wl]) + 1
        for line in lingpy.csv2list(
                self.raw_dir.joinpath("old_chinese.csv").as_posix()):
            for val in line[1].split(", "):
                cogid = p2c.get((line[0], val))
                if not cogid:
                    maxcogid += 1
                    cogid = p2c[line[0], val] = maxcogid
                for row in args.writer.add_lexemes(
                        Language_ID="OldChinese",
                        Parameter_ID=slug(line[0], lowercase=False),
                        Value=val,
                        Source="Hamed2006",
                        Cognacy=p2c.get(val, val),
                ):
                    args.writer.add_cognate(lexeme=row,
                                            Cognateset_ID=cogid,
                                            Source=["Hamed2006", "List2015"])
                idx += 1
    def cmd_makecldf(self, args):
        # Read raw data
        wl = lingpy.Wordlist(self.raw_dir.joinpath("YN-RGLD.csv").as_posix())

        args.writer.add_sources()
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="SrcId")
        language_lookup = args.writer.add_languages(lookup_factory="Name")
        # add lexemes
        for idx, language, concept, value in pylexibank.progressbar(
                wl.iter_rows("doculect", "srcid", "reflex"), desc="make-cldf"):
            if language in language_lookup and concept in concept_lookup:
                args.writer.add_forms_from_value(
                    Language_ID=language_lookup[language],
                    Parameter_ID=concept_lookup[concept],
                    Value=value,
                    Source=["Nagano2013"],
                )
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(str(self.raw_dir / "sino-tibetan-raw.tsv"))
        args.writer.add_sources()
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                TBL_ID=concept.attributes["huang_1992_1820"],
                Name=concept.english,
                Coverage=concept.attributes["coverage"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.english] = idx
        languages, sources = {}, {}
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language["Name_in_Source"]] = language["ID"]
            sources[language["Name_in_Source"]] = language["Source"]
        for idx in pylexibank.progressbar(wl, desc="cldfify"):
            if wl[idx, "tokens"] and " ".join(wl[idx, "tokens"]).strip("+"):
                row = args.writer.add_form(
                    Language_ID=languages[wl[idx, "doculect"]],
                    Local_ID=idx,
                    Parameter_ID=concepts[wl[idx, "concept"]],
                    Value=wl[idx, "entry_in_source"].strip()
                    or "".join(wl[idx, "tokens"]) or wl[idx, "ipa"],
                    Form=".".join(wl[idx, "tokens"]),
                    Source=sources[wl[idx, "doculect"]].split(","),
                    Comment=wl[idx, "note"],
                    Cognacy=wl[idx, "cogid"],
                    Loan=True if wl[idx, "borrowing"].strip() else False,
                )

                args.writer.add_cognate(
                    lexeme=row,
                    Cognateset_ID="{0}-{1}".format(wl[idx, "cogid"],
                                                   slug(wl[idx, "concept"])),
                    Source="Sagart2018",
                    Alignment="",
                    Alignment_Source="",
                )
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    src = getEvoBibAsSource(SOURCE)

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                      'Parameter_ID', 'Parameter_name',
                      'Parameter_Chinese_name', 'Value', 'Segments', 'Source'),
                     dataset) as ds:
        ds.sources.add(src)

        for k in wl:
            if wl[k, 'value'] not in '---' and wl[k, 'value'].strip():
                ds.add_row([
                    wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'chinese'], wl[k, 'value'],
                    clean_string(wl[k, 'value'])[0], SOURCE
                ])
Exemple #23
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(
            self.raw_dir.joinpath('D_old-clics.tsv').as_posix())
        args.log.info('loaded wordlist')

        src = {
            'wold': 'Wold2009',
            'ids': 'Key2007',
            'logos': 'Logos2008',
            'Språkbanken': 'Saxena2013'
        }
        args.writer.add_sources()

        concepts = set()
        languages = set()
        concepticon = {
            c.id: c.gloss
            for c in Concepticon().conceptsets.values()
        }
        args.log.info('added concepticon')
        for k in progressbar(wl, desc='wl-to-cldf'):
            if wl[k, 'value']:
                if wl[k, 'doculect'] not in languages:
                    args.writer.add_language(ID=slug(wl[k, 'doculect'],
                                                     lowercase=False),
                                             Name=wl[k, 'doculect'],
                                             Glottocode=wl[k, 'glottolog'])
                    languages.add(wl[k, 'doculect'])
                if wl[k, 'concept'] not in concepts:
                    args.writer.add_concept(
                        ID=slug(wl[k, 'concept'], lowercase=False),
                        Name=wl[k, 'concept'],
                        Concepticon_ID=wl[k, 'concepticon_id'],
                        Concepticon_Gloss=concepticon.get(
                            wl[k, 'concepticon_id'], ''))
                    concepts.add(wl[k, 'concept'])
                args.writer.add_lexemes(Language_ID=slug(wl[k, 'doculect'],
                                                         lowercase=False),
                                        Parameter_ID=slug(wl[k, 'concept'],
                                                          lowercase=False),
                                        Value=wl[k, 'value'],
                                        Source=src.get(wl[k, 'source'], ''))
Exemple #24
0
    def cmd_makecldf(self, args):
    
        concepts = {}
        wl = lp.Wordlist(self.raw_dir.joinpath('SIN.csv').as_posix(), col='languages')

        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.gloss))
            args.writer.add_concept(
                    ID=idx,
                    Number=concept.number,
                    Name=concept.gloss,
                    Concepticon_ID=concept.concepticon_id,
                    Concepticon_Gloss=concept.concepticon_gloss,
                    )
            concepts[concept.gloss] = idx
        concepts['thunder'] = concepts['thunder (verb)']
        concepts['lightning'] = concepts['flash (verb)']
        concepts['soja sauce'] = concepts['soya sauce']
        concepts['light'] = concepts['watery']
        concepts['two pairs'] = concepts['two ounces']

        languages = args.writer.add_languages(
                lookup_factory="ID", id_factory=lambda x: x['ID'])
        
        args.writer.add_sources()
        for idx in wl:
            lexeme = args.writer.add_form(
                    Language_ID=languages[wl[idx, 'languages']],
                    Parameter_ID=concepts[wl[idx, 'concept']],
                    Value=wl[idx, 'ortho'],
                    Form=wl[idx, 'ipa'].replace('#', '-'),
                    Source='Hou2004',
                    Loan=True if wl[idx, 'cogid'] < 0 else False
                    )
            args.writer.add_cognate(
                    lexeme=lexeme,
                    Cognateset_ID=wl[idx, 'cogid'],
                    Cognate_Detection_Method='expert',
                    Source=['List2014d']
                    )        
Exemple #25
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(str(self.raw_dir / 'signalphabets.tsv'))

        concepts, sources = {}, {}
        for i, c in enumerate(wl.rows):
            args.writer.add_concept(
                ID=str(i + 1),
                Name=c,
            )
            concepts[c] = str(i + 1)
        for language in self.languages:
            args.writer.add_language(
                ID=language['Name_in_Database'],
                Name=language['Name'],
                Latitude=language['Latitude'],
                Longitude=language['Longitude'],
                Glottocode=language['Glottolog'],
                SubGroup=language['SubGroup'],
            )
            sources[language['Name_in_Database']] = language['Source']
        sources['Ukranian_SL'] = 'Lydell2018'
        languages = {language: language for language in sources}
        languages['Ukranian_SL'] = 'Ukrainian_SL'

        args.writer.add_sources(
            *[x for x in self.raw_dir.read_bib() if x.id in sources])

        for i, c, l, h1, h2, t, cid in progressbar(wl.iter_rows(
                'concept', 'doculect', 'handshape_1', 'handshape_2', 'tokens',
                'cogid'),
                                                   desc='makecldf'):
            row = args.writer.add_form(Value=h1 + ' ' + h2,
                                       Language_ID=languages[l],
                                       Parameter_ID=concepts[c],
                                       Form=' '.join(t),
                                       Source=sources[l])
            args.writer.add_cognate(
                lexeme=row,
                Cognateset_ID=cid,
            )
Exemple #26
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist((self.raw_dir / "D_old-clics.tsv").as_posix())
        src = {"logos": "Logos2008"}
        args.writer.add_sources(*self.raw_dir.read_bib())
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for k in progressbar(wl):
            if wl[k, "value"]:
                args.writer.add_language(
                    ID=slug(wl[k, "doculect"], lowercase=False),
                    Name=wl[k, "doculect"],
                    Glottocode=wl[k, "glottolog"],
                )
                args.writer.add_form(
                    Language_ID=slug(wl[k, "doculect"], lowercase=False),
                    Parameter_ID=concepts[wl[k, "concept"]],
                    Value=wl[k, "value"],
                    Form=wl[k, "value"],
                    Source=src.get(wl[k, "source"], ""),
                )
    def cmd_makecldf(self, args):
        # column "counterpart_doculect" gives us the proper names of the doculects
        wl = lingpy.Wordlist((self.raw_dir / self.DSETS[0]).as_posix(),
                             col="counterpart_doculect")
        args.writer.add_sources()

        language_lookup = args.writer.add_languages(
            lookup_factory="Name_in_Source")

        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.number + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            for lg in concept.attributes["lexibank_gloss"]:
                concept_lookup[lg] = idx

        rows = [(doculect, concept, value, qlcid)
                for (idx, doculect, concept, value, qlcid) in wl.iter_rows(
                    "counterpart_doculect", "concept", "counterpart", "qlcid")
                if doculect not in ["English", "Español"]]

        for doculect, concept, value, qlcid in rows:
            if value in exclude:
                continue
            else:
                args.writer.add_form(
                    Language_ID=language_lookup[doculect],
                    Parameter_ID=concept_lookup[concept],
                    Value=value,
                    Form=value,
                    Source=["Huber1992"],
                    Local_ID=qlcid,
                )
    def cmd_install(self, **kw):
        # column "counterpart_doculect" gives us the proper names of the doculects
        wl = lingpy.Wordlist(
            self.raw.posix('Huber_filtered_130_basic_cult_voc'))

        with self.cldf as ds:
            ds.add_sources(*self.raw.read_bib())

            for l in self.languages:
                ds.add_language(ID=slug(l['Name']),
                                Name=l['Name'],
                                Glottocode=l['Glottocode'])
            for c in self.concepts:
                ds.add_concept(ID=slug(c['GLOSS_IN_SOURCE']),
                               Name=c['ENGLISH'],
                               Concepticon_ID=c['CONCEPTICON_ID'] or '',
                               Spanish_Gloss=c['SPANISH'])
            # specify valid entries in the data
            valid_entries = [c['GLOSS_IN_SOURCE'] for c in self.concepts]

            for k in pb(wl, desc='wl-to-cldf'):
                if wl[k, 'concept'] in valid_entries:
                    for row in ds.add_lexemes(Language_ID=slug(wl[k,
                                                                  'doculect']),
                                              Parameter_ID=slug(wl[k,
                                                                   'concept']),
                                              Value=wl[k, 'counterpart'],
                                              Form=wl[k, 'counterpart'],
                                              Segments=wl[k, 'tokens'],
                                              Source='Huber1992'):
                        cid = slug(wl[k, 'concept'] + '-' +
                                   '{0}'.format(wl[k, 'cogid']))
                        ds.add_cognate(lexeme=row,
                                       Cognateset_ID=cid,
                                       Source=['Chacon2017'],
                                       Alignment=wl[k, 'alignment'],
                                       Alignment_Source='Chacon2017')
    def cmd_install(self, **kw):
        wl = lingpy.Wordlist(self.raw.posix('D_subset-300-22.tsv'))
        source_dict = {}
        concept_dict = {}
        sources = {source.id: source for source in self.raw.read_bib()}

        with self.cldf as ds:
            ds.add_sources(sources[SOURCE])

            for l in self.languages:
                ds.add_language(ID=l['ID'],
                                Name=l['Name'],
                                Glottocode=l['Glottocode'])
                source_dict[l['Name']] = [l['Source'], l['ID']]
                ds.add_sources(sources[l['Source']])

            for c in self.concepts:
                ds.add_concept(
                    ID=c['ID'],
                    Concepticon_ID=c['Concepticon_ID'],
                    Concepticon_Gloss=c['Concepticon_Gloss'],
                    Name=c['Gloss'],
                )
                concept_dict[c['Gloss']] = c['ID']

            for k in pb(wl, desc='wl-to-cldf', total=len(wl)):
                if wl[k, 'tokens']:
                    ds.add_lexemes(Language_ID=source_dict[wl[k,
                                                              'doculect']][1],
                                   Parameter_ID=concept_dict[wl[k, 'concept']],
                                   Value=wl[k, 'ipa'].strip()
                                   or ''.join(wl[k, 'tokens']),
                                   Form=wl[k, 'ipa'],
                                   Segments=wl[k, 'tokens'],
                                   Source=[source_dict[wl[k, 'doculect']][0]],
                                   Comment=wl[k, 'note'])
Exemple #30
0
    def cmd_makecldf(self, args):
    
        concepts = {}
        wl = lp.Wordlist(self.raw_dir.joinpath('IEL.csv').as_posix())

        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                    ID=idx,
                    Number=concept.number,
                    Name=concept.english,
                    Concepticon_ID=concept.concepticon_id,
                    Concepticon_Gloss=concept.concepticon_gloss,
                    )
            concepts[concept.english] = idx

        languages = args.writer.add_languages(
                lookup_factory="Name", id_factory=lambda x: slug(x['Name']))
        
        args.writer.add_sources()
        for idx in wl:
            lexeme = args.writer.add_form(
                    Language_ID=languages[wl[idx, 'language']],
                    Parameter_ID=concepts[wl[idx, 'concept']],
                    Value=wl[idx, 'originalform'],
                    Form='.'.join(wl[idx, 'tokens']).replace('#', '-'),
                    #Segments=wl[idx, 'tokens'],
                    Source='Dunn2012',
                    Loan=True if wl[idx, 'cogid'] < 0 else False
                    )
            args.writer.add_cognate(
                    lexeme=lexeme,
                    Cognateset_ID=wl[idx, 'cogid'],
                    Cognate_Detection_Method='expert',
                    Source=['Dunn2012']
                    )