def cmd_makecldf(self, args):
        # add the bibliographic sources
        args.writer.add_sources()

        # add the languages from the language list (no need for mapping here)
        args.writer.add_languages()

        # add the concepts from the concept list
        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            cid = "%s_%s" % (concept.id.split("-")[-1], slug(concept.english))
            args.writer.add_concept(
                ID=cid,
                Name=concept.english,
                NorthEuralex_Gloss=concept.attributes["nelex_id"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.attributes["nelex_id"]] = cid

        # add items
        lexeme_rows = self.raw_dir.read_csv("nelex.tsv",
                                            delimiter="\t",
                                            dicts=True)
        for row in pylexibank.progressbar(lexeme_rows):
            args.writer.add_form(
                Language_ID=row["Language_ID"],
                Parameter_ID=concept_lookup[row["Concept_ID"]],
                Value=row["Word_Form"],
                Form=row["rawIPA"].strip().replace(" ", "_"),
                Source=["Dellert2020"],
            )
Esempio n. 2
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.csv', dicts=True)
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="Name")

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.number + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Category=concept.attributes['category'],
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss)
            concepts[concept.english] = idx

        for row in pylexibank.progressbar(data):
            for language, lexeme in row.items():
                if language in languages:
                    args.writer.add_forms_from_value(
                        Language_ID=languages[language],
                        Parameter_ID=concepts[row["English"]],
                        Value=lexeme,
                        Category=row["Category"],
                        Source="Sawka2019",
                    )
    def cmd_makecldf(self, args):

        args.writer.add_sources()
        wl = lingpy.Wordlist(self.dir.joinpath("raw", "wordlist.tsv").as_posix())
        concepts = {}
        strip_concept = lambda x: x.replace(" ", "").replace("*", "")

        for concept in self.conceptlists[0].concepts.values():
            args.writer.add_concept(
                ID=concept.id,
                Name=concept.english,
                Chinese_Gloss=strip_concept(concept.attributes["chinese"]),
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[strip_concept(concept.attributes["chinese"])] = concept.id
        langs = {k["ChineseName"]: k["ID"] for k in self.languages}
        args.writer.add_languages()

        for idx in pylexibank.progressbar(wl, desc="cldfify"):

            args.writer.add_form_with_segments(
                Language_ID=langs[wl[idx, "doculect"]],
                Parameter_ID=concepts[strip_concept(wl[idx, "concept"])],
                Value=wl[idx, "value"],
                Form=wl[idx, "form"],
                Segments=wl[idx, "tokens"],
                Source=["Castro2010a"],
            )
Esempio n. 4
0
    def cmd_makecldf(self, args):

        wl = lingpy.Wordlist(self.raw_dir.joinpath("yi-wl.tsv").as_posix())
        args.writer.add_sources()

        languages = args.writer.add_languages(lookup_factory="Name")

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Chinese_Gloss=concept.attributes["chinese"],
            )
            concepts[concept.english] = idx
        concepts["Daughter-in-law"] = concepts["daughter-in-law"]

        for idx in pylexibank.progressbar(wl, desc="cldfify", total=len(wl)):
            args.writer.add_form_with_segments(
                Language_ID=languages[wl[idx, "doculect"]],
                Parameter_ID=concepts[wl[idx, "concept"]],
                Value=wl[idx, "value"],
                Form=wl[idx, "form"],
                Segments=wl[idx, "tokens"],
                Source=["Castro2010"],
            )
Esempio n. 5
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        language_lookup = args.writer.add_languages(lookup_factory="Name")

        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            concept_lookup[concept.english] = idx
            concept_lookup[concept.number] = idx
            args.writer.add_concept(
                ID=idx,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Name=concept.english,
                Chinese_Gloss=concept.attributes["chinese"],
            )
        for entry in pylexibank.progressbar(
                self.raw_dir.read_csv("ZMYYC.csv", delimiter="\t",
                                      dicts=True)):
            args.writer.add_forms_from_value(
                Language_ID=language_lookup[entry["language"]],
                Parameter_ID=concept_lookup.get(entry["srcid"].split(".")[0]),
                Local_ID=entry["rn"],
                Value=entry["reflex"],
                Source=["Sun1991"],
            )
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        sources = {}
        for language in self.languages:
            sources[language["ID"]] = [
                x.lower() for x in language["Source"].split(",")
            ]
            args.writer.add_language(**language)
        segments = {
            "ž": "ʒ",
            "nˈ": "nʲ",
            "lˈ": "lʲ",
            "gˈ": "gʲ",
            "rˈ": "rʲ",
            "pˈ": "pʲ",
            "s-": "s",
            "š": "ʃ",
            "βˈ": "βʲ",
            "sˈ": "sʲ",
            "tʃ": "tɕ",
            "ʦ": "ts",
            "_": "+",
            "ch": "x",
        }
        concepts = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="Name")

        for row in progressbar(
                self.raw_dir.read_csv(DATAFILE, delimiter="\t", dicts=True)):
            if row["ID"].startswith("#"):
                # skip lingpy stuff
                continue

            # patch two weird/broken entries:
            if row["ID"] == "7560":
                row["TOKENS"] = "ɕ i v ɘ ʨ"

            if row["ID"] == "8367":
                row["ENTRY"] = "ʒɯl"

            segs = [segments.get(x, x) for x in row["TOKENS"].split()]

            lex = args.writer.add_form_with_segments(
                Local_ID=row["ID"],
                Language_ID=row["DOCULECT"],
                Parameter_ID=concepts.get(row["CONCEPT"]),
                Value=row["ENTRY"],
                # sometimes the FORM value is empty for some reason.
                # if so we use the parsed 'segments' field by removing spaces.
                Form=row["FORM"] if row["FORM"] else "".join(segs),
                Segments=segs,
                Source=sources.get(row["DOCULECT"]) or [""],
            )
            args.writer.add_cognate(
                lexeme=lex,
                Cognateset_ID=row["COGID"],
                Alignment=[segments.get(x, x) for x in row["ALIGNMENT"]],
                Root=row["ROOT"],
            )
Esempio n. 7
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("raw.csv", dicts=True)
        languages = args.writer.add_languages(lookup_factory="Name")
        args.writer.add_sources()
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name"
        )

        for row in pylexibank.progressbar(data):
            for language, lexeme in row.items():
                if language in languages:
                    args.writer.add_forms_from_value(
                        Language_ID=languages[language],
                        Parameter_ID=concepts[row["English gloss"]],
                        Value=lexeme,
                        AmharicGloss=row["Amharic gloss"],
                        Source="Bremer2016",
                    )

        # We explicitly remove the ISO column since none of the languages in
        # this dataset have an ISO code.
        args.writer.cldf["LanguageTable"].tableSchema.columns = [
            col
            for col in args.writer.cldf["LanguageTable"].tableSchema.columns
            if col.name != "ISO639P3code"
        ]
    def cmd_makecldf(self, args):

        args.writer.add_sources()

        # add concepts from list
        concepts = {}
        for concept in self.concepts:
            cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
            args.writer.add_concept(
                ID=cid,
                Number=concept['NUMBER'],
                Name=concept['ENGLISH'],
                Concepticon_ID=concept["CONCEPTICON_ID"],
                Concepticon_Gloss=concept["CONCEPTICON_GLOSS"])
            concepts[concept["ENGLISH"]] = cid

        args.writer.add_languages()

        wl = Wordlist(
            self.raw_dir.joinpath('lundgren_ma_analyzed_data.tsv').as_posix())
        for idx in progressbar(wl):
            lexeme = args.writer.add_form_with_segments(
                Local_ID=idx,
                Language_ID=wl[idx, 'doculect'],
                Parameter_ID=concepts[wl[idx, 'concept']],
                Value=wl[idx, 'ipa'] or ''.join(wl[idx, 'tokens']),
                Form=wl[idx, 'ipa'] or ''.join(wl[idx, 'tokens']),
                Segments=[{
                    '_': '+'
                }.get(x, x) for x in wl[idx, 'tokens']],
                Source=['Lundgren2020'])
            args.writer.add_cognate(lexeme=lexeme,
                                    Cognateset_ID=wl[idx, 'cogid'],
                                    Alignment=wl[idx, 'alignment'],
                                    Source=['Lundgren2020'])
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.csv', dicts=True)
        languages, concepts = {}, {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split('-')[-1] + '_' + slug(concept.gloss)
            args.writer.add_concept(
                ID=idx,
                Name=concept.gloss,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Chinese_Gloss=concept.attributes['chinese'])
            concepts[concept.attributes['chinese']] = idx
        args.writer.add_languages()
        languages = args.writer.add_languages(lookup_factory='Name')
        args.writer.add_sources()

        missing = {}
        for cgloss, entry in progressbar(enumerate(data),
                                         desc='cldfify the data',
                                         total=len(data)):
            if entry['Chinese gloss'] in concepts.keys():
                for language in languages:
                    if entry[language].strip():
                        lexemes = args.writer.add_lexemes(
                            Language_ID=languages[language],
                            Parameter_ID=concepts[entry['Chinese gloss']],
                            Value=entry[language],
                            Source=['Chen2012'])
            else:
                missing[entry["Chinese gloss"]] += 1
Esempio n. 10
0
    def cmd_makecldf(self, args):
        glottocode = "voro1241"
        reprs = ["StandardOrth"]

        args.writer.add_concepts(id_factory=lambda c: c.attributes['ids_id'])
        args.writer.add_sources(*self.raw_dir.read_bib())

        personnel = self.get_personnel(args)

        args.writer.add_language(
            ID=glottocode,
            Name="Võro",
            Glottocode=glottocode,
            Authors=personnel['author'],
            DataEntry=personnel['data entry'],
            Consultants=personnel['consultant'],
            Representations=reprs,
            Latitude=58.0,
            Longitude=26.6,
            date='2020-09-17',
        )

        for form in pylexibank.progressbar(
                self.read_csv("ids_voro1241.idsclldorg.csv")):
            if form.form:
                args.writer.add_lexemes(
                    Language_ID=glottocode,
                    Parameter_ID=form.ids_id,
                    Value=form.form,
                    Comment=form.comment,
                    Source="cosgrove2020",
                    Transcriptions=reprs,
                )

        self.apply_cldf_defaults(args)
    def cmd_makecldf(self, args):

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                    ID=idx,
                    Name=concept.english,
                    Chinese_Gloss=concept.attributes['chinese'],
                    Concepticon_ID=concept.concepticon_id,
                    Concepticon_Gloss=concept.concepticon_gloss
                    )
            concepts[concept.attributes['chinese']+' '+concept.english] = idx
            concepts[concept.english] = idx
        args.writer.add_languages()
        args.writer.add_sources()

        for row, language in progressbar(zip(
                self.raw_dir.read_csv(
                    'data.tsv', 
                    delimiter='\t', 
                    dicts=True)[1:],
                self.languages)):
            for j, (concept, entry) in enumerate(list(row.items())[1:]):
                if entry.strip():
                    pidx = concepts.get(
                                concept, 
                                concepts.get(' '.join(concept.split(' ')[1:]), '?'))
                    args.writer.add_form(
                            Language_ID=language['ID'],
                            Parameter_ID=pidx,
                            Value=entry,
                            Form=entry.replace(" ", "_"),
                            Source=[language['Source']]
                            )
Esempio n. 12
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            cid = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=cid,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Number=concept.number
            )
            concepts[concept.number] = cid
        args.log.info('[i] added concepts')
        languages = args.writer.add_languages(lookup_factory="Number")
        args.log.info('[i] added languages')
        args.writer.add_sources()
        
        missingL, missingC = set(), set()
        missingCog = set()
        cogids = {}
        for row in progressbar(
                self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True)):
            lid = languages.get(row['LANGUAGE'])
            cid = concepts.get(row['SID'])
            # take only the first cognate ID if there are several
            cog = row['COGNATE'].split('|')[0]
            if lid and cid and row["FORM"] and row["FORM"].strip():
                lexemes = args.writer.add_forms_from_value(
                    Language_ID=lid,
                    Parameter_ID=cid,
                    Value=row["FORM"],
                    Source='Sun1991'
                )
                if cog.strip():
                    cogid = cid+'-'+cog
                    args.writer.add_cognate(
                            lexeme=lexemes[0],
                            Cognateset_ID=cogid,
                            Cognate_Detection_Method='expert',
                            Source='Gao2020'
                            )
                else:
                    missingCog.add(cogid)

            if not lid:
                missingL.add(lid)
            if not cid:
                missingC.add(cid)
        for entry in missingL:
            print('missing L {0}'.format(entry))
        for entry in missingC:
            print('missing C {0}'.format(entry))
        for entry in missingCog:
            print('missing Cognate {0}'.format(entry))
Esempio n. 13
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(self.raw_dir.joinpath("suansu.tsv").as_posix())

        converter = {
            "ll": "lː",
            "ddʑ": "dʑː",
            "mm": "mː",
            "nn": "nː",
            "ss": "sː",
            "tts": "tsː",
            "tʂ": "ʈʂː",
            "bb": "bː",
            "dd": "dː",
            "pp": "pː",
            "tt": "tː",
            "ttʰ": "tʰː",
            "ɹɹ": "ɹː",
            "ff": "fː",
            "je": "j e",
            "oj": "oi",
            "ph": "pʰ",
            "th": "tʰ",
            "ttɕ": "tɕː",
            "ttʃ": "tʃː",
            "ma": "m a",
            "ē": "e",
            "ê": "e",
            "ʈʈʂ": "ʈʂː",
            "I": "ɪ",
            "ʷ": "w",
        }

        args.writer.add_sources()
        concepts = {}
        args.writer.add_languages()

        for k in progressbar(wl, desc="wl-to-cldf", total=len(wl)):
            if wl[k, "concepticon_id"] not in concepts:
                cid = "{0}_{1}".format(wl[k, "concepticon_id"],
                                       slug(wl[k, "concept"]))
                concepts[wl[k, "concept"]] = cid
                args.writer.add_concept(
                    ID=cid,
                    Name=wl[k, "concept"],
                    Concepticon_ID=wl[k, "concepticon_id"],
                    Concepticon_Gloss=wl[k, "concepticon_gloss"],
                )
            args.writer.add_form_with_segments(
                Language_ID="Suansu",
                Parameter_ID=concepts[wl[k, "concept"]],
                Value="".join(wl[k, "tokens"]),
                Form="".join(wl[k, "tokens"]),
                Segments=" ".join(
                    [converter.get(x, x) for x in wl[k, "tokens"]]).split(),
                Source=["Ivani2019"],
            )
Esempio n. 14
0
def run(args):
    """
    main function.
    """
    ds = get_dataset(args)
    if args.medials:
        args.medials = set(args.medials.split(','))
    errors = {
        'length': defaultdict(list),
        'syllable': defaultdict(list),
        'missing': defaultdict(list)
    }
    if ds.cldf_dir.joinpath("forms.csv").exists():
        for row in progressbar(ds.cldf_reader()["FormTable"],
                               desc='iterate over wordlist'):
            if row['Language_ID'] == args.doculect or not args.doculect:
                strucs = get_structure(row['Segments'],
                                       medials=args.medials or MEDIALS)
                for i, (struc, segments) in enumerate(
                        zip(strucs, morphemes(row['Segments']))):
                    if len(struc) != len(segments):
                        errors['length'][' '.join(segments),
                                         ' '.join(struc)] += [
                                             (row['ID'], i, row['Language_ID'],
                                              row['Form'], row['Segments'])
                                         ]
                    elif '?' in struc:
                        errors['missing'][' '.join(segments),
                                          ' '.join(struc)] += [
                                              (row['ID'], i,
                                               row['Language_ID'], row['Form'],
                                               row['Segments'])
                                          ]
                    elif not 'n' in struc or not 't' in struc:
                        errors['syllable'][' '.join(segments),
                                           ' '.join(struc)] += [
                                               (row['ID'], i,
                                                row['Language_ID'],
                                                row['Form'], row['Segments'])
                                           ]

    for error, errorname in [('length', 'Length Errors'),
                             ('missing', 'Missing Values'),
                             ('syllable', 'Syllable Errors')]:
        if errors[error]:
            print('# ' + errorname + '\n')
            table = []
            for i, ((segments, structure),
                    examples) in enumerate(errors[error].items()):
                table += [[i + 1, segments, structure, len(examples)]]
            print(
                tabulate(
                    table,
                    tablefmt='pipe',
                    headers=['Number', 'Segments', 'Structure', 'Examples']))
            print('')
Esempio n. 15
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts, wl_concepts = {}, {}
        visited = set()
        for concept in self.concepts:
            cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
            if cid in visited:
                pass
            else:
                visited.add(cid)
                args.writer.add_concept(
                    ID=cid,
                    Name=concept['ENGLISH'],
                    Glosses_in_Source=concept['GLOSSES_IN_SOURCE'],
                    Concepticon_ID=concept['CONCEPTICON_ID'],
                    Concepticon_Gloss=concept['CONCEPTICON_GLOSS'])
                for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '):
                    concepts[gloss] = cid
                    wl_concepts[gloss] = concept['ENGLISH']

        languages = args.writer.add_languages(lookup_factory="Name_in_Source")
        args.writer.add_sources()

        # make a wordlist for edictor to inspect the data
        D = {0: ['doculect', 'concept', 'ipa', 'cogid']}
        idx = 1

        for i, row in progressbar(
                enumerate(
                    self.raw_dir.read_csv('data.tsv',
                                          delimiter='\t',
                                          dicts=True))):
            for language, lid in languages.items():
                form = row[language].strip()
                if form:
                    lexemes = args.writer.add_forms_from_value(
                        Language_ID=lid,
                        Parameter_ID=concepts[row['Meaning']],
                        Value=form,
                        Source='Holm2017')
                    if lexemes:
                        args.writer.add_cognate(
                            lexeme=lexemes[0],
                            Cognateset_ID=str(i + 1),
                            Cognate_Detection_Method='expert',
                            Source='Holm2017')
                        D[idx] = [
                            language, wl_concepts[row['Meaning']], form, i + 1
                        ]
                        idx += 1
        Wordlist(D).output(
            'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
Esempio n. 16
0
    def cmd_makecldf(self, args):
        # add sources
        args.writer.add_sources()
        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name",
        )
        # fix concept
        concepts["fat (n.)"] = concepts["fat n."]
        concepts["burn (tr.)"] = concepts["burn tr."]
        concepts["to fly"] = concepts["fly v."]
        concepts["lie (down)"] = concepts["lie"]
        concepts["walk (go)"] = concepts["walk(go)"]
        args.log.info("added concepts")
        # add languages
        languages = {}
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language["Name"]] = language["ID"]
        args.log.info("added languages")
        # add data
        for row_ in progressbar(
                self.raw_dir.read_csv("Peiros2004-data by etymology.txt",
                                      delimiter="\t")):
            if "".join(row_).strip():
                row = dict(
                    zip(["CONCEPT", "SUBGROUP", "LANGUAGE", "FORM", "COGNACY"],
                        row_))
                bsource = ""
                if row["COGNACY"].isdigit():
                    cogid = int(row["COGNACY"])
                elif row["COGNACY"].startswith("<"):
                    bsource = row["COGNACY"].split(" ")[1]
                    cogid = 0
                else:
                    cogid = 0

                for lexeme in args.writer.add_forms_from_value(
                        Parameter_ID=concepts[re.sub("'", "", row["CONCEPT"])],
                        Language_ID=languages[row["LANGUAGE"].strip()],
                        Value=row["FORM"].strip(),
                        Source=["Peiros2004a"],
                        LoanSource=bsource,
                        Loan=True if bsource else False,
                ):
                    args.writer.add_cognate(
                        lexeme,
                        Cognateset_ID=cogid,
                        Source=["Peiros2004a"],
                    )
Esempio n. 17
0
def run(args):

    ds = Dataset(args)
    wl = Wordlist.from_cldf(str(ds.cldf_specs().metadata_path))
    print('loaded wordlist')
    for idx, form, tokens in wl.iter_rows('form', 'tokens'):
        if str(tokens).endswith('+') or str(tokens).startswith('+'):
            print(idx, tokens)
        elif '+ +' in str(tokens):
            print(idx, form, tokens)

    wl.add_entries(
        'structure', 'tokens', lambda x: basictypes.lists(' + '.join(
            [' '.join(y) for y in segments.get_structure(x)])))

    errors = []
    for idx, doculect, concept, value, form, tokens, structure in progressbar(
            wl.iter_rows('doculect', 'concept', 'value', 'form', 'tokens',
                         'structure')):
        if len(tokens.n) != len(structure.n):
            print('Wrong Length: {0} // {1}'.format(tokens, structure))
        for tok, struc in zip(tokens.n, structure.n):
            error = ''
            if len(tok) != len(struc):
                error = 'wrong length'
            elif not 'n' in struc:
                error = 'missing vowel'
            #elif struc[0] == 'm':
            #    error = 'medial as initial'

            if error.strip():
                errors += [[
                    idx, doculect, concept, value, form, tok, struc, error
                ]]
    table = sorted(errors, key=lambda x: (x[-1], x[-2], x[1]))
    for i, line in enumerate(table):
        table[i] = [i + 1] + line
    print(
        tabulate(table,
                 headers=[
                     'Count', 'ID', 'Doculect', 'Concept', 'Value', 'Form',
                     'Token', 'Structure', 'Error'
                 ],
                 tablefmt='pipe'))

    morphemes = set([(line[-4], str(line[-3]), str(line[-2]))
                     for line in table])
    for a, b, c in sorted(morphemes, key=lambda x: x[-2]):
        print(a + '\t' + b + '\t' + c)
Esempio n. 18
0
    def cmd_makecldf(self, args):

        args.writer.add_sources()
        language_lookup = args.writer.add_languages(lookup_factory="Name")
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="Name")
        wl = lingpy.Wordlist(self.raw_dir.joinpath("HSH-SCL.csv").as_posix())
        for idx in pylexibank.progressbar(wl):
            args.writer.add_forms_from_value(
                Language_ID=language_lookup[wl[idx, "language"]],
                Value=wl[idx, "reflex"],
                Source=["SoHartmann1988"],
                Parameter_ID=concept_lookup[wl[idx, "concept"]],
            )
Esempio n. 19
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        language_lookup = args.writer.add_languages(lookup_factory="Name")
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.number + "_" + slug(x.english),
            lookup_factory="Name")

        for entry in pylexibank.progressbar(
                self.raw_dir.read_csv("clean_data.tsv",
                                      delimiter="\t",
                                      dicts=True)):
            args.writer.add_forms_from_value(
                Language_ID=language_lookup[entry["LANGUAGE"]],
                Parameter_ID=concept_lookup[entry["CONCEPT"]],
                Value=entry["VALUE"],
                Source=["Kraft1981"],
            )
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("wordlists.csv", dicts=True)

        args.writer.add_sources()
        languages = args.writer.add_languages()
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for row in pylexibank.progressbar(data):
            for language in languages:
                args.writer.add_forms_from_value(
                    Language_ID=language,
                    Parameter_ID=concepts[row["English"]],
                    Value=row[language],
                    Source=["Tolmie1884"],
                )
Esempio n. 21
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('st-data.tsv', delimiter='\t', dicts=True)

        args.writer.add_sources()

        # note: no way to easily replace this with the direct call to `add_concepts`
        # as we add the Chinese gloss via concept.attributes
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx

        languages = args.writer.add_languages(lookup_factory="Abbreviation")

        visited = set()
        for row in pylexibank.progressbar(data[1:],
                                          desc='cldfify',
                                          total=len(data)):
            for language in map(lambda x: x.upper(), languages):
                if language in row:
                    if row[language].strip():
                        entry = clean_entry(row[language])
                        if entry.strip():
                            lexeme = args.writer.add_form(
                                Language_ID=languages[language.lower()],
                                Parameter_ID=concepts[row['NUMBER']],
                                Value=row[language],
                                Form=entry,
                                Source='Peiros2004')
                            args.writer.add_cognate(
                                lexeme=lexeme,
                                Cognateset_ID="{0}-{1}".format(
                                    row['NUMBER'], row[language + 'NUM']),
                                Source='Peiros2004')
                else:
                    if language not in visited:
                        visited.add(language)
                        print(language)
Esempio n. 22
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("raw.csv", dicts=True)
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="Name")

        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for row in pylexibank.progressbar(data):
            for language, lexeme in row.items():
                if language in languages:
                    args.writer.add_forms_from_value(
                        Language_ID=languages[language],
                        Parameter_ID=concepts[row["Gloss"]],
                        Value=lexeme,
                        Source="Othaniel2017",
                    )
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.csv', dicts=True)
        languages, concepts = {}, {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split('-')[-1] + '_' + slug(concept.gloss)
            args.writer.add_concept(
                ID=idx,
                Name=concept.gloss,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Chinese_Gloss=concept.attributes['chinese'])
            concepts[concept.attributes['chinese']] = idx
        args.writer.add_languages()
        languages = args.writer.add_languages(lookup_factory='Name')
        args.writer.add_sources()

        # add the tones for the segmented entries
        tones = {(
                    row['Language_ID'],
                    row['Tone']
                    ): row['Tone_category']+'/'+row['Tone'] for row in \
                            self.raw_dir.read_csv(
                                'hm-tones.tsv', delimiter='\t',
                                dicts=True
                                )}
        missing = {}
        for cgloss, entry in progressbar(enumerate(data),
                                         desc='cldfify the data',
                                         total=len(data)):
            if entry['Chinese gloss'] in concepts.keys():
                for language in languages:
                    if entry[language].strip():
                        lexemes = args.writer.add_lexemes(
                            Language_ID=languages[language],
                            Parameter_ID=concepts[entry['Chinese gloss']],
                            Value=entry[language],
                            Source=['Chen2012'])
                        for lexeme in lexemes:
                            lexeme['Segments'] = [
                                tones.get((lexeme['Language_ID'], s), s)
                                for s in lexeme['Segments']
                            ]
            else:
                missing[entry["Chinese gloss"]] += 1
Esempio n. 24
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(str(self.raw_dir / "sino-tibetan-raw.tsv"))
        args.writer.add_sources()
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                TBL_ID=concept.attributes["huang_1992_1820"],
                Name=concept.english,
                Coverage=concept.attributes["coverage"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.english] = idx
        languages, sources = {}, {}
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language["Name_in_Source"]] = language["ID"]
            sources[language["Name_in_Source"]] = language["Source"]
        for idx in pylexibank.progressbar(wl, desc="cldfify"):
            if wl[idx, "tokens"] and " ".join(wl[idx, "tokens"]).strip("+"):
                row = args.writer.add_form(
                    Language_ID=languages[wl[idx, "doculect"]],
                    Local_ID=idx,
                    Parameter_ID=concepts[wl[idx, "concept"]],
                    Value=wl[idx, "entry_in_source"].strip()
                    or "".join(wl[idx, "tokens"]) or wl[idx, "ipa"],
                    Form=".".join(wl[idx, "tokens"]),
                    Source=sources[wl[idx, "doculect"]].split(","),
                    Comment=wl[idx, "note"],
                    Cognacy=wl[idx, "cogid"],
                    Loan=True if wl[idx, "borrowing"].strip() else False,
                )

                args.writer.add_cognate(
                    lexeme=row,
                    Cognateset_ID="{0}-{1}".format(wl[idx, "cogid"],
                                                   slug(wl[idx, "concept"])),
                    Source="Sagart2018",
                    Alignment="",
                    Alignment_Source="",
                )
    def cmd_makecldf(self, args):
        # Read raw data
        wl = lingpy.Wordlist(self.raw_dir.joinpath("YN-RGLD.csv").as_posix())

        args.writer.add_sources()
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="SrcId")
        language_lookup = args.writer.add_languages(lookup_factory="Name")
        # add lexemes
        for idx, language, concept, value in pylexibank.progressbar(
                wl.iter_rows("doculect", "srcid", "reflex"), desc="make-cldf"):
            if language in language_lookup and concept in concept_lookup:
                args.writer.add_forms_from_value(
                    Language_ID=language_lookup[language],
                    Parameter_ID=concept_lookup[concept],
                    Value=value,
                    Source=["Nagano2013"],
                )
Esempio n. 26
0
    def cmd_makecldf(self, args):
        concepts = {}

        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            concepts[concept.number] = idx
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                NUMBER=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Swahili_Gloss=concept.attributes["swahili"],
            )

        # Add sources
        args.writer.add_sources()

        # Add languages
        language_lookup = args.writer.add_languages(lookup_factory="Name")

        # TODO: add STEM and PREFIX? (pay attention to multiple forms)
        for entry in pylexibank.progressbar(self.raw_dir.read_csv("tls.txt", dicts=True)):
            # Skip over when language is "Note" (internal comments) or
            # "Gweno1" (a copy of "Gweno")
            if entry["LGABBR"] in ["Note", "Gweno1"]:
                continue

            src_idx = entry["SRCID"].replace(".0", "").replace(".5", "a")

            # Fix values if possible (for common problems not in lexemes.csv)
            value = entry["REFLEX"]

            if src_idx not in concepts:
                continue

            args.writer.add_forms_from_value(
                Language_ID=language_lookup[entry["LGABBR"]],
                Parameter_ID=concepts[src_idx],
                Value=value,
                Source=["Nurse1975", "Nurse1979", "Nurse1980", "TLS1999"],
            )
Esempio n. 27
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(
            self.raw_dir.joinpath('D_old-clics.tsv').as_posix())
        args.log.info('loaded wordlist')

        src = {
            'wold': 'Wold2009',
            'ids': 'Key2007',
            'logos': 'Logos2008',
            'Språkbanken': 'Saxena2013'
        }
        args.writer.add_sources()

        concepts = set()
        languages = set()
        concepticon = {
            c.id: c.gloss
            for c in Concepticon().conceptsets.values()
        }
        args.log.info('added concepticon')
        for k in progressbar(wl, desc='wl-to-cldf'):
            if wl[k, 'value']:
                if wl[k, 'doculect'] not in languages:
                    args.writer.add_language(ID=slug(wl[k, 'doculect'],
                                                     lowercase=False),
                                             Name=wl[k, 'doculect'],
                                             Glottocode=wl[k, 'glottolog'])
                    languages.add(wl[k, 'doculect'])
                if wl[k, 'concept'] not in concepts:
                    args.writer.add_concept(
                        ID=slug(wl[k, 'concept'], lowercase=False),
                        Name=wl[k, 'concept'],
                        Concepticon_ID=wl[k, 'concepticon_id'],
                        Concepticon_Gloss=concepticon.get(
                            wl[k, 'concepticon_id'], ''))
                    concepts.add(wl[k, 'concept'])
                args.writer.add_lexemes(Language_ID=slug(wl[k, 'doculect'],
                                                         lowercase=False),
                                        Parameter_ID=slug(wl[k, 'concept'],
                                                          lowercase=False),
                                        Value=wl[k, 'value'],
                                        Source=src.get(wl[k, 'source'], ''))
Esempio n. 28
0
    def cmd_makecldf(self, args):
        conn = sqlite3.connect((self.raw_dir / "tryon.db").as_posix())
        cursor = conn.cursor()
        cursor.execute(QUERY)

        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="Name")
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english),
            lookup_factory="Name")

        for lang, param, value in progressbar(cursor.fetchall()):
            if value:
                args.writer.add_forms_from_value(
                    Language_ID=languages[lang],
                    Parameter_ID=concepts[param],
                    Value=self.lexemes.get(value, value).strip(),
                    Source=['Tryon1983'],
                )
        conn.close()
Esempio n. 29
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("data.tsv", delimiter="\t", dicts=True)
        args.writer.add_sources()
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.english,
                Variants=concept.attributes["lexibank_gloss"],
            )
            for variant in concept.attributes["lexibank_gloss"]:
                concepts[variant] = idx
            concepts[concept.english] = idx
        
        languages = args.writer.add_languages(lookup_factory="Name")
        # Only instance where the variant is switched, so we fix that manually.
        concepts["duck²⁹"] = "51_duck"

        for i, row in progressbar(enumerate(data)):
            for language in self.languages:
                if language != "Tangut":
                    entry = row.get(language["Name"])
                    if not entry:
                        entry = row.get(language["Name"] + "_form")
                    concept = concepts.get(row.get(language["Name"] + "_gloss"))
                    if entry and concept and entry not in ["NA"] and concept not in ["NA"]:
                        cogset = args.writer.add_forms_from_value(
                            Language_ID=language["Name"],
                            Parameter_ID=concept,
                            Value=entry,
                            Source="Sims2020",
                        )[0]
                        args.writer.add_cognate(
                            cogset,
                            Cognateset_ID=row["Set #1"],
                            STEDT=str(row["STEDT # "] if "STEDT # " in row else ""),
                            Source="Sims2020",
                        )
Esempio n. 30
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(str(self.raw_dir / 'signalphabets.tsv'))

        concepts, sources = {}, {}
        for i, c in enumerate(wl.rows):
            args.writer.add_concept(
                ID=str(i + 1),
                Name=c,
            )
            concepts[c] = str(i + 1)
        for language in self.languages:
            args.writer.add_language(
                ID=language['Name_in_Database'],
                Name=language['Name'],
                Latitude=language['Latitude'],
                Longitude=language['Longitude'],
                Glottocode=language['Glottolog'],
                SubGroup=language['SubGroup'],
            )
            sources[language['Name_in_Database']] = language['Source']
        sources['Ukranian_SL'] = 'Lydell2018'
        languages = {language: language for language in sources}
        languages['Ukranian_SL'] = 'Ukrainian_SL'

        args.writer.add_sources(
            *[x for x in self.raw_dir.read_bib() if x.id in sources])

        for i, c, l, h1, h2, t, cid in progressbar(wl.iter_rows(
                'concept', 'doculect', 'handshape_1', 'handshape_2', 'tokens',
                'cogid'),
                                                   desc='makecldf'):
            row = args.writer.add_form(Value=h1 + ' ' + h2,
                                       Language_ID=languages[l],
                                       Parameter_ID=concepts[c],
                                       Form=' '.join(t),
                                       Source=sources[l])
            args.writer.add_cognate(
                lexeme=row,
                Cognateset_ID=cid,
            )