Example #1
0
    def cmd_download(self, args):
        # https://diacl.ht.lu.se/GeoJson/GeographicalPresence/24
        print("Download wordlists ...")
        wordlists = self._download_json("WordLists")
        for wlid in progressbar(list(wordlists.keys())):
            # We download the XML representations, because only these seem to contain source info
            # per lexeme.
            self.raw_dir.download(
                self._url("/Xml/WordListWithLanguageLexemes/{0}".format(wlid)),
                "wl{0}.xml".format(wlid),
                skip_if_exists=True,
            )
        print("... done")

        print("Download etymologies ...")
        etymologies_by_wordlistitem = OrderedDict()
        for wl in wordlists.values():
            print(wl["Name"])
            for wlc in wl["WordListCategories"].values():
                print("-- ", wlc["Name"])
                for wli in progressbar(wlc["WordListItems"]):
                    data = self._download_json("WordListLexemesWithAncestors/{0}".format(wli))
                    del data["lexemes"]
                    del data["languages"]
                    etymologies_by_wordlistitem[wli] = data
        with gzip.GzipFile(str(self.raw_dir.joinpath("etymology.json.gz")), "w") as fp:
            fp.write(dumps(etymologies_by_wordlistitem).encode("utf8"))
        for p in self.raw_dir.glob("WordListLexemesWithAncestors*"):
            Path.unlink(p)
        print("... done")

        self._download_json("LanguageTree")
Example #2
0
    def cmd_makecldf(self, args):
        # add the bibliographic sources
        args.writer.add_sources()

        # add the languages from the language list (no need for mapping here)
        args.writer.add_languages()

        # add the concepts from the concept list
        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            cid = "%s_%s" % (concept.id.split("-")[-1], slug(concept.english))
            args.writer.add_concept(
                ID=cid,
                Name=concept.english,
                NorthEuralex_Gloss=concept.attributes["nelex_id"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.attributes["nelex_id"]] = cid

        # add items
        lexeme_rows = self.raw_dir.read_csv("nelex.tsv",
                                            delimiter="\t",
                                            dicts=True)
        for row in progressbar(lexeme_rows):
            args.writer.add_form(
                Language_ID=row["Language_ID"],
                Parameter_ID=concept_lookup[row["Concept_ID"]],
                Value=row["Word_Form"],
                Form=row["rawIPA"],
                Source=["Dellert2020"],
            )
Example #3
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name"
        )
        languages = args.writer.add_languages(id_factory=lambda l: l["Name"])

        reader = self.raw_dir.read_csv(self.raw_dir / "Wordlist.tsv", dicts=True, delimiter="\t")

        for row in progressbar(reader):
            lexemes = {k: v for k, v in row.items() if k in languages}
            for language, lexeme in lexemes.items():
                args.writer.add_forms_from_value(
                    Language_ID=language,
                    Parameter_ID=concepts[row["CONCEPT"]],
                    Value=lexeme,
                    Source="Mitterhofer2013",
                    Loan=False,
                )

        # We explicitly remove the ISO code column since the languages in
        # this datasets do not have an ISO code.
        args.writer.cldf["LanguageTable"].tableSchema.columns = [
            col
            for col in args.writer.cldf["LanguageTable"].tableSchema.columns
            if col.name != "ISO639P3code"
        ]
    def cmd_makecldf(self, args):
        # add sources
        args.writer.add_sources()

        # add languages
        languages = args.writer.add_languages(lookup_factory="Name")

        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda cpt: "%s_%s" %
            (cpt.id.split("_")[0], slug(cpt.english)),
            lookup_factory="Name",
        )

        # Hard-coded fixes to segment errors in raw source
        segments = {
            "áː": "áː/aː",
            "âː": "âː/aː",
            "aʰ": "a h",
            "ɐ̃ʰ": "ɐ̃ h",
            "í": "í/i",
            "íː": "íː/iː",
            "iʰ": "i h",
            "i̥": "i̥/i",
            "ka": "k a",
            "kw": "kʷ",  # the single instance is a labialized velar
            "nⁱ": "n i",
            "óː": "óː/oː",
            "teː": "t eː",
            "ú": "u/u",
            '#': '+'
        }

        # read wordlist with lingpy
        wl_file = self.raw_dir / "Bruzzi_Granadillo.txt"
        wl = lingpy.Wordlist(wl_file.as_posix())

        # iterate over wordlist
        for idx in progressbar(wl, desc="makecldf"):
            # write lexemes
            lex = args.writer.add_form_with_segments(
                Language_ID=languages[wl[idx, "doculect"]],
                Parameter_ID=concepts[wl[idx, "concept"]],
                Value=wl[idx, "entrj_in_source"],
                Form=wl[idx, "ipa"],
                Segments=" ".join(
                    [segments.get(x, x) for x in wl[idx, "tokens"]]).split(),
                Source=[
                    "granadillo_ethnographic_2006", "silva_discoteca_1961"
                ],
            )

            args.writer.add_cognate(
                lexeme=lex,
                Cognateset_ID=wl[idx, "cogid"],
                Source=["Chacon2019"],
            )
Example #5
0
 def cmd_makecldf(self, args):
     args.writer.add_sources(*self.etc_dir.read_bib())
     concepts = args.writer.add_concepts(
         id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english),
         lookup_factory=lambda c: c['ID'].split('_')[0])
     for wl in progressbar(self.iter_wordlists(args.log), desc="cldfify"):
         wl.to_cldf(args.writer, concepts)
         # Now normalize the typedby and checkedby values:
         args.writer.objects['LanguageTable'][-1] = normalize_contributors(
             args.writer.objects['LanguageTable'][-1])
Example #6
0
    def cmd_makecldf(self, args):
        # due to bad concept ids in STEDT, we need to load them from file
        converter = defaultdict(set)
        for row in self.raw_dir.read_csv("srcids.tsv", delimiter="\t", dicts=True):
            converter[row["CORRECTED"]].add(row["IDINSTEDT"])

        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Number=concept.number,
            )
            concept_lookup[concept.number] = idx
            for id_in_source in converter[concept.number]:
                concept_lookup[id_in_source] = idx

        language_lookup = args.writer.add_languages(lookup_factory="Name")
        args.writer.add_sources()

        for row in progressbar(self.raw_dir.read_csv("AH-CSDPN.tsv", delimiter="\t")[1:]):
            args.writer.add_forms_from_value(
                Local_ID=row[0],
                Language_ID=language_lookup[row[6]],
                Parameter_ID=concept_lookup[row[7]],
                Value=row[1],
                Source=["Hale1973"],
            )

        for row in progressbar(self.raw_dir.read_csv("AH-CSDPN.tsv", delimiter="\t")[1:]):
            args.writer.add_forms_from_value(
                Local_ID=row[0],
                Language_ID=language_lookup[row[6]],
                Parameter_ID=concept_lookup[row[7]],
                Value=row[1],
                Source=["Hale1973"],
            )
    def cmd_makecldf(self, args):
        # sources are poorly annotated, so we need to correct manually
        src = {
            "H&R92": "huber_vocabulario_1992",
            "H&R 1992": "huber_vocabulario_1992",
            "Melendez 2011": "melendez_lozano_diccionario_2011",
            "Allin 1979": "allin_vocabulario_1979",
            "Aikhenvald 2012": "aikhenvald_dicionario_2012",
            "Aikenvald2001": "aihenvald_dicionario_2001",
            "Oliveira 93": "cunha_de_oliveira_uma_1993",
            "Ramirez2001": "ramirez_dicionario_2001",
            "Ramirez 2001": "ramirez_dicionario_2001",
            "Schauer 2005": "schauer_diccionario_2005",
            "Aikhenvald 2001": "aikhenvald_dicionario_2001",
        }

        # add source
        args.writer.add_sources()

        # add languages
        languages = args.writer.add_languages(lookup_factory="Name")

        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: "%s_%s" %
            (c.id.split("-")[-1], slug(c.english)),
            lookup_factory="Name",
        )

        # read raw wordlist add lexemes
        wl_file = self.raw_dir / "arawakan_swadesh_100_edictor.tsv"
        wl = lingpy.Wordlist(wl_file.as_posix())

        for idx in progressbar(wl, desc="makecldf"):
            if wl[idx, "value"]:
                if wl[idx, 'segments'][0] == '_':
                    wl[idx, 'segments'] = wl[idx, 'segments'][1:]
                lex = args.writer.add_form_with_segments(
                    Language_ID=languages[wl[idx, "doculect"]],
                    Parameter_ID=concepts[wl[idx, "concept"]],
                    Value=wl[idx, "value"],
                    Form=wl[idx, "form"],
                    Segments=wl[idx, "segments"],
                    Source=src.get(wl[idx, "source"], "Chacon2017"),
                )

                # add cognate
                args.writer.add_cognate(lexeme=lex,
                                        Cognateset_ID=wl[idx, "cogid"],
                                        Source=["Chacon2017"])
 def cmd_makecldf(self, args):
     args.writer.add_sources()
     languages = args.writer.add_languages(id_factory=lambda l: l["Name"],
                                           lookup_factory=lambda l:
                                           (l["Name"], l["Source"]))
     sources = {k[0]: k[1] for k in languages}  # language: source map
     concepts = args.writer.add_concepts(
         id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
         lookup_factory="Name")
     for row in progressbar(
             self.raw_dir.read_csv("madang.csv", dicts=True,
                                   delimiter="\t")):
         concept = CONCEPT_REMAPPING.get(row["CONCEPT"], row["CONCEPT"])
         args.writer.add_forms_from_value(
             Local_ID=row["ID"],
             Language_ID=row["DOCULECT"],
             Parameter_ID=concepts[concept],
             Value=row["COUNTERPART"],
             Source=sources[row["DOCULECT"]],
         )
Example #9
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        data = self.raw_dir.read_csv('wordlist.tsv',
                                     dicts=True,
                                     delimiter='\t')
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="ID")
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for row in progressbar(data, desc="cldfify"):
            if row["DOCULECT"] in languages:
                args.writer.add_forms_from_value(
                    Language_ID=row["DOCULECT"],
                    Parameter_ID=concepts[row["CONCEPT"]],
                    Value=row["TRANSCRIPTION"],
                    Source=["chinds"],
                )
Example #10
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")
        languages = args.writer.add_languages(id_factory=lambda l: l["Name"])

        reader = self.raw_dir.read_csv(self.raw_dir / "Wordlist.tsv",
                                       dicts=True,
                                       delimiter="\t")

        for row in progressbar(reader):
            lexemes = {k: v for k, v in row.items() if k in languages}
            for language, lexeme in lexemes.items():
                args.writer.add_forms_from_value(
                    Language_ID=language,
                    Parameter_ID=concepts[row["CONCEPT"]],
                    Value=lexeme,
                    Source="Mitterhofer2013",
                    Loan=False,
                )
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.tsv', delimiter="\t", dicts=True)

        # Quite a hack to allow things like "1995.pdfb" as Source IDs:
        bib = pybtex.database.parse_string(self.raw_dir.read('sources.bib'), bib_format='bibtex')
        sources = []
        for k, e in bib.entries.items():
            # Unfortunately, Source.from_entry does not allow any keyword arguments to be passed
            # to the constructor, see https://github.com/cldf/pycldf/issues/99
            e.fields['_check_id'] = False
            sources.append(Source.from_entry(k, e))
        args.writer.add_sources(*sources)

        language_lookup = args.writer.add_languages(lookup_factory='NameInSource')
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split('-')[-1]+'_'+slug(x.english),
            lookup_factory='Name'
        )
        lang_sources = {l['NameInSource']: l['Source'].split(",") for l in self.languages}

        # remap concepts for personal pronouns
        remap_concepts = {
            '1SG pronoun': '1sg pronoun',
            '2SG pronoun': '2sg pronoun',
            '3SG pronoun': '3sg pronoun',
        }

        for line_dict in progressbar(data, desc='cldfify'):
            concept = line_dict['Meaning']
            concept_id = concept_lookup.get(remap_concepts.get(concept, concept))
            for language, language_id in language_lookup.items():
                value = line_dict[language].strip()
                if value:
                    args.writer.add_form(
                        Value=value,
                        Form=value,
                        Parameter_ID=concept_id,
                        Language_ID=language_id,
                        Source=lang_sources[language]
                    )
Example #12
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist((self.raw_dir / "D_old-clics.tsv").as_posix())
        src = {"logos": "Logos2008"}
        args.writer.add_sources(*self.raw_dir.read_bib())
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for k in progressbar(wl):
            if wl[k, "value"]:
                args.writer.add_language(
                    ID=slug(wl[k, "doculect"], lowercase=False),
                    Name=wl[k, "doculect"],
                    Glottocode=wl[k, "glottolog"],
                )
                args.writer.add_form(
                    Language_ID=slug(wl[k, "doculect"], lowercase=False),
                    Parameter_ID=concepts[wl[k, "concept"]],
                    Value=wl[k, "value"],
                    Form=wl[k, "value"],
                    Source=src.get(wl[k, "source"], ""),
                )
Example #13
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        data = self.raw_dir.read_csv("zhang2019-oc-rgyal.tsv", dicts=True,
                delimiter="\t")
        # add languages
        languages = args.writer.add_languages(lookup_factory='Name')
        languages_dict = {}
        for lan in self.languages:
            languages[lan['Name']] = {'Source' :lan['Source'], 'ID':lan['ID']}
        
        # add concepts
        concepts = {}
        for concept in self.concepts:
            idx = '{0}_{1}'.format(
                    concept['NUMBER'],
                    slug(concept['ENGLISH']))

            args.writer.add_concept(
                    ID=idx,
                    Name=concept['ENGLISH'],
                    Chinese_Gloss=concept['CHINESE'],
                    Gloss_in_Source=concept['GLOSS_IN_SOURCE']
                    )
            concepts[concept['CHINESE'].strip()] = idx

        for cogid, entry in progressbar(
                enumerate(data), desc="cldfify", total=len(data)
                ):
            for language, value in languages.items():
                if entry[language].strip():
                    for row in args.writer.add_forms_from_value(
                        Language_ID=value['ID'],
                        Parameter_ID=concepts[entry["Chinese_character"]],
                        Value=entry[language],
                        Source=[value['Source']]
                        ):
                        args.writer.add_cognate(
                                lexeme=row,
                                Cognateset_ID=cogid+1)
Example #14
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("forms.csv", dicts=True)
        args.writer.add_sources()

        # TODO: add concepts with `add_concepts`
        concept_lookup = {}
        for concept in self.concepts:
            idx = concept['sort'] + "_" + slug(concept['eng'])
            args.writer.add_concept(
                ID=idx,
                Name=concept['eng'],
                Number=concept['sort'],
                Russian_Gloss=concept['rus'],
                Concepticon_ID=concept['СС_no']
                if concept['СС_no'] != '0' else '',
                #Concepticon_Gloss=concept['eng'] if concept['eng'] else '',
            )
            concept_lookup[concept['sort']] = idx
        language_lookup = {}
        for language in self.languages:
            args.writer.add_language(ID=language['lang.id'],
                                     Name=language['lang.name'],
                                     Glottocode=language['glottocode'],
                                     Latitude=language['latitude'],
                                     Longitude=language['longitude'])
            language_lookup[language['lang.name']] = language['lang.id']

        for k in progressbar(data, desc="wl-to-cldf"):
            if (not k['subentry']
                    or k['subentry'] == 'sg') and k['lc.id'] in concept_lookup:
                args.writer.add_forms_from_value(
                    Language_ID=k['lang'],
                    Parameter_ID=concept_lookup[k["lc.id"]],
                    Value=k["orthographic"],
                    Source="lexcauc")
            elif not k['lc.id'] in concept_lookup:
                print(k['lc.id'])
    def cmd_makecldf(self, args):
        args.writer.add_sources()

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
        languages = args.writer.add_languages(lookup_factory="Name")

        # here we need to add the lexemes
        data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False)
        for i, row in progressbar(enumerate(data[4:])):
            number = row[0].strip().strip('.')
            for j in range(0, len(row)-2, 2):
                language = data[2][j+2]
                value = row[j+2]
                if value.strip() and value.strip() not in ['-----']:
                    if not 'or' in row[3+j]:
                        cogid = str(int(float(row[j+3])))
                    else:
                        cogid = row[j+3].split()[0]
                    for lexeme in args.writer.add_forms_from_value(
                            Parameter_ID=concepts[number],
                            Language_ID=languages[language],
                            Value=value.strip(),
                            Source='Deepadung2015'):
                        args.writer.add_cognate(
                                lexeme=lexeme,
                                Cognateset_ID=cogid+'-'+number,
                                Source='Deepadung2015')
    def cmd_makecldf(self, args):
        languages = []
        number_files = sorted(list((self.raw_dir / "uninumrepo" / "numbers/").glob("**/*.tsv")))
        codes = self.raw_dir / "uninumrepo" / "codes.tsv"
        args.writer.add_sources()
        concepts = args.writer.add_concepts(id_factory=lambda c: c.english, lookup_factory="Name")

        for code in self.raw_dir.read_csv(codes, delimiter="\t", dicts=True):
            # We add additional Glottocodes based on languages.tsv wherever applicable:
            substitute = list(filter(lambda y: y["Code"] == code["Code"], self.languages))

            args.writer.add_language(
                ID=code["Code"],
                Name=code["Language name(s)"],
                Code=code["Code"],
                Glottocode=substitute[0]["Glottocode"] if substitute else code["Glottocode"],
                ISO639P3code=code["ISO 639-3"],
                Script=code["Script"],
                Locale=code["Locale"],
                Ethnologue=code["Ethnologue"],
                Variety=code["Variety"],
            )
            languages.append(code["Code"])

        for number_file in progressbar(number_files):
            lcode = number_file.name.split(".tsv")[0]

            for entry in self.raw_dir.read_csv(number_file, delimiter="\t"):
                # entry[0] is the concept.
                # entry[1] is the lexeme.
                args.writer.add_lexemes(
                    Language_ID=lcode,
                    Parameter_ID=concepts[entry[0]],
                    Value=entry[1],
                    Source="Ritchie2019",
                )
Example #17
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('DagLoans_Words.tsv',
                                     delimiter="\t",
                                     dicts=True)
        args.writer.add_sources()
        concepts = {}
        for concept in self.concepts:
            idx = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
            args.writer.add_concept(
                ID=idx,
                Name=concept['ENGLISH'],
                Number=concept['NUMBER'],
                Concepticon_ID=concept['CONCEPTICON_ID'],
                Concepticon_Gloss=concept['CONCEPTICON_GLOSS'])
            concepts[concept['ENGLISH']] = idx
        sources, languages = {}, {}
        for language in self.languages:
            if language['District'] == 'Dictionary':
                sources[language['List_ID']] = language['Source'].strip()
                language['District'] = ''
            elif language['District'] == 'Expert':
                language['District'] = ''
            args.writer.add_language(**language)
            languages[language['List_ID']] = language['ID']

        for row in progressbar(data):
            lexeme = args.writer.add_form(
                Language_ID=languages[row['List_ID']],
                Parameter_ID=concepts[row['Concept']],
                Local_ID=row['Entry_ID'],
                Value=row['Standard_Transcription'],
                Form=row['Word'],
                Source=sources.get(row['List_ID'], ''),
                Borrowing_ID=row['Concept_ID'] + '-' + row['Stem'])
            args.writer.add_cognate(lexeme=lexeme,
                                    Cognateset_ID=lexeme['Borrowing_ID'])
Example #18
0
    def cmd_makecldf(self, args):

        sound_cat = self.raw_dir.read_json(self.catalog_file_name)

        # add sources
        args.writer.add_sources()

        # add languages from explicit file
        concepts = {}
        for concept in self.concepts:
            args.writer.add_concept(**concept)
            concepts[concept['IndexInSource']] = concept['ID']
        languages = {}
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language['IndexInSource']] = language['ID']

        # Load JSON data
        json_data = self.raw_dir.read_json(self.data_file_name)

        # collect missing languages
        missing = set()

        media = []
        args.writer.cldf.add_table('media.csv',
                                   'ID',
                                   'Description',
                                   'URL',
                                   'mimetype', {
                                       'name': 'size',
                                       'datatype': 'integer'
                                   },
                                   'Form_ID',
                                   primaryKey=['ID'])

        args.writer.cldf.add_foreign_key(
            'media.csv',
            'Form_ID',
            'FormTable',
            'ID',
        )

        # Add lexemes
        for idx in progressbar(sorted(
                json_data['transcriptions'],
                key=lambda k:
            (int(json_data['transcriptions'][k]['LanguageIx']),
             int(json_data['transcriptions'][k]['IxElicitation']),
             int(json_data['transcriptions'][k]['IxMorphologicalInstance']))),
                               desc='makecldf'):
            lexeme = json_data['transcriptions'][idx]

            # Skip over entries with no phonetic transcription, empty
            # phonetic transicrption and from
            # different studies (missing language)
            if 'Phonetic' not in lexeme:  # pragma: no cover
                continue
            if not lexeme['Phonetic']:
                continue
            if lexeme['LanguageIx'] not in languages:  # pragma: no cover
                missing.add(lexeme['LanguageIx'])
                continue

            # If there is only one elictation for a meaning
            # it comes as plain string (otherwise as list).
            # Turn this string into a list as well.
            if isinstance(lexeme['Phonetic'], str):
                lexeme['Phonetic'] = [lexeme['Phonetic']]
                lexeme['path'] = [lexeme['path']]
                lexeme['soundPaths'] = [lexeme['soundPaths']]

            ref_id = None
            last_altlex = None
            for i, value in enumerate(lexeme['Phonetic']):
                v = value.strip()
                # Skip if value is empty
                if not v or v in self.form_spec.missing_data:
                    continue
                # Commas are not allowed!
                if ',' in v:  # pragma: no cover
                    args.log.warn(
                        'Comma not allowed in /{0}/ for {1} - {2}'.format(
                            value, languages[lexeme['LanguageIx']],
                            lexeme['IxElicitation']))
                param_id = concepts['{0}-{1}'.format(
                    lexeme['IxElicitation'],
                    lexeme['IxMorphologicalInstance'])]

                new = args.writer.add_form(
                    Language_ID=languages[lexeme['LanguageIx']],
                    Local_ID='{0}-{1}-{2}'.format(
                        lexeme['LanguageIx'], lexeme['IxElicitation'],
                        lexeme['IxMorphologicalInstance']),
                    Parameter_ID=param_id,
                    Value=v,
                    Form=v,
                    Loan=(lexeme['RootIsLoanWordFromKnownDonor'] == '1'),
                    Source=self.source_id_array,
                    Variant_Of=ref_id
                    if int(lexeme['AlternativePhoneticRealisationIx'][i]) > 0
                    else None,
                )

                # add media
                if isinstance(lexeme['soundPaths'], list)\
                        and len(lexeme['soundPaths'][0]) > 0\
                        and len(lexeme['soundPaths'][i][0]) > 0:
                    if lexeme['path'][i] in sound_cat:
                        for bs in sorted(
                                sound_cat[lexeme['path'][i]]['bitstreams'],
                                key=lambda x: x['content-type']):
                            media.append({
                                'ID':
                                bs['checksum'],
                                'Description':
                                lexeme['path'][i],
                                'URL':
                                'https://cdstar.shh.mpg.de/bitstreams/{0}/{1}'.
                                format(sound_cat[lexeme['path'][i]]['id'],
                                       bs['bitstreamid']),
                                'mimetype':
                                bs['content-type'],
                                'size':
                                bs['filesize'],
                                'Form_ID':
                                new['ID']
                            })
                    else:  # pragma: no cover
                        args.log.warn(
                            'Missing sound file name in catalog {0}.'.format(
                                lexeme['path'][i]))

                # Remember last inserted ID for alternative pronounciations to insert 'Variant_Of'.
                # This can be done in that way since the downloaded json data are sort
                # by altlex and altpron.
                if last_altlex != int(lexeme['AlternativeLexemIx'][i]):
                    ref_id = new['ID']
                last_altlex = int(lexeme['AlternativeLexemIx'][i])

                # add cognate if desired
                if self.create_cognates:
                    wcogid = '{0}-{1}'.format(
                        param_id, lexeme['WCogID'][i] if lexeme['WCogID'][i]
                        and int(lexeme['WCogID'][i]) > 1 else '1')
                    args.writer.add_cognate(
                        lexeme=new,
                        Cognateset_ID=wcogid,
                        Source=self.source_id_array,
                    )

        args.writer.write(**{'media.csv': media})

        for m in sorted(missing):  # pragma: no cover
            args.log.warn('Missing language with ID {0}.'.format(m))
Example #19
0
    def cmd_makecldf(self, args):

        wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t")
        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Chinese_Gloss=concept.attributes["chinese"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.number.rjust(3, "0")] = [idx, concept]
        language_lookup = {k["ID_in_Source"]: k for k in self.languages}
        args.writer.add_languages()
        args.writer.add_sources()

        idx = 1
        mapping = {
            0: [
                "doculect",
                "doculectid",
                "glottocode",
                "concept",
                "glossid",
                "value",
                "form",
                "phonetic",
                "concepticon_id",
                "concepticon_gloss",
            ]
        }

        for line in progressbar(wl, desc="load the data"):
            if not line[0].strip():
                phonetic = True
            if line[0] == "'Ref#":
                numbers = line
                phonetic = False
                idxs = defaultdict(list)
            elif line[0] == "Gloss":
                glosses = line
            elif line[0] in language_lookup and not phonetic:
                taxon = line[0]
                for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]):
                    if num.strip() and gloss.strip():
                        cname = concept_lookup[num[1:]][1]
                        forms = val.split(",")
                        if forms:
                            for form in forms:
                                mapping[idx] = [
                                    language_lookup[taxon]["Name"],
                                    taxon,
                                    language_lookup[taxon]["Glottocode"],
                                    cname.english,
                                    num[1:],
                                    val,
                                    form.strip(),
                                    "",  # check later for phonetic value
                                    cname.concepticon_id,
                                    cname.concepticon_gloss,
                                ]
                                idxs[taxon, gloss] += [idx]
                                idx += 1
                        else:
                            print("missing value", gloss, num, taxon)

            elif line[0] in language_lookup and phonetic:
                taxon = line[0]
                for gloss, val in zip(glosses[1:], line[1:]):
                    if gloss.strip():
                        these_idx = idxs.get((taxon, gloss))
                        if not these_idx:
                            pass
                        else:
                            forms = val.split(",")
                            for this_idx, form in zip(these_idx, forms):
                                mapping[this_idx][7] = form
        # export to lingpy wordlist in raw folder
        # Wordlist(mapping).output(
        #    "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix()
        # )

        # add data to cldf
        for idx in progressbar(range(1, len(mapping)),
                               desc="cldfify",
                               total=len(mapping)):
            vals = dict(zip(mapping[0], mapping[idx]))
            args.writer.add_lexemes(
                Language_ID=language_lookup[vals["doculectid"]]["ID"],
                Parameter_ID=concept_lookup[vals["glossid"]][0],
                Value=vals["value"],
                Source=["Castro2015"],
            )
Example #20
0
    def cmd_makecldf(self, args):
        # Add bibliographic sources
        args.writer.add_sources()

        # Read raw concept data and add to dataset; at the same time,
        # build a map between the concept index as used in data and the
        # concept id in the dataset
        concept_lookup = {}
        for cidx, concept in enumerate(self.conceptlists[0].concepts.values()):
            concept_cldf_id = (concept.id.split("-")[-1] + "_" +
                               slug(concept.english))
            concept_lookup[1 + (cidx * 2)] = concept_cldf_id

            # Add the concept
            args.writer.add_concept(
                ID=concept_cldf_id,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )

        # Add languages and make a map for individual sources
        language_lookup = args.writer.add_languages(
            lookup_factory="Source_Name")
        source_lookup = {
            entry["Source_Name"]: entry["Source"]
            for entry in self.languages
        }

        # Read raw data and remove headers and rows with reconstructions
        # (row[0] not in languages)
        data = self.raw_dir.read_csv("dataset_khobwa.csv")
        data = data[2:]
        data = [row for row in data if row[0] in language_lookup]

        # iterate over the source adding lexemes and collecting cognates
        for row in progressbar(data, desc="makecldf"):
            for cid in range(1, len(row), 2):
                # Skip over rows with empty fields for cogid
                if not row[cid + 1]:
                    continue

                # Compute a cognate_id number; lingpy now requires
                # this to be an integer
                cognate_id = cid * 100 + int(row[cid + 1])

                # Extract the value from the raw data, skipping over
                # missing or non-existing forms. We need to strip here,
                # as there are entries with newlines and FormSpec, as the
                # name implies, does not apply to values.
                value = row[cid].strip()
                for lex in args.writer.add_lexemes(
                        Language_ID=language_lookup[row[0]],
                        Parameter_ID=concept_lookup[cid],
                        Value=value,
                        Cognacy=cognate_id,
                        Source=source_lookup[row[0]],
                ):
                    args.writer.add_cognate(
                        lexeme=lex,
                        Cognateset_ID=cognate_id,
                        Source="Lieberherr2017",
                    )
    def cmd_makecldf(self, args):
        # parse the data from the word document
        table = [[""]]  # we except 9 columns
        with open(self.raw_dir.joinpath("data.txt").as_posix()) as f:
            previous = []
            for i, line in enumerate(f):
                rows = [c.strip() for c in line.split("\t")]
                if rows[0].replace(".", "").isdigit():
                    table += [rows]
                else:
                    table[-1][-1] += "/" + rows[0]
                    table[-1] += rows[1:]
        # load cognates
        cognates = self.raw_dir.read_csv("cognates.tsv", delimiter="\t")[1:]
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = "{0}-{1}".format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Spanish_Gloss=concept.attributes["spanish"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            for gloss in concept.attributes["lexibank_gloss"]:
                concepts[gloss] = idx
        concepts["Frio/(hace frio)"] = concepts["Frio (hace frio)"]
        args.log.info("added concepts")

        args.writer.add_sources()
        cognacy, counter = {}, 1
        cogsets = {
            "A(B)": ["A"],
            "A/(B)": ["A"],
            "A/B": ["A", "B"],
            "A/B/C": ["A", "B", "C"],
            "A/B/D": ["A", "B", "D"],
            "A/B?": ["A"],
            "A/C": ["A", "C"],
            "B/(A)": ["A"],
            "B/(a)": ["B"],
            "B/C": ["B", "C"],
            "C D": ["C", "D"],
            "C/(B)": ["C"],
            "C/B": ["C", "B"],
            "C/E": ["C", "E"],
            "D/B": ["D", "B"],
            "a/(B)": ["A"],
            "a/A": ["A", "A"],
            "a/B": ["A", "B"],
            "ab": ["A", "B"],
        }
        languages = args.writer.add_languages(lookup_factory="Name")
        for i, line in progressbar(enumerate(table[1:])):
            for j, (language, cell) in enumerate(zip(table[0][2:], line[2:])):
                if cell.strip():

                    cognatesets = cogsets.get(
                        cognates[i][j + 1].strip(), [cognates[i][j + 1].strip().upper()]
                    )

                    for lexeme, cognate in zip(
                        args.writer.add_forms_from_value(
                            Value=cell,
                            Language_ID=languages[language],
                            Parameter_ID=concepts[line[1]],
                            Source=["Pharao2020"],
                        ),
                        cognatesets,
                    ):
                        if cognate in ["?", "-"]:
                            cid = counter
                            counter += 1
                        else:
                            cid = "{0}-{1}".format(i, cognate)
                            if cid in cognacy:
                                cid = cognacy[cid]
                            else:
                                cognacy[cid] = counter
                                cid = cognacy[cid]
                                counter += 1
                        if languages[language] == "ProtoUtoAztecan" and "SUA" in cell.strip():
                            lexeme["Language_ID"] = languages["SUA"]

                        args.writer.add_cognate(lexeme, Cognateset_ID=cid, Source=["Pharao2020"])
Example #22
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()
        languages, sources = {}, {}
        for language in self.languages:
            languages[language['Name']] = language['ID']
            sources[language['Name']] = language['Source']
            args.writer.add_language(**language)
        concepts = {}
        for concept in self.concepts:
            idx = '{0}_{1}'.format(concept['NUMBER'], slug(concept['GLOSS']))
            concepts[concept['MSA_NAME'].replace('"', '')] = idx
            args.writer.add_concept(
                ID=idx,
                Name=concept['GLOSS'],
                Concepticon_ID=concept['CONCEPTICON_ID'],
                Concepticon_Gloss=concept['CONCEPTICON_GLOSS'],
                Number=concept['NUMBER'],
                MSA=concept['MSA_NAME'])
        converter = {
            '˗': '-',
            'ı': 'ɨ',
            '_': '+',
            'ɴ̣': 'ɴ̩',
            'ŋ̣̩': 'ŋ̍',
            'ɸ͡x': 'ɸ͡x/ɸ',
            "ouɚ": "ouɚ/oɚ",
            "ouə": "ouə",
            "ʌiə": "ʌiə/ʌə",
            "aːəiə": "aːəiə/aːə",
            "œːiə": "œːiə/œːə",
            "æiə": "æiə/æə",
            "ɛeə": "ɛeə/ɛə",
            "ɛiɪ": "ɛiɪ/ɛɪ",
            "ɛɪə": "ɛɪə/ɛə",
            "ʊuʌ": "ʊuʌ/ʊʌ",
            "euə": "euə/eə",
            "aʊə": "aʊə/aə",
            "æɪə": "æɪə/æə",
            "ɛiə": "ɛiə/ɛə",
            "ɒʊe": "ɒʊe/ɒe",
            "ɪiə": "ɪiə/ɪə",
            "iɪə": "iɪə/iə",
            "æɛo": "æɛo/æo",
            "æɪɛ": "æɪɛ/æɛ",
            "əɪɜ": "əɪɜ/əɜ",
            "ɐuɐ": "ɐuɐ/ɐɐ",
            "ɔuɐ": "ɔuɐ/ɔɐ",
            "aɪɐ": "aɪɐ/aɐ",
            "ɔʊə": "ɔʊə/ɔə",
            "iuə": "iuə/yə",
            "œʊɑ": "œʊɑ/œɑ",
            "ɑʊɔ": "ɑʊɔ/ɑɔ",
            "ɔɪɛ": "ɔɪɛ/ɔɛ",
            "oʊɤ": "oʊɤ/oɤ",
            "ouə": "ouə/oə",
            "oʊə": "oʊə/oə",
            "ʊɛʊ": "ʊɛʊ/ɛʊ",
            "uˡ": "uˡ/u",
            "ɜıi": "ɜıi  ",
            "ɾ̆": "ɾ̆/r",
            "ıiı": "ıiı/ɨi",
            "ɛɪʊ": "ɛɪʊ/ɛʊ",
            "ʌɪɤ": "ʌɪɤ/ʌɤ",
            "ɛɪɤ": "ɛɪɤ/ɛɤ",
            "eiə": "eiə/eə",
            "eɪə": "eɪə/eə",
            "øʊə": "øʊə/øə",
            "æeo": "æeo/æo",
            "ɛɪɐ": "ɛɪɐ/ɛɐ",
            "aɪə": "aɪə/aə",
            "uɛi": "uɛi/ɛi",
            "m̆": "m̆/m",
            "ɜıi": "ɜıi/ɜi",
            "ɒʊə": "ɒʊə/ɒə",
            "ʧ": "tʃ",
            "ʦ": "ts",
            "ʨ": "tɕ",
            "ʣ": "dz",
            "ʤ": "dʒ",
            "ʥ": "dʑ",
            "ʧʰ": "tʃʰ",
            "ʦ": "tsʰ",
            "ʨ": "tɕʰ",
            "k͡χ": "kx",
            "aei": "aei/ai"
        }

        for f in progressbar(self.raw_dir.joinpath('msa').glob('*.msa')):
            msa = lingpy.align.sca.MSA(f.as_posix())
            cogid = msa.infile.split('_')[-1][:-4]
            for language, alignment in zip(msa.taxa, msa.alignment):
                alm = [converter.get(x, x) for x in alignment]
                seq = [x for x in alm if x != '-']
                lexeme = args.writer.add_form_with_segments(
                    Language_ID=languages[language],
                    Parameter_ID=concepts[msa.seq_id.replace('"', '')],
                    Value=''.join(seq),
                    Form=''.join(seq),
                    Segments=seq,
                    Cognacy=cogid,
                    Source=sources[language])
                args.writer.add_cognate(lexeme=lexeme,
                                        Cognateset_ID=cogid,
                                        Alignment=alm,
                                        Source=['List2014e'])
Example #23
0
    def cmd_makecldf(self, args):
        wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t")
        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split('-')[-1] + '_' + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Chinese_Gloss=concept.attributes["chinese"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.number.rjust(3, "0")] = [idx, concept]
        language_lookup = {k["ID_in_Source"]: k for k in self.languages}
        args.writer.add_languages()
        args.writer.add_sources()

        idx = 1
        mapping = {
            0: [
                "doculect",
                "doculectid",
                "glottocode",
                "concept",
                "glossid",
                "value",
                "phonetic",
                "concepticon_id",
                "concepticon_gloss",
            ]
        }

        for line in progressbar(wl, desc="load the data"):
            if not line[0].strip():
                phonetic = True
            if line[0] == "'Ref#":
                numbers = line
                phonetic = False
                idxs = defaultdict(list)
            elif line[0] == "Gloss":
                glosses = line
            elif line[0] in language_lookup and not phonetic:
                taxon = line[0]
                for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]):
                    if num.strip() and gloss.strip():
                        cname = concept_lookup[num[1:]][1]
                        if val:
                            mapping[idx] = [
                                language_lookup[taxon]["Name"],
                                taxon,
                                language_lookup[taxon]["Glottocode"],
                                cname.english,
                                num[1:],
                                val,
                                "",  # check later for phonetic value
                                cname.concepticon_id,
                                cname.concepticon_gloss,
                            ]

                            idxs[taxon, gloss] += [idx]
                            idx += 1

            elif line[0] in language_lookup and phonetic:
                taxon = line[0]
                for gloss, val in zip(glosses[1:], line[1:]):
                    if gloss.strip():
                        these_idx = idxs.get((taxon, gloss))
                        if not these_idx:
                            pass

        # export to lingpy wordlist in raw folder
        # Wordlist(mapping).output(
        #    "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix()
        # )

        # add data to cldf
        for idx in progressbar(range(1, len(mapping)),
                               desc="cldfify",
                               total=len(mapping)):
            vals = dict(zip(mapping[0], mapping[idx]))

            args.writer.add_forms_from_value(
                Language_ID=language_lookup[vals["doculectid"]]["ID"],
                Parameter_ID=concept_lookup[vals["glossid"]][0],
                Value=vals["value"],
                Source=["Castro2015"],
            )

        # We explicitly remove the ISO code column since the languages in
        # this datasets do not have an ISO code.
        args.writer.cldf["LanguageTable"].tableSchema.columns = [
            col
            for col in args.writer.cldf["LanguageTable"].tableSchema.columns
            if col.name != "ISO639P3code"
        ]
Example #24
0
    def cmd_makecldf(self, args):
        self._schema(args)
        args.writer.add_sources()

        # add the languages from the language file
        # NOTE: the source lists all languages, including proto-languages,
        # but the `forms` only include the first 41 in the list
        language_lookup = args.writer.add_languages(lookup_factory="WOLD_ID")

        desc_dir = self.cldf_dir / 'descriptions'
        if not desc_dir.exists():
            desc_dir.mkdir()
        numentries = {
            r["pk"]: int(r["count_words"])
            for r in self.raw_dir.joinpath("db").read_csv("vocabulary.csv", dicts=True)
        }
        db_contribs = {
            r['id']: r
            for r in self.raw_dir.joinpath('db').read_csv('contribution.csv', dicts=True)}
        for contrib in self.raw_dir.read_csv("contributions.csv", dicts=True):
            db_contrib = db_contribs[contrib['ID']]
            args.writer.objects["ContributionTable"].append(
                dict(
                    ID=contrib["ID"],
                    Name="{} vocabulary".format(contrib["Name"]),
                    Citation=format_citation(contrib, numentries[contrib["ID"]]),
                    Contributor=contrib["Contributors"],
                    Number_of_words=numentries[contrib["ID"]],
                    Language_ID=language_lookup[contrib["ID"]],
                )
            )
            desc = vocabulary_description(
                contrib['Name'], contrib["Contributors"], json.loads(db_contrib['jsondata']))
            p = desc_dir.joinpath('vocabulary_{}.md'.format(contrib['ID']))
            p.write_text(desc, encoding='utf8')

        concepticon = {concept.attributes['wold_id']: concept for concept in self.conceptlists[0].concepts.values()}
        for parameter in self.raw_dir.read_csv("parameters.csv", dicts=True):
            concept = concepticon.get(parameter['ID'])
            args.writer.add_concept(
                ID=parameter['ID'],
                Name=concept.english if concept else parameter['Name'],
                Concepticon_ID=concept.concepticon_id if concept else None,
                Concepticon_Gloss=concept.concepticon_gloss if concept else None,
                Core_list=parameter['CoreList'] == 'true',
                Semantic_field=parameter['SemanticField'],
                Semantic_category=parameter['SemanticCategory'],
                Borrowed_score=float(parameter['BorrowedScore']),
                Age_score=float(parameter['AgeScore']) if parameter['AgeScore'] else None,
                Simplicity_score=float(parameter['SimplicityScore']),
            )

        form2lexeme = {}
        wid2fid = collections.defaultdict(set)
        lexemes_rows = self.raw_dir.read_csv("forms.csv", dicts=True)
        for row in progressbar(lexemes_rows):
            # Add information not in row, so we can pass to `add_form()`
            # with a single comprehension
            row["Language_ID"] = language_lookup[row["Language_ID"]]
            row["Parameter_ID"] = row["Parameter_ID"]
            row["Value"] = row.pop("Form")
            row["Loan"] = float(row["BorrowedScore"]) > 0.6
            row["Borrowed_score"] = row["BorrowedScore"]
            row["Simplicity_score"] = row["SimplicityScore"]
            row["original_script"] = normalize_text(row["original_script"])
            row["comment_on_borrowed"] = normalize_text(row["comment_on_borrowed"])
            row.pop("Segments")
            row['Age_score'] = decimal.Decimal(row.pop('AgeScore')) if row['AgeScore'] else None
            row['Age'] = row.pop('age_label')
            row['Local_ID'] = row['ID']
            row['contact_situation'] = row['ContactSituation']
            row['Comment'] = row.pop('other_comments')

            lexemes = args.writer.add_forms_from_value(
                **{k: v for k, v in row.items() if k in self.lexeme_class.fieldnames()}
            )
            assert len(lexemes) == 1
            form2lexeme[row['ID']] = lexemes[0]['ID']
            wid2fid[row['Word_ID']].add(lexemes[0]['ID'])

        words = {r['pk']: r for r in self.raw_dir.joinpath('db').read_csv('unit.csv', dicts=True)}
        languages = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('language.csv', dicts=True)}
        codes = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('identifier.csv', dicts=True) if r['type'] == 'glottolog'}
        glottocodes = {
            r['language_pk']: codes[r['identifier_pk']]
            for r in self.raw_dir.joinpath('db').read_csv('languageidentifier.csv', dicts=True)
            if r['identifier_pk'] in codes}

        wids = [w['id'] for w in words.values()]
        for wid in wid2fid:
            assert wid in wids

        count = 0
        for row in self.raw_dir.joinpath('db').read_csv('loan.csv', dicts=True):
            assert row['target_word_pk'] in words
            source_word = None
            if row ['source_word_pk']:
                assert row['source_word_pk'] in words
                source_word = words[row['source_word_pk']]
            twid = words[row['target_word_pk']]['id']
            for fid in wid2fid[twid]:
                # The meaning-differentiated borrowing events.
                count += 1
                args.writer.objects['BorrowingTable'].append(dict(
                    ID=str(count),
                    Target_Form_ID=fid,
                    Comment='Source word unidentifiable' if source_word['name'].lower() == 'unidentifiable' else None,
                    Source_word=None if source_word['name'].lower() == 'unidentifiable' else source_word['name'],
                    Source_meaning=source_word['description'] or None,
                    Source_languoid=languages[source_word['language_pk']],
                    Source_languoid_glottocode=glottocodes.get(source_word['language_pk']),
                    Source_relation=row['relation'],
                    Source_certain=row['certain'] == 't',
                ))
    def cmd_makecldf(self, args):
        # Write sources to CLDF
        args.writer.add_sources()

        # Collect languages and add to CLDF, also building look-up
        languages = {}
        for language in self.languages:
            args.writer.add_language(
                ID=language["ID"],
                Name=language["Name"],
                Glottocode=language["Glottocode"],
            )
            languages[language["Name"]] = {
                "ID": language["ID"],
                "Source": language["Source"].split(","),
            }

        # Collect concepts and add to CLDF, also building look-up
        concepts = collections.OrderedDict()
        for concept in self.conceptlists[0].concepts.values():
            idx = "{0}_{1}".format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss)
            concepts[concept.english] = idx

        # Define a list os string replacements -- as the raw data is already
        # segmented, and these few cases are actually inconsistencies, it is better
        # than jus tusing a profile
        replacements = {
            "wu": ["w", "u"],
            "wã": ["w", "ã"],
            "ja": ["j", "a"],
            "oj": ["oi"],
            "kãʔã": ["k", "ã", "ʔ", "ã"],
            "ej": ["ei̯"],
            "ij": ["ii̯"],
            "ɨp": ["ɨ", "p"],
            "ɪw": ["ɪu̯"],
            "e͂": ["ẽ"],
        }

        missing, missing2 = set(), set()
        for row, cog in progressbar(
                zip(
                    self.raw_dir.read_csv("Aligned_matrix_lexical.csv",
                                          delimiter=",",
                                          dicts=True),
                    self.raw_dir.read_csv("Cognate matrix.csv",
                                          delimiter=",",
                                          dicts=True),
                )):
            language = row[""].strip()
            for concept, concept_id in concepts.items():
                if concept in row:
                    word = row[concept]
                    if word.strip() and language.strip():
                        segments = []
                        for segment in word.split():
                            segments += replacements.get(segment, [segment])
                        lexeme = args.writer.add_form_with_segments(
                            Language_ID=languages[language]["ID"],
                            Parameter_ID=concept_id,
                            Value=row[concept],
                            Form=row[concept],
                            Segments=segments,
                            Source=languages[language]["Source"],
                        )
                        if concept in cog:
                            args.writer.add_cognate(
                                lexeme=lexeme,
                                Cognateset_ID="{0}-{1}".format(
                                    slug(concept), cog[concept]),
                                Source="gerarditupi",
                            )
                        else:
                            missing.add(concept)
                else:
                    missing2.add(concept)

        # Log missing concepts
        for concept in missing:
            args.log.warn("Concept {0} could not be found".format(concept))
        for concept in missing2:
            args.log.warn("Concept {0} missing".format(concept))
    def cmd_makecldf(self, args):

        unknown_gc_cnt = 0

        html_files = get_file_paths(self.raw_dir)
        tables = find_tables(html_files)
        glottolog_codes = self.glottolog.languoids_by_code()
        glottolog_iso = self.glottolog.iso.languages
        concept_map = {
            cs.english: (cs.concepticon_id, cs.concepticon_gloss)
            for cs in self.conceptlists[0].concepts.values()
        }

        entries = []

        for table_set in tables:
            entry = NumeralsEntry(
                base_name=table_set[0],
                tables=table_set[1],
                file_name=table_set[2],
                codes=glottolog_codes,
                iso=glottolog_iso,
                title_name=table_set[3],
                source=table_set[4],
                base=table_set[5],
                comment=table_set[6],
            )
            entries.append(entry)

        seen_lg_names = {}
        lg_variant_counter = {}

        # with args.writer.cldf as ds:
        meaning_map = {}

        args.writer.add_sources(*self.raw_dir.read_bib())
        args.writer.cldf['FormTable', 'Problematic'].datatype.base = 'boolean'

        # remove newly added columns in order to get a good diff
        args.writer.cldf['FormTable'].tableSchema.columns = [
            c for c in args.writer.cldf['FormTable'].tableSchema.columns
            if c.name != 'Graphemes' and c.name != 'Profile'
        ]

        # map old lang_ids (without 'MsoNormalTable' table class)
        # against new ones to minimize diffs
        lang_id_map = {
            "hupd1244-4": ["hupd1244-2", 2 - 1],
            "hupd1244-2": ["hupd1244-3", 3 - 1],
            "hupd1244-3": ["hupd1244-4", 4 - 1],
            "nucl1440-2": ["nucl1440-1", 1 - 1],
            "nucl1440-3": ["nucl1440-2", 2 - 1],
            "nucl1440-1": ["nucl1440-3", 3 - 1],
            "poum1235-2": ["poum1235-1", 1 - 1],
            "poum1235-1": ["poum1235-2", 2 - 1],
            "wayu1241-1": ["wayu1241-2", 2 - 1],
            "wayu1241-2": ["wayu1241-1", 1 - 1],
            "port1283-1": ["port1283-2", 2 - 1],
            "port1283-2": ["port1283-1", 1 - 1],
        }

        for entry in progressbar(entries, desc="makecldf"):
            number_lexemes = entry.get_numeral_lexemes()

            for variety in number_lexemes:

                for var_id, var in variety.items():

                    # build language name
                    if var_id < len(entry.title_name):
                        lg_name = entry.title_name[var_id]
                    elif len(entry.title_name):
                        lg_name = entry.title_name[0]
                    else:
                        lg_name = entry.base_name

                    if not entry.ethnologue_codes:
                        entry.ethnologue_codes = ['']

                    # map 'old' glottocodes against new one
                    # to minimize diff
                    if lg_name == 'Enlhet (Lengua), Paraguay':
                        entry.glottocodes = ['leng1262']
                    if lg_name == 'Gerai, Indonesia':
                        entry.glottocodes = ['sema1269']
                    if lg_name == 'Southern Ndebele, South Africa':
                        entry.glottocodes = ['sout2808']

                    if not entry.glottocodes:
                        unknown_gc_cnt += 1
                        gc = ''
                        lang_id_prefix = 'xxxx%04d' % (unknown_gc_cnt)
                    else:
                        lang_id_prefix = entry.glottocodes[0]
                        gc = lang_id_prefix

                    if lg_name not in seen_lg_names:
                        seen_lg_names[lg_name] = []
                    seen_lg_names[lg_name].append(entry.file_name)

                    # build Contributor name
                    if var_id < len(entry.source):
                        contrib = entry.source[var_id]
                    else:
                        contrib = None

                    # build Base
                    if var_id < len(entry.base):
                        base = entry.base[var_id]
                    else:
                        base = None

                    # build Comment
                    if var_id < len(entry.comment):
                        com = entry.comment[var_id]
                    else:
                        com = ''

                    if len(set(seen_lg_names[lg_name])) > 1:
                        com = "CHECK with %s: %s" % (entry.file_name, com)

                    if lang_id_prefix not in lg_variant_counter:
                        lg_variant_counter[lang_id_prefix] = 0
                    lg_variant_counter[lang_id_prefix] += 1
                    c_lang_id = "%s-%i" % (lang_id_prefix,
                                           lg_variant_counter[lang_id_prefix])

                    # map according to old table parser without 'MsoNormalTable'
                    if c_lang_id in lang_id_map:
                        c_lang_id, var_id = lang_id_map[c_lang_id]

                    args.writer.add_language(
                        ID=c_lang_id,
                        Name=lg_name,
                        Glottocode=gc,
                        ISO639P3code=entry.ethnologue_codes[0],
                        SourceFile=entry.file_name,
                        Contributor=contrib,
                        Base=base,
                        Comment=com,
                    )

                    for k, vs in var.items():
                        meaning_n = str(k)
                        for v in vs:

                            if meaning_n not in meaning_map:
                                meaning_map[meaning_n] = str(k)
                                args.writer.add_concept(
                                    ID=meaning_map[meaning_n],
                                    Name=meaning_n,
                                    Concepticon_ID=concept_map.get(
                                        meaning_n, '')[0],
                                    Concepticon_Gloss=concept_map.get(
                                        meaning_n, '')[1],
                                )

                            if v:
                                value = v.replace("\n", "").replace("\t", "")
                                # after 2 or more non break spaces follows a comment
                                if '(' not in value:
                                    value = re.sub(r'^(.*?) {2,}(.*)$',
                                                   '\\1 (\\2)', value)
                                # after an em dash follows a comment
                                if '(' not in value:
                                    value = re.sub(r'^(.*?)\s*–\s*(.*)$',
                                                   '\\1 (\\2)', value)
                                # replace non break space by spaces
                                value = value.replace(" ", " ")
                                # put single string 'foo = IPA' into brackets
                                if '=' in value and '(' not in value:
                                    value = re.sub(
                                        r'^(.*?)\s(\S+\s*=\s*IPA.*)$',
                                        '\\1 (\\2)', value)

                                value, comment, other_form, loan = value_parser(
                                    value)

                                if value:
                                    args.writer.add_forms_from_value(
                                        Value=value,
                                        Parameter_ID=meaning_n,
                                        Variant_ID=(var_id + 1),
                                        Language_ID=c_lang_id,
                                        Comment=comment,
                                        Source="chan2019",
                                        Other_Form=other_form,
                                        Loan=loan,
                                    )

        def _x(s):
            try:
                return int(s)
            except ValueError:
                return s

        args.writer.objects['FormTable'] = sorted(
            args.writer.objects['FormTable'],
            key=lambda item: ([_x(i) for i in item['ID'].split('-')]))
        args.writer.objects['LanguageTable'] = sorted(
            args.writer.objects['LanguageTable'],
            key=lambda item: ([_x(i) for i in item['ID'].split('-')]))
        args.writer.objects['ParameterTable'] = sorted(
            args.writer.objects['ParameterTable'],
            key=lambda item: _x(item['ID']))
Example #27
0
    def cmd_makecldf(self, args):
        languages = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "languages.json")
        }
        words = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "words.json")
        }
        sources = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "sources.json")
        }
        # handle sources
        # want to make sure that the bibtex key matches our source id.
        for source in sorted(sources):
            # this is ugly, I wish pybtex made this easier!
            bib = parse_string(sources[source]["bibtex"], "bibtex")
            old_key = list(bib.entries.keys())[0]
            bib.entries[old_key].key = source
            bib.entries = OrderedCaseInsensitiveDict([(source,
                                                       bib.entries[old_key])])
            args.writer.add_sources(bib)

        # handle languages
        for lang in sorted(languages):
            args.writer.add_language(
                ID=lang,
                Name=languages[lang]["fullname"],
                ISO639P3code=languages[lang]["isocode"],
                Glottocode=languages[lang]["glottocode"],
            )

        # handle concepts
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss)
            concepts[concept.english] = idx
            concepts[concept.english.replace(" ", "-")] = idx
            concepts[concept.english.replace(" ", "-").lower()] = idx
            concepts[slug(concept.english)] = idx
            concepts["-".join([slug(x)
                               for x in concept.english.split()])] = idx

            if '(' in concept.english:
                new_string = concept.english[:concept.english.index('(') - 1]
                concepts["-".join([slug(x) for x in new_string.split()])] = idx
                concepts[concept.english[:concept.english.index('(') -
                                         1]] = idx
                concepts[concept.english[:concept.english.index('(') -
                                         1].replace(' ', '-').lower()] = idx
            if concept.english.startswith("to "):
                new_string = concept.english[3:]
                concepts['-'.join([slug(x) for x in new_string.split()])] = idx
                concepts[concept.english.replace("to ", "")] = idx
        concepts["mans-mother-law"] = concepts["man's mother in law"]
        concepts["brother-law"] = concepts["brother in law"]
        concepts["to-make-hole"] = concepts["make hole (in ground)"]
        concepts["front"] = concepts["in front"]
        concepts["husk-nut"] = concepts["husk (of nut)"]
        concepts["his"] = concepts["his, hers, its (pronoun p:3s)"]
        concepts["we-two-incl"] = concepts[
            "we incl. dual (pronoun d:1p, incl, dual)"]
        concepts["intrnasitivizer"] = concepts["intransitivizer"]
        concepts["short-piece-wood"] = concepts["short-piece-of-wood"]
        concepts["top-foot"] = concepts["top (of foot)"]
        concepts["sit-feet-and-legs-together"] = concepts[
            "sit (with feet and legs together)"]
        concepts["earth"] = concepts["earth/soil"]
        concepts["warm"] = concepts["warm/hot"]
        concepts["your-sg"] = concepts["your (pronoun: p:2s)"]
        concepts["-law"] = concepts["in-law"]
        concepts["to-roast"] = concepts["roast"]
        concepts["arrow-barred"] = concepts[
            "arrow (barred) (Arrow with cross bar)"]
        concepts["them-dual"] = concepts["them (pronoun o:3p, dual)"]
        concepts["you-dual"] = concepts["you (pronoun d:2s)"]
        concepts["right-correct"] = concepts["right (correct, true)"]
        concepts["betelpepper"] = concepts["betelpepper vine"]
        concepts["to-chop"] = concepts["to chop, cut down"]
        concepts["road"] = concepts["road/path"]
        concepts["for-benefactive-clitic"] = concepts[
            "for (benefactive) ((cliticised or suffixed to noun))"]
        concepts["mans-father-law"] = concepts["mans' father in law"]
        concepts["sister-law"] = concepts["sister in law"]
        concepts["you-o2s"] = concepts["you (pronoun o:2s)"]
        concepts["you-pl-o2p"] = concepts["you pl. (pronoun o:2p)"]
        concepts["we-pl-incl"] = concepts["we incl. (pronoun d:1p, incl)"]
        concepts["in"] = concepts["in, inside"]
        concepts["not_know"] = concepts["not know"]
        concepts["their-dual"] = concepts["their (pronoun p:3p, dual)"]
        concepts["blow-fire"] = concepts["blow (on fire)"]
        concepts["blunt-eg-knife"] = concepts["blunt (of e.g. knife)"]
        concepts["our-dual"] = concepts["our (two) (pronoun p:1p, dual)"]
        concepts["your-pl-dual"] = concepts[
            "your (two) pl (pronoun p:2p, dual)"]
        concepts["suck-breast"] = concepts["to suck at breast"]
        concepts["draw-water-carry"] = concepts["draw water / carry"]
        concepts["tree-sp-Gnetum-gnemon"] = concepts[
            "tree sp. (Gnetum gnemon)"]
        concepts["he-she"] = concepts["he, she, it, that, those"]
        concepts["fed"] = concepts["fed up (with)"]
        concepts["you-pl-dual-o2p"] = concepts[
            "you plural two (pronoun d:2p, dual)"]
        concepts["you-pl-dual"] = concepts["you two (pronoun d:2s, dual)"]
        concepts["to-put"] = concepts["to put, give"]
        concepts["he-she-it-those"] = concepts["he, she, it, that, those"]
        concepts["we-two-excl"] = concepts[
            "we excl. dual (pronoun d:1p, excl, dual)"]
        concepts["we-pl-excl"] = concepts[
            "we excl. plural (pronoun d:1p, excl, plural)"]
        #concepts["affix-body-part"] = concepts[""]

        itemfiles = [
            f for f in self.raw_dir.iterdir() if f.name.startswith("language-")
        ]
        errors = set()
        for filename in progressbar(sorted(itemfiles), desc="adding lexemes"):
            for o in sorted(self.raw_dir.read_json(filename),
                            key=lambda d: d["id"]):
                wordid = self.get_slug_from_uri(o['word'])
                if wordid in concepts:
                    args.writer.add_forms_from_value(
                        Local_ID=o["id"],
                        Language_ID=self.get_slug_from_uri(o["language"]),
                        Parameter_ID=concepts[wordid],
                        Value=o["entry"],
                        Source=self.get_slug_from_uri(o["source"]),
                        Comment=o["annotation"],
                    )
                else:
                    errors.add(("concept", wordid))
        for error in errors:
            args.log.info("error with {0[0]}: {0[1]}".format(error))

        args.log.info("found {0} errors in concepts".format(len(errors)))
Example #28
0
    def cmd_makecldf(self, args):

        if not hasattr(self, 'form_placeholder'):
            self.form_placeholder = None
        if not hasattr(self, 'only_proto_forms'):  # special case for MixeZoque
            self.only_proto_forms = False

        sound_cat = self.raw_dir.read_json(self.catalog_file_name)

        # add sources
        args.writer.add_sources()

        # add languages from explicit file
        concepts = {}
        for concept in self.concepts:
            args.writer.add_concept(**concept)
            concepts[concept['IndexInSource']] = concept['ID']
        languages = {}
        proto_lgs = []
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language['IndexInSource']] = language['ID']
            if language.get('IsProto', '') == 'True':
                proto_lgs.append(language['IndexInSource'])

        # Load JSON data
        json_data = self.raw_dir.read_json(self.data_file_name)

        # collect missing languages
        missing = set()
        # collect lexemes with no transcription but with audio
        only_snd = []

        media = []
        args.writer.cldf.add_table(
            'media.csv',
            {
                'name': 'ID',
                'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id',
                'valueUrl': 'https://cdstar.shh.mpg.de/bitstreams/{objid}/{fname}',
            },
            'objid',
            'fname',
            'mimetype',
            {'name': 'size', 'datatype': 'integer'},
            {
                "name": "Form_ID",
                "required": True,
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#formReference",
                "datatype": "string"
            },
            primaryKey=['ID']
        )

        args.writer.cldf.add_foreign_key(
            'media.csv', 'Form_ID', 'FormTable', 'ID', )

        # Add lexemes
        for idx in progressbar(sorted(json_data['transcriptions'], key=lambda k: (
            int(json_data['transcriptions'][k]['LanguageIx']),
            int(json_data['transcriptions'][k]['IxElicitation']),
            int(json_data['transcriptions'][k]['IxMorphologicalInstance'])
        )), desc='makecldf'):
            lexeme = json_data['transcriptions'][idx]

            if lexeme['LanguageIx'] not in languages:  # pragma: no cover
                missing.add(lexeme['LanguageIx'])
                continue

            # If entry is marked as 'isDummy' => it only has audio
            if 'isDummy' in lexeme:
                if self.form_placeholder:
                    if isinstance(lexeme['soundPaths'], list) and isinstance(
                            lexeme['soundPaths'][0], list):
                        lexeme['Phonetic'] = [self.form_placeholder] * len(lexeme['soundPaths'])
                        lexeme['AlternativePhoneticRealisationIx'] = \
                            ['0'] * len(lexeme['soundPaths'])
                        lexeme['WCogID'] = [''] * len(lexeme['soundPaths'])
                    else:
                        lexeme['Phonetic'] = self.form_placeholder
                        lexeme['AlternativePhoneticRealisationIx'] = '0'
                        lexeme['WCogID'] = ''
                    try:
                        lexeme['path'] = lexeme['soundPaths'][0].split('/')[-1].split('.')[0]
                    except AttributeError:
                        lexeme['path'] = lexeme['soundPaths'][0][0].split('/')[-1].split('.')[0]
                    lexeme['AlternativeLexemIx'] = '0'
                    lexeme['RootIsLoanWordFromKnownDonor'] = '0'

            # Replace all forms by 'form_placeholder' if language is not a propto language
            # - a special case for MixeZoque only
            if self.only_proto_forms \
                    and self.form_placeholder \
                    and lexeme['LanguageIx'] not in proto_lgs:
                if isinstance(lexeme['Phonetic'], str):
                    lexeme['Phonetic'] = [lexeme['Phonetic']]
                    lexeme['path'] = [lexeme['path']]
                    lexeme['soundPaths'] = [lexeme['soundPaths']]
                    lexeme['WCogID'] = [lexeme['WCogID']]
                for i, v in enumerate(lexeme['Phonetic']):
                    if len(lexeme['soundPaths'][0]) > 0 and len(lexeme['soundPaths'][i][0]) > 0:
                        if lexeme['path'][i] in sound_cat:
                            lexeme['Phonetic'][i] = self.form_placeholder
                        else:
                            lexeme['Phonetic'][i] = ''
                    else:
                        lexeme['Phonetic'][i] = ''

            if 'Phonetic' not in lexeme:
                if 'isDummy' in lexeme:
                    only_snd.append(lexeme)
                continue

            # If there is only one elictation for a meaning
            # it comes as plain string (otherwise as list).
            # Turn relevant items into a list as well.
            if isinstance(lexeme['Phonetic'], str):
                lexeme['Phonetic'] = [lexeme['Phonetic']]
                lexeme['path'] = [lexeme['path']]
                lexeme['soundPaths'] = [lexeme['soundPaths']]
                lexeme['WCogID'] = [lexeme['WCogID']]

            ref_id = None
            last_altlex = None
            for i, value in enumerate(lexeme['Phonetic']):
                v = value.strip()
                # Skip if value is empty
                if not v or v in self.form_spec.missing_data:
                    continue
                # Commas are not allowed!
                if ',' in v:  # pragma: no cover
                    args.log.warning('Comma not allowed in /{0}/ for {1} - {2}'.format(
                        value, languages[lexeme['LanguageIx']], lexeme['IxElicitation']))
                param_id = concepts['{0}-{1}'.format(
                    lexeme['IxElicitation'], lexeme['IxMorphologicalInstance'])]

                new = args.writer.add_form(
                    Language_ID=languages[lexeme['LanguageIx']],
                    Local_ID='{0}-{1}-{2}'.format(
                        lexeme['LanguageIx'],
                        lexeme['IxElicitation'],
                        lexeme['IxMorphologicalInstance']),
                    Parameter_ID=param_id,
                    Value=v,
                    Form=v,
                    Loan=(lexeme['RootIsLoanWordFromKnownDonor'] == '1'),
                    Source=self.get_source_id_array(lexeme),
                    Variant_Of=ref_id if int(
                        lexeme['AlternativePhoneticRealisationIx'][i]) > 0 else None,
                )

                # add media
                if len(lexeme['soundPaths'][0]) > 0 and len(lexeme['soundPaths'][i][0]) > 0:
                    if lexeme['path'][i] in sound_cat:
                        for bs in sorted(sound_cat[lexeme['path'][i]]['bitstreams'],
                                         key=lambda x: x['content-type']):
                            media.append({
                                'ID': bs['checksum'],
                                'fname': bs['bitstreamid'],
                                'objid': sound_cat[lexeme['path'][i]]['id'],
                                'mimetype': bs['content-type'],
                                'size': bs['filesize'],
                                'Form_ID': new['ID']
                            })
                    else:  # pragma: no cover
                        args.log.warning('Missing sound file name in catalog {0}.'.format(
                            lexeme['path'][i]))

                # Remember last inserted ID for alternative pronounciations to insert 'Variant_Of'.
                # This can be done in that way since the downloaded json data are sort
                # by altlex and altpron.
                if last_altlex != int(lexeme['AlternativeLexemIx'][i]):
                    ref_id = new['ID']
                last_altlex = int(lexeme['AlternativeLexemIx'][i])

                # add cognate if desired
                if self.create_cognates:
                    wcogid = None
                    if lexeme['WCogID'][i].strip():
                        try:
                            wid = int(lexeme['WCogID'][i])
                            if wid > 0:
                                wcogid = '{0}-{1}'.format(param_id, wid)
                        except ValueError:
                            wcogid = '{0}-{1}'.format(param_id, lexeme['WCogID'][i])
                    if wcogid:
                        args.writer.add_cognate(
                            lexeme=new,
                            Cognateset_ID=wcogid,
                            Source=self.source_id_array,
                        )

        args.writer.write(
            **{'media.csv': media}
        )

        if self.form_placeholder:
            args.writer.cldf['FormTable', 'Value'].common_props['dc:description'] = \
                '► := no value, but audio'
            args.writer.cldf['FormTable', 'Form'].common_props['dc:description'] = \
                '► := no form, but audio'

        for m in sorted(missing):  # pragma: no cover
            args.log.warning('Missing language with ID {0}.'.format(m))

        if len(only_snd):
            args.log.info('Consider to use "form_placeholder" to import lexemes without '
                          'transcription but with audio:')
        for m in only_snd:  # pragma: no cover
            args.log.warning('Missing transcription for {0}-{1}-{2}.'.format(
                m['LanguageIx'], m['IxElicitation'], m['IxMorphologicalInstance']))
    def cmd_makecldf(self, args):
        from pybtex import errors, database
        errors.strict = False
        bibdata = database.parse_file(
            str(self.raw_dir.joinpath('bibliography', 'sources.bib')))
        args.writer.add_sources(bibdata)
        args.writer["FormTable", "Segments"].datatype = Datatype.fromvalue({
            "base":
            "string",
            "format":
            "([\\S]+)( [\\S]+)*"
        })
        args.writer["FormTable", "Morphemes"].separator = " "
        args.writer["FormTable", "PartialCognates"].separator = " "

        concepts = {}
        errors, blacklist = set(), set()
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Portuguese_Gloss=concept.attributes["portuguese"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                EOL_ID=concept.attributes["eol"],
                Semantic_Field=concept.attributes["semantic_field"])
            concepts[concept.english] = idx

        languages = {}
        sources = {}
        for row in self.languages:
            if not -90 < float(row['Latitude']) < 90:
                errors.add('LATITUDE {0}'.format(row['Name']))
            elif not -180 < float(row['Longitude']) < 180:
                errors.add('LONGITUDE {0}'.format(row['Name']))
            else:
                try:
                    args.writer.add_language(
                        ID=row['ID'],
                        Name=row['Name'],
                        SubGroup=row['SubGroup'],
                        Latitude=row['Latitude'],
                        Longitude=row['Longitude'],
                        Glottocode=row['Glottocode']
                        if row['Glottocode'] != '???' else None,
                    )
                    languages[row['Name']] = row['ID']
                    sources[row['Name']] = []
                    for source in row['Sources'].split(','):
                        if source in bibdata.entries:
                            sources[row['Name']] += [source]
                        else:
                            errors.add('BIBTEX MISSING {0}'.format(source))
                except ValueError:
                    errors.add('LANGUAGE ID {0}'.format(row['ID'], ))
                    args.log.warn('Invalid Language ID {0}'.format(row['ID']))

        wl = lingpy.Wordlist(self.raw_dir.joinpath('tuled.tsv').as_posix())
        etd = wl.get_etymdict(ref='cogids')
        alignments, problems = {}, set()
        for cogid, vals in progressbar(etd.items(), desc='aligning data'):
            idxs = []
            for idx in vals:
                if idx:
                    idxs += idx
            positions = [wl[idx, 'cogids'].index(cogid) for idx in idxs]
            alms, new_idxs = [], []
            for idx, pos in zip(idxs, positions):
                try:
                    tks = lingpy.basictypes.lists(wl[idx, 'tokens']).n[pos]
                    if not ' '.join(tks).strip():
                        raise IndexError
                    alms += [tks]
                    new_idxs += [(idx, pos)]
                except IndexError:
                    problems.add((idx, pos))
            if alms:
                msa = lingpy.Multiple(alms)
                msa.prog_align()
                for i, alm in enumerate(msa.alm_matrix):
                    alignments[new_idxs[i][0], new_idxs[i][1],
                               cogid] = ' '.join(alm)
            else:
                errors.add('ALIGNMENT empty {0}'.format(cogid))

        bipa = CLTS(args.clts.dir).bipa
        for idx, tokens, glosses, cogids, alignment in wl.iter_rows(
                'tokens', 'morphemes', 'cogids', 'alignment'):
            tl, gl, cl, al = (len(lingpy.basictypes.lists(tokens).n),
                              len(glosses), len(cogids),
                              len(lingpy.basictypes.lists(alignment).n))
            if tl != gl or tl != cl or gl != cl or al != gl or al != cl:
                errors.add('LENGTH: {0} {1} {2}'.format(
                    idx, wl[idx, 'language'], wl[idx, 'concept']))
                blacklist.add(idx)
            for token in tokens:
                if bipa[token].type == 'unknownsound':
                    errors.add('SOUND: {0}'.format(token))
                    blacklist.add(idx)

        visited = set()
        for idx in wl:
            if wl[idx, 'concept'] not in concepts:
                if wl[idx, 'concept'] not in visited:
                    args.log.warn('Missing concept {0}'.format(wl[idx,
                                                                  'concept']))
                    visited.add(wl[idx, 'concept'])
                    errors.add('CONCEPT {0}'.format(wl[idx, 'concept']))
            elif wl[idx, 'doculect'] not in languages:
                if wl[idx, 'doculect'] not in visited:
                    args.log.warn("Missing language {0}".format(
                        wl[idx, 'doculect']))
                    visited.add(wl[idx, 'doculect'])
                    errors.add('LANGUAGE {0}'.format(wl[idx, 'doculect']))
            else:
                if ''.join(wl[idx, 'tokens']).strip() and idx not in blacklist:
                    lex = args.writer.add_form_with_segments(
                        Language_ID=languages[wl[idx, 'doculect']],
                        Parameter_ID=concepts[wl[idx, 'concept']],
                        Value=wl[idx, 'value'] or ''.join(wl[idx, 'tokens']),
                        Form=wl[idx, 'form'] or ''.join(wl[idx, 'tokens']),
                        Segments=wl[idx, 'tokens'],
                        Morphemes=wl[idx, 'morphemes'],
                        SimpleCognate=wl[idx, 'cogid'],
                        PartialCognates=wl[idx, 'cogids'],
                        Source=sources[wl[idx, 'doculect']],
                    )
                    for gloss_index, cogid in enumerate(wl[idx, 'cogids']):
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID=cogid,
                                                Segment_Slice=gloss_index + 1,
                                                Alignment=alignments.get(
                                                    (idx, gloss_index, cogid),
                                                    ''),
                                                Alignment_Method='SCA')
                else:
                    args.log.warn(
                        'Entry ID={0}, concept={1}, language={2} is empty'.
                        format(idx, wl[idx, 'concept'], wl[idx, 'doculect']))

        with open(self.dir.joinpath('errors.md'), 'w', encoding="utf-8") as f:
            f.write('# Error Analysis for TULED\n')
            for error in sorted(errors):
                f.write('* ' + error + '\n')
Example #30
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
        languages = args.writer.add_languages(lookup_factory="Name")

        # we combine with the manually edited wordlist to retrieve the lexeme
        # values
        wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix())
        mapper = {
            (concept, language, normalize("NFD", form)): segments
            for (idx, concept, language, form, segments
                 ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens')
        }
        data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False)
        for i, row in progressbar(enumerate(data[4:])):
            number = row[0].strip().strip('.')
            concept = row[1].strip()
            for j in range(0, len(row) - 2, 2):
                language = data[2][j + 2]
                value = row[j + 2]
                if value.strip() and value.strip() not in ['-----']:
                    if ',' in row[j + 2]:
                        forms = [v.strip() for v in value.split(',')]
                        cogids = [
                            str(int(float(x)))
                            for x in row[j + 3].split(' or ')
                        ]
                    else:
                        forms = [value.strip()]
                        cogids = [str(int(float(row[j + 3].split(' or ')[0])))]

                    for form, cogid in zip(forms, cogids):
                        try:
                            segments = mapper[concept, languages[language],
                                              form]
                            lexeme = args.writer.add_form_with_segments(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Segments=segments,
                                Source="Deepadung2015")
                        except:
                            args.log.warn(
                                'lexeme missing {0} / {1} / {2}'.format(
                                    concept, language, form))
                            lexeme = args.writer.add_form(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Source="Deepadung2015")
                        args.writer.add_cognate(lexeme=lexeme,
                                                Cognateset_ID=cogid + '-' +
                                                number,
                                                Source="Deepadung2015")