def test_split_text(): assert text.split_text('arm han( )d')[1] == 'hand' assert text.split_text('arm han( )d', brackets={})[1] == 'han(' assert text.split_text('arm h[\t]and foot')[2] == 'foot' assert text.split_text('arm \t\n hand')[1] == 'hand' assert text.split_text('arm ')[0] == 'arm' assert text.split_text('a(b)c d[e]f', brackets={'(': ')'}) == ['ac', 'd[e]f'] assert text.split_text('a b c') == ['a', 'b', 'c'] assert text.split_text('a/b/c', separators=re.compile('/b/')) == ['a', 'c'] assert text.split_text('a/b/c', separators='/') == ['a', 'b', 'c'] assert text.split_text('a , b\t; c;', separators=',;', strip=True) == ['a', 'b', 'c']
def my_tokenizer(form, prf): value = form.strip() for form in split_text(value, separators='/,~', strip=True): value = form.strip() form = "^%s$" % form.replace(" ", "{} ") form = strip_brackets(form, brackets={'[': ']'}) i = 0 tokens = [] while True: added = False for length in range(len(form[i:]), 0, -1): needle = form[i:i + length] if needle in prf: tokens.append(prf[needle]) i += length added = True break if not added: if form[i] == ' ': tokens.append("#") else: tokens.append('<%s>' % form[i]) i += 1 if i == len(form): break # Remove NULLs tokens = [token for token in tokens if token != "NULL"] return ' '.join(tokens)
def cogids2cogid(wordlist, ref="cogids", cognates="cogid", morphemes="morphemes"): C, M = {}, {} current = 1 for concept in wordlist.rows: base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_") idxs = wordlist.get_list(row=concept, flat=True) cogids = defaultdict(list) for idx in idxs: M[idx] = [c for c in wordlist[idx, ref]] for cogid in basictypes.ints(wordlist[idx, ref]): cogids[cogid] += [idx] for i, (cogid, idxs) in enumerate( sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True)): for idx in idxs: if idx not in C: C[idx] = current M[idx][M[idx].index(cogid)] = base else: M[idx][M[idx].index(cogid)] = "_" + base.lower() current += 1 wordlist.add_entries(cognates, C, lambda x: x) if morphemes: wordlist.add_entries(morphemes, M, lambda x: x)
def bibkeys(s): s = re.sub(r', (?P<year>[0-9]{4})', lambda m: ' ' + m.group('year'), s) s = s.replace('Sammallahti1998Lehtiranta1989', 'Sammallahti1998, Lehtiranta1989') res = [ slug(rid, lowercase=False) for rid in split_text(s, ",", strip=True) ] return [BIBKEYS.get(k, k) for k in res]
def doctypes(self, hhtypes): res = [] if 'hhtype' in self.fields: for ss in split_text(self.fields['hhtype'], separators=',;'): ss = ss.split('(')[0].strip() if ss in hhtypes: res.append(hhtypes[ss]) return res, self.parse_ca(self.fields.get('hhtype'))
def add_identifiers(data, dblang, items, name_type=False): for prov, names in items.items(): if not isinstance(names, (list, tuple)): names = split_text(names, separators=',;') for name in sorted(set(names)): lang = 'en' if name_type: if '[' in name and name.endswith(']'): name, lang = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name' if name_type else prov, prov if name_type else None, lang)
def doctypes(self, hhtypes): """Ordered doctypes assigned to this entry. :param hhtypes: `OrderedDict` mapping doctype names to doctypes :return: `list` of values of `hhtypes` which apply to the entry, ordered by occurrence in\ `hhtypes`. """ res = set() if 'hhtype' in self.fields: for ss in split_text(self.fields['hhtype'], separators=',;'): ss = ss.split('(')[0].strip() if ss in hhtypes: res.add(ss) return [v for k, v in hhtypes.items() if k in res], self.parse_ca(self.fields.get('hhtype'))
def load_references(repos): keys = set() for ds in repos.datasets: for r in ds.references: if ':' in r.key: # skip keys with page numbers. continue # key is in the format Author, Year try: author, year = split_text(r.key, separators=',', strip=True) if (author, year) not in keys: keys.add((author, year)) reference = Source.objects.create( author=author, year=year, reference=r.citation) logging.info("Saved new reference %s (%s)" % (reference.author, reference.year)) except Exception as e: # pragma: no cover logging.warn("Could not save reference for row %s: %s" % (str(r), e)) return len(keys)
def load_references(repos): keys = set() for ds in repos.datasets: for r in ds.references: if ':' in r.key: # skip keys with page numbers. continue # key is in the format Author, Year try: author, year = split_text(r.key, separators=',', strip=True, brackets={}) if (author, year) not in keys: keys.add((author, year)) reference = Source.objects.create( author=author, year=year, reference=r.citation) logging.info("Saved new reference %s (%s)" % (reference.author, reference.year)) except Exception as e: # pragma: no cover logging.warn("Could not save reference for row %s: %s" % (str(r), e)) return len(keys)
def load_languoid(data, lang, nodemap): dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == models.BOOKKEEPING, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, status=models.LanguoidStatus.get( lang.endangerment.name if lang.endangerment else 'safe'), level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None) if lang.iso: add_language_codes(data, dblang, lang.iso) for prov, names in lang.names.items(): for name in names: l = 'en' if '[' in name and name.endswith(']'): name, l = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name', prov, lang=l) for prov, ids in lang.identifier.items(): for id_ in split_text(ids, separators=',;'): add_identifier(dblang, data, id_, prov, None) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr) if attr == 'sub' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'subrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if not val: continue vs = common.ValueSet( id='%s-%s' % (pid, lang.id), description=val, language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['clf']) DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs)) iso_ret = lang.iso_retirement if iso_ret: DBSession.add(models.ISORetirement( id=iso_ret.code, name=iso_ret.name, description=iso_ret.comment, effective=iso_ret.effective, reason=iso_ret.reason, remedy=iso_ret.remedy, change_request=iso_ret.change_request, languoid=dblang)) eth_cmt = lang.ethnologue_comment if eth_cmt: DBSession.add(models.EthnologueComment( comment=eth_cmt.comment, code=eth_cmt.isohid, type=eth_cmt.comment_type, affected=eth_cmt.ethnologue_versions, languoid=dblang))
def clean_form(self, item, form): if form not in ["*", "---", ""]: return split_text(strip_brackets(form), ",;/")[0]
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="{0} {1}".format(glottolog.publication.web.name, version), publisher_name=glottolog.publication.publisher.name, publisher_place=glottolog.publication.publisher.place, publisher_url=glottolog.publication.publisher.url, license=glottolog.publication.license.url, domain=purl.URL(glottolog.publication.web.url).domain(), contact=glottolog.publication.web.contact, jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name}, ) data = Data() for e in glottolog.editors.values(): if e.current: ed = data.add(common.Contributor, e.id, id=e.id, name=e.name) common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord)) DBSession.add(dataset) contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog') DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['hammarstroem'])) # # Add Parameters: # add = functools.partial(add_parameter, data) add('fc', name='Family classification') add('sc', name='Subclassification') add('aes', args.repos.aes_status.values(), name=args.repos.aes_status.__defaults__['name'], pkw=dict( jsondata=dict( reference_id=args.repos.aes_status.__defaults__['reference_id'], sources=[attr.asdict(v) for v in args.repos.aes_sources.values()], scale=[attr.asdict(v) for v in args.repos.aes_status.values()])), dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)), ) add('med', args.repos.med_types.values(), name='Most Extensive Description', dekw=lambda de: dict( name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)), ) add('macroarea', args.repos.macroareas.values(), pkw=dict( description=args.repos.macroareas.__defaults__['description'], jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])), dekw=lambda de: dict( name=de.name, description=de.description, jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)), ), ) add('ltype', args.repos.language_types.values(), name='Language Type', dekw=lambda de: dict(name=de.category, description=de.description), delookup='category', ) add('country', args.repos.countries, dekw=lambda de: dict(name=de.id, description=de.name), ) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) # # Now load languoid data, keeping track of relations that can only be inserted later. # lgsources = defaultdict(list) # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`: nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()]) lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()} for lang in nodemap.values(): for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(glottolog, data, lang, nodemap) for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: mas = [] for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma) mas.append(ma.name) ref.macroareas = ', '.join(mas)
def clean_string( sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, segmentized=False, rules=None, ignore_brackets=True, brackets=None, split_entries=True, splitters='/,;~', preparse=None, merge_geminates=True, normalization_form="NFC"): """ Function exhaustively checks how well a sequence is understood by \ LingPy. Parameters ---------- semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. segmentized : False Indicate whether the input string is already segmentized or not. If set to True, items in brackets can no longer be ignored. rules : dict Replacement rules to be applied to a segmentized string. ignore_brackets : bool If set to True, ignore all content within a given bracket. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. split_entries : bool (default=True) Indicate whether multiple entries (with a comma etc.) should be split into separate entries. splitters : str The characters which force the automatic splitting of an entry. preparse : list List of tuples, giving simple replacement patterns (source and target), which are applied before every processing starts. Returns ------- cleaned_strings : list A list of cleaned strings which are segmented by space characters. If splitters are encountered, indicating that the entry contains two variants, the list will contain one for each element in a separate entry. If there are no splitters, the list has only size one. """ sequence = unicodedata.normalize(normalization_form, sequence) rules = rules or {} preparse = preparse or [] # replace white space if not indicated otherwise if segmentized: segment_list = [sequence.split(' ') if not isinstance(sequence, (list, tuple)) else sequence] else: for s, t in preparse: sequence = sequence.replace(s, t) segment_list = [] if ignore_brackets: new_sequence = strip_brackets(sequence, brackets=brackets) else: new_sequence = sequence # splitting needs to be done afterwards if split_entries: new_sequences = split_text(new_sequence, splitters, brackets='' if not ignore_brackets else brackets) else: new_sequences = [new_sequence] for new_sequence in new_sequences: segments = ipa2tokens( re.sub(r'\s+', '_', new_sequence.strip()), semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, merge_geminates=merge_geminates) segment_list += [segments] out = [] for segments in segment_list: segments = [rules.get(s, s) for s in segments] out += [' '.join(segments)] return out
def cmd_makecldf(self, args): args.writer.add_sources(self.raw_dir.read("Citations.bib")) for src in self._read("Citation_codes"): if src["type"] == "E": args.writer.add_sources( Source("misc", src["ref_abbr"], author=src["original_reference"])) glottocodes = { language["ID"]: language["Glottocode"] for language in self.languages } for language in self._read("Languages"): glottocode = glottocodes.get(language["lgid3"]) if not glottocode: glottocode = self.glottolog.glottocode_by_iso.get( language["ISO-639-3"]) args.writer.add_language( ID=language["lgid3"], Name=language["language"], Glottocode=glottocode, Description=language["Description"], Subgroup=language["Subgroup"], ISO639P3code=language["ISO-639-3"], ) for concept in self.concepts: args.writer.add_concept(**concept) for (cid, cogid), ll in itertools.groupby( sorted(self._read("Data"), key=lambda i: (i["mng_item"], i["cogn_set"])), lambda i: (i["mng_item"], i["cogn_set"]), ): for language in ll: kw = dict( Value=language["item"], Language_ID=language["lgid3"], Parameter_ID=cid, Comment=language["general_notes"], Source=[ slug(rid, lowercase=False) for rid in split_text( language["ref_abbr"], ",", strip=True) ], ) kw.update({ k: language[k] for k in [ "item_UPA", "item_IPA", "form_set", "age_term_pq", "age_term_aq", "borr_source", "borr_qual", "etym_notes", "glossing_notes", ] }) for lex in args.writer.add_lexemes(**kw): if cogid != "?": args.writer.add_cognate(lexeme=lex, Cognateset_ID="{0}-{1}".format( cid, cogid))
def split(s, sep=';'): return split_text(s, separators=sep, brackets={}, strip=True)
def cmd_makecldf(self, args): args.writer.add_sources(self.raw_dir.read("Citations.bib")) bib = parse_string(self.raw_dir.read('Borrowing_references.bib'), 'bibtex') for k, v in bib.entries.items(): args.writer.add_sources( Source.from_entry(slug(k, lowercase=False), v)) args.writer.cldf.add_component( 'BorrowingTable', { 'name': 'Likelihood', 'dc:description': 'Likelihood of borrowing (*possible*, *probable* or *clear*).', 'datatype': { 'base': 'string', 'format': 'possible|clear|probable' } }, { 'name': 'SourceLanguoid', 'dc:description': 'Borrowing source of lexeme.', }) args.writer.cldf['FormTable', 'form'].required = False args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS args.writer.cldf['FormTable', 'value'].required = False args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \ "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \ "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \ "meanings in Proto-Uralic." for src in self._read("Citation_codes"): if src["type"] == "E": args.writer.add_sources( Source("misc", src["ref_abbr"], author=src["original_reference"])) glottocodes = { language["ID"]: language["Glottocode"] for language in self.languages } for language in self._read("Languages"): glottocode = glottocodes.get(language["lgid3"]) if not glottocode: glottocode = self.glottolog.glottocode_by_iso.get( language["ISO-639-3"]) args.writer.add_language( ID=language["lgid3"], Name=language["language"], Glottocode=glottocode, Description=language["Description"], Subgroup=language["Subgroup"], ISO639P3code=language["ISO-639-3"], ) inlists = {r['mng_item']: r for r in self._read('Meaning_lists')} attrs = [ k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank' ] for concept in self.concepts: if concept['ID'] in inlists: memberships = { k.replace('-', '_'): v == '1' for k, v in inlists[concept['ID']].items() if k.replace('-', '_') in attrs } concept.update(memberships) args.writer.add_concept(**concept) for (cid, cogid), ll in itertools.groupby( sorted(self._read("Data"), key=lambda i: (i["mng_item"], i["cogn_set"])), lambda i: (i["mng_item"], i["cogn_set"]), ): for language in ll: if language['item'] in NULL_ITEMS: language['etym_notes'] = language['etym_notes'] + language[ 'item'] kw = dict( Value=language["item"], Language_ID=language["lgid3"], Parameter_ID=cid, Comment=language["general_notes"], Source=[ slug(rid, lowercase=False) for rid in split_text( language["ref_abbr"], ",", strip=True) ], ) kw.update({ k: language[k] for k in [ "item_UPA", "item_IPA", "form_set", "etym_notes", "glossing_notes", ] }) for i, lex in enumerate(args.writer.add_lexemes(**kw)): lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[ 'Form'] if cogid not in ["?", "0"]: args.writer.add_cognate(lexeme=lex, Cognateset_ID="{0}-{1}".format( cid, cogid)) if language['borr_qual']: c = ': borrowed to Pre-Permic' ref = language['ref_borr'] if c in ref: comment = c[1:].strip() ref = ref.replace(c, '') else: comment = None args.writer.objects['BorrowingTable'].append( dict( ID=lex['ID'], Target_Form_ID=lex['ID'], SourceLanguoid=language['borr_source'], Likelihood=language['borr_qual'], Source=bibkeys(ref), Comment=comment, ))
def load(args): fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(args.args[0]), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('bank', 'Sebastian Bank'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) glottolog = args.repos for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def clean_form(self, item, form): if form not in ["*", "---", "-"]: form = strip_brackets(split_text(form, separators=";,/")[0]) return form.replace(" ", "_")
def cmd_makecldf(self, args): args.writer.cldf.add_component('ParameterTable') args.writer.cldf.add_component( 'LanguageTable', 'Continent', 'Genus', 'WALSCode', # we add more language metadata ) args.writer.cldf.add_component('CodeTable') args.writer.objects['ParameterTable'] = [{ 'ID': 'sortalclassifier', 'Name': 'sortal classifier', 'Description': 'Does the language have sortal classifiers, regardless of optional of obligatory?' }, { 'ID': 'morphosyntacticplural', 'Name': 'morphosyntactic plural', 'Description': 'Does the language have morphosyntactic plural markers?' }] args.writer.objects['CodeTable'] = [ { 'ID': 'sortalclassifier-1', 'Parameter_ID': 'sortalclassifier', 'Name': 'yes' }, { 'ID': 'sortalclassifier-0', 'Parameter_ID': 'sortalclassifier', 'Name': 'no' }, { 'ID': 'morphosyntacticplural-1', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'yes' }, { 'ID': 'morphosyntacticplural-0', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'no' }, ] l2s = collections.defaultdict(list) sources = [] for src in sorted(Sources.from_file(self.raw_dir / 'sources.bib').items(), key=lambda i: i.id): if src.get('Wals_code'): for code in split_text(src['Wals_code'], ';', strip=True): l2s[code].append(src.id) sources += [src] args.writer.cldf.add_sources(*sources) for row in self.raw_dir.read_csv('GSSG_ListOfLanguages.csv', delimiter=';', dicts=True): lidx = slug(row['language_name'], lowercase=False) args.writer.objects['LanguageTable'].append({ 'ID': lidx, 'Name': row['language_name'], 'Latitude': row['latitude'], 'Longitude': row['longitude'], 'Glottocode': row['glottocode'], 'ISO639P3code': row['iso_code'], 'Continent': row['continent'], 'Genus': row['genus'], 'WALSCode': row['wals_code'] }) for param in ['sortal_classifier', 'morphosyntactic_plural']: pid = param.replace('_', '') args.writer.objects['ValueTable'].append({ "ID": '{}-{}'.format(lidx, pid), "Value": row[param], "Language_ID": lidx, "Parameter_ID": pid, "Code_ID": '{}-{}'.format(pid, '1' if row[param] == 'yes' else '0'), "Source": l2s.get(row['wals_code'], []) })
def load_languoid(data, lang, nodemap): dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == models.BOOKKEEPING, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, # # TODO: switch to using the AES labels, i.e. lang.endangerment.description! # status=models.LanguoidStatus.get( lang.endangerment.name if lang.endangerment else 'safe'), level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None) if lang.iso: add_language_codes(data, dblang, lang.iso) for prov, names in lang.names.items(): for name in names: l = 'en' if '[' in name and name.endswith(']'): name, l = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name', prov, lang=l) for prov, ids in lang.identifier.items(): for id_ in split_text(ids, separators=',;'): add_identifier(dblang, data, id_, prov, None) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr) if attr == 'sub' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'subrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if attr == 'family' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'familyrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if not val: continue vs = common.ValueSet( id='%s-%s' % (pid, lang.id), description=val, language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['clf']) DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs)) iso_ret = lang.iso_retirement if iso_ret: DBSession.add(models.ISORetirement( id=iso_ret.code, name=iso_ret.name, description=iso_ret.comment, effective=iso_ret.effective, reason=iso_ret.reason, remedy=iso_ret.remedy, change_request=iso_ret.change_request, languoid=dblang)) eth_cmt = lang.ethnologue_comment if eth_cmt: DBSession.add(models.EthnologueComment( comment=eth_cmt.comment, code=eth_cmt.isohid, type=eth_cmt.comment_type, affected=eth_cmt.ethnologue_versions, languoid=dblang))
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(version), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))