def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby( sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description ): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa')) ): if not instance.taxa: instance.active = False
def add_sources(args, data): bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in chain(ext, bib): if rec.id not in data['Source']: data.add(Source, rec.id, _obj=bibtex2source(rec)) # # add aliases to lookup records with bibtex keys with numeric prefixes without # specifying the prefix # for key in list(data['Source'].keys()): if '_' in key: no, rem = key.split('_', 1) try: int(no) if rem not in data['Source']: data['Source'][rem] = data['Source'][key] except (ValueError, TypeError): pass
def _get(d, marker): _l = set(nfilter(d.get(marker, []))) if _l: _l = list(_l) if marker not in ['oo', 'or']: assert len(_l) == 1 _l = _l[0] return _l
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add( Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add( Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def __init__(self, req, *args, **kw): Parameters.__init__(self, req, *args, **kw) if kw.get('languages'): self.languages = kw['languages'] elif 'languages' in req.params: self.languages = nfilter([ Language.get(id_, default=None) for id_ in req.params['languages'].split(',')]) else: self.languages = [] self._langs = [ aliased(ValueSet, name='l%s' % i) for i in range(len(self.languages))]
def add_counterpart(d, vs, id, phonetic, # forms cognate, # oo me, cm, so, org): assert phonetic or cognate if not cognate: if vs.language.proto: cognate = phonetic phonetic = None else: cognate = '[%s]' % phonetic m = models.Counterpart( id=id, name=cognate, phonetic=phonetic, description=me or '[%s]' % vs.parameter.name, comment=cm, original_entry=org, other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None, valueset=vs) if so: for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]): match = SID_PATTERN.match(sid) if not match: continue name = sid sid = normalize_sid(match.group('key')) source = data['Source'].get(sid) if not source: if sid in sources: s = sources[sid] source = data.add( common.Source, sid, id=sid, name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'], description=s.get('Title', s['citation']), author=s.get('Author'), title=s.get('Title'), year=s.get('Year'), ) else: source = data.add( common.Source, sid, id=sid, name=name.upper() if len(name) <= 3 else name) m.references.append(models.ValueReference( source=source, description=match.group('pages')))
def add_sources(args, data): bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True) ext = [Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@'))] for rec in chain(ext, bib): if rec.id not in data['Source']: data.add(Source, rec.id, _obj=bibtex2source(rec)) # # add aliases to lookup records with bibtex keys with numeric prefixes without # specifying the prefix # for key in list(data['Source'].keys()): if '_' in key: no, rem = key.split('_', 1) try: int(no) if rem not in data['Source']: data['Source'][rem] = data['Source'][key] except (ValueError, TypeError): pass
def parsed_words(words, fp, lang): for s, t in [ ("nǁá(q)'ám", "nǁáq'ám, nǁá'ám"), ("ǀga̋é.b/(s)", "ǀga̋é.b, ǀga̋é.s"), ("ǀkhóò.b/(s)", "ǀkhóò.b, ǀkhóò.s"), ("nǂúq(y)è", "nǂúqyè, nǂúqè"), ("ǀ'hùī (n̏ǀ'hùīn)", "ǀ'hùī, n̏ǀ'hùīn"), ("sùr(ù)tsi̋ǃgùűbȅ.s", "sùr(ù)tsi̋ǃgùűbȅ.s, sùrùtsi̋ǃgùűbȅ.s"), ("dàqhńn(tê)", "dàqhńn, dàqhńntê"), ("ǀàālè (ǀàlé)", "ǀàālè, ǀàlé"), ("ǁúq(l)è", "ǁúqlè, ǁúqè"), ("nǃhȁè (nǃȁhè)", "nǃhȁè, nǃȁhè"), ("(ǀxòo) tsàhnà", "ǀxòo tsàhnà, tsàhnà"), ("(kú-)ǃáná", "kúǃáná, ǃáná"), ("ǀgài̋o.b(/s)", "ǀgài̋o.b, ǀgài̋o.s"), ("dz(h)òhè", "dzhòhè, dzòhè, dzhòè (?)"), ("ǀqx'á(y)è", "ǀqx'áyè, ǀqx'áè"), ]: words = words.replace(s, t) words = list(split_words(words)) return nfilter(chain(*[parsed_word(words, i, fp, lang) for i in range(len(words))]))
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { 'Unattested', # keep top-level family as subfamily 'Unclassifiable', # keep top-level family as subfamily 'Pidgin', # keep top-level family as subfamily 'Mixed Language', # keep top-level family as subfamily 'Artificial Language', # keep top-level family as subfamily 'Speech Register', # keep top-level family as subfamily # FIXME: also 'Sign Language'? 'Spurious', # bookkeeping 'Preliminary' } branch = [ unescape(n.strip().replace('_', ' ')) for n in line.split(',') ] if branch[0] not in name_map: return branch, 'established' family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == 'Spurious': if branch[0] == 'Retired': retired = True branch.pop(0) else: subfamily = '%s (%s)' % (branch.pop(0), family) status = 'established' if family in ['Spurious', 'Unattested']: status = family.lower() if retired: status += ' retired' if family == 'Spurious': family = BOOKKEEPING return nfilter([family, subfamily]), status
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { "Unattested", # keep top-level family as subfamily "Unclassifiable", # keep top-level family as subfamily "Pidgin", # keep top-level family as subfamily "Mixed Language", # keep top-level family as subfamily "Artificial Language", # keep top-level family as subfamily "Speech Register", # keep top-level family as subfamily # FIXME: also 'Sign Language'? "Spurious", # bookkeeping 'Preliminary' } branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")] if branch[0] not in name_map: return branch, "established" family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == "Spurious": if branch[0] == "Retired": retired = True branch.pop(0) else: subfamily = "%s (%s)" % (branch.pop(0), family) status = "established" if family in ["Spurious", "Unattested"]: status = family.lower() if retired: status += " retired" if family == "Spurious": family = BOOKKEEPING return nfilter([family, subfamily]), status
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() def read(table): return list(dsv.reader( args.data_file(table + '.csv'), delimiter=',', namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", #published=date(2009, 8, 15), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-nc-nd/2.0/de/88x31.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany License', }, domain='ids.clld.org') DBSession.add(dataset) data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=l.lg_name) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso_codes = {l.id: l.sil_code for l in read('sil_lang')} languages = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil') if l.lg_id not in exclude} load_families(Data(), [(v, data['IdsLanguage'][k]) for k, v in languages.items()]) contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: if int(l.what_did_id) not in [4, 395]: print(l.what_did_id) raise ValueError sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="Max Planck Institute for Evolutionary Anthropology, Leipzig") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) for i, name in enumerate(sorted(sources.keys())): c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for lg in lgs: if lg in exclude: continue try: DBSession.add(common.LanguageSource( language_pk=data['IdsLanguage'][lg].pk, source_pk=data['Source'][name].pk)) except KeyError: print(name, lgs) continue altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = {'id': id_, 'name': name, 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() try: language = common.Language.get(data['IdsLanguage'][lg_id]) except KeyError: print(list(entries)) raise desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: data.add( models.Entry, entry_id, id=entry_id, name=entry_id, #active=False, sub_code=l.entry_id, chapter_pk=data['Chapter'][l.chap_id]) DBSession.flush() data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) #assert language.id == '238' # Rapa Nui has problems! trans2 = None for i, word in enumerate(trans1): v = models.Counterpart( id=id_ + '-' + str(i + 1 + len(vs.values)), name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: print('---', language.id, language.name) print(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned)
def normalize_bib(name): return nfilter([slug(n.strip()) for n in name.split(' and ')])
def normalize_comma_separated(s, d, lower=False): if not s: return chunks = nfilter([_s.strip() for _s in s.split(',')]) return ', '.join( d.get(_s.lower(), _s.lower() if lower else _s) for _s in chunks)
def col_defs(self): kw = {} if self.language: kw['bSearchable'] = False kw['bSortable'] = False name_col = ValueNameCol(self, 'value', **kw) if self.parameter and self.parameter.domain: name_col.choices = [de.name for de in self.parameter.domain] class ValueLanguageCol(LinkCol): def search(self, qs): if self.dt.language: return ValueSet.language_pk == int(qs) if self.dt.parameter: return icontains(self.dt.vs_lang.name, qs) def order(self): if self.dt.parameter: return cast(self.dt.vs_lang.id, Integer) if self.dt.language: return ValueSet.language_pk lang_col = ValueLanguageCol( self, 'language', model_col=Language.name, get_obj=lambda item: item.valueset.language, bSearchable=bool(self.parameter or self.language), bSortable=bool(self.parameter or self.language)) if self.language: if self.language.lects: lang_col.choices = [ (l.pk, l.name) for l in [self.language] + self.language.lects] lang_col.js_args['sTitle'] = 'lect' else: lang_col = None get_param = lambda i: i.valueset.parameter if self.parameter: return nfilter([ lang_col, name_col, FrequencyCol(self, '%') if self.parameter.multivalued else None, Col(self, 'lexifier', format=lambda i: i.valueset.language.lexifier, model_col=self.vs_lect.lexifier, choices=get_distinct_values( Lect.lexifier, key=lambda v: 'z' + v if v == 'Other' else v)), LinkToMapCol( self, 'm', get_object=lambda i: None if i.valueset.language.language_pk else i.valueset.language), DetailsRowLinkCol(self, 'more') if self.parameter.feature_type != 'sociolinguistic' else None, RefsCol(self, 'source') if self.parameter.feature_type != 'segment' else None, ]) if self.language: return nfilter([ IntegerIdCol(self, 'id', get_obj=get_param, model_col=Parameter.id), LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name), name_col, FrequencyCol(self, '%'), lang_col, DetailsRowLinkCol(self, 'more'), RefsCol(self, 'source'), ]) return [ LinkCol(self, 'parameter', get_obj=get_param, model_col=Parameter.name), name_col, FrequencyCol(self, '%'), lang_col, DetailsRowLinkCol(self, 'more'), RefsCol(self, 'source'), ]