def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None} } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert( ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n') conn.update( ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def chapter(request): _html = get_html(ppath('Atlas', '%s.html' % request.matchdict['id'])) return { 'md': jsonload(ppath('Atlas', '%s.json' % request.matchdict['id'])), 'html': lambda vt: _html.replace('<p>value-table</p>', vt), 'ctx': Feature.get(request.matchdict['id']), }
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add( Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add( Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def main(args): repls = set((i['id'], i['replacement']) for i in jsonload(args.data_dir.joinpath('scripts', 'monster-replacements.json'))) with transaction.manager: for ref_id, repl_id in repls: ref = Source.get('%s' % ref_id, default=None) if ref: Config.add_replacement( ref, '%s' % repl_id, session=DBSession, model=Source) # FIXME: "redirect" relations, e.g. from valuesetreference as well! DBSession.delete(ref) args.log.info('%s replacements' % len(repls))
def main(args): if args.cmd == 'convert': outdir = args.data_file('texts', args.what).joinpath('lo') if args.what == 'Atlas': for p in args.data_file('texts', args.what).joinpath('in').files(): if p.ext in ['.doc', '.docx']: convert_chapter(p, outdir) elif args.what == 'Surveys': pass if args.cmd == 'parse': outdir = args.data_file('texts', args.what).joinpath('processed') for p in args.data_file('texts', args.what).joinpath('lo').files(): if args.in_name in p.namebase: globals()[args.what](p)(outdir) if args.cmd == 'refs': refs = [] for p in args.data_file( 'texts', args.what).joinpath('processed').files('*.json'): if args.in_name in p.namebase: md = jsonload(p) refs.extend(md['refs']) db = get_bibtex(refs) unmatched = 0 distinct = defaultdict(list) for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[(slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False))] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return
def main(args): if args.cmd == 'convert': outdir = args.data_file('texts', args.what).joinpath('lo') if args.what == 'Atlas': for p in args.data_file('texts', args.what).joinpath('in').files(): if p.ext in ['.doc', '.docx']: convert_chapter(p, outdir) elif args.what == 'Surveys': pass if args.cmd == 'parse': outdir = args.data_file('texts', args.what).joinpath('processed') for p in args.data_file('texts', args.what).joinpath('lo').files(): if args.in_name in p.namebase: globals()[args.what](p)(outdir) if args.cmd == 'refs': refs = [] for p in args.data_file('texts', args.what).joinpath('processed').files('*.json'): if args.in_name in p.namebase: md = jsonload(p) refs.extend(md['refs']) db = get_bibtex(refs) unmatched = 0 distinct = defaultdict(list) for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[( slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False) )] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return
def read(args, table, sortkey=None): """Read APiCS data from a json file created from filemaker's xml export. """ load = lambda t: jsonload(args.data_file('fm', '%s.json' % t)) res = load(table) if table == 'Features': # merge the data from two other sources: secondary = [ dict((r['Feature_number'], r) for r in load(table + l)) for l in ['p', 'v']] for r in res: for d in secondary: r.update(d[r['Feature_number']]) if sortkey: res = sorted(res, key=lambda d: d[sortkey]) for d in res: for k, v in d.items(): if isinstance(v, unicode): d[k] = v.strip() yield d
def main(args): sources = jsonload(args.data_file('sources.json')) fields = ['href', 'name', 'author', 'iso', 'source', 'notes', 'wordlist'] with UnicodeWriter(args.data_file('..', 'sources.csv')) as fp: fp.writerow(fields) for source in sorted(sources, key=lambda i: i['name']): fp.writerow([source.get(f, '') for f in fields]) return ethnologue_names = { r.ISO_639: r.Language_Name for r in reader(args.data_file( '..', '..', 'ethnologue-17-data', 'Table_of_Languages.tab'), namedtuples=True)} # ASJP name for language, Ethnologue's name, ISO code rows = [['ASJP Name', 'Ethnologue name', 'ISO code']] subquery = DBSession.query(LanguageSource.language_pk).distinct().subquery() for i, l in enumerate(DBSession.query(Doculect).order_by(Doculect.pk).filter(not_(Doculect.pk.in_(subquery)))): rows.append([l.id, ethnologue_names.get(l.code_iso, ''), l.code_iso or '']) #print i with UnicodeWriter(args.data_file('..', 'doculects_without_source.csv')) as fp: fp.writerows(rows)
def coverage(req): gl = jsonload(abspath_from_asset_spec('grambank:static/stats_glottolog.json')) stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for ma in gl: for dt in gl[ma]: ids = gl[ma][dt] isolates = select( [Language.__table__.c.id]).where(Language.__table__.c.id.in_(ids)) families = select( [Family.__table__.c.id]).where(Family.__table__.c.id.in_(ids)) stats[ma][dt] = dict( glottolog=len(ids), grambank=DBSession.query(isolates.union(families).alias('u')).count()) stats[ma]['total'] = {} for src in ['glottolog', 'grambank']: stats[ma]['total'][src] = \ stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src] return dict(stats=stats)
def read(args, table, sortkey=None): """Read APiCS data from a json file created from filemaker's xml export. """ load = lambda t: jsonload(args.data_file('fm', '%s.json' % t)) res = load(table) if table == 'Features': # merge the data from two other sources: secondary = [ dict((r['Feature_number'], r) for r in load(table + l)) for l in ['p', 'v'] ] for r in res: for d in secondary: r.update(d[r['Feature_number']]) if sortkey: res = sorted(res, key=lambda d: d[sortkey]) for d in res: for k, v in d.items(): if isinstance(v, unicode): d[k] = v.strip() yield d
def survey(request): id_ = request.matchdict['id'] md = jsonload(ppath('Surveys', '%s.json' % id_)) html = get_html(ppath('Surveys', '%s.html' % id_)) maps = [] for fname in sorted( ppath('Surveys', processed='maps').files( '%s*.png' % id_.split('.')[1].replace('-', '_')), key=lambda fn: fn.namebase): img = b64encode(open(fname, 'rb').read()) if 'figure' in fname.namebase: html = html.replace('{%s}' % fname.namebase, 'data:image/png;base64,%s' % img) else: maps.append(img) return { 'maps': maps, 'md': md, 'authors': [Contributor.get(a['id']) for a in md['authors']], 'html': html, 'ctx': ApicsContribution.get(id_.split('.')[0]), }
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def import_dataset(path, data, icons): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() contrib = Contribution(id=basename, name=basename) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: print('skip value for invalid feature %s' % row['Feature_ID']) continue #parameter = data.add( # Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! languoid = glottolog.languoid(row['Language_ID']) gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name Value( id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options(joinedload_all( Language.languageidentifier, LanguageIdentifier.identifier )): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn('brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options( joinedload_all(Language.languageidentifier, LanguageIdentifier.identifier)): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith( 'Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove ] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn( 'brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)
def prime_cache(args): # # TODO: relate survey chapter reference with language! # icons = {} frequencies = {} args.log.info('computing wals representation') for feature in DBSession.query(common.Parameter).options( joinedload(common.Parameter.valuesets) ): feature.representation = len(feature.valuesets) if feature.wals_id: data = jsonload(path(apics.__file__).dirname().joinpath( 'static', 'wals', '%sA.json' % feature.wals_id )) feature.wals_representation = sum( [len(l['features']) for l in data['layers']]) args.log.info('computing language sources') compute_language_sources((common.ContributionReference, 'contribution')) compute_number_of_values() for valueset in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '0')\ .options(joinedload(common.ValueSet.language)): if valueset.language.language_pk: continue if len(valueset.values) > 1: valueset.language.lexifier = 'Other' else: if valueset.values[0].domainelement.name == 'Other': valueset.language.lexifier = 'Other' else: valueset.language.lexifier \ = valueset.values[0].domainelement.name.replace('-based', '') for lect in valueset.language.lects: lect.lexifier = valueset.language.lexifier args.log.info('creating icons') for valueset in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.parameter), joinedload_all(common.ValueSet.values, common.Value.domainelement) ): values = sorted(list(valueset.values), key=lambda v: v.domainelement.number) assert abs(sum(v.frequency for v in values) - 100) < 1 fracs = [] colors = [] for v in values: color = v.domainelement.jsondata['color'] frequency = round(v.frequency) assert frequency if frequency not in frequencies: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((int(100 - frequency), frequency), colors=('w', 'k')) coll[0][0].set_linewidth(0.5) assert icons_dir.joinpath('freq-%s.png' % frequency).exists() frequencies[frequency] = True v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency} fracs.append(frequency) colors.append(color) v.domainelement.jsondata = { 'color': color, 'icon': 'pie-100-%s.png' % color} assert len(colors) == len(set(colors)) fracs, colors = tuple(fracs), tuple(colors) basename = 'pie-' basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors)) valueset.update_jsondata(icon=basename + '.png') if (fracs, colors) not in icons: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie( tuple(reversed(fracs)), colors=['#' + _color for _color in reversed(colors)]) for wedge in coll[0]: wedge.set_linewidth(0.5) assert icons_dir.joinpath('%s.png' % basename).exists() icons[(fracs, colors)] = True assert icons_dir.joinpath(basename + '.svg').exists() for de in DBSession.query(common.DomainElement): if not de.jsondata.get('icon'): de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color']) gbs_func('update', args)
def prime_cache(args): # # TODO: relate survey chapter reference with language! # icons = {} frequencies = {} args.log.info('computing wals representation') for feature in DBSession.query(common.Parameter).options( joinedload(common.Parameter.valuesets)): feature.representation = len(feature.valuesets) if feature.wals_id: data = jsonload( path(apics.__file__).dirname().joinpath( 'static', 'wals', '%sA.json' % feature.wals_id)) feature.wals_representation = sum( [len(l['features']) for l in data['layers']]) args.log.info('computing language sources') compute_language_sources((common.ContributionReference, 'contribution')) compute_number_of_values() for valueset in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '0')\ .options(joinedload(common.ValueSet.language)): if valueset.language.language_pk: continue if len(valueset.values) > 1: valueset.language.lexifier = 'Other' else: if valueset.values[0].domainelement.name == 'Other': valueset.language.lexifier = 'Other' else: valueset.language.lexifier \ = valueset.values[0].domainelement.name.replace('-based', '') for lect in valueset.language.lects: lect.lexifier = valueset.language.lexifier args.log.info('creating icons') for valueset in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.parameter), joinedload_all(common.ValueSet.values, common.Value.domainelement)): values = sorted(list(valueset.values), key=lambda v: v.domainelement.number) assert abs(sum(v.frequency for v in values) - 100) < 1 fracs = [] colors = [] for v in values: color = v.domainelement.jsondata['color'] frequency = round(v.frequency) assert frequency if frequency not in frequencies: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((int(100 - frequency), frequency), colors=('w', 'k')) coll[0][0].set_linewidth(0.5) assert icons_dir.joinpath('freq-%s.png' % frequency).exists() frequencies[frequency] = True v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency} fracs.append(frequency) colors.append(color) v.domainelement.jsondata = { 'color': color, 'icon': 'pie-100-%s.png' % color } assert len(colors) == len(set(colors)) fracs, colors = tuple(fracs), tuple(colors) basename = 'pie-' basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors)) valueset.update_jsondata(icon=basename + '.png') if (fracs, colors) not in icons: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie(tuple(reversed(fracs)), colors=['#' + _color for _color in reversed(colors)]) for wedge in coll[0]: wedge.set_linewidth(0.5) assert icons_dir.joinpath('%s.png' % basename).exists() icons[(fracs, colors)] = True assert icons_dir.joinpath(basename + '.svg').exists() for de in DBSession.query(common.DomainElement): if not de.jsondata.get('icon'): de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color']) gbs_func('update', args)
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict((row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader( args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add( common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [float(c.strip()) for c in row['map_coordinates'].split(',')] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add(common.Language_data( object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get('Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup( row['Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add( common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add( common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add( common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None}, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add(common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add( models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append(models.FeatureAuthor( ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict(zip( primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add( models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ( 'Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add( common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add( models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add( common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={'color': colors.get( row['Value%s_colour_ID' % i], colors.values()[i])}) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement']['%s-%s' % ( number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence( args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add(common.ValueSetReference( valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % ( row[prefix('feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop( row[prefix('data_record_id', _prefix)])} valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % ( language.id, primary_to_segment[int(parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add(common.ValueSetReference( valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add(common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence']['%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict( contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']] ) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def import_dataset(path, provider): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() mdpath = path + "-metadata.json" assert os.path.exists(mdpath) md = jsonload(mdpath) md, parameters = md["properties"], md["parameters"] cname = md["name"] if "id" in md: cname = "%s [%s]" % (cname, md["id"]) contrib = Wordlist(id=basename, name=cname) contributors = md.get("typedby", md.get("contributors")) if contributors: contributor_name = HumanName(contributors) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = Contributor.get(contributor_id, default=None) if not contributor: contributor = Contributor(id=contributor_id, name="%s" % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) # bibpath = os.path.join(dirpath, basename + '.bib') # if os.path.exists(bibpath): # for rec in Database.from_file(bibpath): # if rec['key'] not in data['Source']: # data.add(Source, rec['key'], _obj=bibtex2source(rec)) data = Data() concepts = {p.id: p for p in DBSession.query(Concept)} language = None for i, row in enumerate(reader(path, dicts=True, delimiter=",")): if not row["Value"] or not row["Feature_ID"]: continue fid = row["Feature_ID"].split("/")[-1] vsid = "%s-%s-%s" % (basename, row["Language_ID"], fid) vid = "%s-%s-%s" % (provider, basename, i + 1) if language: assert language.id == row["Language_ID"] else: language = Language.get(row["Language_ID"], default=None) if language is None: # query glottolog! languoid = glottolog.languoid(row["Language_ID"]) language = LexibankLanguage( id=row["Language_ID"], name=languoid.name, latitude=languoid.latitude, longitude=languoid.longitude ) parameter = concepts.get(fid) if parameter is None: concepts[fid] = parameter = Concept( id=fid, name=parameters[row["Feature_ID"]], concepticon_url=row["Feature_ID"] ) vs = data["ValueSet"].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row.get("Source"), ) counterpart = Counterpart( id=vid, valueset=vs, name=row["Value"], description=row.get("Comment"), loan=row.get("Loan") == "yes" ) if row.get("Cognate_Set"): csid = row["Cognate_Set"].split(",")[0].strip() cs = Cognateset.get(csid, key="name", default=None) if cs is None: cs = Cognateset(name=csid) counterpart.cognateset = cs # for key, src in data['Source'].items(): # if key in vs.source: # ValueSetReference(valueset=vs, source=src, key=key) contrib.language = language
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add(common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") glottolog = glottocodes_by_isocode('postgresql://robert@/glottolog3') for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): try: add_language_codes( data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None) except: print(row) raise second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)