def issue24(session, timestamp): # pragma: no cover #- Update language cea (name, coords, alternative names, iso code (and name)) #Change name of Cree (Eastern) to Cree (Swampy) #Change coordinates to 56dN, 90dW #Change the Ethnologue name to Cree (Swampy) #Remove the Routledge and Other names #Change the ISO code to csw. glottocode to swam1239 cea = common.Language.get('cea', session=session) cre = common.Language.get('cre', session=session) for i in range(len(cea.languageidentifier)): try: del cea.languageidentifier[i] except IndexError: pass for values in [ ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'), ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'), ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'), ]: id = common.Identifier( **dict(zip('id name description type'.split(), values))) cea.languageidentifier.append( common.LanguageIdentifier(language=cea, identifier=id)) cea.updated = timestamp cea.name = 'Cree (Swampy)' cea.ascii_name = slug('Cree (Swampy)') cea.latitude = 56.0 cea.longitude = -90.0 for pid in ['81A', '82A', '83A']: vsq = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == pid) vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one() vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one() vs2.updated = timestamp vs1.updated = timestamp for ref in vs1.references: if ref.source.id == 'Hive-1948': ref.valueset = vs2 session.flush() #- Delete valueset 85A-cea vs = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '85A')\ .filter(common.ValueSet.language_pk == cea.pk).one() session.delete(vs.values[0]) session.delete(vs.references[0]) session.delete(vs) #- delete valueset 131A-cea add 131A-cre vs_switch_lang(session, timestamp, '131A-cea', 'cre')
def add_language_codes(data, lang, isocode, glottocodes=None, glottocode=None): def identifier(type_, id_): return data.add(common.Identifier, '%s:%s' % (type_, id_), id='%s:%s' % (type_, id_), name=id_, type=getattr(common.IdentifierType, type_).value) if isocode and len(isocode) == 3: DBSession.add( common.LanguageIdentifier(language=lang, identifier=identifier('iso', isocode))) if glottocode or (glottocodes and isocode and isocode in glottocodes): glottocode = glottocode or glottocodes[isocode] DBSession.add( common.LanguageIdentifier(language=lang, identifier=identifier( 'glottolog', glottocode)))
def update_iso(session, timestamp, lang, *obsolete, **new): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) lang.updated = timestamp for code in obsolete: for li in lang.languageidentifier: if li.identifier.id == code or li.identifier.id == 'ethnologue-' + code: session.delete(li) for code, name in new.items(): iso = common.Identifier.get(code, session=session, default=None) ethnologue = common.Identifier.get('ethnologue-' + code, session=session, default=None) if not iso: iso = common.Identifier(id=code, name=code, description=name, type=common.IdentifierType.iso.value) if not ethnologue: ethnologue = common.Identifier(id='ethnologue-' + code, name=name, description='ethnologue', type='name') if iso.id not in [li.identifier.id for li in lang.languageidentifier]: session.add( common.LanguageIdentifier(language=lang, identifier=iso)) if ethnologue.id not in [ li.identifier.id for li in lang.languageidentifier ]: session.add( common.LanguageIdentifier(language=lang, identifier=ethnologue)) return lang
def add_identifier(languoid, data, name, type, description, lang='en'): identifier = data['Identifier'].get((name, type, description, lang)) if not identifier: identifier = data.add(common.Identifier, (name, type, description, lang), id='{0}-{1}-{2}-{3}'.format( slug(name), slug(type), slug(description or ''), lang), name=name, type=type, description=description, lang=lang) DBSession.add( common.LanguageIdentifier(language=languoid, identifier=identifier))
def add_codes(lang): for attr, prefix in dict(wals='wals_code_', iso='', glottolog='').items(): code = getattr(lang, 'code_' + attr) if code: if attr == 'iso' and not re.match('[a-z]{3}$', code): continue id_ = prefix + code identifier = common.Identifier.get(id_, default=None) if not identifier: identifier = common.Identifier(id=id_, name=code, type=getattr( common.IdentifierType, attr).value) common.LanguageIdentifier(identifier=identifier, language=lang)
def add_identifier(languoid, data, name, type, description, lang='en'): if len(lang) > 3: # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing. name, lang = '{0} [{1}]'.format(name, lang), 'en' identifier = data['Identifier'].get((name, type, description, lang)) if not identifier: identifier = data.add( common.Identifier, (name, type, description, lang), id='{0}-{1}-{2}-{3}'.format( slug(name), slug(type), slug(description or ''), lang), name=name, type=type, description=description, lang=lang) DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
def update_language(session, timestamp, lang, keep_old_name=False, **kw): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) if 'name' in kw and keep_old_name: name = common.Identifier(id=str(uuid1()), name=lang.name, description='other', type='name') session.add(common.LanguageIdentifier(language=lang, identifier=name)) return update_obj(session, timestamp, lang, **kw)
def add_new_name(args): # TODO: Check the length of the args array is valid gcode, lang, name, type, desc = \ args.args[0], args.args[1], args.args[2], args.args[3], args.args[4] with_session(args) with transaction.manager: languoid = DBSession.query(common.Language) \ .filter_by(id='{0}'.format(gcode)) \ .first() identifier = common.Identifier( (name, type, desc, lang), id='{0}-{1}-{2}-{3}'.format(slug(name), slug(type), slug(desc or ''), lang), name=name, type=type, description=desc, lang=lang) DBSession.add(identifier) DBSession.add( common.LanguageIdentifier(language=languoid, identifier=identifier))
def update(args): count = 0 assert args.json iid = int( DBSession.execute("select max(cast(id as integer)) from identifier"). fetchone()[0]) + 1 pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier(pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def update_glottocode(session, timestamp, lang, gc): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) lang.updated = timestamp for li in lang.languageidentifier: if li.identifier.name == gc and li.identifier.type == common.IdentifierType.glottolog.value: session.delete(li) glottocode = session.query(common.Identifier)\ .filter(common.Identifier.type == common.IdentifierType.glottolog.value)\ .filter(common.Identifier.name == gc)\ .first() if not glottocode: glottocode = common.Identifier( id=gc, name=gc, description=gc, type=common.IdentifierType.glottolog.value) session.add(common.LanguageIdentifier(language=lang, identifier=glottocode)) return lang
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def add_languoids(data, languoids: dict): instances = [] id_to_instance = dict() for slug, fields in languoids.items(): key_ = slug name = fields['name'] parent = fields.get('parent', None) latitude = fields.get('lat', None) longitude = fields.get('lng', None) rich_description = fields.get('rich_description', None) glottocode = fields.get('glottolog', None) ethnologue_code = fields.get('ethnologue', None) nex_url = fields.get('nex_url', None) description_file_url = fields.get('description_file_url', None) add_kwargs = { 'name': name, 'id': key_, } if parent is not None: add_kwargs['parent'] = id_to_instance[parent] if latitude is not None and longitude is not None: add_kwargs['latitude'] = latitude add_kwargs['longitude'] = longitude if rich_description is not None: add_kwargs['description'] = rich_description add_kwargs['markup_description'] = rich_description if glottocode is not None: add_kwargs['glottocode_'] = glottocode if ethnologue_code is not None: add_kwargs['ethnologue_code'] = ethnologue_code if nex_url is not None: add_kwargs['nex_url'] = nex_url if description_file_url is not None: add_kwargs['description_file_url'] = description_file_url add_kwargs['level'] = fields.get('level', models.LanguoidLevel.language) instance = data.add(models.Languoid, key_, **add_kwargs) instances.append(instance) id_to_instance[instance.id] = instance if glottocode is not None: identifier = data.add(models.Identifier, f'{key_}-glottocode', type=models.IdentifierType.glottolog.value, name=glottocode) DBSession.add( common.LanguageIdentifier(language=instance, identifier=identifier)) if ethnologue_code is not None: identifier = data.add(models.Identifier, f'{key_}-ethnologue', type=models.IdentifierType.ethnologue.value, name=ethnologue_code) DBSession.add( common.LanguageIdentifier(language=instance, identifier=identifier)) return instances
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def add_languoids(data, languoids: list): converted_pk = dict() languoids = sorted(languoids, key=lambda x: operator.itemgetter('level')(x['fields'])) current_pk = 0 for languoid in languoids: fields = languoid['fields'] old_pk = languoid['pk'] key_ = get_languoid_key_(old_pk) name = fields['name'] parent = fields.get('parent', None) latitude = fields.get('latitude', None) longitude = fields.get('longitude', None) rich_description = fields.get('rich_description', None) glottocode = fields.get('glottocode', None) ethnologue_code = fields.get('ethnologue_code', None) nex_url = fields.get('nex_url', None) description_file_url = fields.get('description_file_url', None) add_kwargs = { 'name': name, 'id': slugify(name), } if parent is not None: print(parent) add_kwargs['parent_pk'] = converted_pk[parent] if latitude is not None and longitude is not None: add_kwargs['latitude'] = latitude add_kwargs['longitude'] = longitude if rich_description is not None: add_kwargs['description'] = rich_description add_kwargs['markup_description'] = rich_description if glottocode is not None: add_kwargs['glottocode_'] = glottocode if ethnologue_code is not None: add_kwargs['ethnologue_code'] = ethnologue_code if nex_url is not None: add_kwargs['nex_url'] = nex_url if description_file_url is not None: add_kwargs['description_file_url'] = description_file_url add_kwargs['level'] = [ models.LanguoidLevel.superfamily, models.LanguoidLevel.family, models.LanguoidLevel.group, models.LanguoidLevel.language, ][fields.get('level', 3)] instance = data.add(models.Languoid, key_, **add_kwargs) if glottocode is not None: identifier = data.add(models.Identifier, f'{key_}-glottocode', type=models.IdentifierType.glottolog.value, name=glottocode) DBSession.add( common.LanguageIdentifier(language=instance, identifier=identifier)) if ethnologue_code is not None: identifier = data.add(models.Identifier, f'{key_}-ethnologue', type=models.IdentifierType.glottolog.value, name=ethnologue_code) DBSession.add( common.LanguageIdentifier(language=instance, identifier=identifier)) converted_pk[old_pk] = instance.pk current_pk += 1
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ print('Parsing markdown intros...') for contrib in DBSession.query(models.Contribution): if contrib.description: contrib.markup_description = markdown(contrib.description) else: contrib.markup_description = None print('... done') print('Retrieving language data from glottolog...') catconf = cldfcatalog.Config.from_file() glottolog_path = catconf.get_clone('glottolog') glottolog = Glottolog(glottolog_path) lang_ids = [lang.id for lang in DBSession.query(common.Language)] languoids = {l.id: l for l in glottolog.languoids(lang_ids)} glottocodes = [(l.id, common.Identifier(id=l.id, name=l.id, type='glottolog')) for l in languoids.values()] glottocodes = OrderedDict(sorted(glottocodes, key=lambda t: t[0])) isocodes = [(l.iso, common.Identifier(id=l.iso, name=l.iso, type='iso639-3')) for l in languoids.values() if l.iso] isocodes = OrderedDict(sorted(isocodes, key=lambda t: t[0])) DBSession.add_all(glottocodes.values()) DBSession.add_all(isocodes.values()) DBSession.flush() for lang in DBSession.query(common.Language): if lang.id not in languoids: continue languoid = languoids[lang.id] lang.name = languoid.name lang.latitude = languoid.latitude lang.longitude = languoid.longitude lang.macroarea = languoid.macroareas[ 0].name if languoid.macroareas else '' DBSession.add( common.LanguageIdentifier( language=lang, identifier_pk=glottocodes[languoid.id].pk)) if languoid.iso in isocodes: DBSession.add( common.LanguageIdentifier( language=lang, identifier_pk=isocodes[languoid.iso].pk)) DBSession.flush() print('... done') print('Making pretty colourful dots for parameter values...') all_icons = [icon.name for icon in ORDERED_ICONS] code_query = DBSession.query(common.DomainElement)\ .order_by(common.DomainElement.parameter_pk, common.DomainElement.id) for _, param_codes in groupby(code_query, lambda c: c.parameter_pk): icons = cycle(all_icons) for code in param_codes: code.update_jsondata(icon=next(icons)) DBSession.flush() print('... done')