def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def main(args): data = Data() ds = Pofatu( pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data') dataset = common.Dataset( id=pofatu.__name__, name="POFATU", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='pofatu.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('hermannaymeric', 'Aymeric Hermann'), ('forkelrobert', 'Robert Forkel'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for rec in ds.iterbib(): rec.genre = bibtex.EntryType.from_string( ENTRY_TYPES.get(rec.genre, rec.genre)) if 'date' in rec: rec['year'] = rec.pop('date') data.add(common.Source, rec.id, _obj=bibtex2source(rec, lowercase_id=False)) analyses = list(ds.iterdata()) def midpoint(coords): p = MultiPoint([(lat, lon + 360 if lon < 0 else lon) for lat, lon in coords]).convex_hull #geojson = { # 'type': 'Feature', # 'properties': {}, # 'geometry': mapping(p)} c = p.centroid return c.x, (c.y - 360) if c.y > 180 else c.y artefacts = collections.defaultdict(dict) midpoints = {} for a in analyses: l = a.sample.location lid = l.id if lid not in midpoints: midpoints[lid] = set() if l.latitude is not None and l.longitude is not None: midpoints[lid].add((l.latitude, l.longitude)) art = a.sample.artefact for attr_ in ['name', 'category', 'collection_type']: if not artefacts[slug(art.id)].get(attr_): artefacts[slug(art.id)][attr_] = getattr(art, attr_) midpoints = { k: midpoint(v) if v else (None, None) for k, v in midpoints.items() } for analysis in analyses: loc = analysis.sample.location if loc.id not in data['Location']: data.add( models.Location, loc.id, id=valid_id(loc.id), name=loc.label, latitude=midpoints[loc.id][0], longitude=midpoints[loc.id][1], region=loc.region.replace('_', ' '), subregion=loc.subregion, location=loc.locality, ) # Add contributions for contrib in ds.itercontributions(): contribution = data.add( common.Contribution, contrib.id, id=valid_id(contrib.id), name=contrib.label, description=contrib.description, ) DBSession.flush() for i, name in enumerate(contrib.contributors): cid = slug(name) co = data['Contributor'].get(cid) if not co: co = data.add(common.Contributor, cid, id=cid, name=name) common.ContributionContributor(ord=i, contribution=contribution, contributor=co) for ref in contrib.source_ids: DBSession.add( common.ContributionReference( contribution=contribution, source=data['Source'][ref], )) data['Contribution'][ref] = contribution methods = collections.defaultdict(list) for method in ds.itermethods(): m = data.add( models.Method, method.id, id=valid_id(method.id), name=method.label, code=method.code, parameter=method.parameter.strip(), instrument=method.instrument, number_of_replicates=method.number_of_replicates, date=method.date, comment=method.comment, detection_limit=method.detection_limit, detection_limit_unit=method.detection_limit_unit, total_procedural_blank_value=method.total_procedural_blank_value, total_procedural_unit=method.total_procedural_unit, ) methods[(m.code.lower(), m.parameter.lower())].append(m) for ref in method.references: DBSession.add( models.MethodReference( method=m, sample_name=ref.sample_name, sample_measured_value=ref.sample_measured_value, uncertainty=ref.uncertainty, uncertainty_unit=ref.uncertainty_unit, number_of_measurements=ref.number_of_measurements, )) for ref in method.normalizations: DBSession.add( models.Normalization( method=m, reference_sample_name=ref.reference_sample_name, reference_sample_accepted_value=ref. reference_sample_accepted_value, citation=ref.citation, )) parameter = data.add(common.Parameter, 'c', id='category', name='Sample category') for i, opt in enumerate(attr.fields_dict( pypofatu.models.Sample)['sample_category'].validator.options, start=1): data.add(common.DomainElement, opt, parameter=parameter, id=str(i), name=opt) DBSession.flush() assert parameter.pk # Add Samples and UnitParameters and Measurements for analysis in analyses: sample = analysis.sample vsid = '{0}-{1}'.format(sample.location.id, data['Contribution'][sample.source_id].id) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=valid_id(vsid), language_pk=data['Location'][sample.location.id].pk, parameter_pk=parameter.pk, contribution_pk=data['Contribution'][sample.source_id].pk, ) v = data['Sample'].get(sample.id) if not v: v = data.add( models.Sample, sample.id, id=valid_id(sample.id), name=sample.id, sample_name=sample.sample_name, sample_comment=sample.sample_comment, petrography=sample.petrography, latitude=sample.location.latitude, longitude=sample.location.longitude, elevation=sample.location.elevation, location_comment=sample.location.comment, site_name=sample.site.name, site_code=sample.site.code, site_context=sample.site.context, site_comment=sample.site.comment, site_stratigraphic_position=sample.site.stratigraphic_position, site_stratigraphy_comment=sample.site.stratigraphy_comment, domainelement=data['DomainElement'][sample.sample_category], valueset=vs, artefact_id=sample.artefact.id, artefact_name=sample.artefact.name, artefact_category=sample.artefact.category, artefact_comment=sample.artefact.comment, artefact_attributes=sample.artefact.attributes, artefact_collector=sample.artefact.collector, artefact_collection_type=sample.artefact.collection_type, artefact_collection_location=sample.artefact. collection_location, artefact_collection_comment=sample.artefact.collection_comment, artefact_fieldwork_date=sample.artefact.fieldwork_date, ) DBSession.add( models.SampleReference( description='sample', sample=v, source=data['Source'][sample.source_id])) for ref in sample.artefact.source_ids: DBSession.add( models.SampleReference(description='artefact', sample=v, source=data['Source'][ref])) for ref in sample.site.source_ids: DBSession.add( models.SampleReference(description='site', sample=v, source=data['Source'][ref])) a = data.add( models.Analysis, analysis.id, id=better_slug(analysis.id), name=analysis.id, sample=v, ) for i, measurement in enumerate(analysis.measurements): if i == 0: method = measurement.method if method: a.analyzed_material_1 = method.analyzed_material_1, a.analyzed_material_2 = method.analyzed_material_2, a.sample_preparation = method.sample_preparation, a.chemical_treatment = method.chemical_treatment, a.technique = method.technique, a.laboratory = method.laboratory, a.analyst = method.analyst, pid = slug(measurement.parameter, lowercase=False) p = data['Param'].get(pid) if not p: p = data.add(models.Param, pid, id=pid, name=measurement.parameter) data.add( models.Measurement, None, id='{0}-{1}'.format(a.id, p.id), analysis=a, method=data['Method'].get(measurement.method.id) if measurement.method else None, value=measurement.value, less=measurement.less, precision=measurement.value_sd, sigma=measurement.sd_sigma, unitparameter=p, )
def main(args): # pragma: no cover data = Data() clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data' clts_repos = CLTS(clts_repos) print(clts_repos.repos) version = 'v2.1.0' # assert_release(clts_repos.repos) for rec in Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id='clts', name="CLTS {0}".format(version), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate([ 'Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Robert Forkel', ]): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for line in args.cldf['data/features.tsv']: data.add( models.Feature, line['ID'], id=line['ID'], name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']), sound_type=line['TYPE'], feature=line['FEATURE'], value=line['VALUE'], ) DBSession.add(models.SoundSegment( id='NA', name='<NA>', description='<NA>', type='marker', generated=True, unicode='', color='#bbbbbb', )) for line in args.cldf['data/sounds.tsv']: s = data.add( models.SoundSegment, line['ID'], id=line['ID'], name=line['GRAPHEME'], description=line['NAME'], type=line['TYPE'], generated=line['GENERATED'], unicode=' / '.join(line['UNICODE']), color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']), ) if s.color == '0': s.color = '#bbbbbb' assert s.color in LEGEND DBSession.flush() seen = set() for line in args.cldf['data/sounds.tsv']: for fid in line['FEATURES']: spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk if (spk, fpk) not in seen: DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk)) seen.add((spk, fpk)) english = data.add( common.Language, 'eng', id='eng', name='English') for line in args.cldf['sources/index.tsv']: c = data.add( models.Transcription, line['NAME'], id=line['NAME'], name=line['NAME'], description=line['DESCRIPTION'].replace(':bib:', '/sources/'), datatype=getattr(models.Datatype, line['TYPE']) ) for ref in line.get('REFS', []): common.ContributionReference(source=data['Source'][ref], contribution=c) sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl for line in args.cldf['data/graphemes.tsv']: key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME'] if key not in data['Grapheme']: sound_id = line['NAME'].replace(' ', '_') vs = data['ValueSet'].get((line['DATASET'], line['NAME'])) if not vs: try: vs = data.add( common.ValueSet, (line['DATASET'], line['NAME']), id=key, description=line['NAME'], language=english, contribution=data['Transcription'][line['DATASET']], parameter=data['SoundSegment'][sound_id] ) except: print(line) raise data.add( models.Grapheme, key, id=key, name=line['GRAPHEME'], description=line['NAME'], url=line['URL'].unsplit() if line['URL'] else None, audio=sound_url_template.expand(line) if line['SOUND'] else None, image=image_url_template.expand(line) if line['IMAGE'] else None, valueset=vs )
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) for i, name in enumerate( ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add(common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file(data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add( common.Source_files(mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add(models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add( models.Relation(source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference(source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add(common.Contributor, name, id=slug(name), name=name) DBSession.add( common.ContributionContributor(ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split( ): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add( common.Contribution_data(object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider(id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta(metaprovider=provider, conceptset=data['ConceptSet'] [meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def main(args): data = Data() for rec in Database.from_file( data_path('references.bib'), lowercase=False): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id=clts.__name__, name="CLTS", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Thiago Chacon', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t', namedtuples=True)): if not i % 100: print('-', end="") key = line.NAME.replace(' ', '_') data.add( models.SoundSegment, key, id=key, name = line.NAME, grapheme=line.GRAPHEME, aliases=line.ALIASES, representation=len(line.REFLEXES.split(',')), reflexes = line.REFLEXES, generated = True if line.GENERATED else False, unicode = line.UNICODE, ) print('') english = data.add( common.Language, 'eng', id='eng', name='English') contributions = {} for line in reader(data_path('datasets.tsv'), delimiter='\t', namedtuples=True): contributions[line.NAME] = data.add( models.CLTSDataSet, line.NAME, id=line.NAME, name=line.NAME, description=line.DESCRIPTION, datatype=line.TYPE ) for id_ in line.REFS.split(', '): common.ContributionReference( source=data['Source'][id_], contribution=contributions[line.NAME]) visited = set() for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t", namedtuples=True)): if not i % 100: print('-', end='') key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME if key not in visited: sound_id = line.NAME.replace(' ', '_') vs = common.ValueSet( id=key, description=line.NAME, language=english, contribution=contributions[line.DATASET], parameter=data['SoundSegment'][sound_id] ) data.add( models.Grapheme, key, id=key, grapheme=line.GRAPHEME, bipa_grapheme=line.BIPA, name=line.NAME, dataset=line.DATASET, datatype=line.DATATYPE, frequency=line.FREQUENCY or 0, image=line.IMAGE, url=line.URL, valueset=vs ) visited.add(key) print('-')
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()