def merge(self, ex1, ex2): for prop in 'rf tx mb gl ft'.split(): p1 = ex1.get(prop) p2 = ex2.get(prop) if p1: if p2: try: assert slug(p1) == slug(p2) except AssertionError: self.log.write( '# cannot merge \\%s:\n%s\n# and\n%s\n\n' % (prop, ex1, ex2)) raise else: if p2: ex1.set(prop, p2)
def bibtex2source(rec, cls=common.Source): year = bibtex.unescape(rec.get("year", "nd")) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(cls, field) else jsondata container[field] = value etal = "" eds = "" authors = rec.get("author") if not authors: authors = rec.get("editor", "") if authors: eds = " (eds.)" if authors: authors = bibtex.unescape(authors).split(" and ") if len(authors) > 2: authors = authors[:1] etal = " et al." authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = "%s%s%s" % (" and ".join(authors), etal, eds) return cls( id=slug(rec.id), name=("%s %s" % (authors, year)).strip(), description=bibtex.unescape(rec.get("title", rec.get("booktitle", ""))), jsondata=jsondata, bibtex_type=rec.genre, **fields )
def add_identifier(languoid, data, name, type, description, lang='en'): if len(lang) > 3: # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing. name, lang = '{0} [{1}]'.format(name, lang), 'en' identifier = data['Identifier'].get((name, type, description, lang)) if not identifier: identifier = data.add( common.Identifier, (name, type, description, lang), id='{0}-{1}-{2}-{3}'.format( slug(name), slug(type), slug(description or ''), lang), name=name, type=type, description=description, lang=lang) DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
def make_index(level, repos=None): fname = dict( language='languages', family='families', dialect='dialects')[level.name] links = defaultdict(dict) for lang in walk_tree(tree=languoids_path('tree', repos=repos)): if lang.level == level: label = '{0.name} [{0.id}]'.format(lang) if lang.iso: label += '[%s]' % lang.iso links[slug(lang.name)[0]][label] = \ lang.dir.joinpath(lang.fname('.ini'))\ .relative_to(languoids_path(repos=repos)) res = [languoids_path(fname + '.md', repos=repos)] with res[0].open('w', encoding='utf8') as fp: fp.write('## %s\n\n' % fname.capitalize()) fp.write(' '.join( '[-%s-](%s_%s.md)' % (i.upper(), fname, i) for i in sorted(links.keys()))) fp.write('\n') for i, langs in links.items(): res.append(languoids_path('%s_%s.md' % (fname, i), repos=repos)) with res[-1].open('w', encoding='utf8') as fp: for label in sorted(langs.keys()): fp.write('- [%s](%s)\n' % (label, langs[label])) return res
def from_csv(cls, row, data=None, description=None): obj = cls(**{n: row[i] for i, n in enumerate(cls.__csv_head__) if '__' not in n and n != 'audio'}) if not slug(row[1]): obj.active = False row = dict(list(zip(cls.__csv_head__, row))) sid = row['taxa__id'] lid = row['languages__id'] vsid = '%s-%s' % (sid, lid) if vsid in data['ValueSet']: obj.valueset = data['ValueSet'][vsid] else: # Note: source and references are dumped redundantly with each word, so we # only have to recreate these if a new ValueSet had to be created. obj.valueset = data.add( ValueSet, vsid, id=vsid, parameter=data['Taxon'][sid], language=data['Languoid'][lid], contribution=data['Contribution']['tsammalex']) if row['refs__ids']: for i, rid, pages in parse_ref_ids(row['refs__ids']): data.add( NameReference, '%s-%s' % (obj.id, i), name=obj, source=data['Bibrec'][rid], description=pages or None) for rel, cls in [ ('categories', 'Category'), ('habitats', 'Category'), ('uses', 'Use') ]: for id_ in split_ids(row[rel + '__ids']): getattr(obj, rel).append(data[cls][id_.strip()]) return obj
def bibtex2source(rec, cls=common.Source, lowercase_id=False): year = bibtex.unescape(rec.get('year', 'nd')) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(cls, field) else jsondata container[field] = value etal = '' eds = '' authors = rec.get('author') if not authors: authors = rec.get('editor', '') if authors: eds = ' (eds.)' if authors: authors = bibtex.unescape(authors).split(' and ') if len(authors) > 2: authors = authors[:1] etal = ' et al.' authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = '%s%s%s' % (' and '.join(authors), etal, eds) return cls( id=slug(rec.id, lowercase=lowercase_id), name=('%s %s' % (authors, year)).strip(), description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=rec.genre, **fields)
def update_name(self, lid, newname, other=None): lpk = self.pk(Language, lid) self.update(Language, dict(name=newname), pk=lpk) self.update( WalsLanguage, dict(ascii_name=slug(newname, remove_whitespace=False)), pk=lpk) if other: ipk = self.insert(Identifier, name=other, description='other', type='name') self.insert(LanguageIdentifier, identifier_pk=ipk, language_pk=lpk)
def adapter_factory(*args, **kw): # for backwards compatibility we interpret the first positional argument as template: if args: kw['template'] = args[0] assert 'template' in kw kw.setdefault('mimetype', 'text/html') kw.setdefault('extension', 'html') base = kw.pop('base', Representation) return type(str('AdapterFromFactory%s' % slug(text_type(uuid4()))), (base,), kw)
def florafauna(req): note_ids = [ 'fish_notes_Mali_JH', 'flora_notes_Mali_JH', 'insect_arthropod_mollusc_notes_Mali_JH', 'mammal_notes_Mali_JH', 'reptile_notes_Mali_JH', 'bird_notes_Mali_JH', ] return { 'notes': [Source.get(slug(sid)) for sid in note_ids] }
def other(req): jenaama = 'Heath2016-Jenaama-lexicon Heath2016-JenaamaBozo'.split() rows = [ ["Tieyaxo", "Tigemaxo", "boz", "tiey1235"], ["Tiema Cewe", "Tiema Ce", "boo", "tiem1235"], ["Kelenga", "Hainyaxo", "bsx", "hain1253"], ["Jenaama", "Sorogaana", "bze", "jena1242"], ] return { 'rows': rows, 'jenaama': [Source.get(slug(sid)) for sid in jenaama] }
def nexus_slug(s): """ Converts a string to a nexus "safe" representation (i.e. removes many unicode characters and removes some punctuation characters). Parameters ---------- s : str A string to convert to a nexus safe format. Returns ------- s : str A string containing a nexus safe label. """ return slug(s, lowercase=False, remove_whitespace=False).replace(" ", "_")
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid", segments="tokens", form="ipa", note='note', form_in_source="value", source=None, alignment=None): """Convert a wordlist in LingPy to CLDF. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A regular Wordlist object (or similar). path : str (default='cldf') The name of the directory to which the files will be written. source_path : str (default=None) If available, specify the path of your BibTex file with the sources. ref : str (default="cogid") The column in which the cognate sets are stored. segments : str (default="tokens") The column in which the segmented phonetic strings are stored. form : str (default="ipa") The column in which the unsegmented phonetic strings are stored. note : str (default=None) The column in which you store your comments. form_in_source : str (default=None) The column in which you store the original form in the source. source : str (default=None) The column in which you store your source information. alignment : str (default="alignment") The column in which you store the alignments. """ if not cldf: raise ValueError('The package pycldf needs to be installed') # create cldf-dataset ds = CLDF_Wordlist.in_dir(path) # add sources if they are available ds.add_sources( read_text(source_path) if source_path else '') # add components ds.add_component('LanguageTable') ds.add_component('ParameterTable', 'Concepticon_ID') ds.add_component('CognateTable') ds.add_columns('FormTable', 'form_in_source') languages, parameters, forms, cognates = {}, {}, [], [] for idx in wordlist: lid = slug(wordlist[idx, 'doculect']) if lid not in languages: languages[lid] = dict( ID=lid, Name=wordlist[idx, 'doculect'], Glottocode = wordlist[idx, 'glottocode']) pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept']) if pid not in parameters: parameters[pid] = dict( ID=pid, Name=wordlist[idx, 'concept'], Concepticon_ID=wordlist[idx, 'concepticon_id']) forms.append(dict( ID=str(idx), Language_ID=lid, Parameter_ID=pid, form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '', Form=wordlist[idx, form] or '' if form else '', Segments=wordlist[idx, segments] or '' if segments else '', Source=[wordlist[idx, source]] or [] if source else [], Comment=wordlist[idx, note] or '' if note else '')) if ref: cognates.append(dict(ID=str(idx), Form_ID=str(idx), Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx, alignment] or [''] if alignment else [''])) ds.write( FormTable=forms, LanguageTable=languages.values(), ParameterTable=parameters.values(), CognateTable=cognates)
def lff2tree(lff, tree=None, outdir='fromlff'): out = Path(outdir) out.mkdir() old_tree = {l.id: l for l in languoids_from_tree(tree)} if tree else {} nodes = set() languages = {} for lang in read_lff(lff, 'language'): groupdir = out languages[lang.id] = lang for name, id_, level in lang.lineage: groupdir = groupdir.joinpath('%s.%s' % (slug(name), id_)) if not groupdir.exists(): groupdir.mkdir() if id_ in old_tree: group = old_tree[id_] assert group.level == level if name != group.name: # rename a subgroup! group.name = name group.write_info(groupdir) else: # TODO: create Languoid, write info file! pass assert id_ in old_tree nodes.add(id_) assert lang.id in old_tree nodes.add(lang.id) old_lang = old_tree[lang.id] assert old_lang.level == lang.level if old_lang.name != lang.name: old_lang.name = lang.name langdir = groupdir.joinpath(lang.fname()) langdir.mkdir() old_lang.write_info(langdir) for lang in read_lff(lff.replace('lff', 'dff'), 'dialect'): groupdir = out if not lang.lineage: # TODO: handle error of un-attached dialects! continue for name, id_, level in languages[lang.lineage[0][1]].lineage + lang.lineage: groupdir = groupdir.joinpath('%s.%s' % (slug(name), id_)) if not groupdir.exists(): groupdir.mkdir() if id_ in old_tree: group = old_tree[id_] assert group.level == level if name != group.name: # rename a subgroup! group.name = name group.write_info(groupdir) else: # TODO: create Languoid, write info file! pass assert id_ in old_tree nodes.add(id_) assert lang.id in old_tree nodes.add(lang.id) old_lang = old_tree[lang.id] assert old_lang.level == lang.level if old_lang.name != lang.name: old_lang.name = lang.name langdir = groupdir.joinpath(lang.fname()) langdir.mkdir() old_lang.write_info(langdir) print len(nodes)
def contribution_id(self): return '{0}-{1}'.format(self.contributor_id, slug(self.review_title))
def normalize_sid(sid): return slug(sid.replace('+', 'and').replace('&', 'and'))
def words(s): return set(slug(s.strip(), remove_whitespace=False).split())
def parameter_id(self): return slug(self.parameter)
def main(args): # # order of init: # - villages # - files # - movies # videos = defaultdict(list) for f in util.iter_files(args): obj = models.File(**attr.asdict(f)) if obj.mime_type.startswith('video'): videos[slug(obj.name.split('.')[0])].append(obj) DBSession.add(obj) lexicon = list(util.iter_lexicon(args)) villages = util.get_villages(args) ff_images = list(util.ff_images(args)) bib = list(util.get_bib(args)) data = Data() dataset = common.Dataset( id=dogonlanguages.__name__, name="Dogon and Bangime Linguistics", contact="*****@*****.**", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='dogonlanguages.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) if Glottolog: if socket.gethostname() == 'dlt5502178l': glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) else: glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} else: languoids = {} print('got glottolog') for c in util.CONTRIBUTORS: id_ = slug(c.name.split()[-1]) data.add(models.Member, id_, id=id_, **attr.asdict(c)) data.add( models.Member, 'forkel', id='forkel', name='Robert Forkel', email='*****@*****.**', in_project=False) for i, id_ in enumerate(['moran', 'forkel', 'heath']): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=data['Member'][id_])) contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages') for doc in bib: obj = data.add( models.Document, doc.rec.id, _obj=bibtex2source(doc.rec, cls=models.Document)) keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')]) for dt in 'grammar lexicon typology texts'.split(): if dt in keywords: obj.doctype = dt break obj.project_doc = ('DLP' in keywords) or bool(doc.files) if obj.project_doc: for i, cid in enumerate(util.get_contributors(doc.rec, data)): models.DocumentContributor( document=obj, contributor=data['Member'][cid], ord=i) for i, (path, cdstar) in enumerate(doc.files): common.Source_files( id='%s-%s' % (obj.id, i + 1), name=path, object=obj, mime_type=guess_type(path)[0], jsondata=cdstar, ) print('got bib') for name, (gc, desc) in LANGUAGES.items(): gl_lang = languoids[gc] lat, lon = gl_lang.latitude, gl_lang.longitude lang = data.add( models.Languoid, gc, id=gc, name=name, description=desc, latitude=lat, longitude=lon, family=gl_lang.family.name if gl_lang and gl_lang.family else name, ) if name == 'Penange' and lang.longitude > 0: lang.longitude = -lang.longitude if name == 'Bankan Tey': lang.latitude, lang.longitude = 15.07, -2.91 if name == 'Ben Tey': lang.latitude, lang.longitude = 14.85, -2.95 if name == 'Togo Kan': lang.latitude, lang.longitude = 14.00, -3.25 add_language_codes(data, lang, gl_lang.iso, glottocode=gc) villages_by_name = defaultdict(list) contrib_by_initial = {c.abbr: c for c in data['Member'].values()} for i, village in enumerate(villages): lang = None if village.glottocode: lang = data['Languoid'].get(village.glottocode) if not lang: gl_lang = languoids[village.glottocode] lang = data.add( models.Languoid, gl_lang.id, id=gl_lang.id, name=gl_lang.name, in_project=False, family=gl_lang.family.name if gl_lang.family else gl_lang.name) v = data.add( models.Village, str(i + 1), id=str(i + 1), name=village.name, description=village.data.pop('social info'), surnames=village.data.pop('surnames'), major_city=village.data['MajorCity'] == 'Y', transcribed_name=village.data.pop('Transcribed Village Name'), source_of_coordinates=village.data.pop('sourceOfCoordinates'), latitude=village.lat, longitude=village.lon, languoid=lang, jsondata=village.data, ) villages_by_name[village.name].append(v) for img in village.images: mimetype = guess_type(img.name)[0] if mimetype: f = models.Village_files( id=img.id, name=img.name, description=img.description, date_created=img.date, latitude=img.coords[0] if img.coords else None, longitude=-img.coords[1] if img.coords else None, object=v, mime_type=mimetype, jsondata=img.cdstar, ) for initial in img.creators: if initial in contrib_by_initial: models.Fotographer( foto=f, contributor=contrib_by_initial[initial]) for cat, desc, place, name in MOVIES: s = slug(name) m = models.Movie( id=s, name=desc, description=cat, place=place, ) if place in villages_by_name and len(villages_by_name[place]) == 1: m.village = villages_by_name[place][0] #print('found village: %s' % name) for v in videos[s]: #print('found video: %s' % name) v.movie = m m.duration = v.duration names = defaultdict(int) for concept in lexicon: add(concept, data, names, contrib) count = set() for img in ff_images: if img.id in count: continue count.add(img.id) if img.ref: if img.ref in data['Concept']: concept = data['Concept'][img.ref] if img.tsammalex_taxon and not concept.tsammalex_taxon: concept.tsammalex_taxon = img.tsammalex_taxon #print(concept.tsammalex_taxon) common.Parameter_files( object=concept, id=img.id, name=img.name.decode('utf8'), mime_type=guess_type(img.name)[0], jsondata=img.cdstar) else: print('missing ref: %s' % img.ref)
def id(self): res = self.get('ref') if not res: res = md5(slug(self.text + self.translation).encode('utf')).hexdigest() self.insert(0, ('ref', res)) return res
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))
def main(args): data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2015, 10, 1), contact='*****@*****.**', domain='dictionaria.clld.org', license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) ed = data.add( common.Contributor, 'hartmanniren', id='hartmanniren', name='Iren Hartmann') common.Editor(dataset=dataset, contributor=ed) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} comparison_meanings_alt_labels = {} print('loading concepts ...') concepticon = Concepticon() for i, concept_set in enumerate(concepticon.resources('parameter').members): concept_set = concepticon.resource(concept_set) cm = ComparisonMeaning( id=concept_set.id, name=concept_set.name.lower(), description=concept_set.description, concepticon_url='%s' % concept_set.uriref) DBSession.add(cm) comparison_meanings[cm.name] = cm for label in concept_set.alt_labels: comparison_meanings_alt_labels.setdefault(label.lower(), cm) DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} comparison_meanings_alt_labels = { k: v.pk for k, v in comparison_meanings_alt_labels.items()} submissions = [] for submission in REPOS.joinpath('submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md id_ = submission.id lmd = md['language'] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) dictionary = data.add( Dictionary, id_, id=id_, name=lmd['name'] + ' Dictionary', language=language, published=date(*map(int, md['published'].split('-')))) for i, cname in enumerate(md['authors']): name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add(common.Contributor, cid, id=cid, name=cname) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: try: mod = __import__( 'dictionaria.loader.' + submission.id, fromlist=['MARKER_MAP']) marker_map = mod.MARKER_MAP except ImportError: marker_map = {} transaction.begin() print('loading %s ...' % submission.id) submission.load( did, lid, comparison_meanings, comparison_meanings_alt_labels, marker_map) transaction.commit() print('... done') #('hoocak', 'Hooca\u0328k', 43.5, -88.5, [('hartmanniren', 'Iren Hartmann')]), #('yakkha', 'Yakkha', 27.37, 87.93, [('schackowdiana', 'Diana Schackow')]), #('palula', 'Palula', 35.51, 71.84, [('liljegrenhenrik', 'Henrik Liljegren')], {}), #('daakaka', 'Daakaka', -16.27, 168.01, [('vonprincekilu', 'Kilu von Prince')], # {'published': date(2015, 9, 30), 'iso': 'bpa', 'glottocode': 'daka1243'}), #('teop', 'Teop', -5.67, 154.97, [('moselulrike', 'Ulrike Mosel')], # {'published': date(2015, 9, 30), 'iso': 'tio', 'glottocode': 'teop1238', 'encoding': 'latin1'}), transaction.begin() load_families(Data(), DBSession.query(Variety))
def add_cultural_data(questionnaire_file_name, parameters, language): """ Parse the typological questionnaire into the database """ contribution_text, parameter_descriptions, answers = parse_culture_questionnaire( os.path.join(DBPATH, questionnaire_file_name)) # All ValueSets must be related to a contribution, so generate one from the metadata. contrib = common.Contribution(id='contrib'+newid(), name=contribution_text+newid()) for p, parameter in parameter_descriptions.iterrows(): # First, make sure that this parameter exists – either look it up or create it. pid = p.replace(".", "-") try: param, domain = parameters[pid] except KeyError: param = common.Parameter( id='culture'+pid, name=p, description=parameter['Question_text_English'], markup_description=parameter['Question_text_English']) domain = {} parameters[pid] = (param, domain) # Secondly, check whether we are aware that this answer is # valid already – otherwise we add its value to the domain, # and use that. # Note: Once we have a database, we can do better filtering # and constraining, and don't need to rely on reasonable data. answer = str(answers["Answer"][p]) try: domain_element = domain[slug(answer)] except KeyError: try: numerical_value = int(answer) except ValueError: numerical_value = (1 if answer == "Y" or answer == 'True' else 0 if answer == "N" or answer == 'False' else None) domain_element = common.DomainElement( id=param.id+slug(answer), description=answer, number=numerical_value, name=answer, parameter=param, abbr=answer, jsondata={'color': color(numerical_value)}) DBSession.add(domain_element) try: DBSession.flush() except: print(domain, domain_element, language.name, pid, param.name) domain[slug(answer)] = domain_element # Now create the ValueSet, representing all values the # language has for this parameter vs = common.ValueSet(id='vs'+newid(), language=language, parameter=param, jsondata=domain_element.jsondata, contribution=contrib) # and fill in the actual values, which in this case is only # one. This object, and all objects it depends on, are then # scheduled for writing into the database. DBSession.add(common.Value( id='v'+newid(), valueset=vs, frequency=float(100), jsondata=domain_element.jsondata, domainelement=domain_element)) # Execute all scheduled database updates. DBSession.flush()
def parse_culture_questionnaire(filename): questionnaire = pandas.ExcelFile(filename) metadata_sheet_name = 'Metadata' if metadata_sheet_name in questionnaire.sheet_names: metadata_sheet = questionnaire.parse(metadata_sheet_name, header=None) try: metadata_blob = "; ".join(map(str, metadata_sheet.values)) except KeyError: metadata_blob = "" else: metadata_blob = "" metadata = metadata_blob weird_rows = set() answers = pandas.DataFrame(columns = ["Answer", "Original Answer", "Notes"]) features = pandas.DataFrame(columns = ["Domain", "Question_text_English", "Question_text_Indonesian", "Question_notes"]) try: data_sheet = questionnaire.parse( 0, skiprows=[0, 1, 2, 3, 4]).dropna('columns', 'all') except XLRDError: raise UnexpectedCultureFormatError( "Culture sheet did not have a 'Questionnaire' sheet") if "answer" in data_sheet.columns[5].lower(): cols = list(data_sheet.columns) cols[5]="Original Answer" data_sheet.columns = cols else: raise UnexpectedCultureFormatError( "Expected Excel column F to have 'answer' in its header.") for i, row in data_sheet.iterrows(): if pandas.isnull(row["Q_ID"]): # print(row) continue id = str(row["Q_ID"]).lower() if pandas.isnull(row[5]): # print(row) continue else: question = row[features.columns] question.name = id features = features.append(question) answer = row[answers.columns] if pandas.isnull(answer['Original Answer']): answer['Answer'] = None elif type(answer['Original Answer']) == int: answer['Answer'] = answer['Original Answer'] elif slug(answer['Original Answer']) == 'yes': answer['Answer'] = True elif slug(answer['Original Answer']) == 'no': answer['Answer'] = False elif slug(answer['Original Answer']) == 'na': answer['Answer'] = None else: answer['Answer'] = answer['Original Answer'] answer.name = id answers = answers.append(answer) assert(len(features) == len(answers)) return metadata, features, answers
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add( common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file( data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add(common.Source_files( mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add( models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add(models.Relation( source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference( source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add( common.Contributor, name, id=slug(name), name=name) DBSession.add(common.ContributionContributor( ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add(common.Contribution_data( object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider( id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta( metaprovider=provider, conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def cmd_makecldf(self, args): wl = lingpy.Wordlist( self.raw_dir.joinpath("bodt-khobwa-cleaned.tsv").as_posix(), conf=self.raw_dir.joinpath("wordlist.rc").as_posix(), ) args.writer.add_sources() concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.english] = idx args.writer.add_languages(lookup_factory="Name") mapper = { "pʰl": "pʰ l", "pɕ": "p ɕ", "pɕ~ɕ/pɕ": "p ɕ", "aw": "au", "ɛj": "ɛi", "ɔw": "ɔu", "bl": "b l", "aj": "ai", "ɔj": "ɔi", "(ŋ)": "ŋ", "kʰl": "kʰ l", "ej": "ei", "uj": "ui", "bɹ": "b ɹ", "ɐʰ": "ɐʰ/ɐ", "hw": "h w", "ɔːʰ": "ɔːʰ/ɔː", "dʑr": "dʑ r", "ow": "ou", "pl": "p l", "lj": "l j", "tʰj": "tʰ j", "aːʰ": "aːʰ/aː", "bj": "b j", "mp": "m p", "pɹ": "p ɹ", "ɐ̃ʰ": "ɐ̃ʰ/ɐ̃", "ɔ̃ʰ": "ɔ̃ʰ/ɔ̃", "aj~e/aj": "aj~e/ai", "aj~ej/ej": "aj~ej/ei", "kl": "k l", "kʰɹ": "kʰ ɹ", "ɛːʰ": "ɛːʰ/ɛː", "ɔʰ": "ɔʰ/ɔ", "tɹ": "t ɹ", "ɐːʰ": "ɐːʰ/ɐ", "br": "b r", "kɹ": "k ɹ", "kʰj": "kʰ j", "kʰr": "kʰ r", "gɹ": "g ɹ", "hj": "h j", "bl~gl/bl": "bl~gl/b l", "dj": "d j", "ej~i/ej": "ej~i/ei", "e~a/ej": "e~a/ei", "fl": "f l", "kʰw": "kʰ w", "mj": "m j", "pr": "p r", "pʰl~bl/pʰl": "pʰl~bl/pʰ l", "pʰr": "pʰ r", "pʰr~pʰl/pʰr": "pʰr~pʰl/pʰ r", "pʰw": "pʰ w", "pʰɹ": "pʰ ɹ", "tr": "t r", "tɕʰɹ": "tɕʰ ɹ", "tʰr": "tʰ r", "tʰw": "tʰ w", "dɾ": "d ɾ", "tɾ": "t ɾ", "zj": "z j", "ɔj~uj/uj": "ɔj~uj/ui", } for idx in pylexibank.progressbar(wl, desc="cldfify"): segments = " ".join([mapper.get(x, x) for x in wl[idx, "tokens"]]) concept = concept_lookup.get(wl[idx, "concept"], "") lex = args.writer.add_form_with_segments( Language_ID=wl[idx, "doculect"], Parameter_ID=concept, Value=wl[idx, "form"], Form=wl[idx, "form"], Segments=segments.split(), Partial_Cognacy=" ".join([str(x) for x in wl[idx, "crossids"]]), Source=["Bodt2019"], ) for morpheme_index, cogid in enumerate(wl[idx, "crossids"]): alignment = wl[idx, "alignment"].split(" + ")[morpheme_index].split() alignment = " ".join([mapper.get(x, x) for x in alignment]).split() if int(cogid): args.writer.add_cognate( lexeme=lex, Cognateset_ID=cogid, Morpheme_Index=morpheme_index + 1, Alignment=alignment, )
def contributor_id(self): return slug(self.reviewer.last + self.reviewer.first)
def id(self): return '{0}-{1}-{2}-{3}'.format(self.contributor_id, slug(self.doi), self.experiment_number, self.species_id)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == "cleanup": for fname in args.data_file("gbs").glob("*.json"): try: data = jsonlib.load(fname) if data.get("totalItems") == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file("gbs", "source%s.json" % source.id) if command == "update": source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ["verify", "update"]: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn("no JSON object found in: %s" % filepath) continue if not data["totalItems"]: continue item = data["items"][0] else: continue if command == "verify": stitle = source.description or source.title or source.booktitle needs_check = False year = item["volumeInfo"].get("publishedDate", "").split("-")[0] if not year or year != slug(source.year or ""): needs_check = True twords = words(stitle) iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", "")) if ( twords == iwords or (len(iwords) > 2 and iwords.issubset(twords)) or (len(twords) > 2 and twords.issubset(iwords)) ): needs_check = False if int(source.id) == 241: log.info("%s" % sorted(words(stitle))) log.info("%s" % sorted(iwords)) if needs_check: log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers"))) log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", ""))) log.info(stitle) log.info(item["volumeInfo"].get("publishedDate")) log.info(source.year) log.info(item["volumeInfo"].get("authors")) log.info(source.author) log.info(item["volumeInfo"].get("publisher")) log.info(source.publisher) if not confirm("Are the records the same?"): log.warn("---- removing ----") jsonlib.dump({"totalItems": 0}, filepath) elif command == "update": source.google_book_search_id = item["id"] source.update_jsondata(gbs=item) count += 1 elif command == "download": if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ "inauthor:" + quote_plus(source.author.encode("utf8")), "intitle:" + quote_plus(title.encode("utf8")), ] if source.publisher: q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8"))) url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key) count += 1 r = requests.get(url, headers={"accept": "application/json"}) log.info("%s - %s" % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), "w") as fp: fp.write(r.text.encode("utf8")) elif r.status_code == 403: log.warn("limit reached") break if command == "update": log.info("assigned gbs ids for %s out of %s sources" % (count, i)) elif command == "download": log.info("queried gbs for %s sources" % count)
def ia_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) else: if callable(sources): sources = sources() i = 0 for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('ia', 'source%s.json' % source.id) if command in ['verify', 'update']: if filepath.exists(): with open(filepath) as fp: try: data = json.load(fp) except ValueError: continue if not data['response']['numFound']: continue item = data['response']['docs'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = text_type(item.get('year', '')) if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['title']) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if needs_check: log.info('------- %s -> %s' % (source.id, item['identifier'])) log.info(item['title']) log.info(stitle) log.info(item.get('year')) log.info(source.year) log.info(item['creator']) log.info(source.author) if not confirm('Are the records the same?'): log.warn('---- removing ----') with open(filepath, 'w') as fp: json.dump({"response": {'numFound': 0}}, fp) elif command == 'update': source.update_jsondata(internetarchive_id=item['identifier']) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = quote_plus(b'creator:"%s" AND title:"%s"' % ( source.author.split(',')[0].encode('utf8'), title.encode('utf8'))) count += 1 r = requests.get(API_URL + q, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, r.url)) if r.status_code == 200: with open(filepath, 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned internet archive identifiers for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried internet archive for %s sources' % count)
def test_slug(): from clldutils.misc import slug assert slug('A B. \xe4C') == 'abac'
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Unit.name)).create(DBSession.bind) data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: print('no md', submission.id) continue if not md['date_published']: print('no date', submission.id) continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': print('not selected', submission.id) continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] props.setdefault('choices', {}) language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), doi=md.get('doi'), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=spec.get('primary', True), contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_sources(Dictionary.get(did), dictdata) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog/glottolog')
def slug(s, escape=False, **kw): # That's some weird stuff coming in with ElCat alternative names ... return misc.slug( ''.join(hex(ord(c)) if escape else c for c in s if ord(c) != 2), **kw)
def species_id(self): return slug(self.species_latin)
def from_name(cls, name, dry_run=False, repos=None): alpha = slug(text_type(name))[:4] assert alpha while len(alpha) < 4: alpha += alpha[-1] return Glottocodes(repos=repos).new(alpha, dry_run=dry_run)
def import_dataset(path, data, icons, add_missing_features = False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() try: contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name) except: print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename)) return md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in pandas.io.parsers.read_csv( path, sep=',' if 'c' in ext else '\t', encoding='utf-16').iterrows(): if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']): print("Expected columns not found: ", row) continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: print(('skip value for invalid feature %s' % row['Feature_ID'])) continue language = data['CulturebankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! try: languoid = glottolog.languoid(row['Language_ID']) except AttributeError: print(('Skipping, no Glottocode found for %s' % row['Language_ID'])) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( CulturebankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name else: name = str(name) if name in domain: name = domain[name].name else: raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format( row['Language_ID'], row['Feature_ID'], name, {d: de for d, de in domain.items()})) data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) print(".", end="") if vs.source is not None: for key, src in list(data['Source'].items()): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def fname(self, suffix=''): return '%s.%s%s' % (slug(self.name), self.id, suffix)
def cmd_makecldf(self, args): args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Token") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") seen = [] for f in ("AP_lexicon_coded.txt", "AP_lexicon.txt"): for row in self.raw_dir.read_csv(f, dicts=True, delimiter="\t"): concept = row["English"].lower().strip().replace(", ", "/") # skip rows in AP_lexicon.txt that we've already seen # in AP_lexicon_coded.txt # Note that there are duplicate rows in across both files, and *within* # the same files, so this handles that too. if concept in seen: continue # manually catch "chase away". There are two glosses for this: # 367 "chase away" # 368 "chase away, expel" # ...-> 367 looks only partially complete and 368 contains all # forms in 367, so ignore 367. if row["English"] == "chase away": continue if row["English"]: # store lexicon IDs for the cognate row. lexicon_ids = {} for lang in languages: assert concept in concepts, "bad concept %s" % concept value = row[lang] # preprocess value # remove the reconstruction mark (for proto-AP) value = value.lstrip("*") # remove leading & trailing spaces value = value.strip().lstrip() # if the stripped form starts and ends with a slash, # it is a leftover from a transcription, let's clean # it (it could be done with the orthographic profile, # but this could hide errors in parsing multiple # forms, and in any case this is more adequate as we # get the correct value) if value.startswith("/") and value.endswith("/"): value = value[1:-1] lex = args.writer.add_forms_from_value( Language_ID=languages[lang], Parameter_ID=concepts[concept], Value=value, Source=["Robinson2012"], ) if len(lex) >= 1: # it looks like only the first lexemes of combined # forms have cognates, so only add the first one. lexicon_ids[lang] = lex[0] seen.append(concept) else: # cognates... lastword = seen[-1] # find the last word.. for lang in languages: # find lexical ids belonging to this language & gloss. lex = lexicon_ids.get(lang) if lex and row[lang]: if int(row[lang]) not in range(0, 12): raise ValueError("Invalid cognate id: %s" % row[lang]) args.writer.add_cognate( lexeme=lex, Cognateset_ID="%s-%s" % (concepts[lastword], row[lang]), Source=["Robinson2012"], )
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words( item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info('------- %s -> %s' % ( source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % ( item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus( source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)