def update_providers(args): if not args.data_file(args.version, 'provider.txt').exists(): return with open(args.data_file(args.version, 'provider.txt')) as fp: content = fp.read().decode('latin1') if '\r\n' in content: content = content.replace('\r\n', '\n') provider_map = get_map(Provider) for block in content.split('\n\n\n\n'): lines = block.split('\n') id_, abbr = lines[0].strip().split(':') id_ = id_.split('.')[0] description = unescape('\n'.join(lines[1:])) name = description.split('.')[0] if id_ == 'hedvig-tirailleur': id_ = u'skirgard' if slug(id_) not in provider_map: args.log.info('adding provider %s' % slug(id_)) DBSession.add( Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def update_providers(args, verbose=False): filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini') p = RawConfigParser() with io.open(filepath, encoding='utf-8-sig') as fp: p.readfp(fp) provider_map = get_map(Provider) for section in p.sections(): sectname = section[:-4] if section.endswith('.bib') else section id_ = slug(sectname) attrs = { 'name': p.get(section, 'title'), 'description': p.get(section, 'description'), 'abbr': p.get(section, 'abbr'), } if id_ in provider_map: provider = provider_map[id_] for a in list(attrs): before, after = getattr(provider, a), attrs[a] if before == after: del attrs[a] else: setattr(provider, a, after) attrs[a] = (before, after) if attrs: args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs))) if verbose: for a, (before, after) in attrs.items(): before, after = (' '.join(_.split()) for _ in (before, after)) if before != after: args.log.info('%s\n%r\n%r' % (a, before, after)) else: args.log.info('adding provider %s' % slug(id_)) DBSession.add(Provider(id=id_, **attrs))
def main(args, reload=False): species = {} db = args.data_file('theplantlist', 'db.json') if reload: for a in bs(get('/1.1/browse/-/')).find('ul', id='nametree').find_all('a'): with iopen(args.data_file('theplantlist', a.text + '.csv'), 'w', encoding='utf8') as fp: fp.write(get(a['href'] + a.text + '.csv')) if db.exists(): with open(db) as fp: species = json.load(fp) else: for p in args.data_file('theplantlist').files('*.csv'): for row in reader(p, namedtuples=True, delimiter=','): if row.Taxonomic_status_in_TPL == 'Accepted': id_ = slug(row.Genus + row.Species) species[id_] = row.ID with open(db, 'w') as fp: json.dump(species, fp) with transaction.manager: found = 0 for p in DBSession.query(Parameter): id_ = slug(p.name) if id_ in species: found += 1 p.tpl_id = species[id_] print(found)
def reflink(name, mm, bib): name = name.replace('&', 'and') name = slug(name) name = authors_map.get(name, name) if name == 'puscariu' and mm.group('year') == '1943': name = 'puscariuandkuen' if name == 'mohling' and mm.group('year') == '1986': name = 'mohlig' if name == 'baht' and mm.group('year') == '1987': name = 'bhat' if (name, mm.group('year') + (mm.group('letter') or '')) not in bib: if (name, mm.group('year')) not in bib: print('###', (name, mm.group('year') + (mm.group('letter') or ''))) return '%s%s%s' % ( mm.group('year'), mm.group('letter') or '', ': ' + mm.group('pages') if mm.group('pages') else '') else: recid = bib[(name, mm.group('year'))] else: recid = bib[(name, mm.group('year') + (mm.group('letter') or ''))] global LINKED LINKED[recid] = 1 return '<a href="__%s__">%s%s</a>%s' % ( slug(recid), mm.group('year'), mm.group('letter') or '', ': ' + mm.group('pages') if mm.group('pages') else '')
def get_ref(self, e, category=None): for f in e.find_all('font'): f.unwrap() t = text(e) ref = self.refs.get(slug(t)) if ref: return dict( key=ref.name, id=slug(t), text='%s. %s.' % (ref.name, ref.description), html=u'<a href="/sources/{0.id}">{0.name}</a>. {0.description}.'.format(ref), category=category) match = YEAR.search(t) if match: authors = t[:match.start()].split('(')[0].strip() authors = [HumanName(n.strip()).last for n in authors.split('&')] key = '%s %s' % (' & '.join(authors), match.group('year').strip()) else: key = None return dict( key=key, id=slug(key) if key else unicode(md5(t.encode('utf8')).hexdigest()), text=t, html=unicode(e), category=category)
def main(args): data = Data() dataset = common.Dataset( id=cognition.__name__, name="COSTATOL", description="Cognitive Structures across the Tree of Life", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cognition.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # for rec in Database.from_file(args.data_file('sources.bib')): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) contrib = common.Contribution(id='costatol', name='COSTATOL') for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True): param = data['Parameter'].get(datapoint['cognitive capacity']) if not param: name = datapoint['cognitive capacity'] param = data.add(common.Parameter, name, id=slug(name), name=name) species = data['Language'].get(datapoint['species']) if not species: name = datapoint['species'] species = data.add(common.Language, name, id=slug(name), name=name) vid = '%s-%s' % (species.id, param.id) vs = data.add( common.ValueSet, vid, id=vid, language=species, parameter=param, contribution=contrib) data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs) match = source_pattern.match(datapoint['source']) if match: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][match.group('key')], description=match.group('pages'))) for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True): data['Language'][species.name].longitude = species.longitude data['Language'][species.name].latitude = species.latitude
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def __init__(self, fname): self.fname = fname self.authors = [c.id for c in DBSession.query(Contributor)] self.languages = {l.id: l.name for l in DBSession.query(Language)} self.id = self.get_id(fname) self.refs = {slug(s.name): s for s in DBSession.query(Source) if s.name} self.examples = defaultdict(list) for row in DBSession.query(Sentence): if row.description: self.examples[slug(row.description.split('OR:')[0])].append( (row.name, row.id)) for k in self.examples.keys(): self.examples[k] = {slug(k): v for k, v in self.examples[k]}
def get_normalized_name(authors): authors = authors.lower() if ', and ' in authors: afirst, alast = authors.split(', and ') parts = afirst.split(',', 2) if len(parts) > 2: # Janhunen, Juha, Marja Peltomaa, Erika Sandman, and Xiawu Dongzhou return slug(parts[1] + parts[0] + parts[2] + alast) else: # Goswami, G. C., and Jyotiprakash Tamuli return slug(parts[1] + parts[0] + alast) else: # Fuchs, David R last, first = authors.split(',') return slug(first + last)
def main(args): data = Data() # fetch language data from glottolog: glottolog = glottocodes_by_isocode( 'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude']) dataset = common.Dataset( id=jcld.__name__, name="Journal of Cross-Linguistic Databases", domain='jcld.clld.org') DBSession.add(dataset) contribution = data.add(common.Contribution, '1', id='1', name='fb') for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')): if row.Feature not in data['Parameter']: parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature) else: parameter = data['Parameter'][row.Feature] if row.Value not in data['DomainElement']: de = data.add( common.DomainElement, row.Value, id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value) else: de = data['DomainElement'][row.Value] if row.Language not in data['Language']: if row.Language not in glottolog: print '--->', row.Language continue glottocode, name, lat, lon = glottolog[row.Language] language = data.add( common.Language, row.Language, id=slug(row.Language), name=name, latitude=lat, longitude=lon) else: language = data['Language'][row.Language] id_ = str(i + 1) #'%s-%s' % (parameter.id, language.id) vs = common.ValueSet( id=id_, parameter=parameter, language=language, contribution=contribution, description=row.Comment, source=row.Source) common.Value(valueset=vs, name=row.Value, domainelement=de)
def get_genera(data): """ Zo'e: tupiguarani """ sql = """select g.id, g.name, f.name from genus as g, family as f where g.family_pk = f.pk""" walsdb = create_engine('postgresql://robert@/wals3') genera = {} for row in walsdb.execute(sql): genus = data.add(models.Genus, row[0], id=row[0], name=row[1], description=row[2]) genera[row[0]] = genus genera[slug(row[1])] = genus sql = """select l.iso_codes, g.id from walslanguage as l, genus as g where l.genus_pk = g.pk and l.iso_codes is not null""" for row in walsdb.execute(sql): for code in row[0].split(', '): if code not in genera: genera[code] = genera[row[1]] for row in walsdb.execute("select key, value from config"): if row[0].startswith('__Genus_'): gid = row[0].replace('_', '').split('Genus', 1)[1] genera[gid] = None if row[1] == '__gone__' else genera[row[1]] return genera
def from_csv(cls, row, data=None, description=None): obj = cls(**{n: row[i] for i, n in enumerate(cls.__csv_head__) if '__' not in n and n != 'audio'}) if not slug(row[1]): obj.active = False row = dict(list(zip(cls.__csv_head__, row))) sid = row['taxa__id'] lid = row['languages__id'] vsid = '%s-%s' % (sid, lid) if vsid in data['ValueSet']: obj.valueset = data['ValueSet'][vsid] else: # Note: source and references are dumped redundantly with each word, so we # only have to recreate these if a new ValueSet had to be created. obj.valueset = data.add( ValueSet, vsid, id=vsid, parameter=data['Taxon'][sid], language=data['Languoid'][lid], contribution=data['Contribution']['tsammalex']) if row['refs__ids']: for i, rid, pages in parse_ref_ids(row['refs__ids']): data.add( NameReference, '%s-%s' % (obj.id, i), name=obj, source=data['Bibrec'][rid], description=pages or None) for rel, cls in [ ('categories', 'Category'), ('habitats', 'Category'), ('uses', 'Use') ]: for id_ in split_ids(row[rel + '__ids']): getattr(obj, rel).append(data[cls][id_.strip()]) return obj
def language_lookup(self): if not self._language_lookup: self._language_lookup = { slug(v): k for (k, v) in self.languages.items() } return self._language_lookup
def upgrade(): conn = Connection(op.get_bind()) # The genus for Yanesha’ needs to be renamed Yanesha’. conn.update(Genus, dict(name="Yanesha'"), id='westernarawakan') # Bahuana conn.update_name('bah', 'Xiriana') conn.update_glottocode('bah', 'xiri1243') conn.update_iso('bah', xir='Xiriâna') coords = Coordinates('2d40N', '62d30W') conn.update(Language, dict(latitude=coords.latitude, longitude=coords.longitude), id='bah') spk = conn.execute('select max(pk) from source').fetchone()[0] + 1 lpk = conn.pk(Language, 'bah') spk = conn.insert( Source, pk=spk, id='Ramirez-1992', name='Ramirez 1992', description='Bahuana: une nouvelle langue de la famille Arawak', bibtex_type=EntryType.book, author='Ramirez, Henri', year='1992', title='Bahuana: une nouvelle langue de la famille Arawak', address='Paris', publisher='Amerindia') conn.insert(LanguageSource, language_pk=lpk, source_pk=spk) vspk = conn.pk(ValueSet, lpk, attr='language_pk') conn.insert(ValueSetReference, valueset_pk=vspk, source_pk=spk, description='35') # split northern arawakan GENERA = { 'Alto-Orinoco': 'bnw mpr'.split(), 'Caribbean Arawakan': 'ara grf goa'.split(), 'Inland Northern Arawakan': 'acg bae cur ppc res tar wrk ycn'.split(), 'Bahuanic': ['bah'], 'Wapishanan': ['wps'], } ICONS = ['cdd0000', 'cffcc00', 'cffff00', 'cff6600', 'cffffcc'] fpk = conn.pk(Family, 'arawakan') for icon, (name, lids) in zip(ICONS, GENERA.items()): gpk = conn.insert(Genus, id=slug(name), name=name, icon=icon, family_pk=fpk) for lid in lids: conn.update_genus(lid, gpk) conn.insert(Config, key=Config.replacement_key(Genus, 'northernarawakan'), value=Config.gone) conn.delete(Genus, id='northernarawakan')
def issue24(session, timestamp): # pragma: no cover #- Update language cea (name, coords, alternative names, iso code (and name)) #Change name of Cree (Eastern) to Cree (Swampy) #Change coordinates to 56dN, 90dW #Change the Ethnologue name to Cree (Swampy) #Remove the Routledge and Other names #Change the ISO code to csw. glottocode to swam1239 cea = common.Language.get('cea', session=session) cre = common.Language.get('cre', session=session) for i in range(len(cea.languageidentifier)): try: del cea.languageidentifier[i] except IndexError: pass for values in [ ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'), ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'), ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'), ]: id = common.Identifier( **dict(zip('id name description type'.split(), values))) cea.languageidentifier.append( common.LanguageIdentifier(language=cea, identifier=id)) cea.updated = timestamp cea.name = 'Cree (Swampy)' cea.ascii_name = slug('Cree (Swampy)') cea.latitude = 56.0 cea.longitude = -90.0 for pid in ['81A', '82A', '83A']: vsq = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == pid) vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one() vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one() vs2.updated = timestamp vs1.updated = timestamp for ref in vs1.references: if ref.source.id == 'Hive-1948': ref.valueset = vs2 session.flush() #- Delete valueset 85A-cea vs = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '85A')\ .filter(common.ValueSet.language_pk == cea.pk).one() session.delete(vs.values[0]) session.delete(vs.references[0]) session.delete(vs) #- delete valueset 131A-cea add 131A-cre vs_switch_lang(session, timestamp, '131A-cea', 'cre')
def match_obsolete_refs(args): with open(args.data_file(args.version, 'obsolete_refs.json')) as fp: refs = json.load(fp) matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} # # TODO: optionally re-evaluate known-unmatched refs! # count = 0 f, m = 0, 0 for id_ in refs: if id_ in matched: continue count += 1 if count > 1000: print '1000 obsolete refs processed!' break ref = Ref.get(id_) found = False if ref.description and len(ref.description) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.description.contains(ref.description))\ .filter(or_(Source.author == ref.author, Source.year == ref.year))\ .limit(10): print '++', ref.id, '->', match.id, '++', ref.author, '->', match.author, '++', ref.year, '->', match.year matched[ref.id] = match.id found = True break if not found and ref.name and len(ref.name) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.name == ref.name)\ .limit(10): try: if match.description and ref.description and slug(match.description) == slug(ref.description): print '++', ref.id, '->', match.id, '++', ref.description, '->', match.description matched[ref.id] = match.id found = True break except AssertionError: continue if not found: m += 1 print '--', ref.id, ref.name, ref.description matched[ref.id] = None else: f += 1 print f, 'found' print m, 'missed' with open(args.data_file(args.version, 'obsolete_refs_matched.json'), 'w') as fp: json.dump(matched, fp)
def get_id(self, fname): match = self.fname_pattern.search(fname.name) assert match lid = self.language_lookup.get(slug(match.group('name'))) if lid: return '%s.%s' % (lid, '%(vol)s-%(no)s' % match.groupdict()) assert not match.group('no') return '%(vol)s-%(name)s' % match.groupdict()
def update_name(self, lid, newname, other=None): lpk = self.pk(Language, lid) self.update(Language, dict(name=newname), pk=lpk) self.update( WalsLanguage, dict(ascii_name=slug(newname, remove_whitespace=False)), pk=lpk) if other: ipk = self.insert(Identifier, name=other, description='other', type='name') self.insert(LanguageIdentifier, identifier_pk=ipk, language_pk=lpk)
def repl2(match): s = match.string[match.start():match.end()] id_ = slug(match.group('key').replace('&', '&')) ref = self.refs.get(id_) if not ref or id_ in ids: return s return '%s<a href="/sources/%s">%s</a>%s' \ % (match.group('before'), ref.id, match.group('key'), match.group('after'))
def issue24(session, timestamp): #- Update language cea (name, coords, alternative names, iso code (and name)) #Change name of Cree (Eastern) to Cree (Swampy) #Change coordinates to 56dN, 90dW #Change the Ethnologue name to Cree (Swampy) #Remove the Routledge and Other names #Change the ISO code to csw. glottocode to swam1239 cea = common.Language.get('cea', session=session) cre = common.Language.get('cre', session=session) for i in range(len(cea.languageidentifier)): try: del cea.languageidentifier[i] except IndexError: pass for values in [ ('gc-csw', 'swam1239', 'Swampy Cree', 'glottolog'), ('csw', 'csw', 'Cree (Swampy)', 'iso639-3'), ('ethnologue-csw', 'Cree (Swampy)', 'ethnologue', 'name'), ]: id = common.Identifier(**dict(zip('id name description type'.split(), values))) cea.languageidentifier.append(common.LanguageIdentifier(language=cea, identifier=id)) cea.updated = timestamp cea.name = 'Cree (Swampy)' cea.ascii_name = slug('Cree (Swampy)') cea.latitude = 56.0 cea.longitude = -90.0 for pid in ['81A', '82A', '83A']: vsq = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == pid) vs1 = vsq.filter(common.ValueSet.language_pk == cea.pk).one() vs2 = vsq.filter(common.ValueSet.language_pk == cre.pk).one() vs2.updated = timestamp vs1.updated = timestamp for ref in vs1.references: if ref.source.id == 'Hive-1948': ref.valueset = vs2 session.flush() #- Delete valueset 85A-cea vs = session.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '85A')\ .filter(common.ValueSet.language_pk == cea.pk).one() session.delete(vs.values[0]) session.delete(vs.references[0]) session.delete(vs) #- delete valueset 131A-cea add 131A-cre vs_switch_lang(session, timestamp, '131A-cea', 'cre')
def main(args): if args.cmd == 'convert': outdir = args.data_file('texts', args.what).joinpath('lo') if args.what == 'Atlas': for p in args.data_file('texts', args.what).joinpath('in').files(): if p.ext in ['.doc', '.docx']: convert_chapter(p, outdir) elif args.what == 'Surveys': pass if args.cmd == 'parse': outdir = args.data_file('texts', args.what).joinpath('processed') for p in args.data_file('texts', args.what).joinpath('lo').files(): if args.in_name in p.namebase: globals()[args.what](p)(outdir) if args.cmd == 'refs': refs = [] for p in args.data_file( 'texts', args.what).joinpath('processed').files('*.json'): if args.in_name in p.namebase: md = jsonload(p) refs.extend(md['refs']) db = get_bibtex(refs) unmatched = 0 distinct = defaultdict(list) for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[(slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False))] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return
def main(args): if args.cmd == 'convert': outdir = args.data_file('texts', args.what).joinpath('lo') if args.what == 'Atlas': for p in args.data_file('texts', args.what).joinpath('in').files(): if p.ext in ['.doc', '.docx']: convert_chapter(p, outdir) elif args.what == 'Surveys': pass if args.cmd == 'parse': outdir = args.data_file('texts', args.what).joinpath('processed') for p in args.data_file('texts', args.what).joinpath('lo').files(): if args.in_name in p.namebase: globals()[args.what](p)(outdir) if args.cmd == 'refs': refs = [] for p in args.data_file('texts', args.what).joinpath('processed').files('*.json'): if args.in_name in p.namebase: md = jsonload(p) refs.extend(md['refs']) db = get_bibtex(refs) unmatched = 0 distinct = defaultdict(list) for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[( slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False) )] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return
def refactor(self, soup, md): d = BeautifulSoup('<body></body>') body = d.find('body') linked = 0 notlinked = 0 multiple = 0 for p in self._chunks(soup): if not isinstance(p, list): p = [p] for pp in p: if pp.is_header: continue elif pp.is_refs: md['refs'] = [self.get_ref(line[0]) for line in pp.lines] else: ex = None if pp.is_example: container = d.new_tag( 'blockquote', **{ 'class': 'example', 'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em'}) #body.append(Tag(name='hr')) else: container = body for e, line, t in pp.lines: body.append(e) if pp.is_example: if re.match('\([0-9]+\)', line): e.attrs['style'] = 'text-indent:-2em' equo = "’".decode('utf8') if line.startswith("‘".decode('utf8')) and equo in line: line = equo.join(line[1:].split(equo)[:-1]).strip() examples = self.examples.get(slug(line)) if examples: if len(examples) > 1: #print '~~~', line multiple += 1 else: ex = examples.values()[0] #print '+++' linked += 1 else: #print '---', line notlinked += 1 container.append(e) if pp.is_example: if ex: container.attrs['id'] = 'ex-' + ex container.append(new_tag(d, 'small', new_tag( d, 'a', 'See example ' + ex, href='/sentences/' + ex))) body.append(container) #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices' for e in body.find_all('font'): e.unwrap() return d
def upgrade(): conn = Connection(op.get_bind()) # The genus for Yanesha’ needs to be renamed Yanesha’. conn.update(Genus, dict(name="Yanesha'"), id='westernarawakan') # Bahuana conn.update_name('bah', 'Xiriana') conn.update_glottocode('bah', 'xiri1243') conn.update_iso('bah', xir='Xiriâna') coords = Coordinates('2d40N', '62d30W') conn.update( Language, dict(latitude=coords.latitude, longitude=coords.longitude), id='bah') spk = conn.execute('select max(pk) from source').fetchone()[0] + 1 lpk = conn.pk(Language, 'bah') spk = conn.insert( Source, pk=spk, id='Ramirez-1992', name='Ramirez 1992', description='Bahuana: une nouvelle langue de la famille Arawak', bibtex_type=EntryType.book, author='Ramirez, Henri', year='1992', title='Bahuana: une nouvelle langue de la famille Arawak', address='Paris', publisher='Amerindia') conn.insert(LanguageSource, language_pk=lpk, source_pk=spk) vspk = conn.pk(ValueSet, lpk, attr='language_pk') conn.insert(ValueSetReference, valueset_pk=vspk, source_pk=spk, description='35') # split northern arawakan GENERA = { 'Alto-Orinoco': 'bnw mpr'.split(), 'Caribbean Arawakan': 'ara grf goa'.split(), 'Inland Northern Arawakan': 'acg bae cur ppc res tar wrk ycn'.split(), 'Bahuanic': ['bah'], 'Wapishanan': ['wps'], } ICONS = ['cdd0000', 'cffcc00', 'cffff00', 'cff6600', 'cffffcc'] fpk = conn.pk(Family, 'arawakan') for icon, (name, lids) in zip(ICONS, GENERA.items()): gpk = conn.insert(Genus, id=slug(name), name=name, icon=icon, family_pk=fpk) for lid in lids: conn.update_genus(lid, gpk) conn.insert( Config, key=Config.replacement_key(Genus, 'northernarawakan'), value=Config.gone) conn.delete(Genus, id='northernarawakan')
def _get_bibtex(refs): for ref in refs: genre = 'misc' id = ref['id'] attrs = dict(all=ref['text']) t = ref['text'] match = YEAR.search(t) if match: authors = 'editor' if match.group('ed') else 'author' attrs['key'], attrs[authors] = normalized_author(t[:match.start()].strip()) attrs['title'], rem = [s.strip() for s in re.split('\.|\?', t[match.end():], 1)] attrs['year'] = match.group('year') attrs['key'] = '%(key)s %(year)s' % attrs m = EDS.match(rem) if m: assert 'editor' not in attrs attrs['editor'] = normalized_author(m.group('eds').strip())[1] genre = 'incollection' rem = rem[m.end():].strip() mm = BTITLE_PAGES.match(rem) if mm: attrs['booktitle'] = mm.group('btitle').strip() attrs['pages'] = mm.group('pages').strip() rem = rem[mm.end():].strip() else: mm = JOURNAL.match(rem) if mm: genre = 'article' attrs['journal'] = mm.group('journal').strip() attrs['volume'] = mm.group('volume').strip() if mm.group('number'): attrs['number'] = mm.group('number').strip() attrs['pages'] = mm.group('pages').strip() rem = rem[mm.end():].strip() m = PUBLISHER.match(rem) if m: if genre == 'misc': genre = 'book' attrs['place'] = m.group('place').strip() attrs['publisher'] = m.group('publisher').strip() rem = rem[m.end():].strip() _rem = [] for piece in [p.strip() for p in re.split('\.(?:\s+|$)', rem) if p.strip()]: if piece.startswith('http') and not re.search('\s+', piece): attrs['url'] = piece elif piece.startswith('(') and piece.endswith(')'): attrs['note'] = piece[1:-1].strip() else: _rem.append(piece) rem = '. '.join(_rem) if not slug(unicode(rem)): del attrs['all'] yield Record(genre, id, **attrs)
def upgrade(): conn = Connection(op.get_bind()) # https://github.com/clld/wals-data/issues/50 fpk = conn.pk(Family, 'utoaztecan') gname = 'California Uto-Aztecan' gid = slug(gname) gpk = conn.insert(Genus, id=gid, name=gname, icon='fffff00', family_pk=fpk) for oid in ['takic', 'tubatulabal']: opk = conn.pk(Genus, oid) conn.update(WalsLanguage, dict(genus_pk=gpk), genus_pk=opk) conn.insert(Config, key=Config.replacement_key(Genus, oid), value=gid) conn.delete(Genus, id=oid) # https://github.com/clld/wals-data/issues/49 conn.update_name('aym', 'Aymara (Central)') conn.update_glottocode('aym', 'cent2142') conn.update_iso('aym', 'ayr', ayc='Southern Aymara') # https://github.com/clld/wals-data/issues/48 # The genus Guaymi should be renamed Guaymiic. conn.update(Genus, dict(name='Guaymiic'), id='guaymi') # The genus Aruak should be renamed Arhuacic. conn.update(Genus, dict(name='Arhuacic'), id='aruak') # The language Motilón should be renamed Barí (while keeping Motilón as the name of # the genus). conn.update_name('mti', 'Barí') # The genus Chibchan Proper should be split into two genera, Chibcha-Duit, containing # the language Muisca, and Tunebo, containing the language Tunebo. conn.update_genus('msc', ('chibchaduit', 'Chibcha-Duit', 'fffff00'), family='chibchan') conn.update_genus('tnb', ('tunebo', 'Tunebo', 'fffcc00'), family='chibchan') conn.insert(Config, key=Config.replacement_key(Genus, 'chibchanproper'), value=Config.gone) conn.delete(Genus, id='chibchanproper') # https://github.com/clld/wals-data/issues/44 conn.update_name('jlu', 'Luwo', other='Jur Luwo') # https://github.com/clld/wals-data/issues/43 conn.update_genus('ctw', ('catawban', 'Catawban', 'fffcc00'), family='siouan') conn.update(Genus, dict(name='Core Siouan'), id='siouan') # https://github.com/clld/wals-data/issues/40 conn.update_source('Sumbuk-2002', year='1999', name='Sumbuk 1999')
def yield_valid_authors(self, authors): for name in authors: n = HumanName(name) res = dict(name=name, id=slug(n.last + n.first + n.middle)) if name == 'Margot C. van den Berg': res['id'] = 'vandenbergmargotc' if name == 'Khin Khin Aye': res['id'] = 'khinkhinayenone' if name == 'Melanie Halpap': res['id'] = 'revismelanie' if res['id'] not in self.authors: raise ValueError(name) yield res
def upgrade(): conn = Connection(op.get_bind()) # https://github.com/clld/wals-data/issues/50 fpk = conn.pk(Family, 'utoaztecan') gname = 'California Uto-Aztecan' gid = slug(gname) gpk = conn.insert(Genus, id=gid, name=gname, icon='fffff00', family_pk=fpk) for oid in ['takic', 'tubatulabal']: opk = conn.pk(Genus, oid) conn.update(WalsLanguage, dict(genus_pk=gpk), genus_pk=opk) conn.insert(Config, key=Config.replacement_key(Genus, oid), value=gid) conn.delete(Genus, id=oid) # https://github.com/clld/wals-data/issues/49 conn.update_name('aym', 'Aymara (Central)') conn.update_glottocode('aym', 'cent2142') conn.update_iso('aym', 'ayr', ayc='Southern Aymara') # https://github.com/clld/wals-data/issues/48 # The genus Guaymi should be renamed Guaymiic. conn.update(Genus, dict(name='Guaymiic'), id='guaymi') # The genus Aruak should be renamed Arhuacic. conn.update(Genus, dict(name='Arhuacic'), id='aruak') # The language Motilón should be renamed Barí (while keeping Motilón as the name of # the genus). conn.update_name('mti', 'Barí') # The genus Chibchan Proper should be split into two genera, Chibcha-Duit, containing # the language Muisca, and Tunebo, containing the language Tunebo. conn.update_genus( 'msc', ('chibchaduit', 'Chibcha-Duit', 'fffff00'), family='chibchan') conn.update_genus( 'tnb', ('tunebo', 'Tunebo', 'fffcc00'), family='chibchan') conn.insert( Config, key=Config.replacement_key(Genus, 'chibchanproper'), value=Config.gone) conn.delete(Genus, id='chibchanproper') # https://github.com/clld/wals-data/issues/44 conn.update_name('jlu', 'Luwo', other='Jur Luwo') # https://github.com/clld/wals-data/issues/43 conn.update_genus('ctw', ('catawban', 'Catawban', 'fffcc00'), family='siouan') conn.update(Genus, dict(name='Core Siouan'), id='siouan') # https://github.com/clld/wals-data/issues/40 conn.update_source('Sumbuk-2002', year='1999', name='Sumbuk 1999')
def __call__(self, outdir): """ runs a parser workflow consisting of - preprocess - refactor - postprocess writes the results, an html, a css and a json file to disk. """ cssutils_logger = logging.getLogger('CSSUTILS') cssutils_logger.setLevel(logging.ERROR) print(self.fname.namebase.encode('utf8')) with open(self.fname, encoding='utf8') as fp: c = fp.read() soup = BeautifulSoup(self.preprocess(self._preprocess(c))) # extract css from the head section of the HTML doc: css = cssutils.parseString('\n') for style in soup.find('head').find_all('style'): for rule in self.cssrules(style): css.add(rule) md = dict(outline=[], refs=[], authors=[]) soup = self.refactor(soup, md) # enhance section headings: for section, t in tag_and_text(soup.find_all('h3')): t = t.split('[Note')[0] id_ = 'section-%s' % slug(t) md['outline'].append((t, id_)) section.attrs['id'] = id_ for s, attrs in [ (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}), ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}), ]: append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs)) body = self.insert_links(unicode(soup.find('body')), md) # write output files: with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp: fp.write(self.wrap(self.postprocess(body))) with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp: fp.write(self.csstext(css)) md['authors'] = list(self.yield_valid_authors(md['authors'])) jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
def bibtex2source(rec): year = bibtex.unescape(rec.get('year', 'nd')) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(common.Source, field) else jsondata container[field] = value return common.Source( id=slug(rec.id), name=('%s %s' % (bibtex.unescape( rec.get('author', rec.get('editor', ''))), year)).strip(), description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=rec.genre, **fields)
def glottocode(name, conn, codes=None): # # TODO: must take legacy glottocodes into account! # codes = {} if codes is None else codes letters = slug(name)[:4].ljust(4, 'a') r = conn.execute("select id from language where id like '" + letters + "%%' order by id desc limit 1").fetchone() if r: number = int(r[0][4:]) + 1 else: number = 1234 number = str(number) assert len(number) == 4 res = letters + number i = 0 while res in codes: i += 1 res = letters + str(int(number) + i) codes[res] = True return res
def normalize_name(s): """This function is called to convert ASCII strings to something that can pass as python attribute name, to be used with namedtuples. >>> assert normalize_name('class') == 'class_' >>> assert normalize_name('a-name') == 'a_name' >>> assert normalize_name('a näme') == 'a_name' >>> assert normalize_name('Name') == 'Name' >>> assert normalize_name('') == '_' >>> assert normalize_name('1') == '_1' """ s = s.replace('-', '_').replace('.', '_').replace(' ', '_') if s in keyword.kwlist: return s + '_' s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_')) if not s: s = '_' if s[0] not in ascii_letters + '_': s = '_' + s return s
def get_refs(line): line = line.strip()\ .replace('|Wikipedia', '')\ .replace(',|', ',')\ .replace(' (ed.)', ' ed.')\ .replace(':,77', ':77,') if '(' in line and ')' not in line: line = line + ')' for piece in SEP.findall(line): piece = piece.strip() if piece.startswith('http://'): # TODO: handle URL yield piece continue if not ('(' in piece and ')' in piece): if 'dobes' in piece.lower(): yield 'DOBES' elif piece == 'Cunningham ed.': yield ('cunningham', None) else: print(piece) raise ValueError continue assert len(piece.split('(')) == 2 pages = None year_pages = piece.split('(')[1].split(')')[0] m = YEAR_PAGES.match(year_pages) if not m: if year_pages == '?:15': pages = '15' assert year_pages in ['?:15', '1994:'] else: pages = m.group('pages') if ':' in piece: r = piece.split(':')[0] else: r = piece.split(')')[0] r = slug(r) r = KEY_MAP.get(r, r) yield (r, pages)
def glottocode(name, conn, codes=None): letters = slug(name)[:4].ljust(4, 'a') active = select([cast(func.substring(Languoid.id, 5), Integer).label('number')])\ .where(Languoid.id.startswith(letters)) legacy = select([cast(func.substring(LegacyCode.id, 5), Integer).label('number')])\ .where(LegacyCode.id.startswith(letters)) if not codes: known = union_all(active, legacy) else: dirty = select([cast(func.substring(literal_column('dirty'), 5), Integer).label('number')])\ .select_from(func.unnest(list(codes)).alias('dirty'))\ .where(literal_column('dirty').startswith(letters)) known = union_all(active, legacy, dirty) number = conn.execute(select([func.coalesce(func.max(literal_column('number') + 1), 1234)])\ .select_from(known.alias())).scalar() number = str(number) assert len(number) == 4 res = letters + number assert GLOTTOCODE_PATTERN.match(res) if codes is not None: codes[res] = True return res
def upgrade(): conn = Connection(op.get_bind()) for fname, genera in DATA.items(): fpk = conn.insert(Family, id=slug(fname), name=fname) for gspec, lnames in genera.items(): if isinstance(gspec, tuple): if len(gspec) == 3: # new genus gpk = conn.insert( Genus, id=gspec[0], name=gspec[1], icon=gspec[2], family_pk=fpk) elif len(gspec) == 2: # rename genus gpk = conn.pk(Genus, gspec[0]) conn.update(Genus, dict(family_pk=fpk, name=gspec[1]), pk=gpk) else: raise ValueError() else: # just update the family gpk = conn.pk(Genus, gspec) conn.update(Genus, dict(family_pk=fpk), pk=gpk) for lname in lnames: lpk = conn.pk(Language, lname, attr='name') conn.update(WalsLanguage, dict(genus_pk=gpk), pk=lpk) for gid in GONE: conn.insert(Config, key=Config.replacement_key(Genus, gid), value=Config.gone) conn.delete(Genus, id=gid) conn.insert( Config, key=Config.replacement_key(Family, 'australian'), value=Config.gone) conn.delete(Family, id='australian') conn.update_name('mdl', 'Matngele')
def create(args): args.log.info('starting migration ...') data = Data() db = create_engine('postgresql://robert@/glottolog2') with transaction.manager: sn = data.add(common.Contributor, 'sn', id='sn', name='Sebastian Nordhoff') hh = data.add(common.Contributor, 'hh', id='hh', name='Harald Hammarström') rf = data.add(common.Contributor, 'rf', id='rf', name='Robert Forkel', url="https://github.com/xrotwang") mh = data.add(common.Contributor, 'mh', id='mh', name='Martin Haspelmath') contrib = data.add(common.Contribution, 'c', id='classification', name='Classification') data.add(common.ContributionContributor, 'hh', contribution=contrib, contributor=hh) params = dict( fc=data.add(common.Parameter, 'fc', id='fc', name='Family classification'), sc=data.add(common.Parameter, 'sc', id='sc', name='Subclassification'), ) dataset = data.add( common.Dataset, 'd', id='glottolog', name='Glottolog 2.0', description='', published=datetime.date(2013, 8, 15), domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, ed in enumerate([sn, hh, rf, mh]): DBSession.add( common.Editor(dataset=dataset, contributor=ed, ord=i + 1)) valuesets = {} def create_languoid(row, father_pk=None): glottocode = { 'akun1242': 'akun1241' }.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add(common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id']) level = 0 parents = [ create_languoid(row) for row in db.execute( 'select * from languoidbase where father_id is null') ] while parents: args.log.info('level: %s' % level) level += 1 parents = [ create_languoid( row, father_pk=data['Languoid'][row['father_id']].pk) for row in db.execute( 'select * from languoidbase where father_id in (%s)' % ','.join(parents)) ] def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict(pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues) select(db, 'select * from refbase order by id', handler) DBSession.execute('COMMIT') for table, model, value, order in [ ('macroarea', models2.Macroarea, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['name'], description=row['description']), None), ('country', models2.Country, lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']), None), ('provider', models2.Provider, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['description'], description=row['comment'], abbr=row['abbr'], url=row['url'], refurl=row['refurl'], bibfield=row['bibfield']), None), ('doctype', models2.Doctype, lambda i, row: dict(pk=row['id'], id=slug(row['name']), abbr=row['abbr'], name=row['name'], description=row['description']), None), ('refprovider', models2.Refprovider, lambda i, row: dict( pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']), ('provider_id', 'refbase_id')), ('refdoctype', models2.Refdoctype, lambda i, row: dict( pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']), ('doctype_id', 'refbase_id')), ]: insert(db, table, model, value, order=order) names = dict( (int(d['id']), d['pk']) for d in insert(db, 'namebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['namestring'], type='name', description=row['nameprovider'], lang=row['inlg'] if row['inlg'] and len(row['inlg']) <= 3 else 'en'), order='id')) codes = dict( (int(d['id']), d['pk']) for d in insert(db, 'codebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['codestring'], type=common.IdentifierType.iso. value if row['codeprovider'] == 'ISO' else row['codeprovider']), start=len(names), order='id')) res = insert( db, 'nodecodes', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=codes[row['codebase_id']])) insert(db, 'nodenames', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=names[row['namebase_id']]), start=len(res)) for table, model, value in [ ('languoidmacroarea', models2.Languoidmacroarea, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], macroarea_pk=row['macroarea_id'])), ('languoidcountry', models2.Languoidcountry, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], country_pk=row['country_id'])), ('noderefs', common.LanguageSource, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], source_pk=row['refbase_id'])), ('refmacroarea', models2.Refmacroarea, lambda i, row: dict( pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])), ('refcountry', models2.Refcountry, lambda i, row: dict( pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])), ('spuriousreplacements', models2.Superseded, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], replacement_pk=row['replacement_id'], description=row['relation'])), ('justification', common.ValueSetReference, lambda i, row: dict( pk=i, valueset_pk=valuesets['%s%s' % ('fc' if row[ 'type'] == 'family' else 'sc', row['languoidbase_id'])], source_pk=row['refbase_id'], description=row['pages'])), ]: insert(db, table, model, value)
def issue20(session, timestamp): # pragma: no cover # Datapoint http://wals.info/datapoint/121A/wals_code_bej should be changed to be # about Kemant (wals_code_kem). The same applies to the Rossini source for that # datapoint. (This is the only datapoint for this source.) vs_switch_lang(session, timestamp, '121A-bej', 'kem') # Eastern Ojibwa (wals_code_oji) should link to two ISO codes, ojg (as it is now) but also otw. update_iso(session, timestamp, 'oji', otw='Ottawa') # There should be two ISO codes for Campa (Axininca) (wals_code_cax): cni and cpc update_iso(session, timestamp, 'cax', cpc='Ajyíninka Apurucayali') # All of the datapoints for Fula (Nigerian) (wals_code_fni) based on Arnott (1970) # need to be moved to Fula (Cameroonian) (wals_code_fua). In some cases, this involves # merging these datapoints with existing datapoints for wals_code_fua. source = common.Source.get('Arnott-1970', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'fni': vs_switch_lang(session, timestamp, vs, 'fua') # The one datapoint for Fulani (Gombe) fgo needs to be moved to Fula (Cameroonian) # (wals_code_fua), thus removing Fulani (Gombe) as a language. vs_switch_lang(session, timestamp, '27A-fgo', 'fua') # Tlapanec (wals_code_tlp) should link to ISO code tcf rather than tpx. update_iso(session, timestamp, 'tlp', 'tpx', tcf="Malinaltepec Me'phaa") # Kongo (wals_code_kon) should link to two ISO codes, kwy and kng. update_iso(session, timestamp, 'kon', kwy=None) # One of the sources for Vili (wals_code_vif), namely Carrie (1890) turns out not # to be a source for Vili but another source for Kongo (wals_code_kon). This means: # the page numbers given for Vili for 81A and 82A should be added to the corresponding # datapoints for Kongo # the value and source given for Vili for 91A should be transferred to Congo (which # currently does not have a value for that feature) # all the datapoints for Vili for which Carrie was the source should be removed # the values given for Vili for which Carrie was the source for the features # associated with chapters 112, 143, and 144 are NOT being transferred to Kongo # since they are inconsistent with the existing values for these features for Kongo source = common.Source.get('Carrie-1890', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'vif': if vs.parameter.id in ['81A', '82A', '91A']: vs_switch_lang(session, timestamp, vs, 'kon') else: vs_delete(session, timestamp, vs) # One of the sources for Chichewa (wals_code_cic), namely Mateene 1980, turns out # to be a source for Nyanga (wals_code_nng). What this entail is # the values listed for Chichewa for features 81A, 82A, 83A, 86A, 87A, and 88A, # need to be added to Nyanga # Mateene 1980 should be added as a source for Nyanga # the references to Mateene as a source for datapoints for Chichewa need to be removed # there is one datapoint for Chichewa were Mateene is listed as the only source, # namely for 83A, but this is an error: the source for this datapoint should be # Price 1966: passim; Mchombo 2004: 19 (the sources listed for 81A) source = common.Source.get('Mateene-1980', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'cic': if vs.parameter.id in ['81A', '82A', '83A', '86A', '87A', '88A']: vs_copy_lang(session, timestamp, vs, 'nng') else: vs_delete(session, timestamp, vs) session.delete(vsr) if vs.parameter.id == '83A': session.add( common.ValueSetReference( valueset=vs, source=common.Source.get('Price-1966', session=session), description='passim')) session.add( common.ValueSetReference(valueset=vs, source=common.Source.get( 'Mchombo-2004', session=session), description='19')) # [gby] should be removed as an ISO code for Gwari (wals_code_gwa); the only one should be [gbr] update_iso(session, timestamp, 'gwa', 'gby', gbr=None) # The ISO code for Grebo (wals_code_grb) should be corrected to [grj]. update_iso(session, timestamp, 'grb', 'gry', grj="Southern Grebo") # The only ISO code for Lega is [lea]; please remove the second one. update_iso(session, timestamp, 'leg', 'lgm') # The sources for Ngbaka (wals_code_ngb) are actually for two different, only # distantly related languages. GrandEury is the source for Ngbaka (Minagende), which # has the same ISO code [nga] and location we are currently using for Ngbaka, so we # should keep the WALS code for that Ngbaka (but should change the name to # Ngbaka (Minagende)). Thomas (1963) is a source for what will be a new WALS language, # Ngbaka (Ma’bo). Its ISO code is [nbm]. We could use the same code nbm as the WALS code. # It belongs to the Ubangi genus, as Ngbaka (Minagende) does in the current WALS # classification, but see below where Ngbaka (Minagende) is being moved out of # Ubangi into a new genus. I would use the Glottolog location for it, but I can’t find # that in the new Glottolog. It is also in the Democratic Republic of the Congo. # # This means that all the datapoints in the current WALS that use Thomas 1963 as a # source for Ngbaka need to be moved or copied to the new Ngbaka (Ma’bo). Those # datapoints in the current Ngbaka that only use Thomas as a source will need to be # removed (since that language is the new Ngbaka (Minagende)). Those datapoints that # use both sources in the current WALS will now become two datapoints, one for each # of these two languages. nbm = models.WalsLanguage(id='nbm', name="Ngbaka (Ma'bo)", ascii_name=slug("Ngbaka (Ma'bo)"), latitude=3.56, longitude=18.36, genus=models.Genus.get('ubangi', session=session)) nbm.countries.append(models.Country.get('CD', session=session)) session.add(nbm) update_iso(session, timestamp, nbm, nbm="Ngbaka Ma'bo") update_glottocode(session, timestamp, nbm, 'ngba1284') ngb = common.Language.get('ngb', session=session) ngb.name = 'Ngbaka (Minagende)' ngb.ascii_name = slug(ngb.name) for vs in ngb.valuesets: if 'Thomas-1963' in [ref.source.id for ref in vs.references]: if len(vs.references) > 1: vs_copy_lang(session, timestamp, vs, nbm) else: vs_switch_lang(session, timestamp, vs, nbm) # The ISO code for Sisaala (wals_code_ssa) needs to be changed from [ssl] to [sld]. update_iso(session, timestamp, 'ssa', 'ssl', sld='Sissala') # The ISO code for Makua (wals_code_mua) should be changed to [mgh] and [xsq]. update_iso(session, timestamp, 'mua', 'vmw', mgh='Makhuwa-Meetto', xsq='Makhuwa-Saka') # A change to the genealogical classification: Four languages need to be taken out # of the Ubangi genus and put into a new genus within Niger-Congo called # Gbaya-Manza-Ngbaka: (first below is WALS code, last is ISO code): # #gbb Gbeya Bossangoa gbp #gbk Gbaya Kara gya #mdo Mbodomo gmm #ngb Ngbaka nga # update_classification(session, timestamp, ['gbb', 'gbk', 'mdo', 'ngb'], 'gbayamanzangbaka', genus_name='Gbaya-Manza-Ngbaka', family_id='nigercongo')
def words(s): return set(slug(s.strip(), remove_whitespace=False).split())
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if not sources: sources = DBSession.query(common.Source)\ .order_by(cast(common.Source.id, Integer))\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(sources): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): with open(filepath) as fp: try: data = json.load(fp) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or unicode('')): needs_check = True twords = words(stitle) iwords = words(item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(list(words(stitle)))) log.info('%s' % sorted(list(iwords))) if needs_check: log.info( '------- %s -> %s' % (source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % (item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') with open(filepath, 'w') as fp: json.dump({"totalItems": 0}, fp) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus(source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(filepath, 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def refactor(self, soup, md): d = BeautifulSoup('<body></body>') body = d.find('body') linked = 0 notlinked = 0 multiple = 0 for p in self._chunks(soup): if not isinstance(p, list): p = [p] for pp in p: if pp.is_header: continue elif pp.is_refs: md['refs'] = [self.get_ref(line[0]) for line in pp.lines] else: ex = None if pp.is_example: container = d.new_tag( 'blockquote', **{ 'class': 'example', 'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em' }) #body.append(Tag(name='hr')) else: container = body for e, line, t in pp.lines: body.append(e) if pp.is_example: if re.match('\([0-9]+\)', line): e.attrs['style'] = 'text-indent:-2em' equo = "’".decode('utf8') if line.startswith( "‘".decode('utf8')) and equo in line: line = equo.join( line[1:].split(equo)[:-1]).strip() examples = self.examples.get(slug(line)) if examples: if len(examples) > 1: #print '~~~', line multiple += 1 else: ex = examples.values()[0] #print '+++' linked += 1 else: #print '---', line notlinked += 1 container.append(e) if pp.is_example: if ex: container.attrs['id'] = 'ex-' + ex container.append( new_tag( d, 'small', new_tag(d, 'a', 'See example ' + ex, href='/sentences/' + ex))) body.append(container) #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices' for e in body.find_all('font'): e.unwrap() return d
def repl(match): if end_tag.match(match.string[match.end():]): # if the next tag is the end tag of a link, then don't link again! return match.string[match.start():match.end()] return '<a class="ref-link" style="cursor: pointer;" data-content="%s">%s</a>' \ % (slug(match.group('key').replace('&', '&')), match.group('key'))
'condition': lambda l: l.iso_code, 'rdf': "rdfs:seeAlso", 'logo': "odin.png" }, { 'name': 'WALS', 'href': lambda l: "http://wals.info/languoid/lect/wals_code_" + l.get_identifier('WALS'), 'condition': lambda l: l.get_identifier('WALS'), 'rdf': "owl:sameAs", 'logo': "wals.png" }, { 'name': 'WALSgenus', 'href': lambda l: "http://wals.info/languoid/genus/" + slug(l.get_identifier('WALSgenus')), 'condition': lambda l: l.get_identifier('WALSgenus'), 'rdf': "owl:sameAs", 'logo': "wals.png" }, { 'name': 'WALSfamily', 'href': lambda l: "http://wals.info/languoid/family/" + slug(l.get_identifier('WALSfamily')), 'condition': lambda l: l.get_identifier('WALSfamily'), 'rdf': "owl:sameAs", 'logo': "wals.png" }, { 'name': 'Endangered Languages', 'href': lambda l: "http://www.endangeredlanguages.com/lang/"
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def test_slug(): from clld.util import slug assert slug('A B. \xe4C') == 'abac'
def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': { 'bibtexkey': rec.id }, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [ s.strip() for s in kw['publisher'].split(':', 1) ] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get( 'macro_area', '').split(',') ])): append(ref.macroareas, macroarea_map[name]) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get('src', '').split(',') ])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'