def main(): add_args = [(("command",), dict(help="stats|process")), (("dict",), dict(help="dictionary ID"))] args = parsed_args(*add_args) if args.command == "stats": submission = Submission(args.dict) submission.dict.stats() if 0: # args.dict == 'yakkha': same, d1, d2 = 0, 0, 0 for e in submission.dict: for w in e.get_words(): ne = len(w.meanings) nn = len(FIELD_SPLITTER_PATTERN.split(w.data.get("gn", [""])[0])) if ne < nn: d1 += 1 elif nn < ne: d2 += 1 else: same += 1 print(same, d1, d2) if args.dict == "palula": for e in submission.dict: if len(e.getall("ps")) > len(e.getall("se")) + len(e.getall("lx")): print(e.get("lx")) elif args.command == "process": submission = Submission(args.dict) submission.process()
def llod(): # pragma: no cover """ Create an RDF dataset for an app and register it with datahub.io """ args = parsed_args(bootstrap=True, description=llod.__doc__) llod_func(args) register(args)
def create_downloads(**kw): # pragma: no cover """ Create all registered downloads (locally). """ args = parsed_args(bootstrap=True, description=create_downloads.__doc__) for name, download in args.env['registry'].getUtilitiesFor(IDownload): args.log.info('creating download %s' % name) download.create(args.env['request'])
def create_downloads(**kw): # pragma: no cover """ Create all registered downloads (locally). """ args = parsed_args(bootstrap=True, description=create_downloads.__doc__) for name, download in args.env["registry"].getUtilitiesFor(IDownload): args.log.info("creating download %s" % name) download.create(args.env["request"])
def internetarchive(**kw): # pragma: no cover """ Add information about availability on Internet Archive to Source objects. """ add_args = [(("command",), dict(help="download|verify|update"))] kw.setdefault('description', internetarchive.__doc__) args = parsed_args(*add_args, **kw) with transaction.manager: ia_func(args.command, args, kw.get('sources'))
def internetarchive(**kw): # pragma: no cover """ Add information about availability on Internet Archive to Source objects. """ add_args = [(("command",), dict(help="download|verify|update"))] kw.setdefault("description", internetarchive.__doc__) args = parsed_args(*add_args, **kw) with transaction.manager: ia_func(args.command, args, kw.get("sources"))
def google_books(**kw): # pragma: no cover add_args = [ (("command",), dict(help="download|verify|update|cleanup")), (("--api-key",), dict(default=kw.get("key", os.environ.get("GBS_API_KEY")))), ] args = parsed_args(*add_args, **kw) if args.command == "download" and not args.api_key: raise argparse.ArgumentError(None, "no API key found for download") with transaction.manager: gbs_func(args.command, args, kw.get("sources"))
def google_books(**kw): # pragma: no cover add_args = [ (("command",), dict(help="download|verify|update|cleanup")), (("--api-key",), dict(default=kw.get('key', os.environ.get('GBS_API_KEY')))), ] args = parsed_args(*add_args, **kw) if args.command == 'download' and not args.api_key: raise argparse.ArgumentError(None, 'no API key found for download') with transaction.manager: gbs_func(args.command, args, kw.get('sources'))
def main(): """Construct a new database from scratch.""" print(os.path.join( os.path.dirname(__file__), "lexirumah_for_create_database.ini")) args = parsed_args( args=[os.path.join( os.path.dirname(__file__), "lexirumah_for_create_database.ini")]) with transaction.manager: db_main() with transaction.manager: prime_cache(args)
def update_zenodo_metadata(**kw): # pragma: no cover update_zenodo_metadata_func(parsed_args((('doi', ), {}), bootstrap=True))
def test_parsed_args(testsdir): from clld.scripts.util import parsed_args parsed_args(args=[(testsdir / 'test.ini').as_posix()])
# -*- coding: utf-8 -*- import transaction from clld.scripts.util import parsed_args from glottolog3.scripts.util import update_reflang if __name__ == '__main__': # pragma: no cover with transaction.manager: update_reflang(parsed_args((("--version",), dict(default=""))))
print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes if __name__ == '__main__': args = parsed_args( (('--mode',), dict(default='insert')), (("--version",), dict(default="2.0")), ) res = main(args) with open(args.data_file(args.version, 'refs.json'), 'w') as fp: json.dump(res, fp)
id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid(row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp) if __name__ == '__main__': main(parsed_args((("--all", ), dict(action="store_true"))))
from clld.scripts.util import parsed_args from phoible.scripts.util import add_wikipedia_urls if __name__ == '__main__': add_wikipedia_urls(parsed_args())
def unfreeze(): # pragma: no cover """ Import an app's data from a frozen dump into an sqlite db. """ unfreeze_func(parsed_args(description=unfreeze.__doc__))
from clld.scripts.util import parsed_args from clld.scripts.internetarchive import ia_func if __name__ == '__main__': ia_func('update', parsed_args(bootstrap=True))
from clld.scripts.util import parsed_args from clld.scripts.llod import llod_func, register if __name__ == '__main__': args = parsed_args(bootstrap=True) llod_func(args) #register(args)
('lect_descriptions', 'Lect_descriptions'), ('colours', 'Colours'), ('contributors', 'Contributors'), ("data (editors' layout)", 'Data'), ("data (apics-wals)", 'wals'), ("data references", "Data_references"), ("editors", "Editors"), ("examples", "ExamplesB"), ("examples (editors' layout)", "Examples"), ("feature references", "Feature_references"), ("features (publication)", "Featuresp"), ("features (value names)", "Featuresv"), ("features", "Features"), ("language references", "Language_references"), ("languages (editors' layout)", "Languages"), ("people", "People"), ("references", "References"), ("segment data (editors' layout)", "Segment_data"), ("segment data", "Segment_dataB"), ('segment features', 'Segment_features'), ("sociolinguistic data", "Sociolinguistic_data"), ("sociolinguistic data references", "Sociolinguistic_data_references"), ("sociolinguistic features", "Sociolinguistic_features"), ("value examples", "Value_examples"), ]: jsondump(client.get(layout), args.data_file('fm', '%s.json' % table)) if __name__ == '__main__': main(parsed_args((("host",), {}), (("user",), {}), (("password",), {})))
# -*- coding: utf-8 -*- import transaction from clld.scripts.util import parsed_args from glottolog3.scripts.util import update_refnames if __name__ == '__main__': # pragma: no cover with transaction.manager: update_refnames(parsed_args())
from clld.scripts.util import parsed_args from clld.scripts.llod import llod_func, register if __name__ == '__main__': args = parsed_args(bootstrap=True) llod_func(args) register(args)
# -*- coding: utf-8 -*- import transaction from clld.scripts.util import parsed_args from glottolog3.scripts.util import update_providers if __name__ == "__main__": # pragma: no cover with transaction.manager: update_providers(parsed_args((("--version",), dict(default=""))))
from functools import partial import json import transaction from clld.scripts.util import parsed_args, data_file if __name__ == '__main__': # pragma: no cover add_args = [ (("version",), dict(help="X.Y")), (("what",), dict()), (("command",), dict(help="download|verify|update")), (("--api-key",), dict(default=os.environ.get('GBS_API_KEY'))), ] args = parsed_args(*add_args) assert args.data_file(args.version).exists() args.data_file = partial(data_file, args.module, args.version, args.what) if not args.data_file().exists(): args.data_file().mkdir() mod = __import__( 'glottolog3.scripts.loader.' + args.what, fromlist=[args.command, 'JSON']) args.json = None if getattr(mod, 'JSON', None) and args.data_file(mod.JSON).exists(): with open(args.data_file(mod.JSON)) as fp: args.json = json.load(fp) with transaction.manager: res = getattr(mod, args.command)(args) if res is not None and args.command == 'download' and getattr(mod, 'JSON', None):
config.set_main_option("script_location", args.migrations_dir) scriptdir = ScriptDirectory.from_config(config) script = scriptdir.generate_revision( rev_id(), "Glottolog Curator", refresh=True, upgrades="""\ # from glottologcurator conn = op.get_bind() for sql, params in [ %s ]: conn.execute(sql, params) """ % '\n'.join(u' ("""{0}""", {1}),'.format( event[0], parse_json_with_datetime(event[1])) for event in changes['events'])) args.log.info('new alembic migration script created:') args.log.info(script.path) args.log.info('run "alembic upgrade head" to merge changes') if __name__ == '__main__': main( parsed_args( (("log_url", ), {}), (("--http-user", ), dict(default=None)), (("--http-password", ), dict(default=None)), )) sys.exit(0)
def freeze(): # pragma: no cover """ Create a dump a an app's database as set of csv files in an archive data.zip """ freeze_func(parsed_args(bootstrap=True, description=freeze.__doc__))
from clld.scripts.util import gbs_func, parsed_args if __name__ == '__main__': gbs_func('update', parsed_args(bootstrap=True))
id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid( row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(data_file(args, 'languoids.json'), 'w') as fp: json.dump(languoids, fp) if __name__ == '__main__': main(parsed_args( (("--all",), dict(action="store_true")), (("--version",), dict(default="")), ))
for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[(slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False))] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return if __name__ == '__main__': main( parsed_args( (("what", ), dict()), (("cmd", ), dict()), (("--in-name", ), dict(default='')), ))
def test_parsed_args(self): from clld.scripts.util import parsed_args parsed_args(args=[TESTS_DIR.joinpath('test.ini').as_posix()])
from six.moves.urllib.request import urlretrieve from purl import URL from clld.scripts.util import parsed_args from clld.db.meta import DBSession from clld.db.models.common import Language from tsammalex.adapters import Pdf def cached_path(args, url, rel): if url.startswith('/'): return url url_ = URL(url) cached = args.data_file('edmond', url_.path_segments()[-1]) if not cached.exists(): fname, headers = urlretrieve(url, '%s' % cached) return str(cached) def main(args): for lang in DBSession.query(Language): args.log.info(lang.name) Pdf(None, 'tsammalex').create( args.env['request'], filename='test.pdf', link_callback=partial(cached_path, args), lang=lang) if __name__ == '__main__': main(parsed_args(bootstrap=True))
append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' if __name__ == '__main__': args = parsed_args((('--mode', ), dict(default='insert'))) main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'), args.mode)
kw['auth'] = (args.http_user, args.http_password) changes = requests.get(args.log_url, **kw).json() config = Config() config.set_main_option("script_location", args.migrations_dir) scriptdir = ScriptDirectory.from_config(config) script = scriptdir.generate_revision( rev_id(), "Glottolog Curator", refresh=True, upgrades="""\ # from glottologcurator conn = op.get_bind() for sql, params in [ %s ]: conn.execute(sql, params) """ % '\n'.join(u' ("""{0}""", {1}),'.format( event[0], parse_json_with_datetime(event[1])) for event in changes['events'])) args.log.info('new alembic migration script created:') args.log.info(script.path) args.log.info('run "alembic upgrade head" to merge changes') if __name__ == '__main__': main(parsed_args( (("log_url",), {}), (("--http-user",), dict(default=None)), (("--http-password",), dict(default=None)), )) sys.exit(0)
def get_args(): return parsed_args( (("--data-dir", ), dict(action=ExistingDir, default=path('/home/robert/venvs/clld/data/glottolog-data/'))))
from purl import URL from clld.scripts.util import parsed_args from clld.db.meta import DBSession from clld.db.models.common import Language from tsammalex.adapters import Pdf def cached_path(args, url, rel): if url.startswith('/'): return url url_ = URL(url) cached = args.data_file('edmond', url_.path_segments()[-1]) if not cached.exists(): fname, headers = urlretrieve(url, '%s' % cached) return str(cached) def main(args): for lang in DBSession.query(Language): args.log.info(lang.name) Pdf(None, 'tsammalex').create(args.env['request'], filename='test.pdf', link_callback=partial(cached_path, args), lang=lang) if __name__ == '__main__': main(parsed_args(bootstrap=True))
# -*- coding: utf-8 -*- from __future__ import unicode_literals from datetime import datetime import pytz import transaction from clld.scripts.util import parsed_args from clld.db.models.common import Dataset from clld.db.meta import DBSession def main(args): with transaction.manager: dataset = DBSession.query(Dataset).one() dataset.name = 'Glottolog %s' % args.version dataset.updated = datetime.utcnow().replace(tzinfo=pytz.utc) if __name__ == '__main__': # pragma: no cover main(parsed_args((("--version",), dict(default=""))))
distinct = defaultdict(list) for i, rec in enumerate(db): if 'all' in rec: unmatched += 1 distinct[( slug(rec.get('key', unicode(uuid4().hex))), slug(unicode(rec.get('title', uuid4().hex)), remove_whitespace=False) )] = 1 print unmatched, 'of', i, 'distinct', len(distinct) c = 0 for key, refs in groupby(sorted(distinct.keys()), key=lambda t: t[0]): refs = list(refs) if len(refs) > 1: for t1, t2 in combinations([t[1] for t in refs], 2): if fuzz.partial_ratio(t1, t2) > 80: print t1 print t2 print c += 1 print c return if __name__ == '__main__': main(parsed_args( (("what",), dict()), (("cmd",), dict()), (("--in-name",), dict(default='')), ))
def test_parsed_args(self): from clld.scripts.util import parsed_args parsed_args(args=[path(clld.__file__).dirname().joinpath('tests', 'test.ini')])
print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' if __name__ == '__main__': args = parsed_args((('--mode',), dict(default='insert'))) main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'), args.mode)
from clld.db.models.common import LanguageSource from clld.lib.dsv import reader, UnicodeWriter from asjp.models import Doculect def main(args): sources = jsonload(args.data_file('sources.json')) fields = ['href', 'name', 'author', 'iso', 'source', 'notes', 'wordlist'] with UnicodeWriter(args.data_file('..', 'sources.csv')) as fp: fp.writerow(fields) for source in sorted(sources, key=lambda i: i['name']): fp.writerow([source.get(f, '') for f in fields]) return ethnologue_names = { r.ISO_639: r.Language_Name for r in reader(args.data_file( '..', '..', 'ethnologue-17-data', 'Table_of_Languages.tab'), namedtuples=True)} # ASJP name for language, Ethnologue's name, ISO code rows = [['ASJP Name', 'Ethnologue name', 'ISO code']] subquery = DBSession.query(LanguageSource.language_pk).distinct().subquery() for i, l in enumerate(DBSession.query(Doculect).order_by(Doculect.pk).filter(not_(Doculect.pk.in_(subquery)))): rows.append([l.id, ethnologue_names.get(l.code_iso, ''), l.code_iso or '']) #print i with UnicodeWriter(args.data_file('..', 'doculects_without_source.csv')) as fp: fp.writerows(rows) if __name__ == '__main__': main(parsed_args())
import os from functools import partial import json import transaction from clld.scripts.util import parsed_args, data_file if __name__ == '__main__': # pragma: no cover add_args = [ (("version", ), dict(help="X.Y")), (("what", ), dict()), (("command", ), dict(help="download|verify|update")), (("--api-key", ), dict(default=os.environ.get('GBS_API_KEY'))), ] args = parsed_args(*add_args) assert args.data_file(args.version).exists() args.data_file = partial(data_file, args.module, args.version, args.what) if not args.data_file().exists(): args.data_file().mkdir() mod = __import__('glottolog3.scripts.loader.' + args.what, fromlist=[args.command, 'JSON']) args.json = None if getattr(mod, 'JSON', None) and args.data_file(mod.JSON).exists(): with open(args.data_file(mod.JSON)) as fp: args.json = json.load(fp) with transaction.manager: res = getattr(mod, args.command)(args) if res is not None and args.command == 'download' and getattr(
vs.values[0].domainelement.name, source_name, req.resource_url(vs), req.resource_url(vs.language), req.resource_url(feature), source, ]) def main(args): features = list(DBSession.query(Parameter).options(joinedload(Parameter.domain))) for feature in features: valuesets = DBSession.query(ValueSet)\ .join(Language)\ .filter(ValueSet.parameter_pk == feature.pk)\ .order_by(Language.name)\ .options( contains_eager(ValueSet.language), joinedload(ValueSet.values), joinedload_all(ValueSet.references, ValueSetReference.source)) write_cldf( args.env['request'], feature, valuesets, args.outdir.joinpath('feature-%s.cldf.csv' % feature.id)) if __name__ == '__main__': main(parsed_args((('outdir',), dict(action=ExistingDir)), bootstrap=True))
('lect_descriptions', 'Lect_descriptions'), ('colours', 'Colours'), ('contributors', 'Contributors'), ("data (editors' layout)", 'Data'), ("data (apics-wals)", 'wals'), ("data references", "Data_references"), ("editors", "Editors"), ("examples", "ExamplesB"), ("examples (editors' layout)", "Examples"), ("feature references", "Feature_references"), ("features (publication)", "Featuresp"), ("features (value names)", "Featuresv"), ("features", "Features"), ("language references", "Language_references"), ("languages (editors' layout)", "Languages"), ("people", "People"), ("references", "References"), ("segment data (editors' layout)", "Segment_data"), ("segment data", "Segment_dataB"), ('segment features', 'Segment_features'), ("sociolinguistic data", "Sociolinguistic_data"), ("sociolinguistic data references", "Sociolinguistic_data_references"), ("sociolinguistic features", "Sociolinguistic_features"), ("value examples", "Value_examples"), ]: jsondump(client.get(layout), args.data_file('fm', '%s.json' % table)) if __name__ == '__main__': main(parsed_args((("host", ), {}), (("user", ), {}), (("password", ), {})))
version=version, lang=lang, clf=reduce(wrap, clf) if not lang.replacements else '', versions=versions, identifiers=identifiers.get(lang.pk, []), replacements=[ all_langs[version][lid].link for lid in lang.replacements if lid in all_langs[version] ], wrap=wrap, link_list=link_list, )) if __name__ == '__main__': args = parsed_args() versions = ['2.0', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7'] langs, identifiers = {}, {} for version in versions: db = create_engine( 'postgresql://robert@/glottolog-{0}'.format(version)) langs[version] = { r['id']: L(r['pk'], r['id'], r['name'], version, r['level'], r['father_pk']) for r in db.execute("""\ select l.pk, l.id, l.name, ll.level, ll.father_pk from language as l, languoid as ll where l.pk = ll.pk and l.active = true""") } for r in db.execute("""\ select l.pk, l.id, l.name, string_agg(ll.id, ' ') as replacements
from clld.scripts.util import parsed_args from clld.scripts.llod import register, llod_func if __name__ == '__main__': #llod_func(parsed_args(bootstrap=True)) register(parsed_args())
# and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid( row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp) if __name__ == '__main__': main(parsed_args((("--all",), dict(action="store_true"))))
def get_args(): return parsed_args( (("--data-dir",), dict( action=ExistingDir, default=path('/home/robert/venvs/clld/data/glottolog-data/'))))
from clld.scripts.util import parsed_args from clld.db.meta import DBSession from glottolog3.models import Ref, Doctype, Languoid, Provider def main(args): with open(args.data_file('phoible-isocodes.json')) as fp: covered = json.load(fp).keys() q = DBSession.query(Ref).join(Ref.languages).join(Ref.doctypes).join(Ref.providers)\ .filter(Doctype.pk.in_([3, 8, 11]))\ .filter(Provider.pk == 21)\ .filter(not_(Languoid.hid.in_(covered)))\ .options(joinedload(Ref.languages)) print q.count() with open(args.data_file('phoible-phonologies.bib'), 'w') as fp: for ref in q: rec = ref.bibtex() rec['glottolog_url'] = 'http://glottolog.org/resource/reference/id/%s' % ref.id rec['languages'] = ', '.join('%s [%s][%s]' % (l.name, l.id, l.hid) for l in ref.languages if l.hid not in covered) fp.write('\n%s' % unicode(rec).encode('utf8')) if __name__ == '__main__': main(parsed_args())