def prepare_release(repos, version=None): edition = get_edition(repos, version=version) cit = citation(repos, edition=edition, version=version) dump( { "title": repos.publication.zenodo.title_format.format(edition['version']), "description": to_html(cit, repos.publication.web.url), "license": { "id": repos.publication.zenodo.license_id }, "keywords": repos.publication.zenodo.keywords.split(), "communities": [{ "identifier": cid } for cid in repos.publication.zenodo.communities.split()], "creators": [editor_to_dict(n, repos.editors) for n in edition['editors']], "access_right": "open", "upload_type": "dataset", }, repos.path('.zenodo.json'), indent=4) return cit
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env["request"].dataset dump_dir = args.data_file("dumps") if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath("README.txt").open("w", encoding="utf8") as fp: fp.write(freeze_readme(dataset, args.env["request"])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath("%s.csv" % table.name) if with_history or not table.name.endswith("_history"): _freeze(table, csv) if csv.exists(): csvm = "%s.%s" % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc(csvm, args.env["request"], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile(as_posix(args.data_file("..", "data.zip")), "w", ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open("rb") as fp: zipfile.writestr(f.name, fp.read())
def prepare_release(repos, version): for v, year, editors in read_editors(repos): if v == version: break else: # pragma: no cover raise ValueError('Add version to CONTRIBUTORS.md first!') citation = "{0}. {1}. {2} {3}. {4}: {5}. (Available online at {6})".format( ' & '.join('{0.last}, {0.first}'.format(HumanName(e)) for e in editors), year, repos.publication.web.name, version, repos.publication.publisher.place, repos.publication.publisher.name, repos.publication.web.url, ) dump( { "title": repos.publication.zenodo.title_format.format(version), "description": to_html(citation, repos.publication.web.url), "license": {"id": repos.publication.zenodo.license_id}, "keywords": repos.publication.zenodo.keywords.split(), "communities": [ {"identifier": cid} for cid in repos.publication.zenodo.communities.split()], "creators": [editor_to_dict(n, repos.editors) for n in editors], "access_right": "open" }, repos.path('.zenodo.json'), indent=4)
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env['request'].dataset dump_dir = args.data_file('dumps') if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp: fp.write(freeze_readme(dataset, args.env['request'])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath('%s.csv' % table.name) if with_history or not table.name.endswith('_history'): _freeze(table, csv) if csv.exists(): csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc( csvm, args.env['request'], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version # pragma: no cover jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile( as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open('rb') as fp: zipfile.writestr(f.name, fp.read())
def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples} md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath( 'static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env['request'].dataset dump_dir = args.data_file('dumps') if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp: fp.write(freeze_readme(dataset, args.env['request'])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath('%s.csv' % table.name) if with_history or not table.name.endswith('_history'): _freeze(table, csv) if csv.exists(): csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc(csvm, args.env['request'], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile(as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open('rb') as fp: zipfile.writestr(f.name, fp.read())
def __exit__(self, exc_type, exc_val, exc_tb): jsonlib.dump(collections.OrderedDict([ (k, collections.OrderedDict([i for i in sorted(v.items())])) for k, v in sorted(self.items.items()) ]), self.path, indent=4)
def test_json(self): from clldutils.jsonlib import dump, load d = {'a': 234, 'ä': 'öäüß'} p = self.tmp_path('test') dump(d, p) for k, v in load(p).items(): assert d[k] == v
def to_replacements(self, filename): """Write a JSON file with 301s from merged glottolog_ref_ids.""" with self.connect() as conn: conn.row_factory = sqlite3.Row cursor = conn.execute('SELECT refid AS id, id AS replacement ' 'FROM entry WHERE id != refid ORDER BY id') pairs = map(dict, cursor) jsonlib.dump(pairs, filename, indent=4)
def ldstatus(args): from glottolog3.langdocstatus import extract_data endangerment = { l.id: l.cfg['endangerment'] for l in args.repos.languoids() if 'endangerment' in l.cfg} with_session(args) dump(extract_data(endangerment), 'glottolog3/static/ldstatus.json', indent=4)
def test_profile_with_bad_metadata(tmpdir): mdpath = tmpdir / 'md.json' md = deepcopy(Profile.MD) md['tables'].append({'tableSchema': {'columns': []}}) jsonlib.dump(md, str(mdpath)) with pytest.raises(ValueError): Profile.from_file(str(mdpath))
def run(args): # pragma: no cover if Repo: assert str(Repo(str(args.repos.repos)).active_branch) == 'master', \ 'Command should be run on master branch' res = {'language': [], 'family': [], 'dialect': []} for lang in args.repos.languoids(): res[lang.level.name].append(lang.id) jsonlib.dump(res, args.repos.build_path('languoids.json'))
def main(args): # pragma: no cover ldstatus = {} limit = 200 q = language_query().order_by(Language.pk) offset = 0 # we merge information about extinct languages from unesco and Harald. if 1: # loop over active, established languages with geo-coords while True: transaction.begin() langs = [l for l in q.offset(offset).limit(limit)] if not langs: break offset += limit # let's collect the relevant sources in a way that allows computation of med. # Note: we limit refs to the ones without computerized assignments. lsources = list(DBSession.query(Ref).join(LanguageSource)\ .filter(LanguageSource.language_pk.in_([l.pk for l in langs])) \ .filter(Ref.ca_doctype_trigger == None)\ .filter(Ref.ca_language_trigger == None)\ .options(joinedload(Ref.doctypes), joinedload(Source_.languages))) for l in langs: sources = [s for s in lsources if l in s.languages] sources = sorted(map(Source, sources)) # keep the overall med # note: this source may not be included in the potential meds computed # below, # e.g. because it may not have a year. med = sources[0].__json__() if sources else None # now we have to compute meds respecting a cut-off year. # to do so, we collect eligible sources per year and then # take the med of this collection. potential_meds = [] # we only have to loop over publication years within all sources, because # only in these years something better might have come along. for year in set(s.year for s in sources if s.year): # let's see if something better was published! eligible = [ s for s in sources if s.year and s.year <= year ] if eligible: potential_meds.append(sorted(eligible)[0]) # we store the precomputed sources information as jsondata: ldstatus[l.id] = [ med, [ s.__json__() for s in sorted(set(potential_meds), key=lambda s: -s.year) ] ] print(offset) transaction.abort() dump(ldstatus, 'glottolog3/static/ldstatus.json', indent=4)
def run(args): ordered = [d['species'].lower() for d in reader(args.ordered, dicts=True)] ranks = ['phylum', 'klass', 'order', 'family', 'genus'] ordered_ranks = {r: {} for r in ranks} seen = {} augmented_species = [] for ex in args.api.experiments: species = ex.gbif.cname if species not in seen: seen[species] = (ex.gbif.classification, ex.species_latin) skey = species.lower() if skey not in ordered: skey = ' '.join(skey.split()[:2]) if skey not in ordered: skey = [n for n in ordered if n.split()[0] == skey.split()[0]] if skey: skey = skey[0] if skey in ordered: augmented_species.append((species, ordered.index(skey))) else: augmented_species.append((species, len(ordered) + 1)) for s, i in sorted(augmented_species, key=lambda t: t[1], reverse=True): for r in ranks: ordered_ranks[r][getattr(seen[s][0], r)] = i fully_augmented_species = { s: (ordered_ranks['phylum'][seen[s][0].phylum], ordered_ranks['klass'][seen[s][0].klass], ordered_ranks['order'][seen[s][0].order], ordered_ranks['family'][seen[s][0].family], ordered_ranks['genus'][seen[s][0].genus], i) for s, i in sorted(augmented_species, key=lambda t: t[1]) } clf = collections.defaultdict(lambda: [-1, None]) prefix = {} for k, _ in sorted(fully_augmented_species.items(), key=lambda i: i[1], reverse=True): for j, a in enumerate(ranks): if clf[a][1] != getattr(seen[k][0], a): for aa in ranks[j + 1:]: clf[aa][0] = -1 if a == 'genus': # reset prefix index for all deeper taxonomy ranks: clf['species'][0] = -1 clf[a][0] += 1 clf[a][1] = getattr(seen[k][0], a) node_name = '_'.join( getattr(seen[k][0], aa) for aa in ranks[:j + 1]) prefix[node_name] = string.ascii_lowercase[clf[a][0]] if clf['species'][1] != k: clf['species'][0] += 1 clf['species'][1] = k prefix[k.lower()] = string.ascii_lowercase[clf['species'][0]] dump(prefix, args.api.path('taxa_sortkeys.json'), indent=4)
def jsondump(obj, fname, log=None): fname = Path(fname) if fname.exists(): d = jsonlib.load(fname) d.update(obj) obj = d jsonlib.dump(sorted_obj(obj), fname, indent=4) log_dump(fname, log=log) return obj
def parse(soup, id_, path, with_items=True): props = {} for i, dl in enumerate(soup.find_all('dl')): props.update(dict(list(parse_dl(dl)))) if with_items: parse_table(soup, props) props['name'] = soup.find('h2').get_text() props['id'] = id_ jsonlib.dump(props, path, indent=4)
def rename(args): # pragma: no cover api = Concepticon(args.repos) from_, to_ = args.args assert CONCEPTLIST_ID_PATTERN.match(to_) cl = api.conceptlists[from_] # write the adapted concept list to the new path: with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_), delimiter='\t') as writer: header = [] for i, row in enumerate(reader(cl.path, delimiter='\t')): if i == 0: header = row writer.writerow(row) header = {v: k for k, v in enumerate(header) } # Map col name to row index else: oid = row[header['ID']] assert oid.startswith(from_) nid = oid.replace(from_, to_) api.add_retirement( 'Concept', dict(id=oid, comment='renaming', replacement=nid)) row[header['ID']] = nid writer.writerow(row) # write adapted metadata to the new path: fname = cl.path.name.replace(from_, to_) + MD_SUFFIX md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX), object_pairs_hook=OrderedDict) md['tables'][0]['url'] = fname jsonlib.dump(md, cl.path.parent / fname, indent=4) # remove obsolete concept list and metadata: cl.path.unlink() cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink() # adapt conceptlists.tsv rows = [] for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'): rows.append([col.replace(from_, to_) if col else col for col in row]) with UnicodeWriter(api.data_path('conceptlists.tsv'), delimiter='\t') as writer: writer.writerows(rows) api.add_retirement('Conceptlist', dict(id=from_, comment='renaming', replacement=to_)) print("""Please run grep -r "{0}" concepticondata/ | grep -v retired.json to confirm the renaming was complete!""".format(from_))
def run(args): acc = Taxa() seen = set() for ex in args.api.experiments: species = ex.gbif.name if species not in seen: seen.add(species) acc.add(ex.gbif.classification) #print(acc) gbif, head = Taxa(), None for i, line in enumerate(args.taxa.open(encoding='utf8').readlines()): if i == 0: head = line.strip().split('\t') continue cols = line.strip().split('\t') d = dict(zip(head, cols)) if d['kingdom'] != 'Animalia': continue if d['taxonomicStatus'] != 'accepted': continue if d['taxonRank'] != 'species': continue if 'genus' in d: gbif.add(d) coverage = collections.OrderedDict() for phylum, classes in acc.items(): print('Phylum {}: {}/{} classes'.format(phylum, len(classes), len(gbif[phylum]))) coverage[(phylum, )] = (len(classes), len(gbif[phylum])) for klass, orders in classes.items(): print(' Class {}: {}/{} orders'.format(klass, len(orders), len(gbif[phylum][klass]))) coverage[(phylum, klass)] = (len(orders), len(gbif[phylum][klass])) for order, families in orders.items(): print(' Order {}: {}/{} families'.format( order, len(families), len(gbif[phylum][klass][order]))) coverage[(phylum, klass, order)] = (len(families), len(gbif[phylum][klass][order])) for family, genera in families.items(): print(' Family {}: {}/{} genera'.format( family, len(genera), len(gbif[phylum][klass][order][family]))) coverage[(phylum, klass, order, family)] = ( len(genera), len(gbif[phylum][klass][order][family])) for genus, nspec in genera.items(): print(' Genus {}: {}/{} species'.format( genus, nspec, gbif[phylum][klass][order][family].get(genus))) coverage[(phylum, klass, order, family, genus)] = ( nspec, gbif[phylum][klass][order][family][genus]) coverage = collections.OrderedDict([('_'.join(k), v) for k, v in coverage.items()]) jsonlib.dump(coverage, args.api.path('gbif_coverage.json'), indent=4)
def new(self, alpha, dry_run=False): num = self._store.get(alpha, 1233) + 1 if not dry_run: self._store[alpha] = num # Store the updated dictionary of glottocodes back. ordered = OrderedDict() for k in sorted(self._store.keys()): ordered[k] = self._store[k] jsonlib.dump(ordered, self._fname, indent=4) return Glottocode('%s%s' % (alpha, num))
def run(args): ds = Dataset() comments = {} for p in ds.raw_dir.glob('blog_comments/comments*.html'): for c in iter_comments(p): comments[c['id']] = c comments = sorted(comments.values(), key=lambda c: int(c['id'].split('comment-')[-1])) dump(comments, ds.etc_dir / 'comments.json', indent=4) args.log.info('{} comments'.format(len(comments)))
def new_dataset(args): """ lexibank new-dataset OUTDIR [ID] """ if not args.args: raise ParserError('you must specify an existing directory') outdir = Path(args.args.pop(0)) if not outdir.exists(): raise ParserError('you must specify an existing directory') id_pattern = re.compile('[a-z_0-9]+$') md = {} if args.args: md['id'] = args.args.pop(0) else: md['id'] = input('Dataset ID: ') while not id_pattern.match(md['id']): print( 'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!' ) md['id'] = input('Dataset ID: ') outdir = outdir / md['id'] if not outdir.exists(): outdir.mkdir() for key in ['title', 'url', 'license', 'conceptlist', 'citation']: md[key] = input('Dataset {0}: '.format(key)) # check license! # check conceptlist! for path in Path( pylexibank.__file__).parent.joinpath('dataset_template').iterdir(): if path.is_file(): if path.suffix in ['.pyc']: continue # pragma: no cover target = path.name content = read_text(path) if '+' in path.name: target = re.sub('\+([a-z]+)\+', lambda m: '{' + m.groups()[0] + '}', path.name).format(**md) if target.endswith('_tmpl'): target = target[:-5] content = content.format(**md) write_text(outdir / target, content) else: target = outdir / path.name if target.exists(): shutil.rmtree(str(target)) shutil.copytree(str(path), str(target)) del md['id'] jsonlib.dump(md, outdir / 'metadata.json', indent=4)
def write_CLPA(clpadata, path): """ Basic function to write clpa-data. """ if isinstance(path, Path): outdir, fname = path.parent, path.name else: outdir, fname = local_path(), path # pragma: no cover old_clpa = load_CLPA() jsonlib.dump(old_clpa, outdir.joinpath(fname + '.bak'), indent=4) jsonlib.dump(clpadata, outdir.joinpath(fname), indent=4)
def ldstatus(args): from glottolog3.langdocstatus import extract_data endangerment = { l.id: l.cfg['endangerment'] for l in args.repos.languoids() if 'endangerment' in l.cfg } with_session(args) dump(extract_data(endangerment), 'glottolog3/static/ldstatus.json', indent=4)
def run(args): # pragma: no cover auth = HTTPBasicAuth(args.user, args.token) issues = list(iteritems(auth, '/repos/{0}/issues'.format(REPOS), state='all')) jsonlib.dump(issues, args.repos.issues_path, indent=4) res = {} for issue in issues: if issue['comments']: res[issue['number']] = list(iteritems(auth, issue['comments_url'])) jsonlib.dump(res, args.repos.comments_path, indent=4)
def add_retirement(self, type_, repl): obj = collections.OrderedDict() for k in ['id', 'comment', 'replacement']: obj[k] = repl[k] assert obj[k] if type_ not in self.retirements: self.retirements[type_] = [] self.retirements[type_].append(obj) jsonlib.dump(self.retirements, self.data_path('retired.json'), indent=2)
def test_Glottocodes(self): gcjson = self.tmp_path('glottocodes.json') jsonlib.dump({}, gcjson) glottocodes = Glottocodes(gcjson) gc = glottocodes.new('a', dry_run=True) self.assertTrue(gc.startswith('aaaa')) self.assertNotIn(gc, glottocodes) gc = glottocodes.new('a') self.assertIn(gc, glottocodes) # make sure it's also written to file: self.assertIn(gc, Glottocodes(gcjson)) self.assertEqual(len(list(Glottocodes(gcjson))), 1)
def get(dataset, resource, offset=0, limit=LIMIT, download_=False): fname = dataset.raw.joinpath("%(resource)s-%(limit)s-%(offset)s.json" % locals()) if fname.exists() and not download_: return jsonlib.load(fname) if not download_: raise ValueError res = requests.get("{0}/api/v1/{1}/".format(BASE_URL, resource), params=dict(format='json', limit='{0}'.format(limit), offset='{0}'.format(offset))).json() jsonlib.dump(res, fname) return res
def store(details_): db = read_store() if not details_: return db db[details_['id']] = details_ ordered = OrderedDict() for k in sorted(list(db.keys()), key=lambda lid: int(lid)): v = OrderedDict() for key in sorted(list(db[k].keys())): if key != 'id': v[key] = db[k][key] ordered[k] = v jsonlib.dump(ordered, STORE, indent=4) return db
def test_Glottocodes(self): from pyglottolog.languoids import Glottocodes languoids = self.tmp_path('languoids') languoids.mkdir() jsonlib.dump({}, languoids.joinpath('glottocodes.json')) glottocodes = Glottocodes(repos=self.tmp_path()) gc = glottocodes.new('abcd', dry_run=True) self.assertNotIn(gc, glottocodes) gc = glottocodes.new('abcd') self.assertIn(gc, glottocodes) # make sure it's also written to file: self.assertIn(gc, Glottocodes(repos=self.tmp_path()))
def store(details_, fname): # pragma: no cover db = read_store(fname) if not details_: return db db[details_['id']] = details_ ordered = OrderedDict() for k in sorted(list(db.keys()), key=lambda lid: int(lid)): v = OrderedDict() for key in sorted(list(db[k].keys())): if key != 'id': v[key] = db[k][key] ordered[k] = v jsonlib.dump(ordered, fname, indent=4) return db
def new(self, name, dry_run=False): alpha = slug(text_type(name))[:4] assert alpha while len(alpha) < 4: alpha += alpha[-1] num = self._store.get(alpha, 1233) + 1 if not dry_run: self._store[alpha] = num # Store the updated dictionary of glottocodes back. ordered = OrderedDict() for k in sorted(self._store.keys()): ordered[k] = self._store[k] jsonlib.dump(ordered, self._fname, indent=4) return Glottocode('%s%s' % (alpha, num))
def __exit__(self, *args): ordered = collections.OrderedDict([ (k, v.asdict()) for k, v in sorted(self.objects.items()) ]) if self.path.suffix.lower() == '.zip': with zipfile.ZipFile(str(self.path), 'w', zipfile.ZIP_DEFLATED) as z: z.writestr( self.path.stem, json.dumps(ordered, ensure_ascii=False, indent=0, separators=(',', ':'))) else: dump(ordered, self.path, indent=0, separators=(',', ':'))
def parse(soup, id_, outdir): props = {'id': id_, 'name': soup.find('h2').get_text(), 'tables': {}} for i, dl in enumerate(soup.find_all('dl')): props.update(dict(list(parse_dl(dl)))) for frame in [ 'basic_frame', 'flora_frame', 'cult_frame', 'grammar_frame', 'ethno_frame', ]: div = soup.find('div', id=frame) if div: props['tables'][frame.split('_')[0]] = rows(div.find('table')) jsonlib.dump(props, outdir.joinpath('{0}.json'.format(id_)), indent=4)
def concepticon_api(tmpdir): concepticon_repos = pathlib.Path(str(tmpdir.join('concepticon-data'))) shutil.copytree(str(TEST_REPOS), str(concepticon_repos)) md = jsonlib.load(TEST_REPOS / 'concepticondata' / 'conceptlists' / 'default-metadata.json') md['tables'][0]['url'] = 'Perrin-2010-110.tsv' md['tables'][0]['tableSchema']['columns'].extend( [dict(name='FRENCH'), dict(name='GERMAN')]) jsonlib.dump( md, concepticon_repos / 'concepticondata' / 'conceptlists' / 'Perrin-2010-110.tsv-metadata.json') mappings = concepticon_repos / 'mappings' mappings.joinpath('map-fr.tsv').write_text("""\ ID\tGLOSS\tPRIORITY 1\tG///the gloss\t2 2\tH///the gloss\t3 3\tH///the gloss\t3 """, encoding='utf8') return Concepticon(concepticon_repos)
def cached_metadata(self, sid, id=None, name=None, refresh=False): if data_file('external', self.name, repos=self.repos).is_dir(): fname = data_file('external', self.name, sid + '.json', repos=self.repos) if not fname.exists() or refresh: try: data = self.metadata(id or self.identify(name)) except: # pragma: no cover data = None if not data: return # pragma: no cover jsonlib.dump(data, fname) return data return jsonlib.load(fname) if sid not in self.items or refresh: try: self.items[sid] = self.metadata(id or self.identify(name)) except: return return self.items[sid]
def create(versions, out=None): out = out or Path('archive') if not out.exists(): out.mkdir() langs, identifiers = {}, {} for version in versions: aggregate(version, langs, identifiers) for version in versions: dump( out.joinpath('glottolog-{0}'.format(version)), version, langs, {pk: list(c) for pk, c in groupby(identifiers[version], lambda i: i.lpk)}) gc2v = {} for v in versions: for gc in sorted(langs[v].keys()): gc2v[gc] = v jsonlib.dump(gc2v, out.joinpath('glottocode2version.json'), indent=4)
def create(versions, out=None): out = out or Path('archive') if not out.exists(): out.mkdir() langs, identifiers = {}, {} for version in versions: aggregate(version, langs, identifiers) for version in versions: dump( out.joinpath('glottolog-{0}'.format(version)), version, langs, { pk: list(c) for pk, c in groupby(identifiers[version], lambda i: i.lpk) }) gc2v = {} for v in versions: for gc in sorted(langs[v].keys()): gc2v[gc] = v jsonlib.dump(gc2v, out.joinpath('glottocode2version.json'), indent=4)
def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = { 'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples } md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath('static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def make(): tables = {} columns = {} for e in read_terms().iter(): if ns('rdf:about') in e.attrib: lname = e.attrib[ns('rdf:about')].split('#')[-1] if e.tag == ns('rdfs:Class') and lname.endswith('Table'): tables[lname] = e elif e.tag == ns('rdf:Property'): columns[lname] = e comps = {} for subdir, spec in COMPONENTS.items(): table = make_table(tables.pop(spec['table'])) for c, req in spec['columns']: table['tableSchema']['columns'].append(make_column(columns[c], req)) comps[subdir] = table dump( table, REPO_DIR.joinpath( 'components', subdir, '{0}-metadata.json'.format(spec['table'])), indent=4) for subdir, comprefs in MODULES.items(): dump( OrderedDict([ ("@context", ["http://www.w3.org/ns/csvw", {"@language": "en"}]), ("dc:conformsTo", "http://cldf.clld.org/v1.0/terms.rdf#{0}".format(subdir)), ("dialect", { "commentPrefix": None, }), ("tables", [comps[ref] for ref in comprefs]), ]), REPO_DIR.joinpath('modules', subdir, '{0}-metadata.json'.format(subdir)), indent=4)
def write(self): jsonlib.dump(self.items, self.path, **self._json_opts)
def communities(args, neighbor_weight=None): graphname = args.graphname or 'network' edge_weights = args.weight vertex_weights = str('FamilyFrequency') normalize = args.normalize edgefilter = args.edgefilter threshold = args.threshold or 1 neighbor_weight = neighbor_weight or 5 _graph = args.api.load_graph(graphname, threshold, edgefilter) args.log.info('loaded graph') for n, d in tqdm(_graph.nodes(data=True), desc='vertex-weights', leave=False): d[vertex_weights] = int(d[vertex_weights]) if normalize: for edgeA, edgeB, data in tqdm(_graph.edges(data=True), desc='normalizing', leave=False): data[str('weight')] = data[edge_weights] ** 2 / ( _graph.node[edgeA][vertex_weights] + _graph.node[edgeB][vertex_weights] - data[edge_weights]) vertex_weights = None edge_weights = 'weight' args.log.info('computed weights') graph = networkx2igraph(_graph) args.log.info('starting infomap') args.log.info('converted graph...') comps = graph.community_infomap( edge_weights=str(edge_weights), vertex_weights=vertex_weights) args.log.info('finished infomap') D, Com = {}, defaultdict(list) for i, comp in enumerate(sorted(comps.subgraphs(), key=lambda x: len(x.vs), reverse=True)): for vertex in [v['name'] for v in comp.vs]: D[graph.vs[vertex]['ConcepticonId']] = str(i + 1) Com[i + 1].append(graph.vs[vertex]['ConcepticonId']) for node, data in _graph.nodes(data=True): data['infomap'] = D[node] data['ClusterName'] = '' data['CentralConcept'] = '' # get the articulation points etc. immediately for idx, nodes in sorted(Com.items()): sg = _graph.subgraph(nodes) if len(sg) > 1: d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True) d = [_graph.node[a]['Gloss'] for a, b in d_][0] cluster_name = 'infomap_{0}_{1}'.format(idx, d) else: d = _graph.node[nodes[0]]['Gloss'] cluster_name = 'infomap_{0}_{1}'.format(idx, _graph.node[nodes[0]]['Gloss']) args.log.debug(cluster_name, d) for node in nodes: _graph.node[node]['ClusterName'] = cluster_name _graph.node[node]['CentralConcept'] = d args.log.info('computed cluster names') cluster_dir = args.api.existing_dir('app', 'cluster', clean=True) cluster_names = {} removed = [] for idx, nodes in tqdm(sorted(Com.items()), desc='export to app', leave=False): sg = _graph.subgraph(nodes) for node, data in sg.nodes(data=True): data['OutEdge'] = [] neighbors = [ n for n in _graph if n in _graph[node] and _graph[node][n]['FamilyWeight'] >= neighbor_weight and n not in sg] if neighbors: sg.node[node]['OutEdge'] = [] for n in neighbors: sg.node[node]['OutEdge'].append([ _graph.node[n]['ClusterName'], _graph.node[n]['CentralConcept'], _graph.node[n]['Gloss'], _graph[node][n]['WordWeight'], n ]) if len(sg) > 1: jsonlib.dump( json_graph.adjacency_data(sg), cluster_dir / (_graph.node[nodes[0]]['ClusterName'] + '.json'), sort_keys=True) for node in nodes: cluster_names[_graph.node[node]['Gloss']] = _graph.node[node]['ClusterName'] else: removed += [list(nodes)[0]] _graph.remove_nodes_from(removed) for node, data in _graph.nodes(data=True): if 'OutEdge' in data: data['OutEdge'] = '//'.join(['/'.join([str(y) for y in x]) for x in data['OutEdge']]) removed = [] for nA, nB, data in tqdm(_graph.edges(data=True), desc='remove edges', leave=False): if _graph.node[nA]['infomap'] != _graph.node[nB]['infomap'] and data['FamilyWeight'] < 5: removed += [(nA, nB)] _graph.remove_edges_from(removed) args.api.save_graph(_graph, 'infomap', threshold, edgefilter) args.api.write_js_var('INFO', cluster_names, 'app', 'source', 'infomap-names.js')
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == "cleanup": for fname in args.data_file("gbs").glob("*.json"): try: data = jsonlib.load(fname) if data.get("totalItems") == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file("gbs", "source%s.json" % source.id) if command == "update": source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ["verify", "update"]: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn("no JSON object found in: %s" % filepath) continue if not data["totalItems"]: continue item = data["items"][0] else: continue if command == "verify": stitle = source.description or source.title or source.booktitle needs_check = False year = item["volumeInfo"].get("publishedDate", "").split("-")[0] if not year or year != slug(source.year or ""): needs_check = True twords = words(stitle) iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", "")) if ( twords == iwords or (len(iwords) > 2 and iwords.issubset(twords)) or (len(twords) > 2 and twords.issubset(iwords)) ): needs_check = False if int(source.id) == 241: log.info("%s" % sorted(words(stitle))) log.info("%s" % sorted(iwords)) if needs_check: log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers"))) log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", ""))) log.info(stitle) log.info(item["volumeInfo"].get("publishedDate")) log.info(source.year) log.info(item["volumeInfo"].get("authors")) log.info(source.author) log.info(item["volumeInfo"].get("publisher")) log.info(source.publisher) if not confirm("Are the records the same?"): log.warn("---- removing ----") jsonlib.dump({"totalItems": 0}, filepath) elif command == "update": source.google_book_search_id = item["id"] source.update_jsondata(gbs=item) count += 1 elif command == "download": if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ "inauthor:" + quote_plus(source.author.encode("utf8")), "intitle:" + quote_plus(title.encode("utf8")), ] if source.publisher: q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8"))) url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key) count += 1 r = requests.get(url, headers={"accept": "application/json"}) log.info("%s - %s" % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), "w") as fp: fp.write(r.text.encode("utf8")) elif r.status_code == 403: log.warn("limit reached") break if command == "update": log.info("assigned gbs ids for %s out of %s sources" % (count, i)) elif command == "download": log.info("queried gbs for %s sources" % count)
def subgraph(args, neighbor_weight=None): args.api._log = args.log graphname = args.graphname or 'network' threshold = args.threshold or 1 edgefilter = args.edgefilter neighbor_weight = neighbor_weight or 5 _graph = args.api.load_graph(graphname, threshold, edgefilter) for node, data in _graph.nodes(data=True): generations = [{node}] while generations[-1] and len(set.union(*generations)) < 30 and len(generations) < 3: nextgen = set.union(*[set(_graph[n].keys()) for n in generations[-1]]) if len(nextgen) > 50: break # pragma: no cover else: generations.append(set.union(*[set(_graph[n].keys()) for n in generations[-1]])) data['subgraph'] = list(set.union(*generations)) args.api.save_graph(_graph, 'subgraph', threshold, edgefilter) outdir = args.api.existing_dir('app', 'subgraph', clean=True) cluster_names = {} nodes2cluster = {} nidx = 1 for node, data in tqdm( sorted(_graph.nodes(data=True), key=lambda x: len(x[1]['subgraph']), reverse=True), leave=False): nodes = tuple(sorted(data['subgraph'])) sg = _graph.subgraph(data['subgraph']) if nodes not in nodes2cluster: d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True) d = [_graph.node[a]['Gloss'] for a, b in d_][0] nodes2cluster[nodes] = 'subgraph_{0}_{1}'.format(nidx, d) nidx += 1 cluster_name = nodes2cluster[nodes] data['ClusterName'] = cluster_name for n, d in sg.nodes(data=True): d['OutEdge'] = [] neighbors = [ n_ for n_ in _graph if n_ in _graph[node] and _graph[node][n_]['FamilyWeight'] >= neighbor_weight and n_ not in sg] if neighbors: sg.node[node]['OutEdge'] = [] for n_ in neighbors: sg.node[node]['OutEdge'].append([ 'subgraph_' + n_ + '_' + _graph.node[n]['Gloss'], _graph.node[n_]['Gloss'], _graph.node[n_]['Gloss'], _graph[node][n_]['FamilyWeight'], n_ ]) sg.node[node]['OutEdge'].append([ _graph.node[n]['ClusterName'], _graph.node[n]['CentralConcept'], _graph.node[n]['Gloss'], _graph[node][n]['WordWeight'], n ]) if len(sg) > 1: jsonlib.dump( json_graph.adjacency_data(sg), outdir / (cluster_name + '.json'), sort_keys=True) cluster_names[data['Gloss']] = cluster_name for node, data in _graph.nodes(data=True): if 'OutEdge' in data: data['OutEdge'] = '//'.join([str(x) for x in data['OutEdge']]) args.api.write_js_var('SUBG', cluster_names, 'app', 'source', 'subgraph-names.js')
def json_dump(self, obj, *path): p = self.existing_dir(*path[:-1]) / path[-1] jsonlib.dump(obj, p, indent=2) self.file_written(p)
from collections import OrderedDict from csvw.dsv import reader from clldutils.jsonlib import dump from sqlalchemy import create_engine eth17 = OrderedDict() for l in reader('LanguageCodes.tab', dicts=True, delimiter='\t'): eth17[l['LangID']] = l['Name'] db = create_engine('postgresql://robert@/asjp') in_asjp = set(r[0] for r in db.execute('select code_iso from doculect where code_iso is not null')) missing = [(k, v) for k, v in eth17.items() if k not in in_asjp] dump(missing, 'missing.json', indent=4)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words( item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info('------- %s -> %s' % ( source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % ( item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus( source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)