def run(args): ds = get_dataset(args) print(ds) with Table('key', 'value') as md: md.extend(ds.properties.items()) print() with Table('Path', 'Type', 'Rows') as t: for p, type_, r in ds.stats(): t.append([p, type_, r])
def run(args): if args.citations: # pragma: no cover for dataset in args.repos.index: print('> {}\n'.format(dataset.citation)) return if args.datasets_only: with Table(args, 'dataset', 'files', 'size') as t: totalfiles, totalsize = 0, 0 for dataset in args.repos.index: totalfiles += len(dataset.files) totalsize += sum(f.size for f in dataset.files) t.append([ dataset.name, len(dataset.files), format_size(sum(f.size for f in dataset.files))]) t.append([ 'total: {} datasets'.format(len(args.repos.index)), totalfiles, format_size(totalsize)]) return if args.index: # pragma: no cover args.format = 'pipe' print("""# Content [georoc.sqlite.gz](georoc.sqlite.gz) contains data from [GEOROC's precompiled datasets](https://data.goettingen-research-online.de/dataverse/digis) as listed below. """) with Table(args, 'file', 'dataset', 'size', 'last modified') as t: if args.samples: t.columns.append('# samples') if args.references: t.columns.append('# references') t.columns.append('path') for ds in args.repos.index: if not args.dataset or (args.dataset in ds.name): for f in ds.files: row = [ '[{}]({})'.format(f.id, f.md['pidURL']), ds.name, format_size(f.size), f.date ] if args.samples: row.append(len(list(f.iter_samples(args.repos, stdout=None)))) if args.references: row.append(len(list(f.iter_references(args.repos)))) row.append(f.name) t.append(row)
def run(args): res = GBIF()(args.service, q=args.query) if args.service == 'suggest': cols = ['key', 'scientificName', 'rank', 'status'] with Table(args, *cols) as table: for row in res: if not row['synonym']: table.append([row.get(col) for col in cols]) else: cols = ['key', 'scientificName', 'rank', 'taxonomicStatus'] with Table(args, *cols) as table: for row in res['results']: if not row[ 'synonym'] and 'nubKey' not in row and 'kingdom' in row: table.append([row.get(col) for col in cols])
def run(args): ds = get_dataset(args) forms = [] for row in ds.cldf_reader()['FormTable']: if row['Language_ID'] == args.language_id or not args.language_id: forms.append(row) P = syllable_inventories(forms, format=args.prosody_format) bipa = args.clts.from_config().api.transcriptionsystem_dict['bipa'] table = [] if args.display == 'long': header = ['Language', 'Sound', 'Template', 'Frequency'] for language, data in P.items(): for sound, templates in data.items(): for template, frequency in templates.items(): table += [[language, sound, template, len(frequency)]] else: header = ['Language', 'Sound', 'Class', 'Frequency', 'Templates'] for language, data in P.items(): for sound, templates in data.items(): table += [[ language, sound, bipa[sound].type, sum([len(x) for x in templates.values()]), ', '.join([ '{0}:{1}'.format(x, len(y)) for x, y in templates.items() ]) ]] with Table(args, *header, rows=table): pass
def run(args): if args.unloaded: i = 0 for i, ds in enumerate(iter_datasets('lexibank.dataset')): print(ds.cldf_dir) if not i: print('No datasets installed') # pragma: no cover return with Table( args, '#', 'Dataset', 'Parameters', 'Concepticon', 'Varieties', 'Glottocodes', 'Families', ) as table: try: concept_counts = { r[0]: r[1:] for r in args.repos.db.fetchall('concepts_by_dataset') } except sqlite3.OperationalError: # pragma: no cover print('No datasets loaded yet') return varieties = args.repos.db.varieties var_counts = {} for dsid, vs in itertools.groupby(varieties, lambda v: v.source): vs = list(vs) var_counts[dsid] = (len(vs), len(set(v.glottocode for v in vs)), len(set(v.family for v in vs))) for count, d in enumerate(args.repos.db.datasets): table.append([ count + 1, d.replace('lexibank-', ''), concept_counts[d][1], concept_counts[d][0], var_counts.get(d, [0])[0], var_counts.get(d, [0, 0])[1], var_counts.get(d, [0, 0, 0])[2], ]) table.append([ '', 'TOTAL', 0, args.repos.db.fetchone("""\ select count(distinct p.concepticon_id) from parametertable as p, formtable as f, languagetable as l where f.parameter_id = p.id and f.dataset_id = p.dataset_id and f.language_id = l.id and f.dataset_id = l.dataset_id and l.glottocode is not null and l.family != 'Bookkeeping' """)[0], len(varieties), len(set(v.glottocode for v in varieties)), len(set(v.family for v in varieties)) ])
def run(args): with Table(args, 'id', 'name', 'type', 'variables', 'societies') as t: for ds in args.repos.datasets: t.append([ ds.id, ds.name, ds.type, len(ds.variables), len(ds.societies) ])
def run(args): conn = sqlite3.connect(args.dbpath or args.repos.dbpath) cu = conn.cursor() print("""# Database statistics """) with Table(args, 'table', '# rows') as t: for table in ['file', 'reference', 'sample', 'citation']: cu.execute('SELECT count(*) FROM {}'.format(table)) t.append([table, cu.fetchone()[0]])
def run(args): contribs = {c.id: c.name for c in args.repos.contributors} with Table(args, '#', 'ID', 'Title', 'Patrons') as t: for i, f in enumerate(args.repos.features.values(), start=1): t.append([ i, f.id, textwrap.shorten(f.wiki['title'], 50), ' '.join([contribs[abbr] for abbr in f.patrons]) ])
def run(args): concepts = set() columns = collections.defaultdict(list) headers = ['No', 'Dataset', 'Field', 'Ln', 'Norare', 'Structure', 'Type'] \ if args.columns else ['ID', 'Author', 'Year', 'Languages', 'Tags', 'Ratings', 'Concepts'] with Table(args, *headers) as table: for i, ds in progressbar(enumerate(args.api.datasets.values())): if not args.columns: table.append([ ds.id, ds.author.replace(' AND ', ' and ').split(' and ')[0], ds.year, ', '.join(ds.source_language[:3]), ', '.join(ds.tags), len(ds.columns) - 3, len(ds.concepts) ]) concepts.update(ds.concepts) else: for column in ds.columns: if column not in [ 'concepticon_id', 'concepticon_gloss', 'line_in_source', 'english', 'german', 'polish', 'spanish', 'chinese', 'french', 'dutch', ]: columns[(ds.id, column)] += [( ds.columns[column].language, ds.columns[column].norare, ds.columns[column].structure, ds.columns[column].type, )] if not args.columns: table.append([ '-', 'TOTAL', '-', '-', '-', sum([x[-2] for x in table]), len(concepts) ]) else: for i, (k, v) in enumerate( sorted(columns.items(), key=lambda x: (x[0][1], x[0][0]))): table.append(( i + 1, k[0], k[1], ', '.join(list(set([x[0] for x in v])))[:30], ', '.join(list(set([x[1] for x in v])))[:15], ', '.join(list(set([x[2] for x in v])))[:15], ', '.join(list(set([x[3] for x in v])))[:15], ))
def run(args): try: fts.get_index(args.repos, must_exist=True) except ValueError: raise ParserError('Index does not exist. Run "glottolog searchindex" first!') count, results = fts.search(args.repos, args.query) with Table('ID', 'Author', 'Year', 'Title') as table: for res in results: table.append([res.id, res.author, res.year, res.title]) print('({} matches)'.format(count))
def run(args): attrs = collections.Counter() for cl in args.repos.conceptlists.values(): attrs.update(cl.attributes) with Table( args, 'Attribute', 'Occurrences', rows=[(k, v) for k, v in attrs.most_common() if v >= args.min_occurs]): pass
def run(args): found = args.repos.lookup( args.gloss, language=args.language, full_search=args.full_search, similarity_level=args.similarity, ) with Table(args, "GLOSS", "CONCEPTICON_ID", "CONCEPTICON_GLOSS", "SIMILARITY") as t: for matches in found: t.extend(matches)
def run(args): ops = collections.defaultdict(collections.Counter) for lang in args.repos.languoids(): for secname, sec in lang.cfg.items(): ops[secname].update(opt for opt, val in sec.items() if val) ops.pop('DEFAULT', None) with Table('section', 'option', 'count') as table: for section, options in ops.items(): table.append([section, '', float(sum(options.values()))]) for k, n in options.most_common(): table.append(['', k, float(n)])
def run(args): nexus_obj = get_reader(args, required_blocks=['data']) print(nexus_obj.filename) with Table(args, 'Taxon', 'Characters') as t: for taxon in sorted(nexus_obj.data.matrix): tally = collections.Counter() for site in nexus_obj.data.matrix[taxon]: tally.update([site]) t.append([ taxon, ", ".join(['%s x %s' % (k, tally[k]) for k in sorted(tally)]) ])
def run(args): corpus = get_corpus(args) with Table('type', 'count') as t: e, w, m = corpus.get_stats() t.append(['example', e]) t.append(['word', w]) t.append(['morpheme', m]) if e: print('\nExample properties:') for igt in corpus: for k in igt.properties.keys(): print(' ' + k) break
def run(args): print('Summary:') print(' {0:,} objects with {1:,} bitstreams of total size {2}'.format( len(args.catalog), sum(len(obj.bitstreams) for obj in args.catalog), args.catalog.size_h)) print(' {0} duplicate bitstreams'.format( sum(1 for objs in args.catalog.md5_to_object.values() if len(objs) > 1))) print(' {0} objects with no bitstreams'.format( sum(1 for obj in args.catalog if not obj.bitstreams))) print() types = collections.Counter( itertools.chain(*[[bs.mimetype for bs in obj.bitstreams] for obj in args.catalog])) with Table('maintype', 'subtype', 'bitstreams') as table: for maintype, items in itertools.groupby( sorted(types.items(), key=lambda p: (p[0].split('/')[0], -p[1])), lambda p: p[0].split('/')[0]): for k, v in items: table.append([maintype, k.split('/')[1], v])
def run(args): issues = collections.OrderedDict([(i.id, i) for i in args.repos.issues]) if args.id: issue = issues[args.id] print(colored('{0.created} {0.user}'.format(issue), 'green')) print(colored(issue.title, attrs={'bold', 'underline'})) print('') print(issue.body) for comment in issue.comments: print(colored('\n {0.created} {0.user}'.format(comment), 'green')) for line in comment.stripped_body.split('\n'): print(' ' + line) return with Table(args, 'ID', 'Title', 'User', 'Comments') as t: for issue in issues.values(): if issue.coding_problem: t.append([ issue.id, textwrap.shorten(issue.title, 50, placeholder=' […]'), issue.user, len(issue.comments)])
def run(args): args.repos._log = args.log def clean(word): return "".join([w for w in word if w not in '/,;"']) varieties = args.repos.db.varieties args.log.info("Adding nodes to the graph") G = nx.Graph() for concept in args.repos.db.iter_concepts(): G.add_node(concept.id, **concept.as_node_attrs()) args.log.info("Adding edges to the graph") for v_, formA, formB in args.repos.iter_colexifications(varieties): if not G[formA.concepticon_id].get(formB.concepticon_id, False): G.add_edge( formA.concepticon_id, formB.concepticon_id, words=set(), languages=set(), families=set(), wofam=[], ) G[formA.concepticon_id][formB.concepticon_id]["words"].add( (formA.gid, formB.gid)) G[formA.concepticon_id][formB.concepticon_id]["languages"].add(v_.gid) G[formA.concepticon_id][formB.concepticon_id]["families"].add( v_.family) G[formA.concepticon_id][formB.concepticon_id]["wofam"].append("/".join( [ formA.gid, formB.gid, formA.clics_form, v_.gid, v_.family, clean(formA.form), clean(formB.form), ])) # If either the colex2lang or colexstats files are requested, # build map of variety name to Glottocode, a map of concepts, # and collected the statistics if any([args.colex2lang, args.colexstats]): lang_map = { "%s-%s" % (dataset_id, lang_id): glottocode for dataset_id, lang_id, glottocode in args.repos.db.fetchall( "SELECT dataset_ID, ID, Glottocode FROM languagetable") } # Collect lists of concepts for each language variety, so that we can # check if a colexification would be possible in it (i.e., if there is # enough data) concepts = defaultdict(set) for dataset_id, lang_id, concepticon_id in args.repos.db.fetchall(""" SELECT f.dataset_ID, f.Language_ID, p.Concepticon_ID FROM formtable AS f, parametertable AS P WHERE f.Parameter_ID = p.ID AND f.dataset_ID = p.dataset_ID"""): concepts["%s-%s" % (dataset_id, lang_id)].add(concepticon_id) # Iterate over all edges and collect data all_counts, threshold_counts = defaultdict(int), defaultdict(int) all_possible, threshold_possible = defaultdict(int), defaultdict(int) colex2lang = defaultdict(set) for concept_a, concept_b, data in G.edges(data=True): # Collect concept2languages info for lang, glottocode in lang_map.items(): if lang in data["languages"]: colex2lang[concept_a, concept_b].add(glottocode) # Collect language colexification affinity (David Gil's request) # Don't consider the edge if we don't have at least one language in it if not data["languages"]: continue # Check if the current concept pair passes the threshold filter filter_family, filter_lang, filter_words = True, True, True if args.edgefilter == "families": filter_family = len(data["families"]) >= args.threshold if args.edgefilter == "languages": filter_lang = len(data["languages"]) >= args.threshold if args.edgefilter == "words": filter_words = len(data["words"]) >= args.threshold pass_filter = all([filter_family, filter_lang, filter_words]) # Inspect all languages for lang in lang_map: if lang in data["languages"]: all_counts[lang] += 1 all_possible[lang] += 1 if pass_filter: threshold_counts[lang] += 1 threshold_possible[lang] += 1 else: if concept_a in concepts[lang] and concept_b in concepts[ lang]: all_possible[lang] += 1 if pass_filter: threshold_possible[lang] += 1 edges = {} for edgeA, edgeB, data in G.edges(data=True): edges[edgeA, edgeB] = ( len(data["families"]), len(data["languages"]), len(data["words"]), ) ignore_edges = [] for edgeA, edgeB, data in G.edges(data=True): data["WordWeight"] = len(data["words"]) data["words"] = ";".join( sorted(["{0}/{1}".format(x, y) for x, y in data["words"]])) data["FamilyWeight"] = len(data["families"]) data["families"] = ";".join(sorted(data["families"])) data["LanguageWeight"] = len(data["languages"]) data["languages"] = ";".join(data["languages"]) data["wofam"] = ";".join(data["wofam"]) if args.edgefilter == "families" and data[ "FamilyWeight"] < args.threshold: ignore_edges.append((edgeA, edgeB)) elif args.edgefilter == "languages" and data[ "LanguageWeight"] < args.threshold: ignore_edges.append((edgeA, edgeB)) elif args.edgefilter == "words" and data["WordWeight"] < args.threshold: ignore_edges.append((edgeA, edgeB)) G.remove_edges_from(ignore_edges) nodenames = { r[0]: r[1] for r in args.repos.db.fetchall( "select distinct concepticon_id, concepticon_gloss from parametertable" ) } with Table(args, "ID A", "Concept A", "ID B", "Concept B", "Families", "Languages", "Words") as table: count = 0 for (nodeA, nodeB), (fc, lc, wc) in sorted(edges.items(), key=lambda i: i[1], reverse=True): if (nodeA, nodeB) not in ignore_edges: table.append([ nodeA, nodenames[nodeA], nodeB, nodenames[nodeB], fc, lc, wc ]) count += 1 if count >= args.show: break print( args.repos.save_graph(G, args.graphname, args.threshold, args.edgefilter)) # Output colex2lang info if args.colex2lang: with open(args.colex2lang, "w") as tsvfile: tsvfile.write("CONCEPT_A\tCONCEPT_B\tGLOTTOCODES\n") for entry, langs in colex2lang.items(): tsvfile.write("%s\t%s\t%s\n" % (entry[0], entry[1], ",".join(langs))) # Output per-language info if args.colexstats: with open(args.colexstats, "w") as tsvfile: writer = csv.DictWriter( tsvfile, delimiter="\t", fieldnames=[ "LANG_KEY", "GLOTTOCODE", "COLEXIFICATIONS_ALL", "POTENTIAL_ALL", "COLEXIFICATIONS_THRESHOLD", "POTENTIAL_THRESHOLD", ], ) writer.writeheader() for lang in sorted(lang_map): writer.writerow({ "LANG_KEY": lang, "GLOTTOCODE": lang_map[lang], "COLEXIFICATIONS_ALL": all_counts[lang], "POTENTIAL_ALL": all_possible[lang], "COLEXIFICATIONS_THRESHOLD": threshold_counts[lang], "POTENTIAL_THRESHOLD": threshold_possible[lang], })
def run(args): from pyclics.util import parse_kwargs algo = args.algorithm if algo not in args.repos.cluster_algorithms: with Table(args, 'algorithm', 'description') as table: for name, desc in args.repos.cluster_algorithms.items(): table.append((name, desc)) if args.algorithm != '_': raise argparse.ArgumentError(None, 'Unknown cluster algorithm: {0}'.format(algo)) return if not args.repos.repos.joinpath('app', 'source', 'words.json').exists(): raise argparse.ArgumentError(None, '"clics makeapp" must be run first') graph = args.repos.load_graph(args.graphname, args.threshold, args.edgefilter) args.log.info('graph loaded') kw = vars(args) kw.update(parse_kwargs(*args.args)) neighbor_weight = int(kw.pop('neighbor_weight', 5)) clusters = sorted(args.repos.get_clusterer(algo)(graph, vars(args)), key=lambda c: (-len(c), c)) args.log.info('computed clusters') D, Com = {}, collections.defaultdict(list) for i, cluster in enumerate(clusters, start=1): for vertex in cluster: D[vertex] = str(i) Com[i].append(vertex) # Annotate the graph with the cluster info: for node, data in graph.nodes(data=True): data.update({algo: D.get(node, '0'), 'ClusterName': '', 'CentralConcept': ''}) # get the articulation points etc. immediately for idx, nodes in sorted(Com.items()): sg = graph.subgraph(nodes) if len(sg) > 1: d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True) d = [graph.node[a]['Gloss'] for a, b in d_][0] cluster_name = '{0}_{1}_{2}'.format(algo, idx, d) else: d = graph.node[nodes[0]]['Gloss'] cluster_name = '{0}_{1}_{2}'.format(algo, idx, graph.node[nodes[0]]['Gloss']) for node in nodes: graph.node[node]['ClusterName'] = cluster_name graph.node[node]['CentralConcept'] = d args.log.info('computed cluster names') cluster_dir = args.repos.existing_dir('app', 'cluster', algo, clean=True) cluster_names = {} removed = [] for idx, nodes in tqdm(sorted(Com.items()), desc='export to app', leave=False): sg = graph.subgraph(nodes) for node, data in sg.nodes(data=True): data['OutEdge'] = [] neighbors = [ n for n in graph if n in graph[node] and graph[node][n]['FamilyWeight'] >= neighbor_weight and n not in sg] if neighbors: sg.node[node]['OutEdge'] = [] for n in neighbors: sg.node[node]['OutEdge'].append([ graph.node[n]['ClusterName'], graph.node[n]['CentralConcept'], graph.node[n]['Gloss'], graph[node][n]['WordWeight'], n ]) if len(sg) > 1: fn = cluster_dir / ( (str(idx) if algo == 'subgraph' else graph.node[nodes[0]]['ClusterName']) + '.json') jsonlib.dump(json_graph.adjacency_data(sg), fn, sort_keys=True) for node in nodes: cluster_names[graph.node[node]['Gloss']] = fn.stem else: removed += [list(nodes)[0]] graph.remove_nodes_from(removed) for node, data in graph.nodes(data=True): if 'OutEdge' in data: data['OutEdge'] = '//'.join(['/'.join([str(y) for y in x]) for x in data['OutEdge']]) removed = [] for nA, nB, data in tqdm(graph.edges(data=True), desc='remove edges', leave=False): if graph.node[nA][algo] != graph.node[nB][algo] and data['FamilyWeight'] < 5: removed += [(nA, nB)] graph.remove_edges_from(removed) args.repos.save_graph(graph, algo, args.threshold, args.edgefilter) args.repos.write_js_var(algo, cluster_names, 'app', 'source', 'cluster-names.js')
def run(args): with Table(args, 'DOI', 'topic') as t: for ex in args.api.experiments: t.append([ex.doi, ex.parameter])