Exemple #1
0
def run(args):
    ds = get_dataset(args)
    print(ds)
    with Table('key', 'value') as md:
        md.extend(ds.properties.items())
    print()
    with Table('Path', 'Type', 'Rows') as t:
        for p, type_, r in ds.stats():
            t.append([p, type_, r])
Exemple #2
0
def run(args):
    if args.citations:  # pragma: no cover
        for dataset in args.repos.index:
            print('> {}\n'.format(dataset.citation))
        return

    if args.datasets_only:
        with Table(args, 'dataset', 'files', 'size') as t:
            totalfiles, totalsize = 0, 0
            for dataset in args.repos.index:
                totalfiles += len(dataset.files)
                totalsize += sum(f.size for f in dataset.files)
                t.append([
                    dataset.name,
                    len(dataset.files),
                    format_size(sum(f.size for f in dataset.files))])
            t.append([
                'total: {} datasets'.format(len(args.repos.index)),
                totalfiles,
                format_size(totalsize)])
        return

    if args.index:  # pragma: no cover
        args.format = 'pipe'
        print("""# Content

[georoc.sqlite.gz](georoc.sqlite.gz) contains data from
[GEOROC's precompiled datasets](https://data.goettingen-research-online.de/dataverse/digis)
as listed below.
""")

    with Table(args, 'file', 'dataset', 'size', 'last modified') as t:
        if args.samples:
            t.columns.append('# samples')
        if args.references:
            t.columns.append('# references')
        t.columns.append('path')
        for ds in args.repos.index:
            if not args.dataset or (args.dataset in ds.name):
                for f in ds.files:
                    row = [
                        '[{}]({})'.format(f.id, f.md['pidURL']),
                        ds.name,
                        format_size(f.size),
                        f.date
                    ]
                    if args.samples:
                        row.append(len(list(f.iter_samples(args.repos, stdout=None))))
                    if args.references:
                        row.append(len(list(f.iter_references(args.repos))))
                    row.append(f.name)
                    t.append(row)
Exemple #3
0
def run(args):
    res = GBIF()(args.service, q=args.query)
    if args.service == 'suggest':
        cols = ['key', 'scientificName', 'rank', 'status']
        with Table(args, *cols) as table:
            for row in res:
                if not row['synonym']:
                    table.append([row.get(col) for col in cols])
    else:
        cols = ['key', 'scientificName', 'rank', 'taxonomicStatus']
        with Table(args, *cols) as table:
            for row in res['results']:
                if not row[
                        'synonym'] and 'nubKey' not in row and 'kingdom' in row:
                    table.append([row.get(col) for col in cols])
Exemple #4
0
def run(args):

    ds = get_dataset(args)
    forms = []
    for row in ds.cldf_reader()['FormTable']:
        if row['Language_ID'] == args.language_id or not args.language_id:
            forms.append(row)

    P = syllable_inventories(forms, format=args.prosody_format)
    bipa = args.clts.from_config().api.transcriptionsystem_dict['bipa']

    table = []
    if args.display == 'long':
        header = ['Language', 'Sound', 'Template', 'Frequency']
        for language, data in P.items():
            for sound, templates in data.items():
                for template, frequency in templates.items():
                    table += [[language, sound, template, len(frequency)]]
    else:
        header = ['Language', 'Sound', 'Class', 'Frequency', 'Templates']
        for language, data in P.items():
            for sound, templates in data.items():
                table += [[
                    language, sound, bipa[sound].type,
                    sum([len(x) for x in templates.values()]), ', '.join([
                        '{0}:{1}'.format(x, len(y))
                        for x, y in templates.items()
                    ])
                ]]

    with Table(args, *header, rows=table):
        pass
Exemple #5
0
def run(args):
    if args.unloaded:
        i = 0
        for i, ds in enumerate(iter_datasets('lexibank.dataset')):
            print(ds.cldf_dir)
        if not i:
            print('No datasets installed')  # pragma: no cover
        return

    with Table(
            args,
            '#',
            'Dataset',
            'Parameters',
            'Concepticon',
            'Varieties',
            'Glottocodes',
            'Families',
    ) as table:
        try:
            concept_counts = {
                r[0]: r[1:]
                for r in args.repos.db.fetchall('concepts_by_dataset')
            }
        except sqlite3.OperationalError:  # pragma: no cover
            print('No datasets loaded yet')
            return

        varieties = args.repos.db.varieties
        var_counts = {}
        for dsid, vs in itertools.groupby(varieties, lambda v: v.source):
            vs = list(vs)
            var_counts[dsid] = (len(vs), len(set(v.glottocode for v in vs)),
                                len(set(v.family for v in vs)))

        for count, d in enumerate(args.repos.db.datasets):
            table.append([
                count + 1,
                d.replace('lexibank-', ''),
                concept_counts[d][1],
                concept_counts[d][0],
                var_counts.get(d, [0])[0],
                var_counts.get(d, [0, 0])[1],
                var_counts.get(d, [0, 0, 0])[2],
            ])
        table.append([
            '', 'TOTAL', 0,
            args.repos.db.fetchone("""\
select
    count(distinct p.concepticon_id) from parametertable as p, formtable as f, languagetable as l
where
    f.parameter_id = p.id and f.dataset_id = p.dataset_id
    and f.language_id = l.id and f.dataset_id = l.dataset_id
    and l.glottocode is not null
    and l.family != 'Bookkeeping'
""")[0],
            len(varieties),
            len(set(v.glottocode for v in varieties)),
            len(set(v.family for v in varieties))
        ])
Exemple #6
0
def run(args):
    with Table(args, 'id', 'name', 'type', 'variables', 'societies') as t:
        for ds in args.repos.datasets:
            t.append([
                ds.id, ds.name, ds.type,
                len(ds.variables),
                len(ds.societies)
            ])
Exemple #7
0
def run(args):
    conn = sqlite3.connect(args.dbpath or args.repos.dbpath)
    cu = conn.cursor()
    print("""# Database statistics
""")
    with Table(args, 'table', '# rows') as t:
        for table in ['file', 'reference', 'sample', 'citation']:
            cu.execute('SELECT count(*) FROM {}'.format(table))
            t.append([table, cu.fetchone()[0]])
Exemple #8
0
def run(args):
    contribs = {c.id: c.name for c in args.repos.contributors}
    with Table(args, '#', 'ID', 'Title', 'Patrons') as t:
        for i, f in enumerate(args.repos.features.values(), start=1):
            t.append([
                i, f.id,
                textwrap.shorten(f.wiki['title'], 50),
                ' '.join([contribs[abbr] for abbr in f.patrons])
            ])
Exemple #9
0
def run(args):
    concepts = set()
    columns = collections.defaultdict(list)

    headers = ['No', 'Dataset', 'Field', 'Ln', 'Norare', 'Structure', 'Type'] \
        if args.columns else ['ID', 'Author', 'Year', 'Languages', 'Tags', 'Ratings', 'Concepts']
    with Table(args, *headers) as table:
        for i, ds in progressbar(enumerate(args.api.datasets.values())):
            if not args.columns:
                table.append([
                    ds.id,
                    ds.author.replace(' AND ',
                                      ' and ').split(' and ')[0], ds.year,
                    ', '.join(ds.source_language[:3]), ', '.join(ds.tags),
                    len(ds.columns) - 3,
                    len(ds.concepts)
                ])
                concepts.update(ds.concepts)
            else:
                for column in ds.columns:
                    if column not in [
                            'concepticon_id',
                            'concepticon_gloss',
                            'line_in_source',
                            'english',
                            'german',
                            'polish',
                            'spanish',
                            'chinese',
                            'french',
                            'dutch',
                    ]:
                        columns[(ds.id, column)] += [(
                            ds.columns[column].language,
                            ds.columns[column].norare,
                            ds.columns[column].structure,
                            ds.columns[column].type,
                        )]
        if not args.columns:
            table.append([
                '-', 'TOTAL', '-', '-', '-',
                sum([x[-2] for x in table]),
                len(concepts)
            ])
        else:
            for i, (k, v) in enumerate(
                    sorted(columns.items(), key=lambda x: (x[0][1], x[0][0]))):
                table.append((
                    i + 1,
                    k[0],
                    k[1],
                    ', '.join(list(set([x[0] for x in v])))[:30],
                    ', '.join(list(set([x[1] for x in v])))[:15],
                    ', '.join(list(set([x[2] for x in v])))[:15],
                    ', '.join(list(set([x[3] for x in v])))[:15],
                ))
Exemple #10
0
def run(args):
    try:
        fts.get_index(args.repos, must_exist=True)
    except ValueError:
        raise ParserError('Index does not exist. Run "glottolog searchindex" first!')
    count, results = fts.search(args.repos, args.query)
    with Table('ID', 'Author', 'Year', 'Title') as table:
        for res in results:
            table.append([res.id, res.author, res.year, res.title])
    print('({} matches)'.format(count))
Exemple #11
0
def run(args):
    attrs = collections.Counter()
    for cl in args.repos.conceptlists.values():
        attrs.update(cl.attributes)

    with Table(
            args,
            'Attribute', 'Occurrences',
            rows=[(k, v) for k, v in attrs.most_common() if v >= args.min_occurs]):
        pass
Exemple #12
0
def run(args):
    found = args.repos.lookup(
        args.gloss,
        language=args.language,
        full_search=args.full_search,
        similarity_level=args.similarity,
    )
    with Table(args, "GLOSS", "CONCEPTICON_ID", "CONCEPTICON_GLOSS",
               "SIMILARITY") as t:
        for matches in found:
            t.extend(matches)
def run(args):
    ops = collections.defaultdict(collections.Counter)

    for lang in args.repos.languoids():
        for secname, sec in lang.cfg.items():
            ops[secname].update(opt for opt, val in sec.items() if val)

    ops.pop('DEFAULT', None)

    with Table('section', 'option', 'count') as table:
        for section, options in ops.items():
            table.append([section, '', float(sum(options.values()))])
            for k, n in options.most_common():
                table.append(['', k, float(n)])
Exemple #14
0
def run(args):
    nexus_obj = get_reader(args, required_blocks=['data'])
    print(nexus_obj.filename)

    with Table(args, 'Taxon', 'Characters') as t:
        for taxon in sorted(nexus_obj.data.matrix):
            tally = collections.Counter()
            for site in nexus_obj.data.matrix[taxon]:
                tally.update([site])

            t.append([
                taxon,
                ", ".join(['%s x %s' % (k, tally[k]) for k in sorted(tally)])
            ])
Exemple #15
0
def run(args):
    corpus = get_corpus(args)

    with Table('type', 'count') as t:
        e, w, m = corpus.get_stats()
        t.append(['example', e])
        t.append(['word', w])
        t.append(['morpheme', m])

    if e:
        print('\nExample properties:')
        for igt in corpus:
            for k in igt.properties.keys():
                print('  ' + k)
            break
Exemple #16
0
def run(args):
    print('Summary:')
    print('  {0:,} objects with {1:,} bitstreams of total size {2}'.format(
        len(args.catalog), sum(len(obj.bitstreams) for obj in args.catalog),
        args.catalog.size_h))
    print('  {0} duplicate bitstreams'.format(
        sum(1 for objs in args.catalog.md5_to_object.values()
            if len(objs) > 1)))
    print('  {0} objects with no bitstreams'.format(
        sum(1 for obj in args.catalog if not obj.bitstreams)))

    print()
    types = collections.Counter(
        itertools.chain(*[[bs.mimetype for bs in obj.bitstreams]
                          for obj in args.catalog]))
    with Table('maintype', 'subtype', 'bitstreams') as table:
        for maintype, items in itertools.groupby(
                sorted(types.items(),
                       key=lambda p: (p[0].split('/')[0], -p[1])),
                lambda p: p[0].split('/')[0]):
            for k, v in items:
                table.append([maintype, k.split('/')[1], v])
Exemple #17
0
def run(args):
    issues = collections.OrderedDict([(i.id, i) for i in args.repos.issues])

    if args.id:
        issue = issues[args.id]
        print(colored('{0.created} {0.user}'.format(issue), 'green'))
        print(colored(issue.title, attrs={'bold', 'underline'}))
        print('')
        print(issue.body)
        for comment in issue.comments:
            print(colored('\n    {0.created} {0.user}'.format(comment), 'green'))
            for line in comment.stripped_body.split('\n'):
                print('    ' + line)
        return

    with Table(args, 'ID', 'Title', 'User', 'Comments') as t:
        for issue in issues.values():
            if issue.coding_problem:
                t.append([
                    issue.id,
                    textwrap.shorten(issue.title, 50, placeholder=' […]'),
                    issue.user,
                    len(issue.comments)])
Exemple #18
0
def run(args):
    args.repos._log = args.log

    def clean(word):
        return "".join([w for w in word if w not in '/,;"'])

    varieties = args.repos.db.varieties

    args.log.info("Adding nodes to the graph")
    G = nx.Graph()
    for concept in args.repos.db.iter_concepts():
        G.add_node(concept.id, **concept.as_node_attrs())

    args.log.info("Adding edges to the graph")
    for v_, formA, formB in args.repos.iter_colexifications(varieties):
        if not G[formA.concepticon_id].get(formB.concepticon_id, False):
            G.add_edge(
                formA.concepticon_id,
                formB.concepticon_id,
                words=set(),
                languages=set(),
                families=set(),
                wofam=[],
            )

        G[formA.concepticon_id][formB.concepticon_id]["words"].add(
            (formA.gid, formB.gid))
        G[formA.concepticon_id][formB.concepticon_id]["languages"].add(v_.gid)
        G[formA.concepticon_id][formB.concepticon_id]["families"].add(
            v_.family)
        G[formA.concepticon_id][formB.concepticon_id]["wofam"].append("/".join(
            [
                formA.gid,
                formB.gid,
                formA.clics_form,
                v_.gid,
                v_.family,
                clean(formA.form),
                clean(formB.form),
            ]))

    # If either the colex2lang or colexstats files are requested,
    # build map of variety name to Glottocode, a map of concepts,
    # and collected the statistics
    if any([args.colex2lang, args.colexstats]):
        lang_map = {
            "%s-%s" % (dataset_id, lang_id): glottocode
            for dataset_id, lang_id, glottocode in args.repos.db.fetchall(
                "SELECT dataset_ID, ID, Glottocode FROM languagetable")
        }

        # Collect lists of concepts for each language variety, so that we can
        # check if a colexification would be possible in it (i.e., if there is
        # enough data)
        concepts = defaultdict(set)
        for dataset_id, lang_id, concepticon_id in args.repos.db.fetchall("""
            SELECT f.dataset_ID, f.Language_ID, p.Concepticon_ID
            FROM formtable AS f, parametertable AS P
            WHERE f.Parameter_ID = p.ID AND f.dataset_ID = p.dataset_ID"""):
            concepts["%s-%s" % (dataset_id, lang_id)].add(concepticon_id)

        # Iterate over all edges and collect data
        all_counts, threshold_counts = defaultdict(int), defaultdict(int)
        all_possible, threshold_possible = defaultdict(int), defaultdict(int)
        colex2lang = defaultdict(set)
        for concept_a, concept_b, data in G.edges(data=True):
            # Collect concept2languages info
            for lang, glottocode in lang_map.items():
                if lang in data["languages"]:
                    colex2lang[concept_a, concept_b].add(glottocode)

            # Collect language colexification affinity (David Gil's request)
            # Don't consider the edge if we don't have at least one language in it
            if not data["languages"]:
                continue

            # Check if the current concept pair passes the threshold filter
            filter_family, filter_lang, filter_words = True, True, True
            if args.edgefilter == "families":
                filter_family = len(data["families"]) >= args.threshold
            if args.edgefilter == "languages":
                filter_lang = len(data["languages"]) >= args.threshold
            if args.edgefilter == "words":
                filter_words = len(data["words"]) >= args.threshold
            pass_filter = all([filter_family, filter_lang, filter_words])

            # Inspect all languages
            for lang in lang_map:
                if lang in data["languages"]:
                    all_counts[lang] += 1
                    all_possible[lang] += 1
                    if pass_filter:
                        threshold_counts[lang] += 1
                        threshold_possible[lang] += 1
                else:
                    if concept_a in concepts[lang] and concept_b in concepts[
                            lang]:
                        all_possible[lang] += 1
                        if pass_filter:
                            threshold_possible[lang] += 1

    edges = {}
    for edgeA, edgeB, data in G.edges(data=True):
        edges[edgeA, edgeB] = (
            len(data["families"]),
            len(data["languages"]),
            len(data["words"]),
        )

    ignore_edges = []
    for edgeA, edgeB, data in G.edges(data=True):
        data["WordWeight"] = len(data["words"])
        data["words"] = ";".join(
            sorted(["{0}/{1}".format(x, y) for x, y in data["words"]]))
        data["FamilyWeight"] = len(data["families"])
        data["families"] = ";".join(sorted(data["families"]))
        data["LanguageWeight"] = len(data["languages"])
        data["languages"] = ";".join(data["languages"])
        data["wofam"] = ";".join(data["wofam"])
        if args.edgefilter == "families" and data[
                "FamilyWeight"] < args.threshold:
            ignore_edges.append((edgeA, edgeB))
        elif args.edgefilter == "languages" and data[
                "LanguageWeight"] < args.threshold:
            ignore_edges.append((edgeA, edgeB))
        elif args.edgefilter == "words" and data["WordWeight"] < args.threshold:
            ignore_edges.append((edgeA, edgeB))

    G.remove_edges_from(ignore_edges)

    nodenames = {
        r[0]: r[1]
        for r in args.repos.db.fetchall(
            "select distinct concepticon_id, concepticon_gloss from parametertable"
        )
    }

    with Table(args, "ID A", "Concept A", "ID B", "Concept B", "Families",
               "Languages", "Words") as table:
        count = 0
        for (nodeA, nodeB), (fc, lc, wc) in sorted(edges.items(),
                                                   key=lambda i: i[1],
                                                   reverse=True):
            if (nodeA, nodeB) not in ignore_edges:
                table.append([
                    nodeA, nodenames[nodeA], nodeB, nodenames[nodeB], fc, lc,
                    wc
                ])
                count += 1
            if count >= args.show:
                break

    print(
        args.repos.save_graph(G, args.graphname, args.threshold,
                              args.edgefilter))

    # Output colex2lang info
    if args.colex2lang:
        with open(args.colex2lang, "w") as tsvfile:
            tsvfile.write("CONCEPT_A\tCONCEPT_B\tGLOTTOCODES\n")
            for entry, langs in colex2lang.items():
                tsvfile.write("%s\t%s\t%s\n" %
                              (entry[0], entry[1], ",".join(langs)))

    # Output per-language info
    if args.colexstats:
        with open(args.colexstats, "w") as tsvfile:
            writer = csv.DictWriter(
                tsvfile,
                delimiter="\t",
                fieldnames=[
                    "LANG_KEY",
                    "GLOTTOCODE",
                    "COLEXIFICATIONS_ALL",
                    "POTENTIAL_ALL",
                    "COLEXIFICATIONS_THRESHOLD",
                    "POTENTIAL_THRESHOLD",
                ],
            )
            writer.writeheader()
            for lang in sorted(lang_map):
                writer.writerow({
                    "LANG_KEY":
                    lang,
                    "GLOTTOCODE":
                    lang_map[lang],
                    "COLEXIFICATIONS_ALL":
                    all_counts[lang],
                    "POTENTIAL_ALL":
                    all_possible[lang],
                    "COLEXIFICATIONS_THRESHOLD":
                    threshold_counts[lang],
                    "POTENTIAL_THRESHOLD":
                    threshold_possible[lang],
                })
Exemple #19
0
def run(args):
    from pyclics.util import parse_kwargs

    algo = args.algorithm

    if algo not in args.repos.cluster_algorithms:
        with Table(args, 'algorithm', 'description') as table:
            for name, desc in args.repos.cluster_algorithms.items():
                table.append((name, desc))
        if args.algorithm != '_':
            raise argparse.ArgumentError(None, 'Unknown cluster algorithm: {0}'.format(algo))
        return

    if not args.repos.repos.joinpath('app', 'source', 'words.json').exists():
        raise argparse.ArgumentError(None, '"clics makeapp" must be run first')

    graph = args.repos.load_graph(args.graphname, args.threshold, args.edgefilter)
    args.log.info('graph loaded')
    kw = vars(args)
    kw.update(parse_kwargs(*args.args))
    neighbor_weight = int(kw.pop('neighbor_weight', 5))

    clusters = sorted(args.repos.get_clusterer(algo)(graph, vars(args)), key=lambda c: (-len(c), c))
    args.log.info('computed clusters')

    D, Com = {}, collections.defaultdict(list)
    for i, cluster in enumerate(clusters, start=1):
        for vertex in cluster:
            D[vertex] = str(i)
            Com[i].append(vertex)

    # Annotate the graph with the cluster info:
    for node, data in graph.nodes(data=True):
        data.update({algo: D.get(node, '0'), 'ClusterName': '', 'CentralConcept': ''})

    # get the articulation points etc. immediately
    for idx, nodes in sorted(Com.items()):
        sg = graph.subgraph(nodes)
        if len(sg) > 1:
            d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True)
            d = [graph.node[a]['Gloss'] for a, b in d_][0]
            cluster_name = '{0}_{1}_{2}'.format(algo, idx, d)
        else:
            d = graph.node[nodes[0]]['Gloss']
            cluster_name = '{0}_{1}_{2}'.format(algo, idx, graph.node[nodes[0]]['Gloss'])
        for node in nodes:
            graph.node[node]['ClusterName'] = cluster_name
            graph.node[node]['CentralConcept'] = d

    args.log.info('computed cluster names')

    cluster_dir = args.repos.existing_dir('app', 'cluster', algo, clean=True)
    cluster_names = {}
    removed = []
    for idx, nodes in tqdm(sorted(Com.items()), desc='export to app', leave=False):
        sg = graph.subgraph(nodes)
        for node, data in sg.nodes(data=True):
            data['OutEdge'] = []
            neighbors = [
                n for n in graph if
                n in graph[node] and
                graph[node][n]['FamilyWeight'] >= neighbor_weight and
                n not in sg]
            if neighbors:
                sg.node[node]['OutEdge'] = []
                for n in neighbors:
                    sg.node[node]['OutEdge'].append([
                        graph.node[n]['ClusterName'],
                        graph.node[n]['CentralConcept'],
                        graph.node[n]['Gloss'],
                        graph[node][n]['WordWeight'],
                        n
                    ])
        if len(sg) > 1:
            fn = cluster_dir / (
                (str(idx) if algo == 'subgraph' else graph.node[nodes[0]]['ClusterName']) +
                '.json')
            jsonlib.dump(json_graph.adjacency_data(sg), fn, sort_keys=True)
            for node in nodes:
                cluster_names[graph.node[node]['Gloss']] = fn.stem
        else:
            removed += [list(nodes)[0]]
    graph.remove_nodes_from(removed)
    for node, data in graph.nodes(data=True):
        if 'OutEdge' in data:
            data['OutEdge'] = '//'.join(['/'.join([str(y) for y in x]) for x in data['OutEdge']])
    removed = []
    for nA, nB, data in tqdm(graph.edges(data=True), desc='remove edges', leave=False):
        if graph.node[nA][algo] != graph.node[nB][algo] and data['FamilyWeight'] < 5:
            removed += [(nA, nB)]
    graph.remove_edges_from(removed)

    args.repos.save_graph(graph, algo, args.threshold, args.edgefilter)
    args.repos.write_js_var(algo, cluster_names, 'app', 'source', 'cluster-names.js')
Exemple #20
0
def run(args):
    with Table(args, 'DOI', 'topic') as t:
        for ex in args.api.experiments:
            t.append([ex.doi, ex.parameter])