Example #1
0
def run(args):
    dataset = get_dataset(args)
    with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md:
        modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()]
        contribs = dataset.dir / 'CONTRIBUTORS.md'
        creators, contributors = get_creators_and_contributors(
            contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False)
        if creators:
            md['creators'] = [contrib(p) for p in creators]
        if contributors:
            md["contributors"] = [contrib(p) for p in contributors]
        communities = [r["identifier"] for r in md.get("communities", [])] + \
                      [c.strip() for c in nfilter(args.communities.split(','))]
        if communities:
            md['communities'] = [
                {"identifier": community_id} for community_id in sorted(set(communities))]
        md.update(
            {
                "title": dataset.metadata.title,
                "access_right": "open",
                "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)),
                "upload_type": "dataset",
            }
        )
        if dataset.metadata.citation:
            md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \
                                "<blockquote>\n<p>{}</p>\n</blockquote>".format(
                html.escape(dataset.metadata.citation))
        if dataset.metadata.zenodo_license:
            md['license'] = {'id': dataset.metadata.zenodo_license}
Example #2
0
def run(args):

    ds = get_dataset(args)
    forms = []
    for row in ds.cldf_reader()['FormTable']:
        if row['Language_ID'] == args.language_id or not args.language_id:
            forms.append(row)

    P = syllable_inventories(forms, format=args.prosody_format)
    bipa = args.clts.from_config().api.transcriptionsystem_dict['bipa']

    table = []
    if args.display == 'long':
        header = ['Language', 'Sound', 'Template', 'Frequency']
        for language, data in P.items():
            for sound, templates in data.items():
                for template, frequency in templates.items():
                    table += [[language, sound, template, len(frequency)]]
    else:
        header = ['Language', 'Sound', 'Class', 'Frequency', 'Templates']
        for language, data in P.items():
            for sound, templates in data.items():
                table += [[
                    language, sound, bipa[sound].type,
                    sum([len(x) for x in templates.values()]), ', '.join([
                        '{0}:{1}'.format(x, len(y))
                        for x, y in templates.items()
                    ])
                ]]

    with Table(args, *header, rows=table):
        pass
Example #3
0
def run(args):
    bipa = args.clts.api.bipa
    func = profile.simple_profile
    cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints']
    kw = {'ref': 'form', 'clts': bipa}
    if args.context:
        func = profile.context_profile
        cols = [
            'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence',
            'Codepoints'
        ]
        kw['col'] = 'language_id'

    ds = get_dataset(args)
    profile_path = ds.etc_dir / 'orthography.tsv'
    if profile_path.exists() and not args.force:
        raise ParserError(
            'Orthography profile exists already. To overwrite, pass "-f" flag')

    header, D = [], {}
    for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1):
        if i == 1:
            header = [f for f in row.keys() if f != 'ID']
            D = {0: ['lid'] + [h.lower() for h in header]}

        row['Segments'] = ' '.join(row['Segments'])
        D[i] = [row['ID']] + [row[h] for h in header]

    with UnicodeWriter(profile_path, delimiter='\t') as writer:
        writer.writerow(cols)
        for row in func(Wordlist(D, row='parameter_id', col='language_id'),
                        **kw):
            writer.writerow(row)
    args.log.info('Orthography profile written to {0}'.format(profile_path))
Example #4
0
def run(args):
    dataset = get_dataset(args)
    if setup(dataset, force=args.test):
        if not args.test:  # pragma: no cover
            print(git.cmd.Git(str(dataset.dir)).status())
    print('You may include the following status badge in any markdown file in the repos:\n')
    print(build_status_badge(dataset))
Example #5
0
def run(args):
    """
    main function.
    """
    ds = get_dataset(args)
    if args.medials:
        args.medials = set(args.medials.split(','))
    errors = {
        'length': defaultdict(list),
        'syllable': defaultdict(list),
        'missing': defaultdict(list)
    }
    if ds.cldf_dir.joinpath("forms.csv").exists():
        for row in progressbar(ds.cldf_reader()["FormTable"],
                               desc='iterate over wordlist'):
            if row['Language_ID'] == args.doculect or not args.doculect:
                strucs = get_structure(row['Segments'],
                                       medials=args.medials or MEDIALS)
                for i, (struc, segments) in enumerate(
                        zip(strucs, morphemes(row['Segments']))):
                    if len(struc) != len(segments):
                        errors['length'][' '.join(segments),
                                         ' '.join(struc)] += [
                                             (row['ID'], i, row['Language_ID'],
                                              row['Form'], row['Segments'])
                                         ]
                    elif '?' in struc:
                        errors['missing'][' '.join(segments),
                                          ' '.join(struc)] += [
                                              (row['ID'], i,
                                               row['Language_ID'], row['Form'],
                                               row['Segments'])
                                          ]
                    elif not 'n' in struc or not 't' in struc:
                        errors['syllable'][' '.join(segments),
                                           ' '.join(struc)] += [
                                               (row['ID'], i,
                                                row['Language_ID'],
                                                row['Form'], row['Segments'])
                                           ]

    for error, errorname in [('length', 'Length Errors'),
                             ('missing', 'Missing Values'),
                             ('syllable', 'Syllable Errors')]:
        if errors[error]:
            print('# ' + errorname + '\n')
            table = []
            for i, ((segments, structure),
                    examples) in enumerate(errors[error].items()):
                table += [[i + 1, segments, structure, len(examples)]]
            print(
                tabulate(
                    table,
                    tablefmt='pipe',
                    headers=['Number', 'Segments', 'Structure', 'Examples']))
            print('')
def run(args):
    """
    Entry point for command-line call.
    """

    # Extract dataset
    ds = get_dataset(args)

    # Read raw data and extend it with phonological information
    args.log.info("Loading data from %s...", ds)
    data = read_extended_data(ds, args)
    args.log.info("Read %i entries from CLDF.", len(data))

    # Collect inventories
    args.log.info("Collecting inventories...")
    phoneme_count, syllable_count = collect_inventories(data)
    args.log.info("Read %i inventories.", len(phoneme_count))

    # Collect inventories by size, testing the sample size needed
    args.log.info("Estimating sample sizes...")
    sampled = collect_sampled_inventories(data)
    args.log.info("Read %i inventories.", len(sampled))

    # Estimate sample sizes, and compute the means for output
    ks_stats = defaultdict(list)
    size_stats = defaultdict(list)
    for lang, full in phoneme_count.items():
        dist1 = [full.get(sound, None) for sound in sorted(full)]
        for sample_size in sampled:
            for i, sample in enumerate(sampled[sample_size][lang]):
                dist2 = [sample.get(sound, 0) for sound in sorted(full)]

                ks, p = scipy.stats.ks_2samp(dist1, dist2)

                ks_stats[lang, sample_size].append(ks)
                size_stats[lang, sample_size].append(len(sample))

    ks_stats = {key: np.mean(ks_values) for key, ks_values in ks_stats.items()}
    size_stats = {
        key: np.mean([[size / len(phoneme_count[key[0]])] for size in sizes])
        for key, sizes in size_stats.items()
    }

    output_sample_stats(ks_stats, size_stats, phoneme_count, args)

    # iterate over all phoneme inventories
    stats = {}
    for language, inventory in phoneme_count.items():
        args.log.info("Processing inventory for %s...", language)
        lang_stats = analyze_inventory(inventory, language, args)
        stats[language] = lang_stats

    # Output statistics
    args.log.info("Writing results...")
    output_powerlaw_stats(stats, args)
Example #7
0
def run(args):
    # Access the dataset:
    ds = get_dataset(args)
    print(ds.id)
    # and its CLDF Dataset:
    print(len(list(ds.cldf_reader()['LanguageTable'])))

    # Thanks to `PathType` `args.input_file` is a `pathlib.Path`:
    for c in args.input_file.read_text(encoding='utf8'):
        if args.strict:  # evaluates our flag
            # The CLTS catalog API is available as `args.clts.api`:
            print(args.clts.api.bipa[c].name)  # pragma: no cover
        else:
            args.log.warning('not very strict')
Example #8
0
def run(args):
    ds = get_dataset(args)
    md = []
    cldfs = list(ds.cldf_specs_dict.values())
    if len(cldfs) > 1:
        md.append("# CLDF datasets\n")
        md.extend([
            '- [{}](#ds-{})'.format(cldf.module, slug(cldf.metadata_fname)) for cldf in cldfs])
        md.append('')
    for cldf in cldfs:
        if cldf.metadata_path.exists():
            md.append('<a name="ds-{}"> </a>\n'.format(slug(cldf.metadata_fname)))
            res = metadata2markdown(cldf.get_dataset(), cldf.metadata_path)
            md.append(res.replace('# ', '# {} '.format(cldf.module), 1))
            md.append('\n')

    ds.cldf_dir.joinpath('README.md').write_text('\n'.join(md), encoding='utf8')
Example #9
0
def run(args):
    dataset = get_dataset(args)
    dataset.concepticon = args.concepticon.api
    dataset.glottolog = args.glottolog.api
    with_dataset(args, 'makecldf', dataset=dataset)
    if not dataset.cldf_dir.joinpath('sources.bib').exists():
        raise ValueError('The dataset has no sources at {0}'.format(
            dataset.cldf_dir.joinpath('sources.bib')))
    creators, contributors = dataset.get_creators_and_contributors(
        strict=False)

    def contrib(d):
        return {
            k: v
            for k, v in d.items()
            if k in {'name', 'affiliation', 'orcid', 'type'}
        }

    with jsonlib.update_ordered(dataset.dir / '.zenodo.json', indent=4) as md:
        md.update({
            'title':
            dataset.metadata.title,
            "access_right":
            "open",
            "keywords":
            sorted(
                set(md.get('keywords', []) +
                    ["linguistics", "cldf:Wordlist"])),
            "creators": [contrib(p) for p in creators],
            "contributors": [contrib(p) for p in contributors],
            "communities":
            sorted(md.get('communities', []) + [{
                "identifier": "lexibank"
            }],
                   key=lambda i: i['identifier']),
            "upload_type":
            "dataset",
        })
        if dataset.metadata.citation:
            md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \
                                "<blockquote>\n<p>{}</p>\n</blockquote>".format(
                html.escape(dataset.metadata.citation))
        if dataset.metadata.zenodo_license:
            md['license'] = {'id': dataset.metadata.zenodo_license}
Example #10
0
def run(args):
    ds = get_dataset(args)
    clts = args.clts.api

    # Load the profile(s) specified for the dataset
    profiles = {k or 'default': v for k, v in ds.orthography_profile_dict.items()}
    forms = collections.defaultdict(list)
    if ds.cldf_dir.joinpath('forms.csv').exists():
        for form in ds.cldf_reader()['FormTable']:
            forms[form.get('Profile')].append(ds.form_for_segmentation(form['Form']))
    if list(forms.keys()) == [None]:  # pragma: no cover
        forms['default'] = forms[None]

    for key, profile in profiles.items():
        args.log.info('Processing {0}'.format(profile.fname))
        profile.clean(clts, ipa_col=args.ipa)

        if args.trim:
            # Run the trimmer as many times as necessary until nothing more is left to remove
            total_removed = 0
            while True:
                removed = profile.trim(ipa_col=args.ipa)
                total_removed += removed
                if removed == 0:
                    break
            if total_removed:  # pragma: no cover
                args.log.info("{} superfluous rules were removed.".format(total_removed))

        if args.augment and forms[key]:
            profile.augment(forms[key], clts=args.clts.api)

        if args.sort:
            profile.sort(clts=args.clts.api, ipa_col=args.ipa)

        profile.check(clts, args.log, ipa_col=args.ipa)
        profile.write()
Example #11
0
def run(args):
    ds = None
    if Zenodo.DOI_PATTERN.match(args.dataset):
        z = Zenodo()
        out = z.download_record(z.record_from_doi(args.dataset),
                                pathlib.Path('.'))
        args.log.info('Downloaded files for {0} to {1}'.format(
            args.dataset, out))
        cldf_ds = list(iter_datasets(out))
    else:
        p = pathlib.Path(args.dataset)
        if p.exists() and sniff(p):
            cldf_ds = [Dataset.from_metadata(p)]
        else:  # pragma: no cover
            ds = get_dataset(args)
            cldf_ds = [ds.cldf_reader()]

    if not cldf_ds:
        raise ValueError('No CLDF dataset found for spec {0}'.format(
            args.dataset))

    try:
        count_p = max([len(list(cldf['ParameterTable'])) for cldf in cldf_ds])
    except KeyError:
        count_p = 100

    default_page_size = 100
    while default_page_size < count_p and default_page_size < 600:
        default_page_size += 100  # pragma: no cover

    #  max_returned_rows            Maximum rows that can be returned from a table
    #                               or custom query (default=1000)

    db_paths = []
    if args.db_path:  # pragma: no cover
        if len(cldf_ds) > 1:
            raise ValueError(
                'You cannot pass a db path, when multiple datasets are found')
    else:
        args.db_path = pathlib.Path(
            '{0}.sqlite'.format(ds.id if ds else 'cldf_db'))

    for i, cldf in enumerate(cldf_ds):
        if i == 0:
            db_path = args.db_path
        else:
            db_path = args.db_path.parent / (
                args.db_path.stem + '_{0}'.format(i) + args.db_path.suffix)

        if not db_path.exists():
            db = Database(cldf, fname=db_path, infer_primary_keys=True)
            db.write_from_tg()
            args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname))
        db_paths.append(db_path)

    jsonlib.dump(datasette_cldf.metadata(
        {db.stem: cldf
         for db, cldf in zip(db_paths, cldf_ds)}),
                 args.cfg_path,
                 indent=4)

    os.system(
        'datasette {0} -m {1} --template-dir {2} --config default_page_size:{3}'
        .format(' '.join(str(p) for p in db_paths), args.cfg_path,
                pathlib.Path(datasette_cldf.__file__).parent / 'templates',
                default_page_size))
Example #12
0
def run(args):
    ds = get_dataset(args)
    p = ds.cldf_dir / ds.cldf_reader().properties['dc:hasPart']['summary']['dc:relation']
    print(nexus.NexusReader(p).trees.trees[0].newick_tree.ascii_art())
Example #13
0
def run(args):

    ds = get_dataset(args)
    ds_cldf = ds.cldf_reader()
    release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)

    if ds_cldf.get('media.csv', None) is None:  # pragma: no cover
        args.log.error('Dataset has no media.csv')
        raise ParserError
    if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
        args.log.error('Invalid passed DOI')
        raise ParserError
    if args.update_zenodo:
        if not release_dir.exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir))
            raise ParserError
        if not (release_dir / ZENODO_FILE_NAME).exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir / ZENODO_FILE_NAME))
            raise ParserError
        if args.create_release:
            args.log.error(
                'You cannot create the release and update zenodo at the same time.'
            )
            raise ParserError
    if args.create_release:
        if not args.parent_doi:
            args.log.error(
                'The corresponding DOI is required (via --parent-doi).')
            raise ParserError

    mime_types = None
    if args.mimetype:
        mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        media_dir = args.out / MEDIA
        media_dir.mkdir(exist_ok=True)
        media = []

    if not args.update_zenodo:
        used_file_extensions = set()
        with UnicodeWriter(media_dir /
                           INDEX_CSV if not args.list else None) as w:
            for i, row in enumerate(
                    tqdm.tqdm([r for r in ds_cldf['media.csv']],
                              desc='Getting {0} items'.format(MEDIA))):
                url = ds_cldf.get_row_url('media.csv', row)
                if isinstance(url, rfc3986.URIReference):
                    url = url.normalize().unsplit()
                    row['URL'] = url
                f_ext = url.split('.')[-1].lower()
                if args.debug and i > 500:
                    break
                if (mime_types is None) or f_ext in mime_types\
                        or any(row['mimetype'].startswith(x) for x in mime_types):
                    if args.list:
                        m = '{0} ({1})'.format(row['mimetype'], f_ext)
                        size[m] += int(row['size'])
                        number.update([m])
                    else:
                        used_file_extensions.add(f_ext.lower())
                        d = media_dir / row['ID'][:2]
                        d.mkdir(exist_ok=True)
                        fn = '.'.join([row['ID'], f_ext])
                        target = d / fn
                        row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
                        if i == 0:
                            w.writerow(row)
                        w.writerow(row.values())
                        media.append(target)
                        if (not target.exists()) or md5(target) != row['ID']:
                            _create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
        return

    # Waiting for the download threads to finish
    if 'download_threads' in globals():
        for t in download_threads:
            t.join()

    if args.create_release:
        assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(
            MEDIA, media_dir.resolve())

        release_dir.mkdir(exist_ok=True)

        media.append(media_dir / INDEX_CSV)

        try:
            zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)),
                                   'w', zipfile.ZIP_DEFLATED)
            fp = args.out
            for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
                zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def _contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        version_v = git_describe('.').split('-')[0]
        version = version_v.replace('v', '')
        git_url = [r for r in ds.repo.repo.remotes
                   if r.name == 'origin'][0].url.replace('.git', '')
        with jsonlib.update(release_dir / ZENODO_FILE_NAME,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = ds.dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [_contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [_contrib(p) for p in contributors]
            communities = [r["identifier"] for r in md.get("communities", [])] + \
                [c.strip() for c in nfilter(args.communities.split(','))] + \
                COMMUNITIES
            if communities and not args.debug:
                md['communities'] = [{
                    "identifier": community_id
                } for community_id in sorted(set(communities))]
            md.update({
                'title':
                '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'dataset',
                'publication_date':
                datetime.today().strftime('%Y-%m-%d'),
                'version':
                version,
                'related_identifiers': [
                    {
                        'scheme': 'url',
                        'identifier':
                        '{0}/tree/{1}'.format(git_url, version_v),
                        'relation': 'isSupplementTo'
                    },
                ],
            })
            if args.parent_doi:
                md['related_identifiers'].append({
                    'scheme': 'doi',
                    'identifier': args.parent_doi,
                    'relation': 'isPartOf'
                })
                supplement_to = " - Supplement to dataset " \
                                "<a href='https://doi.org/{0}'>{1}</a> ".format(
                    args.parent_doi, ds.metadata.title)  # noqa: E122
            if ds.metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    ds.metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })

            formats = ', '.join(sorted(used_file_extensions))
            descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else ''
            online_url, online = '', ''
            if ds.metadata.url:
                online_url = ds.metadata.url
                online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format(
                    online_url)
            md['description'] = html.escape(
                DESCRIPTION.format(
                    url=online_url,
                    formats=' ({0})'.format(formats) if formats else '',
                    title=md['title'],
                    supplement_to=supplement_to,
                    descr=descr,
                    online=online))

            license_md = ''
            if ds.metadata.zenodo_license:
                md['license'] = {'id': ds.metadata.zenodo_license}
                license_md = LICENCE.format(ds.metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                README.format(
                    title=md['title'],
                    doi='https://doi.org/{0}'.format(args.parent_doi),
                    ds_title=ds.metadata.title,
                    license=license_md,
                    formats=' ({0})'.format(formats) if formats else '',
                    media=MEDIA,
                    index=INDEX_CSV))

    if args.update_zenodo:

        md = {}
        md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME))

        if args.debug:
            api_url = API_URL_SANDBOX
            access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
        else:
            api_url = API_URL
            access_token = ACCESS_TOKEN
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=access_token)
        try:
            rec = api.record_from_id('{0}record/{1}'.format(
                zenodo_url, args.update_zenodo))
        except Exception as e:
            args.log.error(
                'Check connection and credentials for accessing Zenodo.\n{0}'.
                format(e))
            return
        latest_version = rec.links['latest'].split('/')[-1]
        if latest_version != args.update_zenodo:
            args.log.warn(
                'Passed deposit ID does not refer to latest version {0}!'.
                format(latest_version))
        args.log.info('  DOI:     ' + rec.metadata.doi)
        args.log.info('  Title:   ' + rec.metadata.title)
        args.log.info('  Version: ' + rec.metadata.version)
        args.log.info('  Date:    ' + rec.metadata.publication_date)
        args.log.info('  Files:   ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')