Example #1
0
    def from_path(cls, path, spec=None):
        """
        Instantiate a corpus from a file path.

        :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \
        component as CSV file. Note that in the latter case, the file must use the default \
        column names, as defined in the CLDF ontology.
        :return: `Corpus` instance.
        """
        if isinstance(path, str):
            path = pathlib.Path(path)
        if path.suffix == '.json':
            return cls.from_cldf(Dataset.from_metadata(path), spec=spec)
        # We are given only an ExampleTable. Let's create the appropriate dataset:
        header = None
        for d in reader(path, dicts=True):
            header = list(d.keys())
            break
        ds = Dataset.from_metadata(
            pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json')
        ds.tablegroup._fname = path.parent / 'cldf-metadata.json'
        t = ds.add_component('ExampleTable')
        t.url = Link(path.name)
        default_cols = [col.name for col in t.tableSchema.columns]
        ds.remove_columns(t, *list(set(default_cols) - set(header)))
        ds.add_columns(t, *list(set(header) - set(default_cols)))
        return cls.from_cldf(ds, spec=spec)
Example #2
0
def get_dataset(fname=None):
    """Load a CLDF dataset.

    Load the file as `json` CLDF metadata description file, or as metadata-free
    dataset contained in a single csv file.

    The distinction is made depending on the file extension: `.json` files are
    loaded as metadata descriptions, all other files are matched against the
    CLDF module specifications. Directories are checked for the presence of
    any CLDF datasets in undefined order of the dataset types.

    Parameters
    ----------
    fname : str or Path
        Path to a CLDF dataset

    Returns
    -------
    pycldf.Dataset
    """
    if fname is None:
        fname = repository
    else:
        fname = Path(fname)
    if not fname.exists():
        raise FileNotFoundError('{:} does not exist'.format(fname))
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Example #3
0
def test_cldf(tmp_path):
    with pytest.raises(AttributeError):
        _ = CLDFWriter().cldf

    with CLDFWriter(CLDFSpec(dir=tmp_path)):
        pass
    # The metadata was copied:
    assert tmp_path.glob('*-metadata.json')

    with CLDFWriter(
            CLDFSpec(module='StructureDataset',
                     dir=tmp_path,
                     data_fnames=dict(ValueTable='data.csv',
                                      ExampleTable='igt.csv'))) as writer:
        assert writer.cldf['ValueTable'] and writer.cldf['ExampleTable']
        writer['ValueTable', 'value'].separator = '|'
        writer.objects['ValueTable'].append(
            dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2]))
    assert tmp_path.joinpath('data.csv').exists()
    ds = Dataset.from_metadata(tmp_path / 'StructureDataset-metadata.json')
    values = list(ds['ValueTable'])
    assert len(values) == 1
    assert values[0]['Value'] == ['1', '2']

    with pytest.raises(AttributeError):
        CLDFWriter(tmp_path).validate()
Example #4
0
def test_cldf_with_dataset(ds):
    with CLDFWriter(CLDFSpec(dir=ds.cldf_dir), dataset=ds):
        pass
    cldf = Dataset.from_metadata(ds.cldf_dir.joinpath('Generic-metadata.json'))
    assert 'http://example.org/raw' in [
        p['rdf:about'] for p in cldf.properties['prov:wasDerivedFrom']
    ]
Example #5
0
def get_dataset(p):
    try:
        return Dataset.from_metadata(
            p) if p.suffix == '.json' else Dataset.from_data(p)
    except ValueError:
        raise argparse.ArgumentTypeError(
            'Invalid CLDF dataset spec: {0}!'.format(p))
Example #6
0
def run(args):
    if (args.glottolog
            or args.concepticon) and Catalog is None:  # pragma: no cover
        print(
            'To use reference catalogs you must install the cldfcatalog package!'
        )
        return 10

    if args.cldf:  # pragma: no cover
        args.cldf = Dataset.from_metadata(args.cldf)

    with contextlib.ExitStack() as stack:
        if not args.prime_cache_only:
            stack.enter_context(
                db.FreshDB.from_settings(args.settings, log=args.log))
        stack.enter_context(SessionContext(args.settings))

        for name in ['concepticon', 'glottolog']:
            if getattr(args, name):  # pragma: no cover
                if getattr(args, name + '_version'):
                    stack.enter_context(
                        Catalog(getattr(args, name),
                                tag=getattr(args, name + '_version')))
                else:
                    setattr(args, name, pathlib.Path(getattr(args, name)))

        if not args.prime_cache_only:
            with transaction.manager:
                if args.initializedb:  # pragma: no cover
                    args.initializedb.main(args)
        if hasattr(args.initializedb, 'prime_cache'):
            with transaction.manager:  # pragma: no cover
                args.initializedb.prime_cache(args)
Example #7
0
def _make_client(cldf_md, tmpdir):
    cldf = Dataset.from_metadata(cldf_md)
    dbpath = tmpdir / 'db.sqlite'
    db = Database(cldf, fname=dbpath, infer_primary_keys=True)
    db.write_from_tg()
    return make_app_client(dbpath,
                           metadata=datasette_cldf.metadata({'db': cldf}))
Example #8
0
def run(args):
    args.env, settings = get_env_and_settings(args.config_uri)

    with contextlib.ExitStack() as stack:
        stack.enter_context(db.FreshDB.from_settings(settings, log=args.log))
        stack.enter_context(SessionContext(settings))
        args.cldf = Dataset.from_metadata(args.abvd_cldf / 'cldf' /
                                          'cldf-metadata.json')

        if not args.prime_cache_only:
            with transaction.manager:
                main(args)
        with transaction.manager:
            prime_cache(args)
Example #9
0
def test_cldf(tmpdir):
    with pytest.raises(AttributeError):
        _ = CLDFWriter().cldf

    outdir = pathlib.Path(str(tmpdir))
    with CLDFWriter(CLDFSpec(dir=outdir)):
        pass
    # The metadata was copied:
    assert outdir.glob('*-metadata.json')

    with CLDFWriter(CLDFSpec(dir=outdir, data_fnames=dict(ValueTable='data.csv'))) as writer:
        assert writer.cldf['ValueTable']
        writer['ValueTable', 'value'].separator = '|'
        writer.objects['ValueTable'].append(
            dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2]))
    ds = Dataset.from_metadata(outdir / 'Generic-metadata.json')
    values = list(ds['ValueTable'])
    assert len(values) == 1
    assert values[0]['Value'] == ['1', '2']

    with pytest.raises(AttributeError):
        CLDFWriter(outdir).validate()
Example #10
0
def test_Record_download_dataset(tmp_path, mocker, tests_dir, caplog, record):
    class Response:
        def __init__(self, *args):
            self.yielded = False
            self.code = 200

        def __enter__(self):
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            pass

        def read(self):
            return tests_dir.joinpath(
                'petersonsouthasia-v1.1.zip').read_bytes()

    mocker.patch('cldfzenodo.record.urllib.request.urlopen', Response)

    with caplog.at_level(logging.INFO):
        assert Dataset.from_metadata(
            record.download_dataset(
                tmp_path, log=logging.getLogger(__name__))).validate()
        assert caplog.records
Example #11
0
from collections import Counter
from tabulate import tabulate
from pycldf import Dataset

# Retrieve the WALS v2020 data from GitHub.
# (The proper release on Zenodo is zipped and would need to be downloaded first):
wals = Dataset.from_metadata(
    'https://raw.githubusercontent.com/cldf-datasets/wals/v2020/cldf/StructureDataset-metadata.json'
)
# Get feature 1A ...
feature1 = wals.get_object('ParameterTable', '1A')
# ... and look at its values:
values = Counter(v.code.name for v in feature1.values)
print('\n{}\n\n{}'.format(feature1.name,
                          tabulate(values.most_common(), tablefmt='github')))
Example #12
0
def run(args):
    ds = None
    if Zenodo.DOI_PATTERN.match(args.dataset):
        z = Zenodo()
        out = z.download_record(z.record_from_doi(args.dataset),
                                pathlib.Path('.'))
        args.log.info('Downloaded files for {0} to {1}'.format(
            args.dataset, out))
        cldf_ds = list(iter_datasets(out))
    else:
        p = pathlib.Path(args.dataset)
        if p.exists() and sniff(p):
            cldf_ds = [Dataset.from_metadata(p)]
        else:  # pragma: no cover
            ds = get_dataset(args)
            cldf_ds = [ds.cldf_reader()]

    if not cldf_ds:
        raise ValueError('No CLDF dataset found for spec {0}'.format(
            args.dataset))

    try:
        count_p = max([len(list(cldf['ParameterTable'])) for cldf in cldf_ds])
    except KeyError:
        count_p = 100

    default_page_size = 100
    while default_page_size < count_p and default_page_size < 600:
        default_page_size += 100  # pragma: no cover

    #  max_returned_rows            Maximum rows that can be returned from a table
    #                               or custom query (default=1000)

    db_paths = []
    if args.db_path:  # pragma: no cover
        if len(cldf_ds) > 1:
            raise ValueError(
                'You cannot pass a db path, when multiple datasets are found')
    else:
        args.db_path = pathlib.Path(
            '{0}.sqlite'.format(ds.id if ds else 'cldf_db'))

    for i, cldf in enumerate(cldf_ds):
        if i == 0:
            db_path = args.db_path
        else:
            db_path = args.db_path.parent / (
                args.db_path.stem + '_{0}'.format(i) + args.db_path.suffix)

        if not db_path.exists():
            db = Database(cldf, fname=db_path, infer_primary_keys=True)
            db.write_from_tg()
            args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname))
        db_paths.append(db_path)

    jsonlib.dump(datasette_cldf.metadata(
        {db.stem: cldf
         for db, cldf in zip(db_paths, cldf_ds)}),
                 args.cfg_path,
                 indent=4)

    os.system(
        'datasette {0} -m {1} --template-dir {2} --config default_page_size:{3}'
        .format(' '.join(str(p) for p in db_paths), args.cfg_path,
                pathlib.Path(datasette_cldf.__file__).parent / 'templates',
                default_page_size))
Example #13
0
def cldf_dataset():
    return Dataset.from_metadata(
        Path(pylexibank.__file__).parent / 'cldf-metadata.json')
Example #14
0
def Wordlist():
    return Dataset.from_metadata(
        pathlib.Path(__file__).parent / 'Wordlist' / 'Wordlist-metadata.json')
Example #15
0
def StructureDataset():
    return Dataset.from_metadata(
        pathlib.Path(__file__).parent / 'StructureDataset' /
        'StructureDataset-metadata.json')
Example #16
0
def Generic():
    return Dataset.from_metadata(
        pathlib.Path(__file__).parent / 'Generic' / 'Generic-metadata.json')
Example #17
0
def Dictionary():
    return Dataset.from_metadata(
        pathlib.Path(__file__).parent / 'Dictionary' /
        'Dictionary-metadata.json')
Example #18
0
def cldf_dataset(pytestconfig):
    from pycldf import Dataset

    return Dataset.from_metadata(pytestconfig.getoption('cldf_metadata'))
Example #19
0
def dataset(metadata_path):
    return Dataset.from_metadata(metadata_path)
Example #20
0
def multilingual_dataset(fixtures):
    return Dataset.from_metadata(fixtures / 'multilingual' /
                                 'cldf-metadata.json')
Example #21
0
def get_dataset(args):
    if args.dataset.suffix == '.json':
        return Dataset.from_metadata(args.dataset)
    return Dataset.from_data(args.dataset)
def main(args):
    for (org, repos), recs in itertools.groupby(
            sorted(oai.Records('tular'),
                   key=lambda r: (r.repos.org, r.repos.repos, r.version),
                   reverse=True),
            lambda r: (r.repos.org, r.repos.repos),
    ):
        if org == 'tupian-language-resources' and repos in DATASETS:
            DATASETS[repos] = next(recs)

    data = Data()
    dataset = data.add(
        common.Dataset,
        'tular',
        id=tular.__name__,
        domain="tular.clld.org",
        name="TuLaR",
        description="TupĂ­an Language Resources",
        publisher_name="Max-Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        license='https://creativecommons.org/licenses/by-sa/4.0/',
        contact="*****@*****.**",
        jsondata={
            'license_icon':
            'cc-by-sa.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 4.0 International License'
        },
    )

    rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve()
    root = input('Project dir [{}]: '.format(str(rd)))
    root = pathlib.Path(root) if root else rd
    clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data')

    for db, rec in DATASETS.items():
        print(db, rec.doi, rec.tag)
        dbdir = root.joinpath(db)
        assert dbdir.exists()
        md = jsonlib.load(dbdir / 'metadata.json')
        name = md['title']
        if md['description']:
            name += ': {}'.format(md['description'])
        contribution = data.add(
            Database,
            db,
            id=db,
            name=name,
            description=rec.citation if rec else None,
            doi=rec.doi if rec else None,
        )
        header, contribs = next(
            iter_markdown_tables(
                dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8')))
        for i, contrib in enumerate(contribs):
            contrib = dict(zip(header, contrib))
            cid = slug(HumanName(contrib['Name']).last)
            contributor = data['Contributor'].get(cid)
            if not contributor:
                contributor = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=contrib['Name'],
                    description=contrib.get('Affiliation'),
                )
            DBSession.add(
                common.ContributionContributor(
                    contribution=contribution,
                    contributor=contributor,
                    primary='author' in contrib['Role'].lower(),
                    ord=i,
                ))

    for i, cid in enumerate(
        ['gerardi', 'reichert', 'aragon', 'list', 'forkel']):
        DBSession.add(
            common.Editor(contributor=data['Contributor'][cid],
                          dataset=dataset,
                          ord=i))

    source_ids = list(add_sources(args.cldf.bibpath, DBSession))
    sources = {s.id: s.pk for s in DBSession.query(common.Source)}
    subgroups = []

    for row in args.cldf['LanguageTable']:
        if row['SubGroup'] not in subgroups:
            subgroups.append(row['SubGroup'])
        family = data['Family'].get(row['Family'])
        if (not family) and row['Family']:
            family = data.add(Family,
                              row['Family'],
                              id=slug(row['Family']),
                              name=row['Family'])
        data.add(
            Doculect,
            row['ID'],
            id=row['ID'],
            name=row['Name'].replace('_', ' '),
            family=family,
            subfamily=row['SubGroup'],
            iso_code=row['ISO639P3code'],
            glotto_code=row['Glottocode'],
            longitude=row['Longitude'],
            latitude=row['Latitude'],
            jsondata=dict(icon=SUBGROUPS[row['SubGroup']]),
        )

    tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' /
                                  'Generic-metadata.json')
    seen = set()
    for row in tudet['ExampleTable']:
        if row['ID'] in seen:
            print('skipping duplicate sentence ID {}'.format(row['ID']))
            continue
        seen.add(row['ID'])
        DBSession.add(
            Example(id=row['ID'],
                    name=row['Primary_Text'],
                    description=row['Translated_Text'],
                    language=data['Doculect'][row['Language_ID']],
                    conllu=row['conllu']))

    contrib = data['Database']['tuled']
    for row in args.cldf['ParameterTable']:
        data.add(
            Concept,
            row['ID'],
            id=row['ID'].split('_')[0],
            name=row['Name'],
            portuguese=row['Portuguese_Gloss'],
            semantic_field=row['Semantic_Field'],
            concepticon_class=row['Concepticon_ID'],
            eol=row['EOL_ID'],
        )
    for (lid, pid), rows in itertools.groupby(
            sorted(args.cldf.iter_rows('FormTable', 'languageReference',
                                       'parameterReference'),
                   key=lambda r: (r['Language_ID'], r['Parameter_ID'])),
            lambda r: (r['Language_ID'], r['Parameter_ID']),
    ):
        vsid = '{}-{}'.format(lid, pid)
        vs = data.add(
            common.ValueSet,
            vsid,
            id=vsid,
            language=data['Doculect'][lid],
            parameter=data['Concept'][pid],
            contribution=contrib,
        )
        refs = set()
        for row in rows:
            data.add(
                Word,
                row['ID'],
                id=row['ID'],
                valueset=vs,
                name=row['Form'],
                tokens=' '.join(row['Segments']),
                simple_cognate=int(row['SimpleCognate']),
                notes=row['Comment'],
                morphemes=' '.join(row['Morphemes']),
                partial_cognate=' '.join([k for k in row['PartialCognates']])
                if row['PartialCognates'] else None,
            )
            refs = refs.union(row['Source'])

        for ref in refs:
            if ref in source_ids:
                DBSession.add(
                    common.ValueSetReference(valueset=vs,
                                             source_pk=sources[slug(
                                                 ref, lowercase=False)]))

    load_inventories(args.cldf, clts, data['Doculect'])

    for row in args.cldf['CognateTable']:
        cc = data['Cognateset'].get(row['Cognateset_ID'])
        if not cc:
            cc = data.add(
                Cognateset,
                row['Cognateset_ID'],
                id=row['Cognateset_ID'],
                name=row['Cognateset_ID'],
                contribution=contrib,
            )
        data.add(
            Cognate,
            row['ID'],
            cognateset=cc,
            counterpart=data['Word'][row['Form_ID']],
            alignment=' '.join(row['Alignment'] or []),
        )