def get_dataset(fname=None): """Load a CLDF dataset. Load the file as `json` CLDF metadata description file, or as metadata-free dataset contained in a single csv file. The distinction is made depending on the file extension: `.json` files are loaded as metadata descriptions, all other files are matched against the CLDF module specifications. Directories are checked for the presence of any CLDF datasets in undefined order of the dataset types. Parameters ---------- fname : str or Path Path to a CLDF dataset Returns ------- pycldf.Dataset """ if fname is None: fname = repository else: fname = Path(fname) if not fname.exists(): raise FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': return Dataset.from_metadata(fname) return Dataset.from_data(fname)
def from_path(cls, path, spec=None): """ Instantiate a corpus from a file path. :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \ component as CSV file. Note that in the latter case, the file must use the default \ column names, as defined in the CLDF ontology. :return: `Corpus` instance. """ if isinstance(path, str): path = pathlib.Path(path) if path.suffix == '.json': return cls.from_cldf(Dataset.from_metadata(path), spec=spec) # We are given only an ExampleTable. Let's create the appropriate dataset: header = None for d in reader(path, dicts=True): header = list(d.keys()) break ds = Dataset.from_metadata( pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json') ds.tablegroup._fname = path.parent / 'cldf-metadata.json' t = ds.add_component('ExampleTable') t.url = Link(path.name) default_cols = [col.name for col in t.tableSchema.columns] ds.remove_columns(t, *list(set(default_cols) - set(header))) ds.add_columns(t, *list(set(header) - set(default_cols))) return cls.from_cldf(ds, spec=spec)
def get_dataset(p): try: return Dataset.from_metadata( p) if p.suffix == '.json' else Dataset.from_data(p) except ValueError: raise argparse.ArgumentTypeError( 'Invalid CLDF dataset spec: {0}!'.format(p))
def add_table_with_columns( table: str, column_names: t.Set[str], data: pycldf.Dataset) -> None: """Add a table with the given columns to the dataset. If such a table already exists, only add the columns that do not exist yet. """ delete = True try: data[table] delete = False except KeyError: data.add_component(table) columns = data[table].tableSchema.columns for c in range(len(columns) - 1, -1, -1): column = columns[c] expected_name = "cldf_{}".format( column.propertyUrl.uri.split("#")[-1].lower()) if expected_name not in column_names and delete: del columns[c] else: column_names.remove(expected_name) for column_name in column_names: data.add_columns( table, column_name.replace( "cldf_", "http://cldf.clld.org/v1.0/terms.rdf#"))
def create_concepticon_for_concepts( dataset: pycldf.Dataset, language: t.Iterable, concepticon_glosses: bool, overwrite: bool, status_update: t.Optional[str], ): # add Status_Column if status update if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") # add Concepticon_ID column to ParameterTable if dataset.column_names.parameters.concepticonReference is None: # Create a concepticonReference column dataset.add_columns("ParameterTable", "Concepticon_ID") c = dataset["ParameterTable"].tableSchema.columns[-1] c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}" c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference" ) dataset.write_metadata() if not language: language = [(dataset.column_names.parameters.id, "en")] gloss_languages: t.Dict[str, str] = dict(language) add_concepticon_references( dataset, gloss_languages=gloss_languages, status_update=status_update, overwrite=overwrite, ) if concepticon_glosses: add_concepticon_names(dataset)
def add_status_column_to_table(dataset: pycldf.Dataset, table_name: str) -> None: if "Status_Column" not in dataset[table_name].tableSchema.columndict.keys( ): dataset.add_columns(table_name, "Status_Column") else: cli.logger.info( f"Table {table_name} already contains a Status_Column.")
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, logger: cli.logging.Logger = cli.logger, status_update: t.Optional = None, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: logger.warning("Clics could not be loaded.") clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: logger.warning( f"Dataset {dataset:} had no concepticonReference in a ParamterTable." ) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # if status update given, add status column if status_update: add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") # write cognatesets with central concepts write_back = [] for row in cli.tq( dataset["CognatesetTable"], task="Write cognatesets with central concepts to dataset", total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) row["Status_Column"] = status_update write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def test_cldf(tmp_path): with pytest.raises(AttributeError): _ = CLDFWriter().cldf with CLDFWriter(CLDFSpec(dir=tmp_path)): pass # The metadata was copied: assert tmp_path.glob('*-metadata.json') with CLDFWriter( CLDFSpec(module='StructureDataset', dir=tmp_path, data_fnames=dict(ValueTable='data.csv', ExampleTable='igt.csv'))) as writer: assert writer.cldf['ValueTable'] and writer.cldf['ExampleTable'] writer['ValueTable', 'value'].separator = '|' writer.objects['ValueTable'].append( dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2])) assert tmp_path.joinpath('data.csv').exists() ds = Dataset.from_metadata(tmp_path / 'StructureDataset-metadata.json') values = list(ds['ValueTable']) assert len(values) == 1 assert values[0]['Value'] == ['1', '2'] with pytest.raises(AttributeError): CLDFWriter(tmp_path).validate()
def run(args): if (args.glottolog or args.concepticon) and Catalog is None: # pragma: no cover print( 'To use reference catalogs you must install the cldfcatalog package!' ) return 10 if args.cldf: # pragma: no cover args.cldf = Dataset.from_metadata(args.cldf) with contextlib.ExitStack() as stack: if not args.prime_cache_only: stack.enter_context( db.FreshDB.from_settings(args.settings, log=args.log)) stack.enter_context(SessionContext(args.settings)) for name in ['concepticon', 'glottolog']: if getattr(args, name): # pragma: no cover if getattr(args, name + '_version'): stack.enter_context( Catalog(getattr(args, name), tag=getattr(args, name + '_version'))) else: setattr(args, name, pathlib.Path(getattr(args, name))) if not args.prime_cache_only: with transaction.manager: if args.initializedb: # pragma: no cover args.initializedb.main(args) if hasattr(args.initializedb, 'prime_cache'): with transaction.manager: # pragma: no cover args.initializedb.prime_cache(args)
def test_cldf_with_dataset(ds): with CLDFWriter(CLDFSpec(dir=ds.cldf_dir), dataset=ds): pass cldf = Dataset.from_metadata(ds.cldf_dir.joinpath('Generic-metadata.json')) assert 'http://example.org/raw' in [ p['rdf:about'] for p in cldf.properties['prov:wasDerivedFrom'] ]
def _make_client(cldf_md, tmpdir): cldf = Dataset.from_metadata(cldf_md) dbpath = tmpdir / 'db.sqlite' db = Database(cldf, fname=dbpath, infer_primary_keys=True) db.write_from_tg() return make_app_client(dbpath, metadata=datasette_cldf.metadata({'db': cldf}))
def metadatafree_dataset(tmp_path): values = tmp_path / 'values.csv' values.write_text("""\ ID,Language_ID,Parameter_ID,Value 1,abcd1235,param1,val1 2,abcd1234,param1,val2""", encoding='utf8') return Dataset.from_data(values)
def from_stream(cls, stream, spec=None): from csvw.metadata import TableGroup cldf = Dataset(TableGroup(fname=pathlib.Path('tmp.json'))) cldf.add_component('ExampleTable') spec = spec or CorpusSpec() cols = cls.get_column_names(cldf) igts = [ IGT( id=igt[cols.id], gloss=igt[cols.gloss].split('\\t'), phrase=igt[cols.phrase].split('\\t'), language=igt.get(cols.language), properties=igt, spec=spec, ) for igt in reader(stream.read().splitlines(), dicts=True)] return cls(igts, spec=spec)
def replace_column( dataset: pycldf.Dataset, original: str, replacement: str, column_replace: bool, smush: bool, status_update: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> None: # add Status_column if not existing and status update given if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") if column_replace: assert ( original == "id" or original == dataset["ParameterTable", "id"].name ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable." c_id = dataset["ParameterTable", original].name c_new = dataset["ParameterTable", replacement].name mapping = { concept[c_id]: concept[c_new] for concept in dataset["ParameterTable"] } assert smush or len(mapping) == len( set(mapping.values()) ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended." # dataset["ParameterTable"].tableSchema.columns["c_id"] rename(dataset, mapping, logger, status_update=status_update) else: concepts = dataset["ParameterTable"] c_id = dataset["ParameterTable", "id"].name logger.info(f"Changing {c_id:} of ParameterTable…") dataset.write(ParameterTable=[ substitute_many(r, [c_id], {original: replacement}, status_update=None) for r in concepts ]) rename(dataset, {original: replacement}, logger, status_update=status_update)
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # write cognatesets with central concepts write_back = [] for row in tqdm( dataset["CognatesetTable"], total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def aligne_cognate_table(dataset: pycldf.Dataset, status_update: t.Optional[str] = None): # add Status_Column if not existing – TODO: make configurable if status_update: add_status_column_to_table(dataset=dataset, table_name="CognateTable") forms = util.cache_table(dataset, "FormTable") c_id = dataset["CognateTable", "id"].name c_form_id = dataset["CognateTable", "formReference"].name c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name c_slice = dataset["CognateTable", "segmentSlice"].name c_alignment = dataset["CognateTable", "alignment"].name cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {} judgements: t.Dict[str, t.Dict[str, t.Any]] = {} for judgement in cli.tq( dataset["CognateTable"], task="Aligning the cognate segments", total=dataset["CognateTable"].common_props.get("dc:extent"), ): judgements[judgement[c_id]] = judgement form = forms[judgement[c_form_id]] morpheme = [] if not judgement[c_slice]: morpheme = form["segments"] else: morpheme = [ form["segments"][i] for i in util.parse_segment_slices(judgement[c_slice]) ] cognatesets.setdefault(judgement[c_cognateset_id], []).append( ((form["languageReference"], morpheme), judgement[c_id])) for cognateset, morphemes in cognatesets.items(): for alignment, id in align(morphemes): judgements[id][c_alignment] = alignment if status_update: judgements[id]["Status_Column"] = status_update dataset.write(CognateTable=judgements.values())
def run(args): args.env, settings = get_env_and_settings(args.config_uri) with contextlib.ExitStack() as stack: stack.enter_context(db.FreshDB.from_settings(settings, log=args.log)) stack.enter_context(SessionContext(settings)) args.cldf = Dataset.from_metadata(args.abvd_cldf / 'cldf' / 'cldf-metadata.json') if not args.prime_cache_only: with transaction.manager: main(args) with transaction.manager: prime_cache(args)
def test_MultiParameter(metadatafree_dataset, StructureDataset, glottolog, tmp_path): _ = MultiParameter(metadatafree_dataset, ['param1'], glottolog={lg.id: lg for lg in glottolog.languoids()}) mp = MultiParameter(StructureDataset, ['B', 'C']) for lang, values in mp.iter_languages(): assert lang.name == 'Bengali' assert values['C'][0].v == 'C-1' assert values['C'][0].code == '1' break mp = MultiParameter(StructureDataset, ['B'], language_properties=['Family_name']) assert 'Family_name' in mp.parameters mp = MultiParameter(StructureDataset, []) assert '__language__' in mp.parameters values = tmp_path / 'values.csv' values.write_text("""\ ID,Language_ID,Parameter_ID,Value 1,abcd1235,param1,1 2,abcd1235,param1,2 3,abcd1235,param1,3 4,abcd1235,param1,4 5,abcd1235,param1,5 6,abcd1235,param1,6 7,abcd1235,param1,7 8,abcd1235,param1,8 9,abcd1235,param1,9 10,abcd1234,param1,10""", encoding='utf8') ds = Dataset.from_data(values) mp = MultiParameter(ds, ['param1'], glottolog={lg.id: lg for lg in glottolog.languoids()}) assert list(mp.parameters.values())[0].type == CONTINUOUS mp = MultiParameter(ds, [], glottolog={lg.id: lg for lg in glottolog.languoids()}) assert len(mp.languages) == 2
def test_cldf(tmpdir): with pytest.raises(AttributeError): _ = CLDFWriter().cldf outdir = pathlib.Path(str(tmpdir)) with CLDFWriter(CLDFSpec(dir=outdir)): pass # The metadata was copied: assert outdir.glob('*-metadata.json') with CLDFWriter(CLDFSpec(dir=outdir, data_fnames=dict(ValueTable='data.csv'))) as writer: assert writer.cldf['ValueTable'] writer['ValueTable', 'value'].separator = '|' writer.objects['ValueTable'].append( dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2])) ds = Dataset.from_metadata(outdir / 'Generic-metadata.json') values = list(ds['ValueTable']) assert len(values) == 1 assert values[0]['Value'] == ['1', '2'] with pytest.raises(AttributeError): CLDFWriter(outdir).validate()
def test_Record_download_dataset(tmp_path, mocker, tests_dir, caplog, record): class Response: def __init__(self, *args): self.yielded = False self.code = 200 def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def read(self): return tests_dir.joinpath( 'petersonsouthasia-v1.1.zip').read_bytes() mocker.patch('cldfzenodo.record.urllib.request.urlopen', Response) with caplog.at_level(logging.INFO): assert Dataset.from_metadata( record.download_dataset( tmp_path, log=logging.getLogger(__name__))).validate() assert caplog.records
def add_segments_to_dataset(dataset: pycldf.Dataset, transcription: str, overwrite_existing: bool): if dataset.column_names.forms.segments is None: # Create a Segments column in FormTable dataset.add_columns("FormTable", "Segments") c = dataset["FormTable"].tableSchema.columns[-1] c.separator = " " c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#segments") dataset.write_metadata() write_back = [] c_f_segments = dataset["FormTable", "Segments"].name for row in dataset["FormTable"]: if row[c_f_segments] and not overwrite_existing: continue else: if row[transcription]: form = row[transcription].strip() row[dataset.column_names.forms.segments] = segment_form(form) write_back.append(row) dataset.write(FormTable=write_back)
def add_concepticon_definitions( dataset: pycldf.Dataset, column_name: str = "Concepticon_Definition", logger: cli.logging.Logger = cli.logger, ) -> None: concepticon_ids = dataset.column_names.parameters.concepticonReference if concepticon_ids is None: logger.error( "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references." ) return # Create a concepticon_definition column try: dataset["ParameterTable", column_name] logger.info("Overwriting existing {:} column in concepts table".format( column_name)) except KeyError: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() # Now if this throws an exception, it's an unexpected exception. # write concepticon definitions write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon definitions to dataset", ): try: row[column_name] = concepticon.api.conceptsets[ row[concepticon_ids]].definition except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def multilingual_dataset(fixtures): return Dataset.from_metadata(fixtures / 'multilingual' / 'cldf-metadata.json')
def main(args): for (org, repos), recs in itertools.groupby( sorted(oai.Records('tular'), key=lambda r: (r.repos.org, r.repos.repos, r.version), reverse=True), lambda r: (r.repos.org, r.repos.repos), ): if org == 'tupian-language-resources' and repos in DATASETS: DATASETS[repos] = next(recs) data = Data() dataset = data.add( common.Dataset, 'tular', id=tular.__name__, domain="tular.clld.org", name="TuLaR", description="TupĂan Language Resources", publisher_name="Max-Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", license='https://creativecommons.org/licenses/by-sa/4.0/', contact="*****@*****.**", jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, ) rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve() root = input('Project dir [{}]: '.format(str(rd))) root = pathlib.Path(root) if root else rd clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data') for db, rec in DATASETS.items(): print(db, rec.doi, rec.tag) dbdir = root.joinpath(db) assert dbdir.exists() md = jsonlib.load(dbdir / 'metadata.json') name = md['title'] if md['description']: name += ': {}'.format(md['description']) contribution = data.add( Database, db, id=db, name=name, description=rec.citation if rec else None, doi=rec.doi if rec else None, ) header, contribs = next( iter_markdown_tables( dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8'))) for i, contrib in enumerate(contribs): contrib = dict(zip(header, contrib)) cid = slug(HumanName(contrib['Name']).last) contributor = data['Contributor'].get(cid) if not contributor: contributor = data.add( common.Contributor, cid, id=cid, name=contrib['Name'], description=contrib.get('Affiliation'), ) DBSession.add( common.ContributionContributor( contribution=contribution, contributor=contributor, primary='author' in contrib['Role'].lower(), ord=i, )) for i, cid in enumerate( ['gerardi', 'reichert', 'aragon', 'list', 'forkel']): DBSession.add( common.Editor(contributor=data['Contributor'][cid], dataset=dataset, ord=i)) source_ids = list(add_sources(args.cldf.bibpath, DBSession)) sources = {s.id: s.pk for s in DBSession.query(common.Source)} subgroups = [] for row in args.cldf['LanguageTable']: if row['SubGroup'] not in subgroups: subgroups.append(row['SubGroup']) family = data['Family'].get(row['Family']) if (not family) and row['Family']: family = data.add(Family, row['Family'], id=slug(row['Family']), name=row['Family']) data.add( Doculect, row['ID'], id=row['ID'], name=row['Name'].replace('_', ' '), family=family, subfamily=row['SubGroup'], iso_code=row['ISO639P3code'], glotto_code=row['Glottocode'], longitude=row['Longitude'], latitude=row['Latitude'], jsondata=dict(icon=SUBGROUPS[row['SubGroup']]), ) tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' / 'Generic-metadata.json') seen = set() for row in tudet['ExampleTable']: if row['ID'] in seen: print('skipping duplicate sentence ID {}'.format(row['ID'])) continue seen.add(row['ID']) DBSession.add( Example(id=row['ID'], name=row['Primary_Text'], description=row['Translated_Text'], language=data['Doculect'][row['Language_ID']], conllu=row['conllu'])) contrib = data['Database']['tuled'] for row in args.cldf['ParameterTable']: data.add( Concept, row['ID'], id=row['ID'].split('_')[0], name=row['Name'], portuguese=row['Portuguese_Gloss'], semantic_field=row['Semantic_Field'], concepticon_class=row['Concepticon_ID'], eol=row['EOL_ID'], ) for (lid, pid), rows in itertools.groupby( sorted(args.cldf.iter_rows('FormTable', 'languageReference', 'parameterReference'), key=lambda r: (r['Language_ID'], r['Parameter_ID'])), lambda r: (r['Language_ID'], r['Parameter_ID']), ): vsid = '{}-{}'.format(lid, pid) vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Doculect'][lid], parameter=data['Concept'][pid], contribution=contrib, ) refs = set() for row in rows: data.add( Word, row['ID'], id=row['ID'], valueset=vs, name=row['Form'], tokens=' '.join(row['Segments']), simple_cognate=int(row['SimpleCognate']), notes=row['Comment'], morphemes=' '.join(row['Morphemes']), partial_cognate=' '.join([k for k in row['PartialCognates']]) if row['PartialCognates'] else None, ) refs = refs.union(row['Source']) for ref in refs: if ref in source_ids: DBSession.add( common.ValueSetReference(valueset=vs, source_pk=sources[slug( ref, lowercase=False)])) load_inventories(args.cldf, clts, data['Doculect']) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add( Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'], name=row['Cognateset_ID'], contribution=contrib, ) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Word'][row['Form_ID']], alignment=' '.join(row['Alignment'] or []), )
def get_dataset(args): if args.dataset.suffix == '.json': return Dataset.from_metadata(args.dataset) return Dataset.from_data(args.dataset)
def cldf_dataset(): return Dataset.from_metadata( Path(pylexibank.__file__).parent / 'cldf-metadata.json')
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def run(args): ds = None if Zenodo.DOI_PATTERN.match(args.dataset): z = Zenodo() out = z.download_record(z.record_from_doi(args.dataset), pathlib.Path('.')) args.log.info('Downloaded files for {0} to {1}'.format( args.dataset, out)) cldf_ds = list(iter_datasets(out)) else: p = pathlib.Path(args.dataset) if p.exists() and sniff(p): cldf_ds = [Dataset.from_metadata(p)] else: # pragma: no cover ds = get_dataset(args) cldf_ds = [ds.cldf_reader()] if not cldf_ds: raise ValueError('No CLDF dataset found for spec {0}'.format( args.dataset)) try: count_p = max([len(list(cldf['ParameterTable'])) for cldf in cldf_ds]) except KeyError: count_p = 100 default_page_size = 100 while default_page_size < count_p and default_page_size < 600: default_page_size += 100 # pragma: no cover # max_returned_rows Maximum rows that can be returned from a table # or custom query (default=1000) db_paths = [] if args.db_path: # pragma: no cover if len(cldf_ds) > 1: raise ValueError( 'You cannot pass a db path, when multiple datasets are found') else: args.db_path = pathlib.Path( '{0}.sqlite'.format(ds.id if ds else 'cldf_db')) for i, cldf in enumerate(cldf_ds): if i == 0: db_path = args.db_path else: db_path = args.db_path.parent / ( args.db_path.stem + '_{0}'.format(i) + args.db_path.suffix) if not db_path.exists(): db = Database(cldf, fname=db_path, infer_primary_keys=True) db.write_from_tg() args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname)) db_paths.append(db_path) jsonlib.dump(datasette_cldf.metadata( {db.stem: cldf for db, cldf in zip(db_paths, cldf_ds)}), args.cfg_path, indent=4) os.system( 'datasette {0} -m {1} --template-dir {2} --config default_page_size:{3}' .format(' '.join(str(p) for p in db_paths), args.cfg_path, pathlib.Path(datasette_cldf.__file__).parent / 'templates', default_page_size))
def add_status_column_to_table(dataset: pycldf.Dataset, table_name: str) -> None: if "Status_Column" not in dataset[table_name].tableSchema.columndict.keys( ): dataset.add_columns(table_name, "Status_Column")
def cldf_dataset(pytestconfig): from pycldf import Dataset return Dataset.from_metadata(pytestconfig.getoption('cldf_metadata'))