def create_concepticon_for_concepts( dataset: pycldf.Dataset, language: t.Iterable, concepticon_glosses: bool, overwrite: bool, status_update: t.Optional[str], ): # add Status_Column if status update if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") # add Concepticon_ID column to ParameterTable if dataset.column_names.parameters.concepticonReference is None: # Create a concepticonReference column dataset.add_columns("ParameterTable", "Concepticon_ID") c = dataset["ParameterTable"].tableSchema.columns[-1] c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}" c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference" ) dataset.write_metadata() if not language: language = [(dataset.column_names.parameters.id, "en")] gloss_languages: t.Dict[str, str] = dict(language) add_concepticon_references( dataset, gloss_languages=gloss_languages, status_update=status_update, overwrite=overwrite, ) if concepticon_glosses: add_concepticon_names(dataset)
def test_Dataset_validate(tmpdir, mocker): ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.write(ValueTable=[]) assert ds.validate() ds['ValueTable'].tableSchema.columns = [] with pytest.raises(ValueError): ds.validate() assert not ds.validate(log=mocker.Mock()) ds.tablegroup.tables = [] with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.add_component('LanguageTable') ds.write(ValueTable=[]) ds['LanguageTable'].common_props[ 'dc:conformsTo'] = 'http://cldf.clld.org/404' with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds['ValueTable'].get_column('Source').propertyUrl = URITemplate( 'http://cldf.clld.org/404') ds.write(ValueTable=[]) with pytest.raises(ValueError): ds.validate()
def apply_column_property(self, column, property, value): if self.is_uri_property(property): value = URITemplate(value) if property == 'propertyUrl': column.propertyUrl = value elif property == 'valueUrl': column.valueUrl = value
def add_image_schema(writer): writer.cldf.add_table( 'images.csv', { 'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id', }, 'Taxon_ID', 'objid', 'bitstreamid', { "name": "tags", "separator": ";" }, 'mime_type', 'creator', 'date', 'place', 'permission', 'source', 'Comment', ) writer.cldf['images.csv', 'ID'].valueUrl = URITemplate( 'https://cdstar.shh.mpg.de/bitstreams/{objid}/{bitstreamid}') writer.cldf.add_foreign_key('images.csv', 'Taxon_ID', 'ParameterTable', 'ID')
def test_Dataset_validate(tmpdir, mocker): ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.write(ValueTable=[]) values = tmpdir / 'new' / 'values.csv' assert values.check() Path(str(values)).unlink() log = mocker.Mock() assert not ds.validate(log=log) assert log.warn.called ds.write(ValueTable=[]) assert ds.validate() ds['ValueTable'].tableSchema.columns = [] with pytest.raises(ValueError): ds.validate() assert not ds.validate(log=mocker.Mock()) ds.tablegroup.tables = [] with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.add_component('LanguageTable') ds.write(ValueTable=[], LanguageTable=[]) assert ds.validate() # test violation of referential integrity: ds.write(ValueTable=[{ 'ID': '1', 'Value': '1', 'Language_ID': 'lid', 'Parameter_ID': 'pid' }], LanguageTable=[]) assert not ds.validate(log=mocker.Mock()) # test an invalid CLDF URL: ds['LanguageTable'].common_props[ 'dc:conformsTo'] = 'http://cldf.clld.org/404' with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds['ValueTable'].get_column('Source').propertyUrl = URITemplate( 'http://cldf.clld.org/404') ds.write(ValueTable=[]) with pytest.raises(ValueError): ds.validate()
def test_no_concepticon_definition_column_added(caplog): original = Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json" dirname = Path(tempfile.mkdtemp(prefix="lexedata-test")) target = dirname / original.name shutil.copyfile(original, target) dataset = pycldf.Dataset.from_metadata(target) dataset.add_columns("ParameterTable", "Concepticon_ID") c = dataset["ParameterTable"].tableSchema.columns[-1] c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}" c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference" ) dataset.add_columns("ParameterTable", "Concepticon_Definition") dataset.write_metadata() dataset.write(ParameterTable=[]) with caplog.at_level(logging.INFO): add_concepticon_definitions(dataset=dataset) assert re.search("[oO]verwrit.*existing Concepticon_Definition", caplog.text)
def reshape_dataset(dataset: pycldf.Wordlist, add_column: bool = True) -> pycldf.Dataset: # check for existing cognateset table if dataset.column_names.cognatesets is None: # Create a Cognateset Table dataset.add_component("CognatesetTable") # add a concept column to the cognateset table if add_column: if dataset.column_names.cognatesets.parameterReference is None: dataset.add_columns("CognatesetTable", "Core_Concept_ID") c = dataset["CognatesetTable"].tableSchema.columns[-1] c.datatype = dataset["ParameterTable", "ID"].datatype c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#parameterReference") fname = dataset.write_metadata() # Reload dataset with new column definitions dataset = pycldf.Wordlist.from_metadata(fname) return dataset
def add_segments_to_dataset(dataset: pycldf.Dataset, transcription: str, overwrite_existing: bool): if dataset.column_names.forms.segments is None: # Create a Segments column in FormTable dataset.add_columns("FormTable", "Segments") c = dataset["FormTable"].tableSchema.columns[-1] c.separator = " " c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#segments") dataset.write_metadata() write_back = [] c_f_segments = dataset["FormTable", "Segments"].name for row in dataset["FormTable"]: if row[c_f_segments] and not overwrite_existing: continue else: if row[transcription]: form = row[transcription].strip() row[dataset.column_names.forms.segments] = segment_form(form) write_back.append(row) dataset.write(FormTable=write_back)
parser = argparse.ArgumentParser() parser.add_argument("wordlist", default="cldf-metadata.json", type=Path, help="The wordlist to add Concepticon links to") args = parser.parse_args() dataset = pycldf.Wordlist.from_metadata(args.wordlist) # Is there a cognateset table? if dataset.column_names.cognatesets is None: # Create a Cognateset Table dataset.add_component("CognatesetTable") dataset.add_columns("CognatesetTable", "Core_Concept_ID") c = dataset["CognatesetTable"].tableSchema.columns[-1] c.propertyUrl = URITemplate( "https://cldf.clld.org/v1.0/terms.rdf#parameterReference") dataset.column_names.parameters.concepticonReference = "Core_Concept_ID" dataset.write_metadata() # Reload dataset dataset = pycldf.Wordlist.from_metadata(args.wordlist) c_cognateset = dataset.column_names.forms.cognatesetReference c_form = dataset.column_names.forms.id table = dataset["FormTable"] if c_cognateset is None: c_cognateset = dataset.column_names.cognates.cognatesetReference c_form = dataset.column_names.cognates.formReference table = dataset["CognateTable"] if c_cognateset is None: raise ValueError(
parser.add_argument("wordlist", default="cldf-metadata.json", type=Path, help="The wordlist to add Concepticon links to") parser.add_argument("transcription", nargs="?", default=None, help="Column containing the IPA transcriptions." "(Default: The CLDF #form column)") args = parser.parse_args() dataset = pycldf.Wordlist.from_metadata(args.wordlist) if args.transcription is None: args.transcription = dataset.column_names.forms.form if dataset.column_names.forms.segments is None: # Create a concepticonReference column dataset.add_columns("FormTable", "Segments") c = dataset["FormTable"].tableSchema.columns[-1] c.separator = " " c.propertyUrl = URITemplate("http://cldf.clld.org/v1.0/terms.rdf#segments") dataset.write_metadata() print(dataset.column_names.forms.segments) write_back = [] for row in dataset["FormTable"]: if row[args.transcription]: form = row[args.transcription].strip() row[dataset.column_names.forms.segments] = segment_form(form) write_back.append(row) dataset.write(FormTable=write_back)
def create_schema(self, cldf): cldf.add_table( 'glossabbreviations.csv', { 'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id', }, { 'name': 'Name', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#name', }) cldf.add_table( 'media.csv', { 'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id', 'valueUrl': 'https://cdstar.shh.mpg.de/bitstreams/{Name}', }, { 'name': 'Name', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#name', }, { 'name': 'Description', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#description', }, 'mimetype', { 'name': 'size', 'datatype': 'integer' }, ) cldf.add_component( 'ParameterTable', { 'name': 'Contributor_ID', 'separator': ' ', 'dc:description': 'Authors of the Atlas chapter describing the feature', }, 'Chapter', # valueUrl: https://apics-online.info/parameters/1.chapter.html { 'name': 'Type', 'dc:description': "Primary or structural feature, segment or sociolinguistic feature", }, { 'name': 'PHOIBLE_Segment_ID', 'valueUrl': 'https://phoible.org/parameters/{PHOIBLE_Segment_ID}', }, 'PHOIBLE_Segment_Name', { 'name': 'Multivalued', 'datatype': 'boolean' }, { 'name': 'WALS_ID', 'dc:description': 'ID of the corresponding WALS feature', }, { 'name': 'WALS_Representation', 'datatype': 'integer' }, 'Area', 'Map_Gall_Peters', { 'name': 'metadata', 'dc:format': 'application/json' }, ) cldf['ParameterTable', 'id'].valueUrl = URITemplate( 'https://apics-online.info/parameters/{id}') cldf.add_component( 'CodeTable', { 'name': 'Number', 'datatype': 'integer' }, 'icon', 'color', 'abbr', ) cldf.add_component( 'LanguageTable', { 'name': 'Description', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#description', 'dc:format': 'text/html', }, { 'name': 'Data_Contributor_ID', 'separator': ' ', 'dc:description': 'Authors contributing the language structure dataset', }, { 'name': 'Survey_Contributor_ID', 'separator': ' ', 'dc:description': 'Authors of the language survey', }, 'Survey_Title', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';', }, 'Ethnologue_Name', 'Glossed_Text_PDF', 'Glossed_Text_Audio', { 'name': 'Metadata', 'dc:format': 'text/json', }, 'Region', { 'name': 'Default_Lect_ID', 'dc:description': NON_DEFAULT_LECT, }, { 'name': 'Lexifier', 'dc:description': LEXIFIER_DESC, }, ) cldf['LanguageTable', 'id'].valueUrl = URITemplate( 'https://apics-online.info/contributions/{id}') cldf.add_component( 'ExampleTable', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';', }, 'Audio', { 'name': 'Type', 'propertyUrl': 'dc:type' }, { 'name': 'markup_text', 'dc:format': 'text/html', }, { 'name': 'markup_analyzed', 'dc:format': 'text/html', }, { 'name': 'markup_gloss', 'dc:format': 'text/html', }, { 'name': 'markup_comment', 'dc:format': 'text/html', }, 'source_comment', 'original_script', 'sort', 'alt_translation', ) t = cldf.add_table( 'contributors.csv', { 'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id', }, { 'name': 'Name', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#name', }, 'Address', 'URL', { 'name': 'editor_ord', 'datatype': 'integer', }) t.common_props['dc:conformsTo'] = None cldf.add_columns( 'ValueTable', { 'name': 'Example_ID', 'separator': ' ', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#exampleReference', }, { 'name': 'Frequency', "datatype": 'number', }, 'Confidence', { 'name': 'Metadata', 'dc:format': 'text/json', }, 'source_comment', ) cldf.add_foreign_key('ParameterTable', 'Contributor_ID', 'contributors.csv', 'ID') cldf.add_foreign_key('ParameterTable', 'Map_Gall_Peters', 'media.csv', 'ID') cldf.add_foreign_key('LanguageTable', 'Glossed_Text_PDF', 'media.csv', 'ID') cldf.add_foreign_key('LanguageTable', 'Glossed_Text_Audio', 'media.csv', 'ID') cldf.add_foreign_key('ExampleTable', 'Audio', 'media.csv', 'ID')
def url_template(req, route, id_name): return URITemplate( get_url_template(req, route, relative=False, variable_map={'id': id_name}))
def cmd_makecldf(self, args): args.writer.add_sources() # We can link forms to scans of the page in the source where they appear: args.writer.cldf["FormTable", "Scan"].valueUrl = URITemplate( 'https://cdstar.shh.mpg.de/bitstreams/{Objid}/gauchat_et_al_1925_tppsr_{Scan}.png' ) for c in ['Population', 'SpeakerAge']: args.writer.cldf['LanguageTable', c].datatype.base = 'integer' args.writer.cldf['LanguageTable', c].datatype.minimum = 0 values = self.raw_dir.read_csv('tppsr-db-v20.txt', delimiter='\t') forms = self.raw_dir.read_csv('tppsr-db-v20-ipa-narrow.txt', delimiter='\t') concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.id, slug(concept.attributes['french'])) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, French_Gloss=concept.attributes['french'], Latin_Gloss=concept.attributes['latin'], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.number] = (idx, concept.attributes['page'], concept.attributes['french']) languages = args.writer.add_languages(lookup_factory='Number') def scan_number(bitstreams): p = re.compile(r'tppsr_(?P<number>[0-9]{4})\.png') for bs in bitstreams: m = p.search(bs['bitstreamid']) if m: return m.group('number') scans = { scan_number(o['bitstreams']): objid for objid, o in self.raw_dir.read_json('tppsr_scans.json').items() } phrase_data = collections.defaultdict(dict) for row1, row2 in progressbar(zip(values, forms), desc='cldfify'): entry = row1[2] for s, t in [('\u0320', '')]: entry = entry.replace(s, t) tokens = self.tokenizer({}, entry.strip().replace(' ', '_'), column='IPA') # Compute scan number from concept number and language number. page = int(concepts[row1[0]][1]) + int(int(row1[1]) > 31) scan = str(page + 18).rjust(4, '0') if row1[2].replace('_', '').replace('-', '').strip(): phrase_data[row1[1]][row1[0]] = (row2[2], row1[2]) args.writer.add_form_with_segments( Value=row1[2], Form=''.join(tokens), Segments=tokens, Profile=' '.join( self.tokenizer({}, entry.strip(), column='Grapheme')), Source=['Gauchat1925[{0}]'.format(page)], Language_ID=languages[row1[1]], Parameter_ID=concepts[row1[0]][0], Scan=scan, Objid=scans[scan], ProsodicStructure=prosodic_string(tokens, _output='CcV'), SegmentedValue=' '.join( self.tokenizer({}, entry, column='Graphemes'))) args.writer.cldf.add_component( 'ExampleTable', 'Alt_Transcription', { "name": "Concept_ID", "separator": " ", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference", }, { "name": "Form_ID", "separator": " ", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#formReference", }, ) args.writer.cldf.add_foreign_key('ExampleTable', 'Concept_ID', 'ParameterTable', 'ID') args.writer.cldf.add_foreign_key('ExampleTable', 'Form_ID', 'FormTable', 'ID') for phrase in self.etc_dir.read_csv('phrases.csv', dicts=True): for lid, data in sorted(phrase_data.items(), key=lambda i: i[0]): lid = languages[lid] cids = phrase['Concepts'].split() try: args.writer.objects['ExampleTable'].append( dict( ID='{}-{}'.format(phrase['ID'], lid), Language_ID=lid, Primary_Text=' '.join( [data[cid][0] for cid in cids]), Translated_Text=' '.join( [concepts[cid][2] for cid in cids]), Alt_Transcription=' '.join( [data[cid][1] for cid in cids]), Concept_ID=[concepts[cid][0] for cid in cids], Form_ID=[ '{}-{}-1'.format(lid, concepts[cid][0]) for cid in cids ], )) except KeyError: pass
def apply_table_schema_property(self, table_schema, property, value): if self.is_uri_property(property): value = URITemplate(value) if property == 'aboutUrl': table_schema.aboutUrl = value
def _schema(self, args): args.writer.cldf['FormTable'].common_props['dc:description'] = \ "Word forms are listed as 'counterparts', i.e. as words with a specific meaning. " \ "Thus, words with multiple meanings may appear more than once in this table." args.writer.cldf['FormTable', 'Comment'].common_props['dc:description'] = \ "For more specific comments see 'comment_on_borrowed' and 'comment_on_word_form'" args.writer.cldf['FormTable', 'Word_ID'].valueUrl = URITemplate('https://wold.clld.org/word/{Word_ID}') args.writer.cldf.remove_columns('FormTable', 'Cognacy') t = args.writer.cldf.add_component( "ContributionTable", { "name": "Number_of_words", "datatype": "integer", "dc:description": "There would be 1814 words in each vocabulary, " "corresponding to the 1814 Loanword Typology meanings, if each meaning " "had exactly one counterpart, and if all the counterparts were " 'different words. But many ("polysomous") words are counterparts of ' "several meanings, many meanings have several word counterparts " '("synonyms", or "subcounterparts"), and many meanings have no ' "counterparts at all, so the number of words in each database varies " "considerably.", }, { "name": "Language_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference", "dc:description": "References the language for which this contribution provides " "a vocabulary.", }, ) t.common_props['dc:description'] = \ "WOLD contributions are vocabularies (mini-dictionaries of about 1000-2000 entries) " \ "with comprehensive information about the loanword status of each word. " \ "Descriptions of how these vocabularies coded the data can be found in the " \ "[descriptions](descriptions/) directory." args.writer.cldf['ContributionTable', 'description'].valueUrl = URITemplate( './descriptions/vocabulary_{ID}.md') args.writer.cldf['ContributionTable', 'description'].common_props['dc:format'] = 'text/markdown' args.writer.cldf['ContributionTable', 'id'].common_props["dc:description"] = \ "The vocabulary ID number corresponds to the ordering to the chapters on the book " \ "Loanwords in the World's Languages. Languages are listed in rough geographical order " \ "from west to east, from Africa via Europe to Asia and the Americas, so that " \ "geographically adjacent languages are next to each other." args.writer.cldf['ContributionTable', 'citation'].common_props["dc:description"] = \ "Each vocabulary of WOLD is a separate electronic publication with a separate author " \ "or team of authors and should be cited as specified here." args.writer.cldf['ContributionTable', 'contributor'].common_props["dc:description"] = \ "The authors are experts of the language and its history. They also contributed a " \ "prose chapter on the borrowing situation in their language that was published in the " \ "book Loanwords in the World's Languages." t.add_foreign_key("Language_ID", "languages.csv", "ID") t = args.writer.cldf.add_component( 'BorrowingTable', { 'name': 'Source_relation', 'datatype': {'base': 'string', 'format': "immediate|earlier"}, 'dc:description': "Whether a word was contributed directly (immediate) or indirectly (earlier), " "i.e. via another, intermediate donor languoid, to the recipient language.", }, 'Source_word', 'Source_meaning', { 'name': 'Source_certain', 'datatype': {'base': 'boolean', 'format': "yes|no"}, 'dc:description': "Certainty of the source identification", }, { 'name': 'Source_languoid', 'dc:description': 'Donor languoid, specified as name of a language or language subgroup or family', }, { 'name': 'Source_languoid_glottocode', 'dc:description': 'Glottocode of the source languid', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#glottocode', } ) t.common_props['dc:description'] = \ 'While a lot of information about the borrowing status is attached to the borrowed ' \ 'forms, the BorrowingTable lists information about (potential) source words. Note ' \ 'that we list loan events per meaning; i.e. one loanword may result in multiple ' \ 'borrowings if the word has multiple meanings.'
parser.add_argument("wordlist", default="cldf-metadata.json", type=Path, help="The wordlist to add Concepticon links to") parser.add_argument( "--language", "-l", action="append", default=[], type=equal_separated, help="Maps from column names to language codes, eg. GLOSS=en") args = parser.parse_args() dataset = pycldf.Wordlist.from_metadata(args.wordlist) if dataset.column_names.parameters.concepticonReference is None: # Create a concepticonReference column dataset.add_columns("ParameterTable", "Concepticon_ID") c = dataset["ParameterTable"].tableSchema.columns[-1] c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}" c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference") dataset.write_metadata() if not args.language: args.language = [(dataset.column_names.parameters.id, "en")] gloss_languages: t.Dict[str, str] = dict(args.language) add_concepticon_references(dataset, gloss_languages)