def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, logger: cli.logging.Logger = cli.logger, status_update: t.Optional = None, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: logger.warning("Clics could not be loaded.") clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: logger.warning( f"Dataset {dataset:} had no concepticonReference in a ParamterTable." ) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # if status update given, add status column if status_update: add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") # write cognatesets with central concepts write_back = [] for row in cli.tq( dataset["CognatesetTable"], task="Write cognatesets with central concepts to dataset", total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) row["Status_Column"] = status_update write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def replace_column( dataset: pycldf.Dataset, original: str, replacement: str, column_replace: bool, smush: bool, status_update: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> None: # add Status_column if not existing and status update given if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") if column_replace: assert ( original == "id" or original == dataset["ParameterTable", "id"].name ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable." c_id = dataset["ParameterTable", original].name c_new = dataset["ParameterTable", replacement].name mapping = { concept[c_id]: concept[c_new] for concept in dataset["ParameterTable"] } assert smush or len(mapping) == len( set(mapping.values()) ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended." # dataset["ParameterTable"].tableSchema.columns["c_id"] rename(dataset, mapping, logger, status_update=status_update) else: concepts = dataset["ParameterTable"] c_id = dataset["ParameterTable", "id"].name logger.info(f"Changing {c_id:} of ParameterTable…") dataset.write(ParameterTable=[ substitute_many(r, [c_id], {original: replacement}, status_update=None) for r in concepts ]) rename(dataset, {original: replacement}, logger, status_update=status_update)
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # write cognatesets with central concepts write_back = [] for row in tqdm( dataset["CognatesetTable"], total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def aligne_cognate_table(dataset: pycldf.Dataset, status_update: t.Optional[str] = None): # add Status_Column if not existing – TODO: make configurable if status_update: add_status_column_to_table(dataset=dataset, table_name="CognateTable") forms = util.cache_table(dataset, "FormTable") c_id = dataset["CognateTable", "id"].name c_form_id = dataset["CognateTable", "formReference"].name c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name c_slice = dataset["CognateTable", "segmentSlice"].name c_alignment = dataset["CognateTable", "alignment"].name cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {} judgements: t.Dict[str, t.Dict[str, t.Any]] = {} for judgement in cli.tq( dataset["CognateTable"], task="Aligning the cognate segments", total=dataset["CognateTable"].common_props.get("dc:extent"), ): judgements[judgement[c_id]] = judgement form = forms[judgement[c_form_id]] morpheme = [] if not judgement[c_slice]: morpheme = form["segments"] else: morpheme = [ form["segments"][i] for i in util.parse_segment_slices(judgement[c_slice]) ] cognatesets.setdefault(judgement[c_cognateset_id], []).append( ((form["languageReference"], morpheme), judgement[c_id])) for cognateset, morphemes in cognatesets.items(): for alignment, id in align(morphemes): judgements[id][c_alignment] = alignment if status_update: judgements[id]["Status_Column"] = status_update dataset.write(CognateTable=judgements.values())
def add_segments_to_dataset(dataset: pycldf.Dataset, transcription: str, overwrite_existing: bool): if dataset.column_names.forms.segments is None: # Create a Segments column in FormTable dataset.add_columns("FormTable", "Segments") c = dataset["FormTable"].tableSchema.columns[-1] c.separator = " " c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#segments") dataset.write_metadata() write_back = [] c_f_segments = dataset["FormTable", "Segments"].name for row in dataset["FormTable"]: if row[c_f_segments] and not overwrite_existing: continue else: if row[transcription]: form = row[transcription].strip() row[dataset.column_names.forms.segments] = segment_form(form) write_back.append(row) dataset.write(FormTable=write_back)
def add_concepticon_definitions( dataset: pycldf.Dataset, column_name: str = "Concepticon_Definition", logger: cli.logging.Logger = cli.logger, ) -> None: concepticon_ids = dataset.column_names.parameters.concepticonReference if concepticon_ids is None: logger.error( "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references." ) return # Create a concepticon_definition column try: dataset["ParameterTable", column_name] logger.info("Overwriting existing {:} column in concepts table".format( column_name)) except KeyError: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() # Now if this throws an exception, it's an unexpected exception. # write concepticon definitions write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon definitions to dataset", ): try: row[column_name] = concepticon.api.conceptsets[ row[concepticon_ids]].definition except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)