def test_singletons(): dataset, _ = copy_to_temp_no_bib( Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json") add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") all_cogsets, judgements = create_singletons(dataset=dataset, status="automatic singleton") c_c_id = dataset["CognateTable", "id"].name c_cs_id = dataset["CognatesetTable", "id"].name cognates = [c for c in judgements if c[c_c_id].startswith("X")] cogsets = [c for c in all_cogsets if c[c_cs_id].startswith("X")] assert cognates == [ { "ID": "X_old_paraguayan_guarani_two_1", "Form_ID": "old_paraguayan_guarani_two", "Comment": None, "Segment_Slice": ["1:5"], "Alignment": ["p", "a", "t", "h", "á"], "FIXME_IF_you_set_this_column_name_to_Value_it_messes_up_translations_due_to_conflict": "X_old_paraguayan_guarani_two_1", }, { "ID": "X_paraguayan_guarani_five_1", "Form_ID": "paraguayan_guarani_five", "Comment": None, "Segment_Slice": ["1:2"], "Alignment": ["p", "o"], "FIXME_IF_you_set_this_column_name_to_Value_it_messes_up_translations_due_to_conflict": "X_paraguayan_guarani_five_1", }, ] assert cogsets == [ { "ID": "X_old_paraguayan_guarani_two_1", "Set": None, "Comment": None, "Name": "two", "Status_Column": "automatic singleton", }, { "ID": "X_paraguayan_guarani_five_1", "Set": None, "Comment": None, "Name": "five", "Status_Column": "automatic singleton", }, ]
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, logger: cli.logging.Logger = cli.logger, status_update: t.Optional = None, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: logger.warning("Clics could not be loaded.") clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: logger.warning( f"Dataset {dataset:} had no concepticonReference in a ParamterTable." ) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # if status update given, add status column if status_update: add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") # write cognatesets with central concepts write_back = [] for row in cli.tq( dataset["CognatesetTable"], task="Write cognatesets with central concepts to dataset", total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) row["Status_Column"] = status_update write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def replace_column( dataset: pycldf.Dataset, original: str, replacement: str, column_replace: bool, smush: bool, status_update: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> None: # add Status_column if not existing and status update given if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") if column_replace: assert ( original == "id" or original == dataset["ParameterTable", "id"].name ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable." c_id = dataset["ParameterTable", original].name c_new = dataset["ParameterTable", replacement].name mapping = { concept[c_id]: concept[c_new] for concept in dataset["ParameterTable"] } assert smush or len(mapping) == len( set(mapping.values()) ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended." # dataset["ParameterTable"].tableSchema.columns["c_id"] rename(dataset, mapping, logger, status_update=status_update) else: concepts = dataset["ParameterTable"] c_id = dataset["ParameterTable", "id"].name logger.info(f"Changing {c_id:} of ParameterTable…") dataset.write(ParameterTable=[ substitute_many(r, [c_id], {original: replacement}, status_update=None) for r in concepts ]) rename(dataset, {original: replacement}, logger, status_update=status_update)
def aligne_cognate_table(dataset: pycldf.Dataset, status_update: t.Optional[str] = None): # add Status_Column if not existing – TODO: make configurable if status_update: add_status_column_to_table(dataset=dataset, table_name="CognateTable") forms = util.cache_table(dataset, "FormTable") c_id = dataset["CognateTable", "id"].name c_form_id = dataset["CognateTable", "formReference"].name c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name c_slice = dataset["CognateTable", "segmentSlice"].name c_alignment = dataset["CognateTable", "alignment"].name cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {} judgements: t.Dict[str, t.Dict[str, t.Any]] = {} for judgement in cli.tq( dataset["CognateTable"], task="Aligning the cognate segments", total=dataset["CognateTable"].common_props.get("dc:extent"), ): judgements[judgement[c_id]] = judgement form = forms[judgement[c_form_id]] morpheme = [] if not judgement[c_slice]: morpheme = form["segments"] else: morpheme = [ form["segments"][i] for i in util.parse_segment_slices(judgement[c_slice]) ] cognatesets.setdefault(judgement[c_cognateset_id], []).append( ((form["languageReference"], morpheme), judgement[c_id])) for cognateset, morphemes in cognatesets.items(): for alignment, id in align(morphemes): judgements[id][c_alignment] = alignment if status_update: judgements[id]["Status_Column"] = status_update dataset.write(CognateTable=judgements.values())
def create_concepticon_for_concepts( dataset: pycldf.Dataset, language: t.Sequence[t.Tuple[str, str]], concepticon_glosses: bool, concepticon_definition: bool, overwrite: bool, status_update: t.Optional[str], ): # add Status_Column if status update if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") # add Concepticon_ID column to ParameterTable if dataset.column_names.parameters.concepticonReference is None: # Create a concepticonReference column dataset.add_columns("ParameterTable", "Concepticon_ID") c = dataset["ParameterTable"].tableSchema.columns[-1] c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}" c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference") dataset.write_metadata() if not language: language = [(dataset.column_names.parameters.id, "en")] gloss_languages: t.Dict[str, str] = dict(language) add_concepticon_references( dataset, gloss_languages=gloss_languages, status_update=status_update, overwrite=overwrite, ) if concepticon_glosses: add_concepticon_names(dataset) if concepticon_definition: add_concepticon_definitions(dataset=dataset)
def add_single_languages( metadata: Path, sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet], match_form: t.Optional[t.List[str]], concept_name: t.Optional[str], ignore_missing: bool, ignore_superfluous: bool, status_update: t.Optional[str], logger: cli.logging.Logger, ) -> t.Mapping[str, ImportLanguageReport]: if status_update == "None": status_update = None # initiate dataset from meta data or csv depending on command line arguments if metadata: if metadata.name == "forms.csv": dataset = pycldf.Dataset.from_data(metadata) else: dataset = pycldf.Dataset.from_metadata(metadata) concepts: t.Mapping[str, str] try: cid = dataset["ParameterTable", "id"].name if concept_name is None: concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]} concept_column = dataset["FormTable", "parameterReference"].name else: name = dataset["ParameterTable", "name"].name concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]} concept_column = concept_name except (KeyError, FileNotFoundError) as err: if isinstance(err, KeyError): logger.warning( "Did not find a well-formed ParameterTable. Importing all forms independent of concept" ) elif isinstance(err, FileNotFoundError): logger.warning( f"Did not find {dataset['ParameterTable'].url.string}. " f"Importing all forms independent of concept" ) concepts = KeyKeyDict() if concept_name: concept_column = concept_name else: concept_column = dataset["FormTable", "parameterReference"].name # add Status_Column if not existing and status_update given if status_update: add_status_column_to_table(dataset=dataset, table_name="FormTable") report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) # import all selected sheets for sheet in sheets: for lang, subreport in read_single_excel_sheet( dataset=dataset, sheet=sheet, logger=logger, match_form=match_form, entries_to_concepts=concepts, concept_column=concept_column, ignore_missing=ignore_missing, ignore_superfluous=ignore_superfluous, status_update=status_update, ).items(): report[lang] += subreport return report
def load_dataset( metadata: Path, lexicon: t.Optional[str], cognate_lexicon: t.Optional[str] = None, status_update: t.Optional[str] = None, logger: logging.Logger = cli.logger, ): # logging.basicConfig(filename="warnings.log") dataset = pycldf.Dataset.from_metadata(metadata) # load dialect from metadata try: dialect = argparse.Namespace( **dataset.tablegroup.common_props["special:fromexcel"]) except KeyError: dialect = None if not lexicon and not cognate_lexicon: raise argparse.ArgumentError( None, "At least one of WORDLIST and COGNATESETS excel files must be specified", ) if lexicon: # load dialect from metadata if dialect: try: EP = excel_parser_from_dialect(dataset, dialect, cognate=False) except (AttributeError, KeyError) as err: field = re.match(r".*?'(.+?)'.+?'(.+?)'$", str(err)).group(2) logger.warning( f"User-defined format specification in the json-file was missing the key {field}, " f"falling back to default parser") EP = ExcelParser else: logger.warning( "User-defined format specification in the json-file was missing, falling back to default parser" ) EP = ExcelParser # The Intermediate Storage, in a in-memory DB (unless specified otherwise) # add Status_Column if not existing if status_update: add_status_column_to_table(dataset=dataset, table_name="FormTable") EP = EP(dataset, row_type=Concept) EP.db.empty_cache() lexicon_wb = openpyxl.load_workbook(lexicon).active EP.parse_cells(lexicon_wb, status_update=status_update) EP.db.write_dataset_from_cache() # load cognate dataset if provided by metadata if cognate_lexicon: if dialect: try: ECP = excel_parser_from_dialect( dataset, argparse.Namespace(**dialect.cognates), cognate=True) except (AttributeError, KeyError) as err: field = re.match(r".*?'(.+?)'.+?'(.+?)'$", str(err)).group(2) logger.warning( f"User-defined format specification in the json-file was missing the key {field}, " f"falling back to default parser") ECP = ExcelCognateParser else: logger.warning( "User-defined format specification in the json-file was missing, falling back to default parser" ) ECP = ExcelCognateParser # add Status_Column if not existing if status_update: add_status_column_to_table(dataset=dataset, table_name="CognateTable") ECP = ECP(dataset, row_type=CogSet) ECP.db.cache_dataset() for sheet in openpyxl.load_workbook(cognate_lexicon).worksheets: ECP.parse_cells(sheet, status_update=status_update) ECP.db.write_dataset_from_cache()