def on_form_not_found( self, form: t.Dict[str, t.Any], cell_identifier: t.Optional[str] = None, language_id: t.Optional[str] = None, logger: cli.logging.Logger = cli.logger, ) -> bool: """Should I add a missing object? No, but inform the user. Send a warning (ObjectNotFoundWarning) reporting the missing object and cell. Returns ======= False: The object should not be added. """ rep = form.get("cldf_id", repr(form)) logger.warning( f"Unable to find form {rep} in cell {cell_identifier} in the dataset. " f"This cognate judgement was skipped. " f"Please make sure that the form is present in forms.csv or in the file " f"used for the Wordlist importation.") # Do a fuzzy search for row in self.db.find_db_candidates(form, self.check_for_match, edit_dist_threshold=4): logger.info(f"Did you mean {row} ?") return False
def header_from_cognate_excel( ws: openpyxl.worksheet.worksheet.Worksheet, dataset: pycldf.Dataset, logger: cli.logging.Logger = cli.logger, ): row_header = [] separators = [] for (header, ) in ws.iter_cols( min_row=1, max_row=1, max_col=len(dataset["CognatesetTable"].tableSchema.columns), ): column_name = header.value if column_name is None: column_name = dataset["CognatesetTable", "id"].name elif column_name == "CogSet": column_name = dataset["CognatesetTable", "id"].name try: column_name = dataset["CognatesetTable", column_name].name except KeyError: break row_header.append(column_name) separators.append(dataset["CognatesetTable", column_name].separator) if column_name == dataset["CognatesetTable", "comment"].name: logger.warning( "Your cognates table has a separate ‘{header.value}’ column for comments, but `lexedata.importer.cognates` expects to extract comments from the cell comments of the cognateset metadata columns, not from a separate column. Your ‘{header.value}’ column will be ignored." ) return row_header, separators
def source_from_source_string( self, source_string: str, language_id: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> str: """Parse a string referencing a language-specific source""" context: t.Optional[str] if ":" in source_string: source_part, context = source_string.split(":", maxsplit=1) if not context.endswith("}"): logger.warning( f"In source {source_string}: Closing bracket '}}' is missing, split into source and page/context may be wrong" ) source_string = source_part + "}" context = context[:-1].strip() context = context.replace(":", "").replace(",", "") else: context = None if source_string.startswith("{") and source_string.endswith("}"): source_string = source_string[1:-1] if language_id is None: source_id = string_to_id(source_string) else: source_id = string_to_id(f"{language_id:}_s{source_string:}") source_id = source_id.replace(":", "").replace(",", "") if context: return f"{source_id}[{context}]" else: return source_id
def list_homophones(dataset: pycldf.Dataset, out: io.TextIOBase, logger: cli.logging.Logger = cli.logger) -> None: clics = load_clics() # warn if clics cannot be loaded if not clics: logger.warning( "Clics could not be loaded. Using an empty graph instead") clics = nx.Graph() c_id = dataset["ParameterTable", "id"].name try: c_concepticon = dataset["ParameterTable", "concepticonReference"].name except KeyError: cli.Exit.INVALID_DATASET( "This script requires a column concepticonReference in ParamterTable. " "Please run add_concepticon.py") concepticon = {} for concept in dataset["ParameterTable"]: concepticon[concept[c_id]] = concept[c_concepticon] f_id = dataset["FormTable", "id"].name f_lang = dataset["FormTable", "languageReference"].name f_concept = dataset["FormTable", "parameterReference"].name f_form = dataset["FormTable", "form"].name homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[ str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set)) for form in dataset["FormTable"]: if form[f_form] == "-" or form[f_form] is None: continue if isinstance(form[f_concept], list): homophones[form[f_lang]][form[f_form]].add( tuple(form[f_concept]) + (form[f_id], )) else: homophones[form[f_lang]][form[f_form]].add( (form[f_concept], form[f_id])) for lang, forms in homophones.items(): for form, meanings in forms.items(): if len(meanings) == 1: continue clics_nodes = {concepticon.get(concept) for concept, _ in meanings} if None in clics_nodes: x = " (but at least one concept not found):" else: x = ":" clics_nodes -= {None} if len(clics_nodes) <= 1: x = "Unknown" + x elif nx.is_connected(clics.subgraph(clics_nodes)): x = "Connected" + x else: x = "Unconnected" + x line = f"{lang}, '{form}': {x}\n" for ele in sorted(meanings): line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n" out.write(line)
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, logger: cli.logging.Logger = cli.logger, status_update: t.Optional = None, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: logger.warning("Clics could not be loaded.") clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: logger.warning( f"Dataset {dataset:} had no concepticonReference in a ParamterTable." ) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # if status update given, add status column if status_update: add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") # write cognatesets with central concepts write_back = [] for row in cli.tq( dataset["CognatesetTable"], task="Write cognatesets with central concepts to dataset", total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) row["Status_Column"] = status_update write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def separate( self, values: str, context: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[str]: """Separate different form descriptions in one string. Separate forms separated by comma or semicolon, unless the comma or semicolon occurs within a set of matching component delimiters (eg. brackets) If the brackets don't match, the whole remainder string is passed on, so that the form parser can try to recover as much as possible or throw an exception. """ raw_split = re.split(self.separation_pattern, values) if len(raw_split) <= 1: for form in raw_split: yield form return while len(raw_split) > 1: if check_brackets(raw_split[0], self.bracket_pairs): form = raw_split.pop(0).strip() if form: yield form raw_split.pop(0) else: raw_split[:2] = ["".join(raw_split[:2])] if not check_brackets(raw_split[0], self.bracket_pairs): logger.warning( f"{context:}In values {values:}: " "Encountered mismatched closing delimiters. Please check that the " "separation of the cell into multiple entries, for different forms, was correct." ) form = raw_split.pop(0).strip() if form: yield form assert not raw_split
def add_cognate_table( dataset: pycldf.Wordlist, split: bool = True, logger: cli.logging.Logger = cli.logger, ) -> None: if "CognateTable" in dataset: return dataset.add_component("CognateTable") # TODO: Check if that cognatesetReference is already a foreign key to # elsewhere (could be a CognatesetTable, could be whatever), because then # we need to transfer that knowledge. # Load anything that's useful for a cognate set table: Form IDs, segments, # segment slices, cognateset references, alignments columns = { "id": dataset["FormTable", "id"].name, "concept": dataset["FormTable", "parameterReference"].name, "form": dataset["FormTable", "form"].name, } for property in [ "segments", "segmentSlice", "cognatesetReference", "alignment" ]: try: columns[property] = dataset["FormTable", property].name except KeyError: pass cognate_judgements = [] forms = cache_table(dataset, columns=columns) forms_without_segments = 0 for f, form in cli.tq(forms.items(), task="Extracting cognate judgements from forms…"): if form.get("cognatesetReference"): if split: cogset = util.string_to_id("{:}-{:}".format( form["concept"], form["cognatesetReference"])) else: cogset = form["cognatesetReference"] judgement = { "ID": f, "Form_ID": f, "Cognateset_ID": cogset, } try: judgement["Segment_Slice"] = form["segmentSlice"] except KeyError: try: if not form["segments"]: raise ValueError("No segments") if ("+" in form["segments"] and dataset["FormTable", "cognatesetReference"].separator): logger.warning( "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!" ) judgement["Segment_Slice"] = [ "1:{:d}".format(len(form["segments"])) ] except (KeyError, TypeError, ValueError): forms_without_segments += 1 if forms_without_segments >= 5: pass else: logger.warning( f"No segments found for form {f} ({form['form']})." ) # What does an alignment mean without segments or their slices? # Doesn't matter, if we were given one, we take it. judgement["Alignment"] = form.get("alignment") cognate_judgements.append(judgement) if forms_without_segments >= 5: logger.warning( "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.", forms_without_segments, ) # Delete the cognateset column cols = dataset["FormTable"].tableSchema.columns remove = { dataset["FormTable", c].name for c in ["cognatesetReference", "segmentSlice", "alignment"] if ("FormTable", c) in dataset } def clean_form(form): for c in remove: form.pop(c, None) return form forms = [clean_form(form) for form in dataset["FormTable"]] for c in remove: ix = cols.index(dataset["FormTable", c]) del cols[ix] dataset.write(FormTable=forms) dataset.write(CognateTable=cognate_judgements)
def load_forms_from_tsv( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], input_file: Path, logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]: """ Side effects ============ This function overwrites dataset's FormTable """ input = csv.DictReader( input_file.open(encoding="utf-8"), delimiter="\t", ) # These days, all dicts are ordered by default. Still, better make this explicit. forms = util.cache_table(dataset) edictor_cognatesets: t.Dict[ int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = collections.defaultdict(list) form_table_upper = { (util.cldf_property(column.propertyUrl) or column.name).upper(): ( util.cldf_property(column.propertyUrl) or column.name ) for column in dataset["FormTable"].tableSchema.columns } form_table_upper.update( { "DOCULECT": "languageReference", "CONCEPT": "parameterReference", "IPA": "form", "COGID": "cognatesetReference", "ALIGNMENT": "alignment", "TOKENS": "segments", "CLDF_ID": "id", "ID": "", } ) if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]: form_table_upper["_PARAMETERREFERENCE"] = "parameterReference" form_table_upper["CONCEPT"] = "" separators: t.MutableMapping[str, t.Optional[str]] = {} # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames. for i in range(len(input.fieldnames)): if i == 0 and input.fieldnames[0] != "ID": raise ValueError( "When importing from Edictor, expected the first column to be named 'ID', but found %s", input.fieldnames["ID"], ) lingpy = input.fieldnames[i] try: input.fieldnames[i] = form_table_upper[lingpy.upper()] except KeyError: logger.warning( "Your edictor file contained a column %s, which I could not interpret.", lingpy, ) if input.fieldnames[i] == "cognatesetReference": separators[input.fieldnames[i]] = " " elif input.fieldnames[i] == "alignment": separators[input.fieldnames[i]] = " " try: separators[input.fieldnames[i]] = dataset[ "FormTable", input.fieldnames[i] ].separator except KeyError: pass logger.info( "The header of your edictor file will be interpreted as %s.", input.fieldnames ) affected_forms: t.Set[types.Form_ID] = set() for line in cli.tq( input, task="Importing form rows from edictor…", total=len(forms) ): # Column "" is the re-named Lingpy-ID column, so the first one. if not any(line.values()) or line[""].startswith("#"): # One of Edictor's comment rows, storing settings continue for (key, value) in line.items(): value = value.replace("\\!t", "\t").replace("\\!n", "\n") sep = separators[key] if sep is not None: if not value: line[key] = [] else: line[key] = value.split(sep) else: line[key] = value affected_forms.add(line["id"]) try: for segments, cognateset, alignment in extract_partial_judgements( line["segments"], line["cognatesetReference"], line["alignment"], logger, ): edictor_cognatesets[cognateset].append( (line["id"], segments, alignment) ) forms[line["id"]] = line except IndexError: logger.warning( f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped." ) edictor_cognatesets.pop(0, None) columns = { (util.cldf_property(column.propertyUrl) or column.name): column.name for column in dataset["FormTable"].tableSchema.columns } # Deliberately make use of the property of `write` to discard any entries # that don't correspond to existing columns. Otherwise, we'd still have to # get rid of the alignment, cognatesetReference and Lingpy-ID columns. dataset["FormTable"].write( ( { columns[property]: value for property, value in form.items() if columns.get(property) } for form in forms.values() ) ) return edictor_cognatesets, affected_forms
def root_presence_code( dataset: t.Mapping[types.Language_ID, t.Mapping[types.Parameter_ID, t.Set[types.Cognateset_ID]]], relevant_concepts: t.Mapping[types.Cognateset_ID, t.Iterable[types.Parameter_ID]], ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"], logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]], t.Mapping[types.Cognateset_ID, int], ]: """Create a root-presence/absence coding from cognate codes in a dataset Take the cognate code information from a wordlist, i.e. a mapping of the form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary alignment from it that lists for every root whether it is present in that language or not. Return that, and the association between cognatesets and characters. >>> alignment, roots = root_presence_code( ... {"Language": {"Meaning": {"Cognateset 1"}}}, ... relevant_concepts={"Cognateset 1": ["Meaning"]}) >>> alignment {'Language': ['0', '1']} >>> roots {'Cognateset 1': 1} The first entry in each sequence is always '0': The configuration where a form is absent from all languages is never observed, but always possible, so we add this entry for the purposes of ascertainment correction. If a root is attested at all, in any concept, it is considered present. Because the word list is never a complete description of the language's lexicon, the function employs a heuristic to generate ‘absent’ states. If a root is unattested, and at least half of the relevant concepts associated with this root are attested, but each expressed by another root, the root is assumed to be absent in the target language. (If there is exactly one central concept, then that central concept being attested or unknown is a special case of this general rule.) Otherwise the presence/absence of the root is considered unknown. >>> alignment, roots = root_presence_code( ... {"l1": {"m1": {"c1"}}, ... "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}}, ... relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]}) >>> sorted(roots) ['c1', 'c2', 'c3'] >>> sorted_roots = sorted(roots.items()) >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()} {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']} >>> list(zip(*sorted(zip(*alignment.values())))) [('0', '0', '1', '?'), ('0', '1', '1', '1')] """ all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts) language_roots: t.MutableMapping[ types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for language, lexicon in dataset.items(): for concept, cognatesets in lexicon.items(): if not cognatesets: logger.warning( f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it." ) for cognateset in cognatesets: language_roots[language].add(cognateset) all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots) alignment = {} roots = {} for language, lexicon in dataset.items(): alignment[language] = list(ascertainment) for root in all_roots_sorted: roots[root] = len(alignment[language]) if root in language_roots[language]: alignment[language].append("1") else: n_concepts = 0 n_filled_concepts = 0 for concept in relevant_concepts[root]: n_concepts += 1 if lexicon.get(concept): n_filled_concepts += 1 if 2 * n_filled_concepts >= n_concepts: alignment[language].append("0") else: alignment[language].append("?") return alignment, roots
def read_wordlist( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], code_column: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]]: col_map = dataset.column_names if code_column: # Just in case that column was specified by property URL. We # definitely want the name. In any case, this will also throw a # helpful KeyError when the column does not exist. form_table_form = col_map.forms.form form_table_column = col_map.forms.id cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": dataset["FormTable", code_column].name, }, filter=lambda row: bool(row[col_map.forms.form]), ) else: # We search for cognatesetReferences in the FormTable or a separate # CognateTable. # Try the FormTable first. code_column = col_map.forms.cognatesetReference if code_column: # This is not the CLDF way, warn the user. form_table_column = col_map.forms.id form_table_form = col_map.forms.form logger.warning( "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table." ) cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": code_column, }, ) else: # There was no cognatesetReference in the form table. If we # find them in CognateTable (I mean, they should be there!), we # store them keyed with formReference. if (col_map.cognates and col_map.cognates.cognatesetReference and col_map.cognates.formReference): code_column = col_map.cognates.cognatesetReference form_reference = col_map.cognates.formReference (foreign_key, ) = [ key for key in dataset["CognateTable"].tableSchema.foreignKeys if key.columnReference == [form_reference] ] (form_table_column, ) = foreign_key.reference.columnReference cognatesets = util.cache_table( dataset, "CognateTable", { "form": form_reference, "code": code_column }, ) else: raise ValueError( "Dataset has no cognatesetReference column in its " "primary table or in a separate cognate table. " "Is this a metadata-free wordlist and you forgot to " "specify code_column explicitly?") # Cognate sets have been loaded. Consolidate. cognates_by_form: t.MutableMapping[ types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for judgement in cognatesets.values(): cognates_by_form[judgement["form"]].add(judgement["code"]) parameter_column = col_map.forms.parameterReference # If one form can have multiple concepts, if dataset["FormTable", parameter_column].separator: def all_parameters(parameter): return list(parameter) else: def all_parameters(parameter): return [parameter] data: t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]] if "LanguageTable" in dataset: (langref_target, ) = [ key for key in dataset["FormTable"].tableSchema.foreignKeys if key.columnReference == [dataset["FormTable", "languageReference"].name] ] ref_col = langref_target.reference.columnReference[0] data = { lang[ref_col]: t.DefaultDict(set) for lang in dataset["LanguageTable"] } else: data = t.DefaultDict(lambda: t.DefaultDict(set)) for row in dataset["FormTable"].iterdicts(): if not row[col_map.forms.form]: # Transcription is empty, should not be a form. Skip, but maybe # warn if it was in a cognateset. if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], cognates_by_form[row[form_table_column]], ) continue language = row[col_map.forms.languageReference] if row[col_map.forms.form] == "-": if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], language, cognates_by_form[row[form_table_column]], ) cognates_by_form[row[form_table_column]] = set() for parameter in all_parameters(row[parameter_column]): if data[language][parameter]: logger.warning( "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.", row[col_map.forms.id], parameter, row[col_map.forms.languageReference], data[language][parameter], ) for parameter in all_parameters(row[parameter_column]): data[language][parameter] |= cognates_by_form[ row[form_table_column]] return data
def read_single_excel_sheet( dataset: pycldf.Dataset, sheet: openpyxl.worksheet.worksheet.Worksheet, logger: cli.logging.Logger = cli.logger, match_form: t.Optional[t.List[str]] = None, entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(), concept_column: t.Optional[str] = None, ignore_missing: bool = False, ignore_superfluous: bool = False, status_update: t.Optional[str] = None, ) -> t.Mapping[str, ImportLanguageReport]: report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) concept_columns: t.Tuple[str, str] if concept_column is None: concept_columns = ( dataset["FormTable", "parameterReference"].name, dataset["FormTable", "parameterReference"].name, ) else: concept_columns = ( dataset["FormTable", "parameterReference"].name, concept_column, ) db = DB(dataset) db.cache_dataset() # required cldf fields of a form c_f_id = db.dataset["FormTable", "id"].name c_f_language = db.dataset["FormTable", "languageReference"].name c_f_form = db.dataset["FormTable", "form"].name c_f_value = db.dataset["FormTable", "value"].name c_f_concept = db.dataset["FormTable", "parameterReference"].name if not match_form: match_form = [c_f_form, c_f_language] if not db.dataset["FormTable", c_f_concept].separator: logger.warning( "Your metadata does not allow polysemous forms. According to your specifications, " "identical forms with different concepts will always be considered homophones, not a single " "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference " "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones." ) match_form.append(c_f_concept) else: if c_f_concept in match_form: logger.info( "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones." ) sheet_header = get_headers_from_excel(sheet) form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys()) # These columns don't need to be given, we can infer them from the sheet title and from the other data: implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {} if c_f_language not in sheet_header: implicit["languageReference"] = c_f_language if c_f_id not in sheet_header: implicit["id"] = c_f_id if c_f_value not in sheet_header: implicit["value"] = c_f_value found_columns = set(sheet_header) - {concept_column} - set(implicit.values()) expected_columns = set(form_header) - {c_f_concept} - set(implicit.values()) if not found_columns >= expected_columns: if ignore_missing: logger.info( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"For the newly imported forms, these columns will be left empty in the dataset." ) else: raise ValueError( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these " f"columns empty in the dataset for the newly imported forms." ) if not found_columns <= expected_columns: if ignore_superfluous: logger.info( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. These columns will be ignored." ) else: raise ValueError( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. Clean up your data, or use " f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns." ) # check if language exist c_l_name = db.dataset["LanguageTable", "name"].name c_l_id = db.dataset["LanguageTable", "id"].name language_name_to_language_id = { row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values() } language_name = normalize_string(sheet.title) if language_name in language_name_to_language_id: language_id = language_name_to_language_id[language_name] report[language_id].is_new_language = False else: language_id = language_name report[language_id].is_new_language = True # read new data from sheet for form in cli.tq( import_data_from_sheet( sheet, sheet_header=sheet_header, implicit=implicit, language_id=language_id, concept_column=concept_columns, ), task=f"Parsing cells of sheet {sheet.title}", total=sheet.max_row, ): # if concept not in dataset, don't add form try: concept_entry = form[c_f_concept] entries_to_concepts[concept_entry] except KeyError: logger.warning( f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. " f"The corresponding form was ignored and not added to the dataset." ) report[language_id].skipped += 1 continue # else, look for candidates, link to existing form or add new form for item, value in form.items(): try: sep = db.dataset["FormTable", item].separator except KeyError: continue if sep is None: continue form[item] = value.split(sep) form_candidates = db.find_db_candidates(form, match_form) if form_candidates: new_concept_added = False for form_id in form_candidates: logger.info(f"Form {form[c_f_value]} was already in dataset.") if db.dataset["FormTable", c_f_concept].separator: for new_concept in form[c_f_concept]: if ( new_concept not in db.cache["FormTable"][form_id][c_f_concept] ): db.cache["FormTable"][form_id][c_f_concept].append( new_concept ) logger.info( f"New form-concept association: Concept {form[c_f_concept]} was added to existing form " f"{form_id}. If this was not intended " f"(because it is a homophonous form, not a polysemy), " f"you need to manually remove that concept from the old form in forms.csv " f"and create a separate new form. If you want to treat identical forms " f"as homophones in general, add " f"--match-forms={' '.join(match_form)}, " f"{db.dataset['FormTable', 'parameterReference']} " f"when you run this script." ) new_concept_added = True break if new_concept_added: report[language_id].concepts += 1 else: report[language_id].existing += 1 else: # we land here after the break and keep adding existing forms to the dataset just with integer in id +1 form[c_f_language] = language_id if "id" in implicit: # TODO: check for type of form id column form_concept = form[c_f_concept] concept_reference = ( form_concept[0] if isinstance(form_concept, list) else form_concept ) form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}") db.make_id_unique(form) if status_update: form["Status_Column"] = status_update db.insert_into_db(form) report[language_id].new += 1 # write to cldf db.write_dataset_from_cache() return report
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
def segment_to_cognateset( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], cognatesets: t.Container[types.Cognateset_ID], logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]: # required fields c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference c_cognate_id = dataset.column_names.cognates.id c_cognate_form = dataset.column_names.cognates.formReference c_cognate_slice = dataset.column_names.cognates.segmentSlice forms = util.cache_table(dataset) cognateset_cache: t.Container[types.Cognateset_ID] if "CognatesetTable" in dataset: c_s_id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[c_s_id] for cognateset in dataset["CognatesetTable"] if cognatesets is None or cognateset["ID"] in cognatesets } else: if cognatesets is None: cognateset_cache = types.WorldSet() else: cognateset_cache = cognatesets which_segment_belongs_to_which_cognateset: t.Mapping[ types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = { f: [set() for _ in form["segments"]] for f, form in forms.items() if form["form"] and form["form"].strip() and form["form"].strip() != "-" } for j in dataset["CognateTable"]: if j[c_cognate_form] in forms and j[ c_cognate_cognateset] in cognateset_cache: form = forms[j[c_cognate_form]] if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset: continue if j.get(c_cognate_slice): try: segments_judged = list( parse_segment_slices(j[c_cognate_slice])) except ValueError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end." ) continue else: segments_judged = list(range(len(form["segments"]))) old_s = None for s in segments_judged: if old_s is not None and old_s + 1 != s: logger.warning( f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous" ) try: cognatesets = which_segment_belongs_to_which_cognateset[ j[c_cognate_form]][s] except IndexError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}." ) continue cognatesets.add(j[c_cognate_cognateset]) return which_segment_belongs_to_which_cognateset
def add_single_languages( metadata: Path, sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet], match_form: t.Optional[t.List[str]], concept_name: t.Optional[str], ignore_missing: bool, ignore_superfluous: bool, status_update: t.Optional[str], logger: cli.logging.Logger, ) -> t.Mapping[str, ImportLanguageReport]: if status_update == "None": status_update = None # initiate dataset from meta data or csv depending on command line arguments if metadata: if metadata.name == "forms.csv": dataset = pycldf.Dataset.from_data(metadata) else: dataset = pycldf.Dataset.from_metadata(metadata) concepts: t.Mapping[str, str] try: cid = dataset["ParameterTable", "id"].name if concept_name is None: concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]} concept_column = dataset["FormTable", "parameterReference"].name else: name = dataset["ParameterTable", "name"].name concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]} concept_column = concept_name except (KeyError, FileNotFoundError) as err: if isinstance(err, KeyError): logger.warning( "Did not find a well-formed ParameterTable. Importing all forms independent of concept" ) elif isinstance(err, FileNotFoundError): logger.warning( f"Did not find {dataset['ParameterTable'].url.string}. " f"Importing all forms independent of concept" ) concepts = KeyKeyDict() if concept_name: concept_column = concept_name else: concept_column = dataset["FormTable", "parameterReference"].name # add Status_Column if not existing and status_update given if status_update: add_status_column_to_table(dataset=dataset, table_name="FormTable") report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) # import all selected sheets for sheet in sheets: for lang, subreport in read_single_excel_sheet( dataset=dataset, sheet=sheet, logger=logger, match_form=match_form, entries_to_concepts=concepts, concept_column=concept_column, ignore_missing=ignore_missing, ignore_superfluous=ignore_superfluous, status_update=status_update, ).items(): report[lang] += subreport return report
def parse_form( self, form_string: str, language_id: str, cell_identifier: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Optional[Form]: """Create a dictionary of columns from a form description. Extract each value (transcriptions, comments, sources etc.) from a string describing a single form. """ # not required fields c_comment = self.c.get("comment") c_variants = self.c.get("variants", c_comment) # if string is only whitespaces, there is no form. if not form_string.strip(): return None properties: t.Dict[str, t.Any] = { self.c["lang"]: language_id, self.c["value"]: form_string, } # Semantics: 'None' for no variant expected, any string for the # decorator that introduces variant forms. Currently we expect '~' and # '%', see below. expect_variant: t.Optional[str] = None # Iterate over the delimiter-separated elements of the form. for element in components_in_brackets(form_string, self.bracket_pairs): element = element.strip() if not element: continue # If the element has mismatched brackets (tends to happen only for # the last element, because a mismatched opening bracket means we # are still waiting for the closing one), warn. if not check_brackets(element, self.bracket_pairs): try: delimiter = self.bracket_pairs[element[0]] except KeyError: delimiter = element[0] raise ValueError( f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters " f"{delimiter}. This could be a bigger problem in the cell, " f"so the form was not imported.") # Check what kind of element we have. for start, (term, transcription) in self.element_semantics.items(): field = self.c[term] if element.startswith(start): break else: # TODO: here an other if catchin '-' might be necessary # The only thing we expect outside delimiters is the variant # separators, '~' and '%'. if self.variant_separator and element in self.variant_separator: expect_variant = element else: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored" ) continue # If we encounter a field for the first time, we add it to the # dictionary. If repeatedly, to the variants, with a decorator that # shows how expected the variant was. # This drops sources and comments in variants, if more than one source or comment is provided # clean this up in self.postprocess_form if field in properties: if (not expect_variant and field != c_comment and field != self.c["source"]): logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}" ) properties.setdefault( c_variants, []).append((expect_variant or "") + element) else: if expect_variant: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}" ) properties[field] = element expect_variant = None self.postprocess_form(properties, language_id) return Form(properties)
def log_or_raise(message, log: cli.logging.Logger = cli.logger): log.warning(message)
def create_singletons( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], status: t.Optional[str] = None, by_segment: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]: """Create singleton cognate judgements for forms that don't have cognate judgements. Depending on by_segment, singletons are created for every range of segments that is not in any cognate set yet (True) or just for every form where no segment is in any cognate sets (False). """ forms = util.cache_table(dataset) c_j_id = dataset["CognateTable", "id"].name c_j_cogset = dataset["CognateTable", "cognatesetReference"].name c_j_form = dataset["CognateTable", "formReference"].name try: c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name except KeyError: c_j_segmentslice = None try: c_j_alignment = dataset["CognateTable", "alignment"].name except KeyError: c_j_alignment = None if not dataset.get(("CognatesetTable", "Status_Column")): logger.warning( "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column." ) try: c_s_id = dataset["CognatesetTable", "id"].name all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]} except KeyError: c_s_id = "id" c_s_name = "name" all_cognatesets = { id: types.Judgement({ "id": id, "name": id }) for id in {j[c_j_cogset] for j in dataset["CognateTable"]} } try: c_s_name = dataset["CognatesetTable", "name"].name except KeyError: c_s_name = c_s_id all_judgements = list(dataset["CognateTable"]) if by_segment: judgements = segment_to_cognateset(dataset, types.WorldSet(), logger) forms_and_segments = uncoded_segments(judgements, logger) else: forms_and_segments = uncoded_forms( forms.values(), {j[c_j_form] for j in all_judgements}) for form, slice in forms_and_segments: i = 1 singleton_id = f"X_{form}_{i:d}" while singleton_id in all_cognatesets: i += 1 singleton_id = f"X_{form}_{i:d}" all_cognatesets[singleton_id] = types.CogSet({}) properties = { c_s_name: util.ensure_list(forms[form]["parameterReference"])[0], c_s_id: singleton_id, "Status_Column": status, } try: for column in dataset["CognatesetTable"].tableSchema.columns: all_cognatesets[singleton_id][column.name] = properties.get( column.name) except KeyError: pass judgement = types.Judgement({}) properties = { c_j_id: singleton_id, c_j_cogset: singleton_id, c_j_form: form, c_j_segmentslice: indices_to_segment_slice(slice), c_j_alignment: [forms[form]["segments"][i] for i in slice], "Status_Column": status, } for column in dataset["CognateTable"].tableSchema.columns: judgement[column.name] = properties.get(column.name) all_judgements.append(judgement) return all_cognatesets.values(), all_judgements
def __init__( self, dataset: pycldf.Dataset, element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [ # ("[", "]", "phonetic", True), ("<", ">", "form", True), # ("/", "/", "phonemic", True), ("(", ")", "comment", False), ("{", "}", "source", False), ], separation_pattern: str = r"([;,])", variant_separator: t.Optional[t.List[str]] = ["~", "%"], add_default_source: t.Optional[str] = "{1}", logger: cli.logging.Logger = cli.logger, ): super().__init__(dataset) # Colums implied by element semantics self.bracket_pairs = { start: end for start, end, _, _ in element_semantics } self.element_semantics = { start: (term, transcription) for start, _, term, transcription in element_semantics } for start, end, term, transcription in element_semantics: # Ensure that all terms required by the element semantics are fields we can write to. self.cc(short=term, long=("FormTable", term), dataset=dataset) assert self.transcriptions, ( "Your metadata json file and your cell parser don’t match: Your cell parser " f"{self.__class__.__name__} expects to work with transcriptions " "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form " "in #FormTable, but your metadata defines no such column.") # Colums necessary for word list self.cc(short="source", long=("FormTable", "source"), dataset=dataset) self.cc(short="comment", long=("FormTable", "comment"), dataset=dataset) try: self.comment_separator = dataset["FormTable", "comment"].separator or "\t" except KeyError: logger.info("No #comment column found.") self.comment_separator = "" try: # As long as there is no CLDF term #variants, this will either be # 'variants' or raise a KeyError. However, it is a transparent # re-use of an otherwise established idiom in this module, so we # use this minor overhead. self.c["variants"] = dataset["FormTable", "variants"].name except KeyError: logger.warning( "No 'variants' column found for FormTable in Wordlist-metadata.json. " "Form variants will be added to #comment.") # Other class attributes self.separation_pattern = separation_pattern self.variant_separator = variant_separator self.add_default_source = add_default_source
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache