def test_cell_comments_export(): dataset, _ = copy_to_temp( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") _, out_filename = tempfile.mkstemp(".xlsx", "cognates") E = MatrixExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) for col in E.ws.iter_cols(): pass assert ( col[-1].comment and col[-1].comment.content ), "Last row of last column should contain a form, with a comment attached to it." assert (col[-1].comment.content == "A Comment!" ), "Comment should match the comment from the form table"
def test_roundtrip(cldf_wordlist, working_and_nonworking_bibfile): filled_cldf_wordlist = working_and_nonworking_bibfile(cldf_wordlist) dataset, target = filled_cldf_wordlist c_formReference = dataset["CognateTable", "formReference"].name c_cogsetReference = dataset["CognateTable", "cognatesetReference"].name old_judgements = {(row[c_formReference], row[c_cogsetReference]) for row in dataset["CognateTable"].iterdicts()} writer = ExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(filled_cldf_wordlist[0]) languages = util.cache_table(filled_cldf_wordlist[0], "LanguageTable").values() judgements = util.cache_table(filled_cldf_wordlist[0], "CognateTable").values() cogsets = util.cache_table(filled_cldf_wordlist[0], "CognatesetTable").values() writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) # Reset the existing cognatesets and cognate judgements, to avoid # interference with the the data in the Excel file dataset["CognateTable"].write([]) dataset["CognatesetTable"].write([]) import_cognates_from_excel(writer.ws, dataset) new_judgements = {(row[c_formReference], row[c_cogsetReference]) for row in dataset["CognateTable"].iterdicts()} assert new_judgements == old_judgements
def test_toexcel_filtered(cldf_wordlist, working_and_nonworking_bibfile, caplog): dataset, url = working_and_nonworking_bibfile(cldf_wordlist) writer = MatrixExcelWriter( dataset=dataset, database_url=str(url), ) E = MatrixExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = [ c for n, c in util.cache_table(dataset, "ParameterTable").items() if n == "Woman" ] with caplog.at_level(logging.WARNING): E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) assert len(list(writer.ws.iter_rows())) in {0, 2}
def test_adding_singleton_cognatesets_with_status(caplog): dataset = get_dataset( Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json") dataset.add_columns("CognatesetTable", "Status_Column") with caplog.at_level(logging.WARNING): excel_writer = ExcelWriter(dataset=dataset) cogsets, judgements = create_singletons( dataset, status="NEW", by_segment=True, ) properties_as_key(cogsets, dataset["CognatesetTable"].tableSchema.columns) properties_as_key(judgements, dataset["CognateTable"].tableSchema.columns) forms = util.cache_table(dataset) languages = util.cache_table(dataset, "LanguageTable").values() excel_writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) assert re.search("no Status_Column to write", caplog.text) is None cogset_index = 0 for row in excel_writer.ws.iter_rows(min_row=1, max_row=1): for cell in row: if cell.value == "Status_Column": cogset_index = cell.column - 1 # when accessing the row as a tuple the index is not 1-based as for excel sheets status = [ row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2) ] assert status == [ None, None, None, None, None, None, None, None, None, None, None, "NEW", "NEW", "NEW", "NEW", ]
def test_adding_singleton_cognatesets(caplog): dataset = get_dataset( Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json") with caplog.at_level(logging.WARNING): excel_writer = ExcelWriter(dataset=dataset, ) cogsets, judgements = create_singletons( dataset, status="NEW", by_segment=False, ) properties_as_key(cogsets, dataset["CognatesetTable"].tableSchema.columns) properties_as_key(judgements, dataset["CognateTable"].tableSchema.columns) forms = util.cache_table(dataset) languages = util.cache_table(dataset, "LanguageTable").values() excel_writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) assert re.search("No Status_Column", caplog.text) # load central concepts from output cogset_index = 0 for row in excel_writer.ws.iter_rows(min_row=1, max_row=1): for cell in row: if cell.value == "CogSet": cogset_index = cell.column - 1 # when accessing the row as a tuple the index is not 1-based as for excel sheets cogset_ids = [ row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2) ] assert cogset_ids == [ "one1", "one1", "one2", "one6", "two1", "three1", "two8", "three9", "four1", "four8", "five5", "X_old_paraguayan_guarani_two_1", "X_paraguayan_guarani_five_1", ]
def test_missing_required_column(): dataset, _ = copy_to_temp( Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json") dataset.remove_columns("FormTable", "ID") # TODO: switch to pycldf.dataset.SchemaError with pytest.raises(KeyError): excel_writer = ExcelWriter(dataset=dataset) forms = util.cache_table(dataset) languages = util.cache_table(dataset, "LanguageTable").values() judgements = util.cache_table(dataset, "CognateTable") cogsets = util.cache_table(dataset, "CognatesetTable") excel_writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages)
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile): filled_cldf_wordlist = working_and_nonworking_bibfile(cldf_wordlist) writer = ExcelWriter( dataset=filled_cldf_wordlist[0], database_url=str(filled_cldf_wordlist[1]), ) forms = util.cache_table(filled_cldf_wordlist[0]) languages = util.cache_table(filled_cldf_wordlist[0], "LanguageTable").values() judgements = util.cache_table(filled_cldf_wordlist[0], "CognateTable").values() cogsets = util.cache_table(filled_cldf_wordlist[0], "CognatesetTable").values() writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) _, out_filename = tempfile.mkstemp(".xlsx", "cognates") writer.wb.save(filename=out_filename)
def test_no_comment_column(): dataset, _ = copy_to_temp( Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json") dataset.remove_columns("FormTable", "comment") writer = ExcelWriter(dataset=dataset, ) forms = util.cache_table(dataset).values() for form in forms: assert writer.form_to_cell_value( form).strip() == "{ e t a k ɾ ã } ‘one, one’" break
def tiny_dataset(): ds = util.fs.new_wordlist( FormTable=[{ "ID": "f1" }], CognatesetTable=[ { "ID": "s1", "Source": "3", "Description": "A" }, { "ID": "s2", "Source": "3", "Description": "A" }, { "ID": "s3", "Source": "3", "Description": "A" }, { "ID": "s4", "Source": "1", "Description": "C" }, { "ID": "s5", "Source": "1", "Description": "C" }, ], CognateTable=[{ "ID": f"{i}{n}", "Cognateset_ID": f"s{i}", "Form_ID": "f1" } for i in range(1, 6) for n in range(i)], ) cognatesets = list(util.cache_table(ds, "CognatesetTable").values()) judgements = util.cache_table(ds, "CognateTable").values() return cognatesets, judgements
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile): dataset, filename = working_and_nonworking_bibfile(cldf_wordlist) E = MatrixExcelWriter( dataset=dataset, database_url=str(filename), ) forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) _, out_filename = tempfile.mkstemp(".xlsx", "cognates") E.wb.save(filename=out_filename)
def cogsets_and_judgements( dataset, status: t.Optional[str], by_segment=True, logger: cli.logging.Logger = cli.logger, ): if status is not None: cogsets, judgements = create_singletons( dataset, status=status, by_segment=by_segment, logger=logger, ) properties_as_key(cogsets, dataset["CognatesetTable"].tableSchema.columns) properties_as_key(judgements, dataset["CognateTable"].tableSchema.columns) else: cogsets = util.cache_table(dataset, "CognatesetTable").values() judgements = util.cache_table(dataset, "CognateTable").values() return cogsets, judgements
def test_write_edictor_singleton_dataset(): forms = { "form1": { "ID": "form1", "Language_ID": "axav1032", "Parameter_ID": "one", "Form": "the form", "Segments": list("ðəfom"), "Source": [], } } dataset = lexedata.util.fs.new_wordlist( FormTable=forms.values(), CognateTable=[{ "ID": "1-1", "Form_ID": "form1", "Cognateset_ID": "c1", "Segment_Slice": ["1:1"], "Alignment": ["ð"], }], ) file = io.StringIO() file.name = "<memory>" judgements_about_form = { "form1": (["ð", "(ə)", "(f)", "(o)", "(m)"], ["c1"]) } cognateset_numbers = {"c1": 2} exporter.write_edictor_file( dataset, file, util.cache_table(dataset), judgements_about_form, cognateset_numbers, ) rows = [ line.strip().split("\t") for line in file.getvalue().split("\n")[:3] ] assert rows[2] == [""] assert dict(zip(rows[0], rows[1])) == { "ID": "1", "CONCEPT": "one", "DOCULECT": "axav1032", "IPA": "the form", "CLDF_id": "form1", "TOKENS": "ð ə f o m", "source": "", "comment": "", "COGID": "2", "ALIGNMENT": "ð ( ə f o m )", } assert "<memory>" in file.getvalue()
def test_roundtrip_separator_column(cldf_wordlist, working_and_nonworking_bibfile): """Test whether a CognatesetTable column with separator survives a roundtrip.""" dataset, target = working_and_nonworking_bibfile(cldf_wordlist) dataset.add_columns("CognatesetTable", "CommaSeparatedTags") dataset["CognatesetTable", "CommaSeparatedTags"].separator = "," c_id = dataset["CognatesetTable", "id"].name write_back = list(dataset["CognatesetTable"]) tags = [] for tag, row in zip( itertools.cycle([["two", "tags"], ["single-tag"], [], ["tag;containing;other;separator"]]), write_back, ): tags.append((row[c_id], tag)) row["CommaSeparatedTags"] = tag dataset.write(CognatesetTable=write_back) writer = ExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") _, out_filename = tempfile.mkstemp(".xlsx", "cognates") forms = util.cache_table(dataset) languages = util.cache_table(dataset, "LanguageTable").values() judgements = util.cache_table(dataset, "CognateTable").values() cogsets = util.cache_table(dataset, "CognatesetTable").values() writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) import_cognates_from_excel(writer.ws, dataset) reread_tags = [(c[c_id], c["CommaSeparatedTags"]) for c in dataset["CognatesetTable"]] reread_tags.sort(key=lambda x: x[0]) tags.sort(key=lambda x: x[0]) assert reread_tags == tags
def test_cell_comments_export(): dataset, _ = copy_to_temp( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") _, out_filename = tempfile.mkstemp(".xlsx", "cognates") writer = ExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = util.cache_table(dataset, "CognateTable").values() cogsets = util.cache_table(dataset, "CognatesetTable").values() writer.create_excel(rows=cogsets, judgements=judgements, forms=forms, languages=languages) for col in writer.ws.iter_cols(): pass assert ( col[-1].comment and col[-1].comment.content ), "Last row of last column should contain a judgement, with a comment attached to it." assert (col[-1].comment.content == "A judgement comment" ), "Comment should match the comment from the cognate table"
def aligne_cognate_table(dataset: pycldf.Dataset, status_update: t.Optional[str] = None): # add Status_Column if not existing – TODO: make configurable if status_update: add_status_column_to_table(dataset=dataset, table_name="CognateTable") forms = util.cache_table(dataset, "FormTable") c_id = dataset["CognateTable", "id"].name c_form_id = dataset["CognateTable", "formReference"].name c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name c_slice = dataset["CognateTable", "segmentSlice"].name c_alignment = dataset["CognateTable", "alignment"].name cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {} judgements: t.Dict[str, t.Dict[str, t.Any]] = {} for judgement in cli.tq( dataset["CognateTable"], task="Aligning the cognate segments", total=dataset["CognateTable"].common_props.get("dc:extent"), ): judgements[judgement[c_id]] = judgement form = forms[judgement[c_form_id]] morpheme = [] if not judgement[c_slice]: morpheme = form["segments"] else: morpheme = [ form["segments"][i] for i in util.parse_segment_slices(judgement[c_slice]) ] cognatesets.setdefault(judgement[c_cognateset_id], []).append( ((form["languageReference"], morpheme), judgement[c_id])) for cognateset, morphemes in cognatesets.items(): for alignment, id in align(morphemes): judgements[id][c_alignment] = alignment if status_update: judgements[id]["Status_Column"] = status_update dataset.write(CognateTable=judgements.values())
ttype = ds.get_tabletype(table) c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id") if c_id.datatype.base == "string": # Temporarily open up the datatype format, otherwise we may be unable to read c_id.datatype.format = None elif c_id.datatype.base == "integer": # Temporarily open up the datatype format, otherwise we may be unable to read c_id.datatype = "string" update_integer_ids(ds, table) c_id.datatype = "integer" continue else: logger.warning( f"Table {table.uri} had an id column ({c_id.name}) that is neither integer nor string. I did not touch it." ) continue if args.transparent and ttype in ID_COMPONENTS: cols = { prop: ds[ttype, prop].name for prop in ID_COMPONENTS[ttype] } mapping = clean_mapping(cache_table(ds, ttype, cols)) else: ids = {row[c_id.name] for row in ds[table]} mapping = clean_mapping(cache_table(ds, table.url.string, {})) update_ids(ds, table, mapping) ds.write_metadata()
help="Path to output file (default: output to stdout)", type=Path, ) args = parser.parse_args() logger = cli.setup_logging(args) dataset = pycldf.Dataset.from_metadata(args.metadata) which_segment_belongs_to_which_cognateset = segment_to_cognateset( dataset=dataset, cognatesets=args.cognatesets, logger=logger, ) overlapping_cognatesets = network_of_overlaps( which_segment_belongs_to_which_cognateset, forms_cache=util.cache_table(dataset)) graph = networkx.Graph() graph.add_edges_from(overlapping_cognatesets) if graph.nodes(): out = args.output_file.open("w") if args.output_file else sys.stdout # Sort to keep order persistent for community in sorted( networkx.algorithms.community.greedy_modularity_communities( graph), key=lambda x: sorted(x), ): print("Cluster of overlapping cognate sets:", file=out) for cognateset in sorted(community): print(f"\t {cognateset}", file=out) # TODO: Generate form segments, if considered informative
def segment_to_cognateset( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], cognatesets: t.Container[types.Cognateset_ID], logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]: # required fields c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference c_cognate_id = dataset.column_names.cognates.id c_cognate_form = dataset.column_names.cognates.formReference c_cognate_slice = dataset.column_names.cognates.segmentSlice forms = util.cache_table(dataset) cognateset_cache: t.Container[types.Cognateset_ID] if "CognatesetTable" in dataset: c_s_id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[c_s_id] for cognateset in dataset["CognatesetTable"] if cognatesets is None or cognateset["ID"] in cognatesets } else: if cognatesets is None: cognateset_cache = types.WorldSet() else: cognateset_cache = cognatesets which_segment_belongs_to_which_cognateset: t.Mapping[ types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = { f: [set() for _ in form["segments"]] for f, form in forms.items() if form["form"] and form["form"].strip() and form["form"].strip() != "-" } for j in dataset["CognateTable"]: if j[c_cognate_form] in forms and j[ c_cognate_cognateset] in cognateset_cache: form = forms[j[c_cognate_form]] if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset: continue if j.get(c_cognate_slice): try: segments_judged = list( parse_segment_slices(j[c_cognate_slice])) except ValueError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end." ) continue else: segments_judged = list(range(len(form["segments"]))) old_s = None for s in segments_judged: if old_s is not None and old_s + 1 != s: logger.warning( f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous" ) try: cognatesets = which_segment_belongs_to_which_cognateset[ j[c_cognate_form]][s] except IndexError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}." ) continue cognatesets.add(j[c_cognate_cognateset]) return which_segment_belongs_to_which_cognateset
def read_wordlist( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], code_column: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]]: col_map = dataset.column_names if code_column: # Just in case that column was specified by property URL. We # definitely want the name. In any case, this will also throw a # helpful KeyError when the column does not exist. form_table_form = col_map.forms.form form_table_column = col_map.forms.id cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": dataset["FormTable", code_column].name, }, filter=lambda row: bool(row[col_map.forms.form]), ) else: # We search for cognatesetReferences in the FormTable or a separate # CognateTable. # Try the FormTable first. code_column = col_map.forms.cognatesetReference if code_column: # This is not the CLDF way, warn the user. form_table_column = col_map.forms.id form_table_form = col_map.forms.form logger.warning( "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table." ) cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": code_column, }, ) else: # There was no cognatesetReference in the form table. If we # find them in CognateTable (I mean, they should be there!), we # store them keyed with formReference. if (col_map.cognates and col_map.cognates.cognatesetReference and col_map.cognates.formReference): code_column = col_map.cognates.cognatesetReference form_reference = col_map.cognates.formReference (foreign_key, ) = [ key for key in dataset["CognateTable"].tableSchema.foreignKeys if key.columnReference == [form_reference] ] (form_table_column, ) = foreign_key.reference.columnReference cognatesets = util.cache_table( dataset, "CognateTable", { "form": form_reference, "code": code_column }, ) else: raise ValueError( "Dataset has no cognatesetReference column in its " "primary table or in a separate cognate table. " "Is this a metadata-free wordlist and you forgot to " "specify code_column explicitly?") # Cognate sets have been loaded. Consolidate. cognates_by_form: t.MutableMapping[ types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for judgement in cognatesets.values(): cognates_by_form[judgement["form"]].add(judgement["code"]) parameter_column = col_map.forms.parameterReference # If one form can have multiple concepts, if dataset["FormTable", parameter_column].separator: def all_parameters(parameter): return list(parameter) else: def all_parameters(parameter): return [parameter] data: t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]] if "LanguageTable" in dataset: (langref_target, ) = [ key for key in dataset["FormTable"].tableSchema.foreignKeys if key.columnReference == [dataset["FormTable", "languageReference"].name] ] ref_col = langref_target.reference.columnReference[0] data = { lang[ref_col]: t.DefaultDict(set) for lang in dataset["LanguageTable"] } else: data = t.DefaultDict(lambda: t.DefaultDict(set)) for row in dataset["FormTable"].iterdicts(): if not row[col_map.forms.form]: # Transcription is empty, should not be a form. Skip, but maybe # warn if it was in a cognateset. if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], cognates_by_form[row[form_table_column]], ) continue language = row[col_map.forms.languageReference] if row[col_map.forms.form] == "-": if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], language, cognates_by_form[row[form_table_column]], ) cognates_by_form[row[form_table_column]] = set() for parameter in all_parameters(row[parameter_column]): if data[language][parameter]: logger.warning( "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.", row[col_map.forms.id], parameter, row[col_map.forms.languageReference], data[language][parameter], ) for parameter in all_parameters(row[parameter_column]): data[language][parameter] |= cognates_by_form[ row[form_table_column]] return data
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache
def load_forms_from_tsv( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], input_file: Path, logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]: """ Side effects ============ This function overwrites dataset's FormTable """ input = csv.DictReader( input_file.open(encoding="utf-8"), delimiter="\t", ) # These days, all dicts are ordered by default. Still, better make this explicit. forms = util.cache_table(dataset) edictor_cognatesets: t.Dict[ int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = collections.defaultdict(list) form_table_upper = { (util.cldf_property(column.propertyUrl) or column.name).upper(): ( util.cldf_property(column.propertyUrl) or column.name ) for column in dataset["FormTable"].tableSchema.columns } form_table_upper.update( { "DOCULECT": "languageReference", "CONCEPT": "parameterReference", "IPA": "form", "COGID": "cognatesetReference", "ALIGNMENT": "alignment", "TOKENS": "segments", "CLDF_ID": "id", "ID": "", } ) if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]: form_table_upper["_PARAMETERREFERENCE"] = "parameterReference" form_table_upper["CONCEPT"] = "" separators: t.MutableMapping[str, t.Optional[str]] = {} # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames. for i in range(len(input.fieldnames)): if i == 0 and input.fieldnames[0] != "ID": raise ValueError( "When importing from Edictor, expected the first column to be named 'ID', but found %s", input.fieldnames["ID"], ) lingpy = input.fieldnames[i] try: input.fieldnames[i] = form_table_upper[lingpy.upper()] except KeyError: logger.warning( "Your edictor file contained a column %s, which I could not interpret.", lingpy, ) if input.fieldnames[i] == "cognatesetReference": separators[input.fieldnames[i]] = " " elif input.fieldnames[i] == "alignment": separators[input.fieldnames[i]] = " " try: separators[input.fieldnames[i]] = dataset[ "FormTable", input.fieldnames[i] ].separator except KeyError: pass logger.info( "The header of your edictor file will be interpreted as %s.", input.fieldnames ) affected_forms: t.Set[types.Form_ID] = set() for line in cli.tq( input, task="Importing form rows from edictor…", total=len(forms) ): # Column "" is the re-named Lingpy-ID column, so the first one. if not any(line.values()) or line[""].startswith("#"): # One of Edictor's comment rows, storing settings continue for (key, value) in line.items(): value = value.replace("\\!t", "\t").replace("\\!n", "\n") sep = separators[key] if sep is not None: if not value: line[key] = [] else: line[key] = value.split(sep) else: line[key] = value affected_forms.add(line["id"]) try: for segments, cognateset, alignment in extract_partial_judgements( line["segments"], line["cognatesetReference"], line["alignment"], logger, ): edictor_cognatesets[cognateset].append( (line["id"], segments, alignment) ) forms[line["id"]] = line except IndexError: logger.warning( f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped." ) edictor_cognatesets.pop(0, None) columns = { (util.cldf_property(column.propertyUrl) or column.name): column.name for column in dataset["FormTable"].tableSchema.columns } # Deliberately make use of the property of `write` to discard any entries # that don't correspond to existing columns. Otherwise, we'd still have to # get rid of the alignment, cognatesetReference and Lingpy-ID columns. dataset["FormTable"].write( ( { columns[property]: value for property, value in form.items() if columns.get(property) } for form in forms.values() ) ) return edictor_cognatesets, affected_forms
dataset, args.add_singletons_with_status, args.by_segment, logger) try: cogset_order = (util.cldf_property( dataset["CognatesetTable", args.sort_cognatesets_by].propertyUrl) or dataset["CognatesetTable", args.sort_cognatesets_by].name) except (KeyError): cli.Exit.INVALID_COLUMN_NAME( f"No column '{args.sort_cognatesets_by}' in your CognatesetTable.") sort_cognatesets(cogsets, judgements, cogset_order, size=args.size_sort) # TODO: wrap the following two blocks into a # get_sorted_languages() -> t.OrderedDict[languageReference, Column Header/Titel/Name] # function languages = list(util.cache_table(dataset, "LanguageTable").values()) if args.sort_languages_by: c_sort = (util.cldf_property( dataset["LanguageTable", args.sort_languages_by].propertyUrl) or dataset["LanguageTable", args.sort_languages_by].name) languages.sort(key=lambda x: x[c_sort], reverse=False) forms = util.cache_table(dataset) E.create_excel( size_sort=args.size_sort, languages=languages, rows=cogsets, judgements=judgements, forms=forms, )
type=str, default="https://example.org/lexicon/{:}", help= "A template string for URLs pointing to individual forms. For example, to" " point to lexibank, you would use https://lexibank.clld.org/values/{:}." " (default: https://example.org/lexicon/{:})", ) args = parser.parse_args() logger = cli.setup_logging(args) dataset = (pycldf.Wordlist.from_metadata(args.metadata), ) E = MatrixExcelWriter( dataset, database_url=args.url_template, logger=logger, ) forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) E.wb.save(filename=args.excel, )
def create_singletons( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], status: t.Optional[str] = None, by_segment: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]: """Create singleton cognate judgements for forms that don't have cognate judgements. Depending on by_segment, singletons are created for every range of segments that is not in any cognate set yet (True) or just for every form where no segment is in any cognate sets (False). """ forms = util.cache_table(dataset) c_j_id = dataset["CognateTable", "id"].name c_j_cogset = dataset["CognateTable", "cognatesetReference"].name c_j_form = dataset["CognateTable", "formReference"].name try: c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name except KeyError: c_j_segmentslice = None try: c_j_alignment = dataset["CognateTable", "alignment"].name except KeyError: c_j_alignment = None if not dataset.get(("CognatesetTable", "Status_Column")): logger.warning( "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column." ) try: c_s_id = dataset["CognatesetTable", "id"].name all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]} except KeyError: c_s_id = "id" c_s_name = "name" all_cognatesets = { id: types.Judgement({ "id": id, "name": id }) for id in {j[c_j_cogset] for j in dataset["CognateTable"]} } try: c_s_name = dataset["CognatesetTable", "name"].name except KeyError: c_s_name = c_s_id all_judgements = list(dataset["CognateTable"]) if by_segment: judgements = segment_to_cognateset(dataset, types.WorldSet(), logger) forms_and_segments = uncoded_segments(judgements, logger) else: forms_and_segments = uncoded_forms( forms.values(), {j[c_j_form] for j in all_judgements}) for form, slice in forms_and_segments: i = 1 singleton_id = f"X_{form}_{i:d}" while singleton_id in all_cognatesets: i += 1 singleton_id = f"X_{form}_{i:d}" all_cognatesets[singleton_id] = types.CogSet({}) properties = { c_s_name: util.ensure_list(forms[form]["parameterReference"])[0], c_s_id: singleton_id, "Status_Column": status, } try: for column in dataset["CognatesetTable"].tableSchema.columns: all_cognatesets[singleton_id][column.name] = properties.get( column.name) except KeyError: pass judgement = types.Judgement({}) properties = { c_j_id: singleton_id, c_j_cogset: singleton_id, c_j_form: form, c_j_segmentslice: indices_to_segment_slice(slice), c_j_alignment: [forms[form]["segments"][i] for i in slice], "Status_Column": status, } for column in dataset["CognateTable"].tableSchema.columns: judgement[column.name] = properties.get(column.name) all_judgements.append(judgement) return all_cognatesets.values(), all_judgements
def add_cognate_table( dataset: pycldf.Wordlist, split: bool = True, logger: cli.logging.Logger = cli.logger, ) -> None: if "CognateTable" in dataset: return dataset.add_component("CognateTable") # TODO: Check if that cognatesetReference is already a foreign key to # elsewhere (could be a CognatesetTable, could be whatever), because then # we need to transfer that knowledge. # Load anything that's useful for a cognate set table: Form IDs, segments, # segment slices, cognateset references, alignments columns = { "id": dataset["FormTable", "id"].name, "concept": dataset["FormTable", "parameterReference"].name, "form": dataset["FormTable", "form"].name, } for property in [ "segments", "segmentSlice", "cognatesetReference", "alignment" ]: try: columns[property] = dataset["FormTable", property].name except KeyError: pass cognate_judgements = [] forms = cache_table(dataset, columns=columns) forms_without_segments = 0 for f, form in cli.tq(forms.items(), task="Extracting cognate judgements from forms…"): if form.get("cognatesetReference"): if split: cogset = util.string_to_id("{:}-{:}".format( form["concept"], form["cognatesetReference"])) else: cogset = form["cognatesetReference"] judgement = { "ID": f, "Form_ID": f, "Cognateset_ID": cogset, } try: judgement["Segment_Slice"] = form["segmentSlice"] except KeyError: try: if not form["segments"]: raise ValueError("No segments") if ("+" in form["segments"] and dataset["FormTable", "cognatesetReference"].separator): logger.warning( "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!" ) judgement["Segment_Slice"] = [ "1:{:d}".format(len(form["segments"])) ] except (KeyError, TypeError, ValueError): forms_without_segments += 1 if forms_without_segments >= 5: pass else: logger.warning( f"No segments found for form {f} ({form['form']})." ) # What does an alignment mean without segments or their slices? # Doesn't matter, if we were given one, we take it. judgement["Alignment"] = form.get("alignment") cognate_judgements.append(judgement) if forms_without_segments >= 5: logger.warning( "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.", forms_without_segments, ) # Delete the cognateset column cols = dataset["FormTable"].tableSchema.columns remove = { dataset["FormTable", c].name for c in ["cognatesetReference", "segmentSlice", "alignment"] if ("FormTable", c) in dataset } def clean_form(form): for c in remove: form.pop(c, None) return form forms = [clean_form(form) for form in dataset["FormTable"]] for c in remove: ix = cols.index(dataset["FormTable", c]) del cols[ix] dataset.write(FormTable=forms) dataset.write(CognateTable=cognate_judgements)
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
def check_cognate_table(dataset: pycldf.Wordlist, logger=cli.logger, strict_concatenative=False) -> bool: """Check that the CognateTable makes sense. The cognate table MUST have an indication of forms, in a #formReference column, and cognate sets, in a #cognatesetReference column. It SHOULD have segment slices (#segmentSlice) and alignments (#alignment). - The segment slice must be a valid (1-based, inclusive) slice into the segments of the form - The alignment must match the segment slice applied to the segments of the form - The length of the alignment must match the lengths of other alignments of that cognate set - NA forms (Including "" for “source reports form as unknown” must not be in cognatesets) If checking for strictly concatenative morphology, also check that the segment slice is a contiguous, non-overlapping section of the form. Having no cognates is a valid choice for a dataset, so this function returns True if no CognateTable was found. """ # First, load all forms that are referenced in the CognateTable try: cognatetable = dataset["CognateTable"] except KeyError: # Having no cognates is a valid choice for a dataset. return True try: c_form = dataset["CognateTable", "formReference"].name except KeyError: log_or_raise("CognateTable does not have a #formReference column.") # All further checks don't make sense, return early. return False try: c_cognateset = dataset["CognateTable", "cognatesetReference"].name except KeyError: log_or_raise( "CognateTable does not have a #cognatesetReference column.") # All further checks don't make sense, return early. return False # The CLDF specifications state that foreign key references take precedence # over the implicit semantics of a `#xxxReference` column pointing to an # `#id` column, so we need to find forms by the stated foreign key # relationship. for foreign_key in cognatetable.tableSchema.foreignKeys: if foreign_key.columnReference == [c_form]: referenced_table = str(foreign_key.reference.resource) # A multi-column column reference for a single-column foreign key # makes no sense, so use tuple unpacking to extract the only # element from that list. (referenced_column, ) = foreign_key.reference.columnReference if (not dataset[referenced_table].common_props["dc:conformsTo"] == "http://cldf.clld.org/v1.0/terms.rdf#FormTable"): log_or_raise( "CognateTable #formReference does not reference a FormTable.", ) break else: log_or_raise("CognateTable #formReference must be a foreign key.") # All further checks don't make sense, return early. return False try: c_sslice = dataset["CognateTable", "segmentSlice"].name except KeyError: logger.info("CognateTable does not have a #segmentSlice column.") c_sslice = None try: c_alignment = dataset["CognateTable", "alignment"].name except KeyError: logger.info("CognateTable does not have an #alignment column.") c_alignment = None if c_sslice is None and c_alignment is None: # No additional data concerning the associations between forms and # cognate sets. That's sad, but valid. # All further checks don't make sense, return early. return True try: c_f_form = dataset[referenced_table, "form"].name def form_given(row): return row[c_f_form] and row[c_f_form].strip() != "-" except KeyError: if dataset[referenced_table] == dataset["FormTable"]: log_or_raise("FormTable does not have a #form column.") def form_given(row): return True # Check whether each row is valid. all_judgements_okay = True forms = cache_table( dataset, columns={"segments": dataset[referenced_table, "segments"].name}, table=referenced_table, index_column=referenced_column, filter=form_given, ) missing_forms = cache_table( dataset, columns={}, table=referenced_table, index_column=referenced_column, filter=lambda row: not form_given(row), ) cognateset_alignment_lengths: t.DefaultDict[ t.Any, t.Set[int]] = t.DefaultDict(set) for f, j, judgement in dataset["CognateTable"].iterdicts( with_metadata=True): try: form_segments = forms[judgement[c_form]]["segments"] except KeyError: if judgement[c_form] in missing_forms: log_or_raise( "In {}, row {}: NA form {} was judged to be in cognate set." .format(f, j, judgement[c_form]), ) # The case of a missing foreign key in general is already handled # by the basic CLDF validator. continue if c_sslice is not None: if not judgement[c_sslice]: log_or_raise("In {}, row {}: Empty segment slice".format(f, j)) continue try: included_segments = list( parse_segment_slices(judgement[c_sslice])) if (max(included_segments) >= len(form_segments) or min(included_segments) < 0): log_or_raise( "In {}, row {}: Segment slice {} is invalid for segments {}" .format( f, j, judgement[c_sslice], form_segments, ), ) all_judgements_okay = False continue if strict_concatenative: s1 = included_segments[0] for s2 in included_segments[1:]: if s2 != s1 + 1: log_or_raise( "In {}, row {}: Segment slice {} has non-consecutive elements {}, {}" .format( f, j, judgement[c_sslice], s1, s2, )) s1 = s2 except ValueError: log_or_raise( "In {}, row {}: Segment slice {} is invalid".format( f, j, judgement[c_sslice], )) all_judgements_okay = False continue else: included_segments = list(range(len(form_segments))) if c_alignment: # Length of alignment should match length of every other alignment in this cognate set. lengths = cognateset_alignment_lengths[judgement[c_cognateset]] alignment_length = len(judgement[c_alignment]) if lengths and alignment_length not in lengths: log_or_raise( "In {}, row {}: Alignment has length {}, other alignments of cognateset {} have length(s) {}" .format(f, j, alignment_length, judgement[c_cognateset], lengths), ) all_judgements_okay = False elif not lengths: lengths.add(alignment_length) # Alignment when gaps are removed should match segments. TODO: # Should we permit other gap characters? Where do we know them # from? TODO: To be more robust when segments are separated into # morphemes, not individual segments, compare alignment and # segments space-separated. without_gaps = " ".join( [c or "" for c in judgement[c_alignment] if c != "-"]) actual_segments = " ".join(form_segments[i] for i in included_segments) if without_gaps.strip() != actual_segments.strip(): if unicodedata.normalize( "NFKC", without_gaps.strip()) == unicodedata.normalize( "NFKC", actual_segments.strip()): comment = " This is down to encoding differences: Their normalized unicode representations are the same. I suggest you run `lexedata.edit.normalize_unicode`." else: comment = "" log_or_raise( "In {}, row {}: Referenced segments in form resolve to {}, while alignment contains segments {}.{}" .format(f, j, actual_segments, without_gaps, comment), ) all_judgements_okay = False return all_judgements_okay
def edictor_to_cldf( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], new_cogsets: t.Mapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ], affected_forms: t.Set[types.Form_ID], source: t.List[str] = [], ): ref_cogsets: t.MutableMapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = t.DefaultDict(list) cognate: t.List[types.Judgement] = [] judgements_lookup: t.MutableMapping[ types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement] ] = t.DefaultDict(dict) for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in affected_forms: ref_cogsets[j["cognatesetReference"]].append( (j["formReference"], j["segmentSlice"], j["alignment"]) ) judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j else: cognate.append(j) matches = match_cognatesets(new_cogsets, ref_cogsets) for cognateset, judgements in new_cogsets.items(): cognateset = matches[cognateset] if cognateset is None: cognateset = "_".join(f for f, _, _ in judgements) for form, slice, alignment in judgements: was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset) if was: was["segmentSlice"] = util.indices_to_segment_slice(slice) was["alignment"] = alignment cognate.append(was) continue judgements_lookup cognate.append( types.Judgement( { "id": f"{form}-{cognateset}", "formReference": form, "cognatesetReference": cognateset, "alignment": alignment, "segmentSlice": util.indices_to_segment_slice(slice), "source": source, # TODO: Any more parameters? Status update? } ) ) cognate.sort(key=lambda j: j["id"]) m = { util.cldf_property(c.propertyUrl) or c.name: c.name for c in dataset["CognateTable"].tableSchema.columns } dataset["CognateTable"].write( [{m[k]: v for k, v in j.items() if k in m} for j in cognate] )