def merge_group( forms: t.Sequence[types.Form], target: types.Form, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.Form: """Merge one group of homophones >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist()) {'Parameter_ID': [1, 2]} The target is assumed to be already included in the forms. >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist()) {'Parameter_ID': [1, 1, 2]} """ c_f_id = dataset["FormTable", "id"].name for column in target: if column == c_f_id: continue try: reference_name = (util.cldf_property( dataset["FormTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([form[column] for form in forms], target) except AssertionError: # We cannot deal with this block, but others may be fine. merger_name = merger.__name__ logger.error( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[form[column] for form in forms]}") raise Skip except TypeError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n" f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n" f"Given input: {[form[column] for form in forms]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in FormTable.") return target
def merge_group( cogsets: t.Sequence[types.CogSet], target: types.CogSet, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.CogSet: """Merge one group of cognate sets The target is assumed to be already included in the forms. """ c_s_id = dataset["CognatesetTable", "id"].name for column in target: if column == c_s_id: continue try: reference_name = (util.cldf_property( dataset["CognatesetTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([cogset[column] for cogset in cogsets], target) except AssertionError: merger_name = merger.__name__ # We cannot deal with this block, but others may be fine. logger.error( f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") raise Skip except NotImplementedError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n" f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in CognatesetTable.") return target
def parse( self, cell: op.cell.Cell, language_id: str, cell_identifier: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[Judgement]: try: url = cell.hyperlink.target text = clean_cell_value(cell) comment = get_cell_comment(cell) if "{" not in text: slice, alignment = alignment_from_braces("{" + text + "}") else: slice, alignment = alignment_from_braces(text) try: form_id = self.extractor.search(url)["ID"] except (TypeError, IndexError): logger.error( f"Could not extract group ID from URL {url} using regular expression {self.extractor.pattern}" ) cli.Exit.INVALID_ID() properties = { self.c["c_id"]: form_id, self.c.get("c_segments"): ["{:}:{:}".format(i, j) for i, j in slice], self.c.get("c_alignment"): alignment, self.c.get("c_comment"): comment, } properties.pop(None, None) yield Judgement(properties) except AttributeError: pass
def add_concepticon_definitions( dataset: pycldf.Dataset, column_name: str = "Concepticon_Definition", logger: cli.logging.Logger = cli.logger, ) -> None: concepticon_ids = dataset.column_names.parameters.concepticonReference if concepticon_ids is None: logger.error( "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references." ) return # Create a concepticon_definition column try: dataset["ParameterTable", column_name] logger.info("Overwriting existing {:} column in concepts table".format( column_name)) except KeyError: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() # Now if this throws an exception, it's an unexpected exception. # write concepticon definitions write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon definitions to dataset", ): try: row[column_name] = concepticon.api.conceptsets[ row[concepticon_ids]].definition except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def treat_brackets( table: t.Iterable[R], form_column_name="form", variants_column_name="variants", comment_column_name="comment", bracket_pairs=[("(", ")")], logger: cli.logging.Logger = cli.logger, ) -> t.Iterator[R]: """Make sure forms contain no brackets. >>> for row in treat_brackets([ ... {'F': 'a(m)ba', 'V': [], 'C': ''}, ... {'F': 'da (dialectal)', 'V': [], 'C': ''}, ... {'F': 'tu(m) (informal)', 'V': [], 'C': '2p'}], ... "F", "V", "C"): ... print(row) {'F': 'amba', 'V': ['aba'], 'C': ''} {'F': 'da', 'V': [], 'C': '(dialectal)'} {'F': 'tum', 'V': ['tu'], 'C': '2p; (informal)'} Skipping works even when it is noticed only late in the process. >>> for row in treat_brackets([ ... {'F': 'a[m]ba (unbalanced', 'V': [], 'C': ''}, ... {'F': 'tu(m) (informal', 'V': [], 'C': ''}], ... "F", "V", "C", [("[", "]"), ("(", ")")]): ... print(row) {'F': 'a[m]ba (unbalanced', 'V': [], 'C': ''} {'F': 'tu(m) (informal', 'V': [], 'C': ''} """ for r, row in enumerate(table): form = row[form_column_name] variants = row[variants_column_name][:] comment = [row[comment_column_name] ] if row[comment_column_name] else [] try: for opening_b, closing_b in bracket_pairs: if opening_b not in form and closing_b not in form: continue form, new_variants, new_comments = unbracket_single_form( form, opening_b, closing_b) variants.extend(new_variants) comment.extend(new_comments) # We avoid dict.update() here, so that in a recursive call where an # earlier bracket has already succeeded, still the whole form cell # is skipped. Or at least I thought that was the logic, except # there are no recursive calls to treat_brackets. yield { **row, form_column_name: form, variants_column_name: variants, comment_column_name: "; ".join(comment), } except Skip as e: # TODO: Should we have a message here? logger.error( "Line %d: Form '%s' has %s. I did not modify the row.", r, row[form_column_name], e.message, ) yield row