def import_data_from_sheet( sheet, sheet_header, language_id: str, implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {}, concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"), ) -> t.Iterable[Form]: row_iter = sheet.iter_rows() # TODO?: compare header of this sheet to format of given dataset process # row. Maybe unnecessary. In any case, do not complain about the unused # variable. header = next(row_iter) # noqa: F841 assert ( concept_column[1] in sheet_header ), f"Could not find concept column {concept_column[1]} in your excel sheet {sheet.title}." for row in row_iter: data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)}) if "?" in data.values(): continue if "value" in implicit: data[implicit["value"]] = "\t".join(map(str, data.values())) concept_entry = data.pop(concept_column[1]) data[concept_column[0]] = concept_entry if "id" in implicit: data[implicit["id"]] = None if "languageReference" in implicit: data[implicit["languageReference"]] = language_id yield data
def parse(self, cell: openpyxl.cell.Cell, language_id: str, cell_identifier: str = '') -> t.Iterable[Form]: try: url = cell.hyperlink.target yield Form({"cldf_id": url.split("/")[-1]}) except AttributeError: pass
def parse_form(self, form_string: str, language_id: str, cell_identifier: str = '') -> t.Optional[Form]: return Form({ "cldf_value": form_string, "cldf_form": form_string.strip(), "cldf_languageReference": language_id })
def parse_form(self, form_string: str, language_id: str, cell_identifier: str = "") -> t.Optional[Form]: return Form({ self.c["value"]: form_string, self.c["form"]: form_string.strip(), self.c["lang"]: language_id, })
def handle_form( self, params, row_object: CogSet, cell_with_forms, this_lan, status_update: t.Optional[str], ): try: if params.__table__ == "CognateTable": row_id = row_object[self.db.dataset["CognatesetTable", "id"].name] params[self.db.dataset["CognateTable", "cognatesetReference"].name] = row_id c_j_id = self.db.dataset["CognateTable", "id"].name if c_j_id not in params: form_id = params[self.db.dataset["CognateTable", "formReference"].name] params[c_j_id] = f"{form_id}-{row_id}" self.db.make_id_unique(params) # add status update if given if status_update: params["Status_Column"] = status_update self.db.insert_into_db(params) return except AttributeError: pass # Deal with the more complex case where we are given a form and need # to discern what to do with it. form = Form(params) c_f_id = self.db.dataset["FormTable", "id"].name if c_f_id in form: self.db.associate(form[c_f_id], row_object) else: try: form_id = next( iter(self.db.find_db_candidates(form, self.check_for_match))) self.db.associate(form_id, row_object) except StopIteration: if self.on_form_not_found( form, cell_identifier=cell_with_forms.coordinate, language_id=this_lan, ): raise NotImplementedError( "Creating a form is not supported in CognateExcelParser" )
def import_data_from_sheet( sheet, sheet_header, implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {}, entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(), concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"), ) -> t.Iterable[Form]: row_iter = sheet.iter_rows() # TODO?: compare header of this sheet to format of given data set process # row. Maybe unnecessary. In any case, do not complain about the unused # variable. header = next(row_iter) # noqa: F841 assert ( concept_column[1] in sheet_header ), f"Could not find concept column {concept_column[0]} in your excel sheet {sheet.title}." for row in row_iter: data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)}) if "value" in implicit: data[implicit["value"]] = "\t".join(map(str, data.values())) try: concept_entry = data.pop(concept_column[1]) data[concept_column[0]] = entries_to_concepts[concept_entry] except KeyError: logger.warning( f"Concept {concept_entry} was not found. Please add it to the concepts table manually. The corresponding form was ignored and not added to the dataset." ) data[concept_column[0]] = concept_entry continue if "id" in implicit: data[implicit["id"]] = None if "languageReference" in implicit: data[implicit["languageReference"]] = sheet.title yield data
def handle_form( self, params, row_object: RowObject, cell_with_forms, this_lan: str, status_update: t.Optional[str], ): form = Form(params) c_f_id = self.db.dataset["FormTable", "id"].name c_f_language = self.db.dataset["FormTable", "languageReference"].name c_f_value = self.db.dataset["FormTable", "value"].name c_r_id = self.db.dataset[row_object.__table__, "id"].name if c_f_id not in form: # create candidate for form[id] form[c_f_id] = "{:}_{:}".format(form[c_f_language], row_object[c_r_id]) candidate_forms = iter( self.db.find_db_candidates(form, self.check_for_match)) try: # if a candidate for form already exists, don't add the form form_id = next(candidate_forms) self.db.associate(form_id, row_object) except StopIteration: # no candidates. form is created or not. if self.on_form_not_found(form, cell_with_forms): form[c_f_id] = "{:}_{:}".format(form[c_f_language], row_object[c_r_id]) form[c_f_value] = cell_with_forms.value # add status update if given if status_update: form["Status_Column"] = status_update self.db.make_id_unique(form) self.db.insert_into_db(form) form_id = form[c_f_id] self.db.associate(form_id, row_object) else: logger.error( "The missing form was {:} in {:}, given as {:}.".format( row_object[c_r_id], this_lan, form[c_f_value])) # TODO: Fill data with a fuzzy search for row in self.db.find_db_candidates(form, self.check_for_match, edit_dist_threshold=4): logger.info(f"Did you mean {row} ?") return
def handle_form( self, params, row_object: RowObject, cell_with_forms, this_lan, status_update: t.Optional[str], ): try: if params.__table__ == "CognateTable": row_id = row_object[self.db.dataset["CognatesetTable", "id"].name] params[self.db.dataset["CognateTable", "cognatesetReference"].name] = row_id c_j_id = self.db.dataset["CognateTable", "id"].name if c_j_id not in params: form_id = params[self.db.dataset["CognateTable", "formReference"].name] params[c_j_id] = f"{form_id}-{row_id}" self.db.make_id_unique(params) # add status update if given if status_update: params["Status_Column"] = status_update self.db.insert_into_db(params) return except AttributeError: pass # Deal with the more complex case where we are given a form and need # to discern what to do with it. form = Form(params) c_f_id = self.db.dataset["FormTable", "id"].name if c_f_id in form: self.db.associate(form[c_f_id], row_object) else: try: form_id = next( iter(self.db.find_db_candidates(form, self.check_for_match))) self.db.associate(form_id, row_object) except StopIteration: if self.on_form_not_found(form, cell_with_forms): raise RuntimeError( "I don't know how to add a non-existent form, referenced in a cognateset, to the dataset. This refers to form {form} in cell {cell_with_forms.coordinate}." )
def create_formcell(self, form: types.Form, column: int, row: int) -> None: """Fill the given cell with the form's data. In the cell described by ws, column, row, dump the data for the form: Write into the the form data, and supply a comment from the judgement if there is one. """ form, metadata = form cell_value = self.form_to_cell_value(form) form_cell = self.ws.cell(row=row, column=column, value=cell_value) comment = form.pop("comment", None) if comment: form_cell.comment = op.comments.Comment(comment, __package__) if self.URL_BASE: link = self.URL_BASE.format(urllib.parse.quote(form["id"])) form_cell.hyperlink = link
def handle_form( self, params, row_object: R, cell_with_forms, this_lan: str, status_update: t.Optional[str], ): form = Form(params) c_f_id = self.db.dataset["FormTable", "id"].name c_f_language = self.db.dataset["FormTable", "languageReference"].name c_f_value = self.db.dataset["FormTable", "value"].name c_r_id = self.db.dataset[row_object.__table__, "id"].name if c_f_id not in form: # create candidate for form[id] form[c_f_id] = "{:}_{:}".format(form[c_f_language], row_object[c_r_id]) candidate_forms = iter( self.db.find_db_candidates(form, self.check_for_match)) try: # if a candidate for form already exists, don't add the form form_id = next(candidate_forms) self.db.associate(form_id, row_object) except StopIteration: # no candidates. form is created or not. if self.on_form_not_found(form, cell_identifier=cell_with_forms, language_id=this_lan): form[c_f_id] = "{:}_{:}".format(form[c_f_language], row_object[c_r_id]) form[c_f_value] = cell_with_forms.value # add status update if given if status_update: form["Status_Column"] = status_update self.db.make_id_unique(form) self.db.insert_into_db(form) form_id = form[c_f_id] self.db.associate(form_id, row_object)
def form_to_cell_value(self, form: types.Form) -> str: """Build a string describing the form itself Provide the best transcription and all translations of the form strung together. """ transcription = self.get_best_transcription(form) translations = [] suffix = "" # TODO: Use CLDF terms instead of column names, like the c_ elsewhere if form.get("Comment"): suffix = f" {WARNING:}" # corresponding concepts – TODO: distinguish between list data type # (multiple concepts) and others (single concept) c_concept = self.dataset["FormTable", "parameterReference"].name translations.append(form[c_concept]) return "{:} ‘{:}’{:}".format(transcription, ", ".join(translations), suffix)
def form_to_cell_value(self, form: types.Form) -> str: """Build a string describing the form itself Provide the best transcription and all translations of the form strung together. >>> ds = util.fs.new_wordlist(FormTable=[], CognatesetTable=[], CognateTable=[]) >>> E = ExcelWriter(dataset=ds) >>> E.form_to_cell_value({"form": "f", "parameterReference": "c"}) 'f ‘c’' >>> E.form_to_cell_value( ... {"form": "f", "parameterReference": "c", "formComment": "Not empty"}) 'f ‘c’ ⚠' >>> E.form_to_cell_value( ... {"form": "fo", "parameterReference": "c", "segments": ["f", "o"]}) '{ f o } ‘c’' >>> E.form_to_cell_value( ... {"form": "fo", ... "parameterReference": "c", ... "segments": ["f", "o"], ... "segmentSlice": ["1:1"]}) '{ f }o ‘c’' TODO: This function should at some point support alignments, so that the following call will return '{ - f - }o ‘c’' instead. >>> E.form_to_cell_value( ... {"form": "fo", ... "parameterReference": "c", ... "segments": ["f", "o"], ... "segmentSlice": ["1:1"], ... "alignment": ["", "f", ""]}) '{ f }o ‘c’' """ segments = form.get("segments") if not segments: transcription = form["form"] else: transcription = "" # TODO: use CLDF property instead of column name included_segments: t.Iterable[int] try: included_segments = set( parse_segment_slices(form["segmentSlice"], enforce_ordered=True)) except TypeError: self.logger.warning( "In judgement %s, for form %s, there was no segment slice. I will use the whole form.", form["cognateReference"], form["id"], ) included_segments = range(len(form["segments"])) except KeyError: included_segments = range(len(form["segments"])) except ValueError: # What if segments overlap or cross? Overlap shouldn't happen, # but we don't check here. Crossing might happen, but this # serialization cannot reflect it, so we enforce order, # expecting that an error message here will be more useful than # silently messing with data. If the check fails, we take the # whole segment and warn. self.logger.warning( "In judgement %s, for form %s, segment slice %s is invalid. I will use the whole form.", form["cognateReference"], form["id"], ",".join(form["segmentSlice"]), ) included_segments = range(len(form["segments"])) included = False for i, s in enumerate(segments): if included and i not in included_segments: transcription += " }" + s included = False elif not included and i in included_segments: transcription += "{ " + s included = True elif i in included_segments: transcription += " " + s else: transcription += s if included: transcription += " }" transcription = transcription.strip() translations = [] suffix = "" try: if form.get("formComment"): suffix = f" {WARNING:}" except (KeyError): pass # corresponding concepts # (multiple concepts) and others (single concept) if isinstance(form["parameterReference"], list): for f in form["parameterReference"]: translations.append(f) else: translations.append(form["parameterReference"]) return "{:} ‘{:}’{:}".format(transcription, ", ".join(translations), suffix)
def parse_form( self, form_string: str, language_id: str, cell_identifier: str = '', ) -> t.Optional[Form]: """Create a dictionary of columns from a form description. Extract each value (transcriptions, comments, sources etc.) from a string describing a single form. >>> c = CellParser() >>> c.parse_form(" \t", "abui") == None True """ # if string is only whitespaces, there is no form. if not form_string.strip(): return None # cell_identifier format: sheet.cell_coordinate cell_identifier = '{}: '.format( cell_identifier) if cell_identifier else '' properties: t.Dict[str, t.Any] = { "cldf_languageReference": language_id, "cldf_value": form_string } # Semantics: 'None' for no variant expected, any string for the # decorator that introduces variant forms. Currently we expect '~' and # '%', see below. expect_variant: t.Optional[str] = None # Iterate over the delimiter-separated elements of the form. for element in components_in_brackets(form_string, self.bracket_pairs): element = element.strip() if not element: continue # If the element has mismatched brackets (tends to happen only for # the last element, because a mismatched opening bracket means we # are still waiting for the closing one), warn. if not check_brackets(element, self.bracket_pairs): logger.warning( f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters" ) # Check what kind of element we have. for start, field in self.element_semantics.items(): if element.startswith(start): break else: # The only thing we expect outside delimiters is the variant # separators, '~' and '%'. if self.variant_separator and element in self.variant_separator: # TODO: Should this be configurable? Where do we document # the semantics? expect_variant = element else: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored" ) continue # If we encounter a field for the first time, we add it to the # dictionary. If repeatedly, to the variants, with a decorator that # shows how expected the variant was. # TODO: This drops duplicate sources and comments, which is not # intended. If we drop the first variant of each of those two # fields, we cannot clean that up in post-processing. Maybe the # intention was to assume that for comments and soucres, we always # `expect_variant`s, so it should be an `or` for the inner if? if field in properties and field != "cldf_comment" and field != "cldf_source": if not expect_variant: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}" ) properties.setdefault( "variants", []).append((expect_variant or '') + element) else: if expect_variant: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}" ) properties[field] = element expect_variant = None self.postprocess_form(properties, language_id) return Form(properties)
def parse_form( self, form_string: str, language_id: str, cell_identifier: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Optional[Form]: """Create a dictionary of columns from a form description. Extract each value (transcriptions, comments, sources etc.) from a string describing a single form. """ # not required fields c_comment = self.c.get("comment") c_variants = self.c.get("variants", c_comment) # if string is only whitespaces, there is no form. if not form_string.strip(): return None properties: t.Dict[str, t.Any] = { self.c["lang"]: language_id, self.c["value"]: form_string, } # Semantics: 'None' for no variant expected, any string for the # decorator that introduces variant forms. Currently we expect '~' and # '%', see below. expect_variant: t.Optional[str] = None # Iterate over the delimiter-separated elements of the form. for element in components_in_brackets(form_string, self.bracket_pairs): element = element.strip() if not element: continue # If the element has mismatched brackets (tends to happen only for # the last element, because a mismatched opening bracket means we # are still waiting for the closing one), warn. if not check_brackets(element, self.bracket_pairs): try: delimiter = self.bracket_pairs[element[0]] except KeyError: delimiter = element[0] raise ValueError( f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters " f"{delimiter}. This could be a bigger problem in the cell, " f"so the form was not imported.") # Check what kind of element we have. for start, (term, transcription) in self.element_semantics.items(): field = self.c[term] if element.startswith(start): break else: # TODO: here an other if catchin '-' might be necessary # The only thing we expect outside delimiters is the variant # separators, '~' and '%'. if self.variant_separator and element in self.variant_separator: expect_variant = element else: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored" ) continue # If we encounter a field for the first time, we add it to the # dictionary. If repeatedly, to the variants, with a decorator that # shows how expected the variant was. # This drops sources and comments in variants, if more than one source or comment is provided # clean this up in self.postprocess_form if field in properties: if (not expect_variant and field != c_comment and field != self.c["source"]): logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}" ) properties.setdefault( c_variants, []).append((expect_variant or "") + element) else: if expect_variant: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}" ) properties[field] = element expect_variant = None self.postprocess_form(properties, language_id) return Form(properties)