Beispiel #1
0
def import_data_from_sheet(
    sheet,
    sheet_header,
    language_id: str,
    implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {},
    concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"),
) -> t.Iterable[Form]:
    row_iter = sheet.iter_rows()

    # TODO?: compare header of this sheet to format of given dataset process
    # row. Maybe unnecessary. In any case, do not complain about the unused
    # variable.
    header = next(row_iter)  # noqa: F841

    assert (
        concept_column[1] in sheet_header
    ), f"Could not find concept column {concept_column[1]} in your excel sheet {sheet.title}."

    for row in row_iter:
        data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)})
        if "?" in data.values():
            continue
        if "value" in implicit:
            data[implicit["value"]] = "\t".join(map(str, data.values()))
        concept_entry = data.pop(concept_column[1])
        data[concept_column[0]] = concept_entry
        if "id" in implicit:
            data[implicit["id"]] = None
        if "languageReference" in implicit:
            data[implicit["languageReference"]] = language_id
        yield data
Beispiel #2
0
 def parse(self,
           cell: openpyxl.cell.Cell,
           language_id: str,
           cell_identifier: str = '') -> t.Iterable[Form]:
     try:
         url = cell.hyperlink.target
         yield Form({"cldf_id": url.split("/")[-1]})
     except AttributeError:
         pass
Beispiel #3
0
 def parse_form(self,
                form_string: str,
                language_id: str,
                cell_identifier: str = '') -> t.Optional[Form]:
     return Form({
         "cldf_value": form_string,
         "cldf_form": form_string.strip(),
         "cldf_languageReference": language_id
     })
Beispiel #4
0
 def parse_form(self,
                form_string: str,
                language_id: str,
                cell_identifier: str = "") -> t.Optional[Form]:
     return Form({
         self.c["value"]: form_string,
         self.c["form"]: form_string.strip(),
         self.c["lang"]: language_id,
     })
Beispiel #5
0
    def handle_form(
        self,
        params,
        row_object: CogSet,
        cell_with_forms,
        this_lan,
        status_update: t.Optional[str],
    ):
        try:
            if params.__table__ == "CognateTable":
                row_id = row_object[self.db.dataset["CognatesetTable",
                                                    "id"].name]
                params[self.db.dataset["CognateTable",
                                       "cognatesetReference"].name] = row_id
                c_j_id = self.db.dataset["CognateTable", "id"].name
                if c_j_id not in params:
                    form_id = params[self.db.dataset["CognateTable",
                                                     "formReference"].name]
                    params[c_j_id] = f"{form_id}-{row_id}"
                    self.db.make_id_unique(params)
                # add status update if given
                if status_update:
                    params["Status_Column"] = status_update
                self.db.insert_into_db(params)
                return
        except AttributeError:
            pass

        # Deal with the more complex case where we are given a form and need
        # to discern what to do with it.
        form = Form(params)
        c_f_id = self.db.dataset["FormTable", "id"].name

        if c_f_id in form:
            self.db.associate(form[c_f_id], row_object)
        else:
            try:
                form_id = next(
                    iter(self.db.find_db_candidates(form,
                                                    self.check_for_match)))
                self.db.associate(form_id, row_object)
            except StopIteration:
                if self.on_form_not_found(
                        form,
                        cell_identifier=cell_with_forms.coordinate,
                        language_id=this_lan,
                ):
                    raise NotImplementedError(
                        "Creating a form is not supported in CognateExcelParser"
                    )
Beispiel #6
0
def import_data_from_sheet(
    sheet,
    sheet_header,
    implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {},
    entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(),
    concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"),
) -> t.Iterable[Form]:
    row_iter = sheet.iter_rows()

    # TODO?: compare header of this sheet to format of given data set process
    # row. Maybe unnecessary. In any case, do not complain about the unused
    # variable.
    header = next(row_iter)  # noqa: F841

    assert (
        concept_column[1] in sheet_header
    ), f"Could not find concept column {concept_column[0]} in your excel sheet {sheet.title}."

    for row in row_iter:
        data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)})
        if "value" in implicit:
            data[implicit["value"]] = "\t".join(map(str, data.values()))
        try:
            concept_entry = data.pop(concept_column[1])
            data[concept_column[0]] = entries_to_concepts[concept_entry]
        except KeyError:
            logger.warning(
                f"Concept {concept_entry} was not found. Please add it to the concepts table manually. The corresponding form was ignored and not added to the dataset."
            )
            data[concept_column[0]] = concept_entry
            continue
        if "id" in implicit:
            data[implicit["id"]] = None
        if "languageReference" in implicit:
            data[implicit["languageReference"]] = sheet.title
        yield data
Beispiel #7
0
    def handle_form(
        self,
        params,
        row_object: RowObject,
        cell_with_forms,
        this_lan: str,
        status_update: t.Optional[str],
    ):
        form = Form(params)
        c_f_id = self.db.dataset["FormTable", "id"].name
        c_f_language = self.db.dataset["FormTable", "languageReference"].name
        c_f_value = self.db.dataset["FormTable", "value"].name
        c_r_id = self.db.dataset[row_object.__table__, "id"].name

        if c_f_id not in form:
            # create candidate for form[id]
            form[c_f_id] = "{:}_{:}".format(form[c_f_language],
                                            row_object[c_r_id])
        candidate_forms = iter(
            self.db.find_db_candidates(form, self.check_for_match))
        try:
            # if a candidate for form already exists, don't add the form
            form_id = next(candidate_forms)
            self.db.associate(form_id, row_object)
        except StopIteration:
            # no candidates. form is created or not.
            if self.on_form_not_found(form, cell_with_forms):
                form[c_f_id] = "{:}_{:}".format(form[c_f_language],
                                                row_object[c_r_id])
                form[c_f_value] = cell_with_forms.value
                # add status update if given
                if status_update:
                    form["Status_Column"] = status_update
                self.db.make_id_unique(form)
                self.db.insert_into_db(form)
                form_id = form[c_f_id]
                self.db.associate(form_id, row_object)
            else:
                logger.error(
                    "The missing form was {:} in {:}, given as {:}.".format(
                        row_object[c_r_id], this_lan, form[c_f_value]))
                # TODO: Fill data with a fuzzy search
                for row in self.db.find_db_candidates(form,
                                                      self.check_for_match,
                                                      edit_dist_threshold=4):
                    logger.info(f"Did you mean {row} ?")
                return
Beispiel #8
0
    def handle_form(
        self,
        params,
        row_object: RowObject,
        cell_with_forms,
        this_lan,
        status_update: t.Optional[str],
    ):
        try:
            if params.__table__ == "CognateTable":
                row_id = row_object[self.db.dataset["CognatesetTable",
                                                    "id"].name]
                params[self.db.dataset["CognateTable",
                                       "cognatesetReference"].name] = row_id
                c_j_id = self.db.dataset["CognateTable", "id"].name
                if c_j_id not in params:
                    form_id = params[self.db.dataset["CognateTable",
                                                     "formReference"].name]
                    params[c_j_id] = f"{form_id}-{row_id}"
                    self.db.make_id_unique(params)
                # add status update if given
                if status_update:
                    params["Status_Column"] = status_update
                self.db.insert_into_db(params)
                return
        except AttributeError:
            pass

        # Deal with the more complex case where we are given a form and need
        # to discern what to do with it.
        form = Form(params)
        c_f_id = self.db.dataset["FormTable", "id"].name

        if c_f_id in form:
            self.db.associate(form[c_f_id], row_object)
        else:
            try:
                form_id = next(
                    iter(self.db.find_db_candidates(form,
                                                    self.check_for_match)))
                self.db.associate(form_id, row_object)
            except StopIteration:
                if self.on_form_not_found(form, cell_with_forms):
                    raise RuntimeError(
                        "I don't know how to add a non-existent form, referenced in a cognateset, to the dataset. This refers to form {form} in cell {cell_with_forms.coordinate}."
                    )
Beispiel #9
0
    def create_formcell(self, form: types.Form, column: int, row: int) -> None:
        """Fill the given cell with the form's data.

        In the cell described by ws, column, row, dump the data for the form:
        Write into the the form data, and supply a comment from the judgement
        if there is one.

        """
        form, metadata = form
        cell_value = self.form_to_cell_value(form)
        form_cell = self.ws.cell(row=row, column=column, value=cell_value)
        comment = form.pop("comment", None)
        if comment:
            form_cell.comment = op.comments.Comment(comment, __package__)
        if self.URL_BASE:
            link = self.URL_BASE.format(urllib.parse.quote(form["id"]))
            form_cell.hyperlink = link
Beispiel #10
0
    def handle_form(
        self,
        params,
        row_object: R,
        cell_with_forms,
        this_lan: str,
        status_update: t.Optional[str],
    ):
        form = Form(params)
        c_f_id = self.db.dataset["FormTable", "id"].name
        c_f_language = self.db.dataset["FormTable", "languageReference"].name
        c_f_value = self.db.dataset["FormTable", "value"].name
        c_r_id = self.db.dataset[row_object.__table__, "id"].name

        if c_f_id not in form:
            # create candidate for form[id]
            form[c_f_id] = "{:}_{:}".format(form[c_f_language],
                                            row_object[c_r_id])
        candidate_forms = iter(
            self.db.find_db_candidates(form, self.check_for_match))
        try:
            # if a candidate for form already exists, don't add the form
            form_id = next(candidate_forms)
            self.db.associate(form_id, row_object)
        except StopIteration:
            # no candidates. form is created or not.
            if self.on_form_not_found(form,
                                      cell_identifier=cell_with_forms,
                                      language_id=this_lan):
                form[c_f_id] = "{:}_{:}".format(form[c_f_language],
                                                row_object[c_r_id])
                form[c_f_value] = cell_with_forms.value
                # add status update if given
                if status_update:
                    form["Status_Column"] = status_update
                self.db.make_id_unique(form)
                self.db.insert_into_db(form)
                form_id = form[c_f_id]
                self.db.associate(form_id, row_object)
Beispiel #11
0
    def form_to_cell_value(self, form: types.Form) -> str:
        """Build a string describing the form itself

        Provide the best transcription and all translations of the form strung
        together.

        """

        transcription = self.get_best_transcription(form)
        translations = []

        suffix = ""
        # TODO: Use CLDF terms instead of column names, like the c_ elsewhere
        if form.get("Comment"):
            suffix = f" {WARNING:}"

        # corresponding concepts – TODO: distinguish between list data type
        # (multiple concepts) and others (single concept)
        c_concept = self.dataset["FormTable", "parameterReference"].name
        translations.append(form[c_concept])

        return "{:} ‘{:}’{:}".format(transcription, ", ".join(translations),
                                     suffix)
Beispiel #12
0
    def form_to_cell_value(self, form: types.Form) -> str:
        """Build a string describing the form itself

        Provide the best transcription and all translations of the form strung
        together.

        >>> ds = util.fs.new_wordlist(FormTable=[], CognatesetTable=[], CognateTable=[])
        >>> E = ExcelWriter(dataset=ds)
        >>> E.form_to_cell_value({"form": "f", "parameterReference": "c"})
        'f ‘c’'
        >>> E.form_to_cell_value(
        ...   {"form": "f", "parameterReference": "c", "formComment": "Not empty"})
        'f ‘c’ ⚠'
        >>> E.form_to_cell_value(
        ...   {"form": "fo", "parameterReference": "c", "segments": ["f", "o"]})
        '{ f o } ‘c’'
        >>> E.form_to_cell_value(
        ...   {"form": "fo",
        ...    "parameterReference": "c",
        ...    "segments": ["f", "o"],
        ...    "segmentSlice": ["1:1"]})
        '{ f }o ‘c’'

        TODO: This function should at some point support alignments, so that
        the following call will return '{ - f - }o ‘c’' instead.

        >>> E.form_to_cell_value(
        ...   {"form": "fo",
        ...    "parameterReference": "c",
        ...    "segments": ["f", "o"],
        ...    "segmentSlice": ["1:1"],
        ...    "alignment": ["", "f", ""]})
        '{ f }o ‘c’'

        """
        segments = form.get("segments")
        if not segments:
            transcription = form["form"]
        else:
            transcription = ""
            # TODO: use CLDF property instead of column name
            included_segments: t.Iterable[int]
            try:
                included_segments = set(
                    parse_segment_slices(form["segmentSlice"],
                                         enforce_ordered=True))
            except TypeError:
                self.logger.warning(
                    "In judgement %s, for form %s, there was no segment slice. I will use the whole form.",
                    form["cognateReference"],
                    form["id"],
                )
                included_segments = range(len(form["segments"]))
            except KeyError:
                included_segments = range(len(form["segments"]))
            except ValueError:
                # What if segments overlap or cross? Overlap shouldn't happen,
                # but we don't check here. Crossing might happen, but this
                # serialization cannot reflect it, so we enforce order,
                # expecting that an error message here will be more useful than
                # silently messing with data. If the check fails, we take the
                # whole segment and warn.
                self.logger.warning(
                    "In judgement %s, for form %s, segment slice %s is invalid. I will use the whole form.",
                    form["cognateReference"],
                    form["id"],
                    ",".join(form["segmentSlice"]),
                )
                included_segments = range(len(form["segments"]))

            included = False
            for i, s in enumerate(segments):
                if included and i not in included_segments:
                    transcription += " }" + s
                    included = False
                elif not included and i in included_segments:
                    transcription += "{ " + s
                    included = True
                elif i in included_segments:
                    transcription += " " + s
                else:
                    transcription += s
            if included:
                transcription += " }"

            transcription = transcription.strip()
        translations = []

        suffix = ""
        try:
            if form.get("formComment"):
                suffix = f" {WARNING:}"
        except (KeyError):
            pass

        # corresponding concepts
        # (multiple concepts) and others (single concept)
        if isinstance(form["parameterReference"], list):
            for f in form["parameterReference"]:
                translations.append(f)
        else:
            translations.append(form["parameterReference"])
        return "{:} ‘{:}’{:}".format(transcription, ", ".join(translations),
                                     suffix)
Beispiel #13
0
    def parse_form(
        self,
        form_string: str,
        language_id: str,
        cell_identifier: str = '',
    ) -> t.Optional[Form]:
        """Create a dictionary of columns from a form description.

        Extract each value (transcriptions, comments, sources etc.) from a
        string describing a single form.

        >>> c = CellParser()
        >>> c.parse_form(" \t", "abui") == None
        True

        """
        # if string is only whitespaces, there is no form.
        if not form_string.strip():
            return None
        # cell_identifier format: sheet.cell_coordinate
        cell_identifier = '{}: '.format(
            cell_identifier) if cell_identifier else ''

        properties: t.Dict[str, t.Any] = {
            "cldf_languageReference": language_id,
            "cldf_value": form_string
        }

        # Semantics: 'None' for no variant expected, any string for the
        # decorator that introduces variant forms. Currently we expect '~' and
        # '%', see below.
        expect_variant: t.Optional[str] = None
        # Iterate over the delimiter-separated elements of the form.
        for element in components_in_brackets(form_string, self.bracket_pairs):
            element = element.strip()

            if not element:
                continue

            # If the element has mismatched brackets (tends to happen only for
            # the last element, because a mismatched opening bracket means we
            # are still waiting for the closing one), warn.
            if not check_brackets(element, self.bracket_pairs):
                logger.warning(
                    f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters"
                )

            # Check what kind of element we have.
            for start, field in self.element_semantics.items():
                if element.startswith(start):
                    break
            else:
                # The only thing we expect outside delimiters is the variant
                # separators, '~' and '%'.
                if self.variant_separator and element in self.variant_separator:
                    # TODO: Should this be configurable? Where do we document
                    # the semantics?
                    expect_variant = element
                else:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored"
                    )
                continue

            # If we encounter a field for the first time, we add it to the
            # dictionary. If repeatedly, to the variants, with a decorator that
            # shows how expected the variant was.

            # TODO: This drops duplicate sources and comments, which is not
            # intended. If we drop the first variant of each of those two
            # fields, we cannot clean that up in post-processing. Maybe the
            # intention was to assume that for comments and soucres, we always
            # `expect_variant`s, so it should be an `or` for the inner if?
            if field in properties and field != "cldf_comment" and field != "cldf_source":
                if not expect_variant:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}"
                    )
                properties.setdefault(
                    "variants", []).append((expect_variant or '') + element)
            else:
                if expect_variant:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}"
                    )
                properties[field] = element

            expect_variant = None

        self.postprocess_form(properties, language_id)
        return Form(properties)
Beispiel #14
0
    def parse_form(
        self,
        form_string: str,
        language_id: str,
        cell_identifier: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Optional[Form]:
        """Create a dictionary of columns from a form description.

        Extract each value (transcriptions, comments, sources etc.) from a
        string describing a single form.
        """
        # not required fields
        c_comment = self.c.get("comment")
        c_variants = self.c.get("variants", c_comment)

        # if string is only whitespaces, there is no form.
        if not form_string.strip():
            return None

        properties: t.Dict[str, t.Any] = {
            self.c["lang"]: language_id,
            self.c["value"]: form_string,
        }

        # Semantics: 'None' for no variant expected, any string for the
        # decorator that introduces variant forms. Currently we expect '~' and
        # '%', see below.
        expect_variant: t.Optional[str] = None
        # Iterate over the delimiter-separated elements of the form.
        for element in components_in_brackets(form_string, self.bracket_pairs):
            element = element.strip()

            if not element:
                continue

            # If the element has mismatched brackets (tends to happen only for
            # the last element, because a mismatched opening bracket means we
            # are still waiting for the closing one), warn.
            if not check_brackets(element, self.bracket_pairs):
                try:
                    delimiter = self.bracket_pairs[element[0]]
                except KeyError:
                    delimiter = element[0]
                raise ValueError(
                    f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters "
                    f"{delimiter}. This could be a bigger problem in the cell, "
                    f"so the form was not imported.")
            # Check what kind of element we have.
            for start, (term, transcription) in self.element_semantics.items():
                field = self.c[term]
                if element.startswith(start):
                    break
            else:
                # TODO: here an other if catchin '-' might be necessary
                # The only thing we expect outside delimiters is the variant
                # separators, '~' and '%'.
                if self.variant_separator and element in self.variant_separator:
                    expect_variant = element
                else:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored"
                    )
                continue

            # If we encounter a field for the first time, we add it to the
            # dictionary. If repeatedly, to the variants, with a decorator that
            # shows how expected the variant was.
            # This drops sources and comments in variants, if more than one source or comment is provided
            # clean this up in self.postprocess_form

            if field in properties:
                if (not expect_variant and field != c_comment
                        and field != self.c["source"]):
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}"
                    )
                properties.setdefault(
                    c_variants, []).append((expect_variant or "") + element)
            else:
                if expect_variant:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}"
                    )
                properties[field] = element

            expect_variant = None

        self.postprocess_form(properties, language_id)
        return Form(properties)