Esempio n. 1
0
def merge_group(
    forms: t.Sequence[types.Form],
    target: types.Form,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.Form:
    """Merge one group of homophones

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 2]}

    The target is assumed to be already included in the forms.

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 1, 2]}

    """
    c_f_id = dataset["FormTable", "id"].name
    for column in target:
        if column == c_f_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["FormTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([form[column] for form in forms], target)
            except AssertionError:
                # We cannot deal with this block, but others may be fine.
                merger_name = merger.__name__
                logger.error(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[form[column] for form in forms]}")
                raise Skip
            except TypeError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n"
                    f"Given input: {[form[column] for form in forms]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in FormTable.")
    return target
Esempio n. 2
0
def merge_group(
    cogsets: t.Sequence[types.CogSet],
    target: types.CogSet,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.CogSet:
    """Merge one group of cognate sets

    The target is assumed to be already included in the forms.

    """
    c_s_id = dataset["CognatesetTable", "id"].name
    for column in target:
        if column == c_s_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["CognatesetTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([cogset[column] for cogset in cogsets],
                                      target)
            except AssertionError:
                merger_name = merger.__name__
                # We cannot deal with this block, but others may be fine.
                logger.error(
                    f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
                raise Skip
            except NotImplementedError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in CognatesetTable.")
    return target
Esempio n. 3
0
    def parse(
        self,
        cell: op.cell.Cell,
        language_id: str,
        cell_identifier: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Iterable[Judgement]:
        try:
            url = cell.hyperlink.target
            text = clean_cell_value(cell)
            comment = get_cell_comment(cell)
            if "{" not in text:
                slice, alignment = alignment_from_braces("{" + text + "}")
            else:
                slice, alignment = alignment_from_braces(text)
            try:
                form_id = self.extractor.search(url)["ID"]
            except (TypeError, IndexError):
                logger.error(
                    f"Could not extract group ID from URL {url} using regular expression {self.extractor.pattern}"
                )
                cli.Exit.INVALID_ID()
            properties = {
                self.c["c_id"]:
                form_id,
                self.c.get("c_segments"):
                ["{:}:{:}".format(i, j) for i, j in slice],
                self.c.get("c_alignment"):
                alignment,
                self.c.get("c_comment"):
                comment,
            }
            properties.pop(None, None)
            yield Judgement(properties)

        except AttributeError:
            pass
Esempio n. 4
0
def add_concepticon_definitions(
    dataset: pycldf.Dataset,
    column_name: str = "Concepticon_Definition",
    logger: cli.logging.Logger = cli.logger,
) -> None:
    concepticon_ids = dataset.column_names.parameters.concepticonReference
    if concepticon_ids is None:
        logger.error(
            "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references."
        )
        return

    # Create a concepticon_definition column
    try:
        dataset["ParameterTable", column_name]
        logger.info("Overwriting existing {:} column in concepts table".format(
            column_name))
    except KeyError:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
        # Now if this throws an exception, it's an unexpected exception.

    # write concepticon definitions
    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon definitions to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[
                row[concepticon_ids]].definition
        except KeyError:
            pass
        write_back.append(row)

    dataset.write(ParameterTable=write_back)
Esempio n. 5
0
def treat_brackets(
    table: t.Iterable[R],
    form_column_name="form",
    variants_column_name="variants",
    comment_column_name="comment",
    bracket_pairs=[("(", ")")],
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterator[R]:
    """Make sure forms contain no brackets.

    >>> for row in treat_brackets([
    ...   {'F': 'a(m)ba', 'V': [], 'C': ''},
    ...   {'F': 'da (dialectal)', 'V': [], 'C': ''},
    ...   {'F': 'tu(m) (informal)', 'V': [], 'C': '2p'}],
    ...   "F", "V", "C"):
    ...   print(row)
    {'F': 'amba', 'V': ['aba'], 'C': ''}
    {'F': 'da', 'V': [], 'C': '(dialectal)'}
    {'F': 'tum', 'V': ['tu'], 'C': '2p; (informal)'}


    Skipping works even when it is noticed only late in the process.

    >>> for row in treat_brackets([
    ...   {'F': 'a[m]ba (unbalanced', 'V': [], 'C': ''},
    ...   {'F': 'tu(m) (informal', 'V': [], 'C': ''}],
    ...   "F", "V", "C", [("[", "]"), ("(", ")")]):
    ...   print(row)
    {'F': 'a[m]ba (unbalanced', 'V': [], 'C': ''}
    {'F': 'tu(m) (informal', 'V': [], 'C': ''}

    """
    for r, row in enumerate(table):
        form = row[form_column_name]
        variants = row[variants_column_name][:]
        comment = [row[comment_column_name]
                   ] if row[comment_column_name] else []
        try:
            for opening_b, closing_b in bracket_pairs:
                if opening_b not in form and closing_b not in form:
                    continue

                form, new_variants, new_comments = unbracket_single_form(
                    form, opening_b, closing_b)
                variants.extend(new_variants)
                comment.extend(new_comments)
            # We avoid dict.update() here, so that in a recursive call where an
            # earlier bracket has already succeeded, still the whole form cell
            # is skipped. Or at least I thought that was the logic, except
            # there are no recursive calls to treat_brackets.
            yield {
                **row,
                form_column_name: form,
                variants_column_name: variants,
                comment_column_name: "; ".join(comment),
            }
        except Skip as e:
            # TODO: Should we have a message here?
            logger.error(
                "Line %d: Form '%s' has %s. I did not modify the row.",
                r,
                row[form_column_name],
                e.message,
            )
            yield row