Beispiel #1
0
    def source_from_source_string(
        self,
        source_string: str,
        language_id: t.Optional[str],
        logger: cli.logging.Logger = cli.logger,
    ) -> str:
        """Parse a string referencing a language-specific source"""
        context: t.Optional[str]
        if ":" in source_string:
            source_part, context = source_string.split(":", maxsplit=1)
            if not context.endswith("}"):
                logger.warning(
                    f"In source {source_string}: Closing bracket '}}' is missing, split into source and page/context may be wrong"
                )
            source_string = source_part + "}"
            context = context[:-1].strip()

            context = context.replace(":", "").replace(",", "")
        else:
            context = None

        if source_string.startswith("{") and source_string.endswith("}"):
            source_string = source_string[1:-1]
        if language_id is None:
            source_id = string_to_id(source_string)
        else:
            source_id = string_to_id(f"{language_id:}_s{source_string:}")

        source_id = source_id.replace(":", "").replace(",", "")

        if context:
            return f"{source_id}[{context}]"
        else:
            return source_id
Beispiel #2
0
def rename(
    ds,
    old_values_to_new_values,
    logger: cli.logging.Logger,
    status_update: t.Optional[str],
):
    concepts = ds["ParameterTable"]

    for table in ds.tables:
        if table == concepts:
            continue
        _, component = table.common_props["dc:conformsTo"].split("#")
        try:
            c_concept = ds[component, "parameterReference"]
            columns = {c_concept.name}
        except KeyError:
            columns = set()
        for reference in table.tableSchema.foreignKeys:
            if reference.reference.resource.string == concepts.url.string:
                (column, ) = reference.columnReference
                columns.add(column)
        if columns:
            logger.info(f"Changing columns {columns:} in {component:}…")
            ds.write(
                **{
                    component: [
                        substitute_many(
                            r,
                            columns,
                            old_values_to_new_values,
                            status_update=status_update,
                        ) for r in table
                    ]
                })
Beispiel #3
0
    def on_form_not_found(
        self,
        form: t.Dict[str, t.Any],
        cell_identifier: t.Optional[str] = None,
        language_id: t.Optional[str] = None,
        logger: cli.logging.Logger = cli.logger,
    ) -> bool:
        """Should I add a missing object? No, but inform the user.

        Send a warning (ObjectNotFoundWarning) reporting the missing object and cell.

        Returns
        =======
        False: The object should not be added.

        """
        rep = form.get("cldf_id", repr(form))
        logger.warning(
            f"Unable to find form {rep} in cell {cell_identifier} in the dataset. "
            f"This cognate judgement was skipped. "
            f"Please make sure that the form is present in forms.csv or in the file "
            f"used for the Wordlist importation.")
        # Do a fuzzy search
        for row in self.db.find_db_candidates(form,
                                              self.check_for_match,
                                              edit_dist_threshold=4):
            logger.info(f"Did you mean {row} ?")
        return False
Beispiel #4
0
def header_from_cognate_excel(
    ws: openpyxl.worksheet.worksheet.Worksheet,
    dataset: pycldf.Dataset,
    logger: cli.logging.Logger = cli.logger,
):
    row_header = []
    separators = []
    for (header, ) in ws.iter_cols(
            min_row=1,
            max_row=1,
            max_col=len(dataset["CognatesetTable"].tableSchema.columns),
    ):
        column_name = header.value
        if column_name is None:
            column_name = dataset["CognatesetTable", "id"].name
        elif column_name == "CogSet":
            column_name = dataset["CognatesetTable", "id"].name
        try:
            column_name = dataset["CognatesetTable", column_name].name
        except KeyError:
            break
        row_header.append(column_name)
        separators.append(dataset["CognatesetTable", column_name].separator)
        if column_name == dataset["CognatesetTable", "comment"].name:
            logger.warning(
                "Your cognates table has a separate ‘{header.value}’ column for comments, but `lexedata.importer.cognates` expects to extract comments from the cell comments of the cognateset metadata columns, not from a separate column. Your ‘{header.value}’ column will be ignored."
            )
    return row_header, separators
Beispiel #5
0
def list_homophones(dataset: pycldf.Dataset,
                    out: io.TextIOBase,
                    logger: cli.logging.Logger = cli.logger) -> None:
    clics = load_clics()
    # warn if clics cannot be loaded
    if not clics:
        logger.warning(
            "Clics could not be loaded. Using an empty graph instead")
        clics = nx.Graph()

    c_id = dataset["ParameterTable", "id"].name
    try:
        c_concepticon = dataset["ParameterTable", "concepticonReference"].name
    except KeyError:
        cli.Exit.INVALID_DATASET(
            "This script requires a column concepticonReference in ParamterTable. "
            "Please run add_concepticon.py")
    concepticon = {}
    for concept in dataset["ParameterTable"]:
        concepticon[concept[c_id]] = concept[c_concepticon]

    f_id = dataset["FormTable", "id"].name
    f_lang = dataset["FormTable", "languageReference"].name
    f_concept = dataset["FormTable", "parameterReference"].name
    f_form = dataset["FormTable", "form"].name

    homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[
        str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set))

    for form in dataset["FormTable"]:
        if form[f_form] == "-" or form[f_form] is None:
            continue
        if isinstance(form[f_concept], list):
            homophones[form[f_lang]][form[f_form]].add(
                tuple(form[f_concept]) + (form[f_id], ))
        else:
            homophones[form[f_lang]][form[f_form]].add(
                (form[f_concept], form[f_id]))
    for lang, forms in homophones.items():
        for form, meanings in forms.items():
            if len(meanings) == 1:
                continue
            clics_nodes = {concepticon.get(concept) for concept, _ in meanings}
            if None in clics_nodes:
                x = " (but at least one concept not found):"
            else:
                x = ":"
            clics_nodes -= {None}
            if len(clics_nodes) <= 1:
                x = "Unknown" + x
            elif nx.is_connected(clics.subgraph(clics_nodes)):
                x = "Connected" + x
            else:
                x = "Unconnected" + x
            line = f"{lang}, '{form}': {x}\n"
            for ele in sorted(meanings):
                line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n"
            out.write(line)
Beispiel #6
0
def merge_group(
    forms: t.Sequence[types.Form],
    target: types.Form,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.Form:
    """Merge one group of homophones

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 2]}

    The target is assumed to be already included in the forms.

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 1, 2]}

    """
    c_f_id = dataset["FormTable", "id"].name
    for column in target:
        if column == c_f_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["FormTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([form[column] for form in forms], target)
            except AssertionError:
                # We cannot deal with this block, but others may be fine.
                merger_name = merger.__name__
                logger.error(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[form[column] for form in forms]}")
                raise Skip
            except TypeError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n"
                    f"Given input: {[form[column] for form in forms]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in FormTable.")
    return target
def add_central_concepts_to_cognateset_table(
    dataset: pycldf.Dataset,
    add_column: bool = True,
    overwrite_existing: bool = True,
    logger: cli.logging.Logger = cli.logger,
    status_update: t.Optional = None,
) -> pycldf.Dataset:
    # create mapping cognateset to central concept
    try:
        clics: t.Optional[networkx.Graph] = load_clics()
    except FileNotFoundError:
        logger.warning("Clics could not be loaded.")
        clics = None
    concepts_of_cognateset: t.Mapping[
        CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset)
    central: t.MutableMapping[str, str] = {}
    if clics and dataset.column_names.parameters.concepticonReference:
        concept_to_concepticon = concepts_to_concepticon(dataset)
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts,
                                                  concept_to_concepticon,
                                                  clics)
    else:
        logger.warning(
            f"Dataset {dataset:} had no concepticonReference in a ParamterTable."
        )
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts, {}, None)
    dataset = reshape_dataset(dataset, add_column=add_column)
    c_core_concept = dataset.column_names.cognatesets.parameterReference
    if c_core_concept is None:
        raise ValueError(
            f"Dataset {dataset:} had no parameterReference column in a CognatesetTable"
            " and is thus not compatible with this script.")
    # if status update given, add status column
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="CognatesetTable")
    # write cognatesets with central concepts
    write_back = []
    for row in cli.tq(
            dataset["CognatesetTable"],
            task="Write cognatesets with central concepts to dataset",
            total=dataset["CognatesetTable"].common_props.get("dc:extent"),
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        row["Status_Column"] = status_update
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset
Beispiel #8
0
def check_id_format(dataset: pycldf.Dataset,
                    logger: cli.logging.Logger = cli.logger):
    correct = True
    for table in dataset.tables:
        # Every table SHOULD have an ID column
        try:
            id_column = dataset[table, "id"]
        except KeyError:
            log_or_raise("Table %s has no identifier column.", logger)
            correct = False
            continue

        # All IDs SHOULD be [a-zA-Z0-9_-]+
        datatype = id_column.datatype
        if datatype.base == "string":
            if not datatype.format:
                correct = False
                log_or_raise(
                    f"Table {table.url} has an unconstrained ID column {id_column.name}. Consider setting "
                    f"its format to [a-zA-Z0-9_-]+ and/or running `lexedata.edit.simplify_ids`.",
                    logger,
                )
            else:
                if datatype.format not in {
                        "[a-zA-Z0-9_\\-]+",
                        "[a-zA-Z0-9_-]+",
                        "[a-zA-Z0-9\\-_]+",
                        "[a-z0-9_]+",
                }:
                    log_or_raise(
                        f"Table {table.url} has a string ID column {id_column.name} with format {datatype.format}. "
                        f"I am too dumb to check whether that's a subset of [a-zA-Z0-9_-]+ (which is fine) "
                        f"or not (in which case maybe change it).",
                        logger,
                    )

        elif datatype.base == "integer":
            logger.info(
                "Table %s has integer ID column %s. This is okay, I hope I will not mess it up.",
                table.url,
                id_column.name,
            )

        # IDs should be primary keys and primary keys IDs (not official part of the CLDF specs)
        if table.tableSchema.primaryKey != [id_column.name]:
            log_or_raise(
                f"Table {table.url} has ID column {id_column.name}, but primary key {table.tableSchema.primaryKey}",
                logger,
            )
            correct = False

    return correct
def merge_group(
    cogsets: t.Sequence[types.CogSet],
    target: types.CogSet,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.CogSet:
    """Merge one group of cognate sets

    The target is assumed to be already included in the forms.

    """
    c_s_id = dataset["CognatesetTable", "id"].name
    for column in target:
        if column == c_s_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["CognatesetTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([cogset[column] for cogset in cogsets],
                                      target)
            except AssertionError:
                merger_name = merger.__name__
                # We cannot deal with this block, but others may be fine.
                logger.error(
                    f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
                raise Skip
            except NotImplementedError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in CognatesetTable.")
    return target
Beispiel #10
0
def clean_forms(
    table: t.Iterable[R],
    form_column_name="form",
    variants_column_name="variants",
    split_at=[",", ";"],
    split_at_and_keep=["~"],
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterator[R]:
    """Split all forms that contain separators into form+variants.

    >>> for row in clean_forms([
    ...   {'F': 'a ~ æ', 'V': []},
    ...   {'F': 'bə-, be-', 'V': ['b-']}],
    ...   "F", "V"):
    ...   print(row)
    {'F': 'a', 'V': ['~æ']}
    {'F': 'bə-', 'V': ['b-', 'be-']}

    """
    for r, row in enumerate(table):
        forms = [("", row[form_column_name])]
        for separator in split_at:
            forms = [("", form.strip()) for _, chunk in forms
                     for form in chunk.split(separator)]
        for separator in split_at_and_keep:
            forms = [(first_separator if f == 0 else separator, form.strip())
                     for first_separator, chunk in forms
                     for f, form in enumerate(chunk.split(separator))]

        if len(forms) > 1:
            logger.info(
                "Line %d: Split form '%s' into %d elements.",
                r,
                row[form_column_name],
                len(forms),
            )
            if forms[0][0]:
                logger.warn(
                    "First element was marked as variant using %s, ignoring the marker",
                    forms[0][0],
                )
            row[form_column_name] = forms[0][1]
            row[variants_column_name].extend(
                [f"{separator}{form}" for separator, form in forms[1:]])
        yield row
Beispiel #11
0
def replace_column(
    dataset: pycldf.Dataset,
    original: str,
    replacement: str,
    column_replace: bool,
    smush: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> None:
    # add Status_column if not existing and status update given
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="ParameterTable")

    if column_replace:
        assert (
            original == "id"
            or original == dataset["ParameterTable", "id"].name
        ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable."

        c_id = dataset["ParameterTable", original].name
        c_new = dataset["ParameterTable", replacement].name
        mapping = {
            concept[c_id]: concept[c_new]
            for concept in dataset["ParameterTable"]
        }
        assert smush or len(mapping) == len(
            set(mapping.values())
        ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended."
        # dataset["ParameterTable"].tableSchema.columns["c_id"]
        rename(dataset, mapping, logger, status_update=status_update)
    else:
        concepts = dataset["ParameterTable"]

        c_id = dataset["ParameterTable", "id"].name

        logger.info(f"Changing {c_id:} of ParameterTable…")
        dataset.write(ParameterTable=[
            substitute_many(r, [c_id], {original: replacement},
                            status_update=None) for r in concepts
        ])
        rename(dataset, {original: replacement},
               logger,
               status_update=status_update)
Beispiel #12
0
    def separate(
        self,
        values: str,
        context: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Iterable[str]:
        """Separate different form descriptions in one string.

        Separate forms separated by comma or semicolon, unless the comma or
        semicolon occurs within a set of matching component delimiters (eg.
        brackets)

        If the brackets don't match, the whole remainder string is passed on,
        so that the form parser can try to recover as much as possible or throw
        an exception.
        """
        raw_split = re.split(self.separation_pattern, values)
        if len(raw_split) <= 1:
            for form in raw_split:
                yield form
            return

        while len(raw_split) > 1:
            if check_brackets(raw_split[0], self.bracket_pairs):
                form = raw_split.pop(0).strip()
                if form:
                    yield form
                raw_split.pop(0)
            else:
                raw_split[:2] = ["".join(raw_split[:2])]
        if not check_brackets(raw_split[0], self.bracket_pairs):
            logger.warning(
                f"{context:}In values {values:}: "
                "Encountered mismatched closing delimiters. Please check that the "
                "separation of the cell into multiple entries, for different forms, was correct."
            )

        form = raw_split.pop(0).strip()
        if form:
            yield form
        assert not raw_split
Beispiel #13
0
def import_cognates_from_excel(
    ws: openpyxl.worksheet.worksheet.Worksheet,
    dataset: pycldf.Dataset,
    extractor: re.Pattern = re.compile("/(?P<ID>[^/]*)/?$"),
    logger: cli.logging.Logger = cli.logger,
) -> None:
    logger.info("Loading sheet…")
    logger.info(
        f"Importing cognate sets from sheet {ws.title}, into {dataset.tablegroup._fname}…"
    )

    row_header, _ = header_from_cognate_excel(ws, dataset, logger=logger)
    excel_parser_cognate = CognateEditParser(
        dataset,
        top=2,
        # When the dataset has cognateset comments, that column is not a header
        # column, so this value is one higher than the actual number of header
        # columns, so actually correct for the 1-based indices. When there is
        # no comment column, we need to compensate for the 1-based Excel
        # indices.
        cellparser=cell_parsers.CellParserHyperlink(dataset,
                                                    extractor=extractor),
        row_header=row_header,
        check_for_language_match=[dataset["LanguageTable", "name"].name],
        check_for_match=[dataset["FormTable", "id"].name],
        check_for_row_match=[dataset["CognatesetTable", "id"].name],
    )
    excel_parser_cognate.db.cache_dataset()
    excel_parser_cognate.db.drop_from_cache("CognatesetTable")
    excel_parser_cognate.db.drop_from_cache("CognateTable")
    logger.info("Parsing cognate Excel…")
    excel_parser_cognate.parse_cells(ws, status_update=None)
    excel_parser_cognate.db.write_dataset_from_cache(
        ["CognateTable", "CognatesetTable"])
Beispiel #14
0
    def cache_dataset(self, logger: cli.logging.Logger = cli.logger):
        logger.info("Caching dataset into memory…")
        for table in self.dataset.tables:
            table_type = (table.common_props.get("dc:conformsTo", "").rsplit(
                "#", 1)[1] or table.url)
            (id, ) = table.tableSchema.primaryKey
            # Extent may be wrong, but it's usually at least roughly correct
            # and a better indication of the table size than none at all.
            try:
                self.cache[table_type] = {
                    row[id]: row
                    for row in cli.tq(
                        table,
                        task="Cache the dataset",
                        total=table.common_props.get("dc:extent"),
                    )
                }
            except FileNotFoundError:
                self.cache[table_type] = {}

        for source in self.dataset.sources:
            self.source_ids.add(source.id)
Beispiel #15
0
def filter(
    table: t.Iterable[R],
    column: str,
    filter: re.Pattern,
    invert: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterator[R]:
    """Return all rows matching a filter

    Match the filter regular expression and return all rows in the table where
    the filter matches the column. (Or all where it does not, if invert==True.)

    >>> list(filter([
    ...   {"C": "A"},
    ...   {"C": "An"},
    ...   {"C": "T"},
    ...   {"C": "E"},
    ... ], "C", re.compile("A"), invert=True))
    [{'C': 'T'}, {'C': 'E'}]

    """
    n_row = 0
    n_included = 0
    for row in table:
        n_row += 1
        # TODO: Treat list-valued columns better.
        string = str(row[column])
        row_matches = bool(filter.search(string))
        if row_matches ^ invert:
            n_included += 1
            yield row

    logger.info(
        "Filtered %d rows down to %d (%1.0f%%)",
        n_row,
        n_included,
        n_included / n_row * 100,
    )
Beispiel #16
0
    def parse(
        self,
        cell: op.cell.Cell,
        language_id: str,
        cell_identifier: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Iterable[Judgement]:
        try:
            url = cell.hyperlink.target
            text = clean_cell_value(cell)
            comment = get_cell_comment(cell)
            if "{" not in text:
                slice, alignment = alignment_from_braces("{" + text + "}")
            else:
                slice, alignment = alignment_from_braces(text)
            try:
                form_id = self.extractor.search(url)["ID"]
            except (TypeError, IndexError):
                logger.error(
                    f"Could not extract group ID from URL {url} using regular expression {self.extractor.pattern}"
                )
                cli.Exit.INVALID_ID()
            properties = {
                self.c["c_id"]:
                form_id,
                self.c.get("c_segments"):
                ["{:}:{:}".format(i, j) for i, j in slice],
                self.c.get("c_alignment"):
                alignment,
                self.c.get("c_comment"):
                comment,
            }
            properties.pop(None, None)
            yield Judgement(properties)

        except AttributeError:
            pass
Beispiel #17
0
def add_concepticon_definitions(
    dataset: pycldf.Dataset,
    column_name: str = "Concepticon_Definition",
    logger: cli.logging.Logger = cli.logger,
) -> None:
    concepticon_ids = dataset.column_names.parameters.concepticonReference
    if concepticon_ids is None:
        logger.error(
            "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references."
        )
        return

    # Create a concepticon_definition column
    try:
        dataset["ParameterTable", column_name]
        logger.info("Overwriting existing {:} column in concepts table".format(
            column_name))
    except KeyError:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
        # Now if this throws an exception, it's an unexpected exception.

    # write concepticon definitions
    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon definitions to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[
                row[concepticon_ids]].definition
        except KeyError:
            pass
        write_back.append(row)

    dataset.write(ParameterTable=write_back)
Beispiel #18
0
def update_ids(
    ds: pycldf.Dataset,
    table: csvw.metadata.Table,
    mapping: t.Mapping[str, str],
    logger: cli.logging.Logger = cli.logger,
):
    """Update all IDs of the table in the database, also in foreign keys, according to mapping."""
    c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id")
    rows = []
    for row in cli.tq(
            ds[table],
            task=f"Updating ids of {table.url.string}",
            total=ds[table].common_props.get("dc:extent"),
    ):
        row[c_id.name] = mapping.get(row[c_id.name], row[c_id.name])
        rows.append(row)
    logger.info(f"Writing {table.url.string} back to file…")
    table.write(rows)

    c_id.datatype.format = ID_FORMAT.pattern

    foreign_keys_to_here = {
        other_table.url.string: {
            foreign_key.columnReference[
                foreign_key.reference.columnReference.index(c_id.name)]
            for foreign_key in other_table.tableSchema.foreignKeys
            if foreign_key.reference.resource == table.url
            if c_id.name in foreign_key.reference.columnReference
        }
        for other_table in ds.tables
    }

    for other_table, columns in foreign_keys_to_here.items():
        if not columns:
            continue
        logger.info(
            f"Applying changed foreign key to columns {columns:} in {other_table:}…"
        )
        rows = []
        for row in cli.tq(
                ds[other_table],
                total=ds[other_table].common_props.get("dc:extent"),
                task="Replacing changed IDs",
        ):
            for column in columns:
                # TODO: is this enough to handle columns with a separator? like parameterReference in forms table
                if isinstance(row[column], list):
                    row[column] = [mapping.get(v, v) for v in row[column]]
                else:
                    row[column] = mapping.get(row[column], row[column])
                rows.append(row)
        logger.info(f"Writing {other_table} back to file…")
        ds[other_table].write(rows)

        for column in columns:
            ds[other_table, column].datatype = c_id.datatype
Beispiel #19
0
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger):
    if fname.name != "forms.csv":
        cli.Exit.CLI_ARGUMENT_ERROR(
            "A metadata-free Wordlist must be in a file called 'forms.csv'.")
    default_wordlist = TableGroup.from_file(
        pycldf.util.pkg_path("modules", "Wordlist-metadata.json"))
    default_wordlist._fname = fname.with_name("Wordlist-metadata.json")
    ds = pycldf.Wordlist(default_wordlist)

    # `from_data` checks that the reqired columns of the FormTable are present,
    # but it does not consolidate the columns further.

    colnames = next(iterrows(fname))

    understood_colnames = {
        c.name
        for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames
    }
    more_columns = {
        c.propertyUrl.uri: c
        for c in ds[ds.primary_table].tableSchema.columns
        if c.name not in understood_colnames
    }
    logger.info(
        "CLDF freely understood the columns %s in your forms.csv.",
        sorted(understood_colnames),
    )

    # Consider the columns that were not understood.
    columns_without_metadata = set(colnames) - understood_colnames
    for column_name in columns_without_metadata:
        column: Column
        # Maybe they are known CLDF properties?
        if column_name in pycldf.terms.TERMS:
            column = pycldf.TERMS[column_name].to_column()
        # Maybe they are CLDF default column names?
        elif column_name in DEFAULT_NAME_COLUMNS:
            column = DEFAULT_NAME_COLUMNS[column_name]
        # Maybe they are columns that Lexedata knows to handle?
        elif column_name in LEXEDATA_COLUMNS:
            column = LEXEDATA_COLUMNS[column_name]
        # Maybe they are columns inherited from LingPy?
        elif column_name.upper() in LINGPY_COLUMNS:
            column = LINGPY_COLUMNS[column_name.upper()]
        # Maybe they are some name we have seen before?
        elif column_name in OTHER_KNOWN_COLUMNS:
            column = OTHER_KNOWN_COLUMNS[column_name]
        else:
            # TODO: Maybe they look like they have a specific type?
            ...
            # Otherwise, they are probably just text to be kept.
            column = Column(
                datatype=Datatype(base="string"),
                default="",
                null=[""],
                name=column_name,
            )
        column.name = column_name

        ds[ds.primary_table].tableSchema.columns.append(column)
        summary = column.propertyUrl or column.datatype
        logger.info(f"Column {column_name} seems to be a {summary} column.")
        if column.propertyUrl:
            to_be_replaced = more_columns.pop(column.propertyUrl.uri, None)
            if to_be_replaced is not None:
                ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced)

    for column in more_columns.values():
        logger.info(
            f"Also added column {column.name}, as expected for a FormTable.")

    ds[ds.primary_table].tableSchema.columns.sort(
        key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10)

    # TODO: Once lexedata is properly published, we can give a better URL.
    ds.properties["dc:contributor"] = [
        "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py"
    ]
    return ds
Beispiel #20
0
def add_cognate_table(
    dataset: pycldf.Wordlist,
    split: bool = True,
    logger: cli.logging.Logger = cli.logger,
) -> None:
    if "CognateTable" in dataset:
        return
    dataset.add_component("CognateTable")

    # TODO: Check if that cognatesetReference is already a foreign key to
    # elsewhere (could be a CognatesetTable, could be whatever), because then
    # we need to transfer that knowledge.

    # Load anything that's useful for a cognate set table: Form IDs, segments,
    # segment slices, cognateset references, alignments
    columns = {
        "id": dataset["FormTable", "id"].name,
        "concept": dataset["FormTable", "parameterReference"].name,
        "form": dataset["FormTable", "form"].name,
    }
    for property in [
            "segments", "segmentSlice", "cognatesetReference", "alignment"
    ]:
        try:
            columns[property] = dataset["FormTable", property].name
        except KeyError:
            pass
    cognate_judgements = []
    forms = cache_table(dataset, columns=columns)
    forms_without_segments = 0
    for f, form in cli.tq(forms.items(),
                          task="Extracting cognate judgements from forms…"):
        if form.get("cognatesetReference"):
            if split:
                cogset = util.string_to_id("{:}-{:}".format(
                    form["concept"], form["cognatesetReference"]))
            else:
                cogset = form["cognatesetReference"]
            judgement = {
                "ID": f,
                "Form_ID": f,
                "Cognateset_ID": cogset,
            }
            try:
                judgement["Segment_Slice"] = form["segmentSlice"]
            except KeyError:
                try:
                    if not form["segments"]:
                        raise ValueError("No segments")
                    if ("+" in form["segments"]
                            and dataset["FormTable",
                                        "cognatesetReference"].separator):
                        logger.warning(
                            "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!"
                        )
                    judgement["Segment_Slice"] = [
                        "1:{:d}".format(len(form["segments"]))
                    ]
                except (KeyError, TypeError, ValueError):
                    forms_without_segments += 1
                    if forms_without_segments >= 5:
                        pass
                    else:
                        logger.warning(
                            f"No segments found for form {f} ({form['form']})."
                        )
            # What does an alignment mean without segments or their slices?
            # Doesn't matter, if we were given one, we take it.
            judgement["Alignment"] = form.get("alignment")
            cognate_judgements.append(judgement)

    if forms_without_segments >= 5:
        logger.warning(
            "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.",
            forms_without_segments,
        )

    # Delete the cognateset column
    cols = dataset["FormTable"].tableSchema.columns
    remove = {
        dataset["FormTable", c].name
        for c in ["cognatesetReference", "segmentSlice", "alignment"]
        if ("FormTable", c) in dataset
    }

    def clean_form(form):
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)
Beispiel #21
0
def update_integer_ids(
    ds: pycldf.Dataset,
    table: csvw.metadata.Table,
    logger: cli.logging.Logger = cli.logger,
):
    """Update all IDs of the table in the database, also in foreign keys."""
    c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id")
    max_id = 0
    no_integer_rows: t.Set[str] = set()
    # logger.info("Checking IDs that are already integers…")
    for row in cli.tq(
            ds[table],
            task="Checking IDs that are already integers…",
            total=ds[table].common_props.get("dc:extent"),
    ):
        try:
            max_id = max(int(row[c_id.name]), max_id)
        except ValueError:
            no_integer_rows.add(row[c_id.name])
    logger.info("Adding integer IDs to other rows…")

    mapping: t.Dict[str, int] = dict()
    rows: t.List[t.Dict[str, t.Any]] = []
    for row in cli.tq(
            ds[table],
            task="Updating integer ids",
            total=ds[table].common_props.get("dc:extent"),
    ):
        original = row[c_id.name]
        if row[c_id.name] in no_integer_rows:
            max_id += 1
            row[c_id.name] = max_id
        else:
            row[c_id.name] = int(row[c_id.name])
        mapping[original] = row[c_id.name]
        rows.append(row)
    logger.info(f"Writing {table.url.string} back to file…")
    table.write(rows)

    foreign_keys_to_here = {
        other_table.url.string: {
            foreign_key.columnReference[
                foreign_key.reference.columnReference.index(c_id.name)]
            for foreign_key in other_table.tableSchema.foreignKeys
            if foreign_key.reference.resource == table.url
            if c_id.name in foreign_key.reference.columnReference
        }
        for other_table in ds.tables
    }
    for other_table, columns in foreign_keys_to_here.items():
        if not columns:
            continue
        rows = []
        for row in cli.tq(
                ds[other_table],
                task=f"Applying changed foreign key to {other_table}…",
                total=ds[other_table].common_props.get("dc:extent"),
        ):
            for column in columns:
                # TODO: is this enough to handle columns with a separator? like parameterReference in forms table
                if isinstance(row[column], list):
                    row[column] = [mapping[v] for v in row[column]]
                else:
                    row[column] = mapping[row[column]]
            rows.append(row)

        for column in columns:
            ds[other_table, column].datatype = c_id.datatype

        logger.info(f"Writing {other_table} back to file…")

        ds[other_table].write(rows)
Beispiel #22
0
def read_wordlist(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    code_column: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID,
                                                          t.Set]]:
    col_map = dataset.column_names

    if code_column:
        # Just in case that column was specified by property URL. We
        # definitely want the name. In any case, this will also throw a
        # helpful KeyError when the column does not exist.
        form_table_form = col_map.forms.form
        form_table_column = col_map.forms.id
        cognatesets = util.cache_table(
            dataset,
            columns={
                "form": form_table_column,
                "transcription": form_table_form,
                "code": dataset["FormTable", code_column].name,
            },
            filter=lambda row: bool(row[col_map.forms.form]),
        )
    else:
        # We search for cognatesetReferences in the FormTable or a separate
        # CognateTable.

        # Try the FormTable first.
        code_column = col_map.forms.cognatesetReference

        if code_column:
            # This is not the CLDF way, warn the user.
            form_table_column = col_map.forms.id
            form_table_form = col_map.forms.form
            logger.warning(
                "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table."
            )
            cognatesets = util.cache_table(
                dataset,
                columns={
                    "form": form_table_column,
                    "transcription": form_table_form,
                    "code": code_column,
                },
            )
        else:
            # There was no cognatesetReference in the form table. If we
            # find them in CognateTable (I mean, they should be there!), we
            # store them keyed with formReference.
            if (col_map.cognates and col_map.cognates.cognatesetReference
                    and col_map.cognates.formReference):
                code_column = col_map.cognates.cognatesetReference
                form_reference = col_map.cognates.formReference
                (foreign_key, ) = [
                    key
                    for key in dataset["CognateTable"].tableSchema.foreignKeys
                    if key.columnReference == [form_reference]
                ]
                (form_table_column, ) = foreign_key.reference.columnReference
                cognatesets = util.cache_table(
                    dataset,
                    "CognateTable",
                    {
                        "form": form_reference,
                        "code": code_column
                    },
                )
            else:
                raise ValueError(
                    "Dataset has no cognatesetReference column in its "
                    "primary table or in a separate cognate table. "
                    "Is this a metadata-free wordlist and you forgot to "
                    "specify code_column explicitly?")

    # Cognate sets have been loaded. Consolidate.
    cognates_by_form: t.MutableMapping[
        types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for judgement in cognatesets.values():
        cognates_by_form[judgement["form"]].add(judgement["code"])
    parameter_column = col_map.forms.parameterReference

    # If one form can have multiple concepts,
    if dataset["FormTable", parameter_column].separator:

        def all_parameters(parameter):
            return list(parameter)

    else:

        def all_parameters(parameter):
            return [parameter]

    data: t.MutableMapping[types.Language_ID,
                           t.MutableMapping[types.Parameter_ID, t.Set]]
    if "LanguageTable" in dataset:
        (langref_target, ) = [
            key for key in dataset["FormTable"].tableSchema.foreignKeys
            if key.columnReference ==
            [dataset["FormTable", "languageReference"].name]
        ]
        ref_col = langref_target.reference.columnReference[0]
        data = {
            lang[ref_col]: t.DefaultDict(set)
            for lang in dataset["LanguageTable"]
        }
    else:
        data = t.DefaultDict(lambda: t.DefaultDict(set))
    for row in dataset["FormTable"].iterdicts():
        if not row[col_map.forms.form]:
            # Transcription is empty, should not be a form. Skip, but maybe
            # warn if it was in a cognateset.
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    cognates_by_form[row[form_table_column]],
                )
            continue

        language = row[col_map.forms.languageReference]
        if row[col_map.forms.form] == "-":
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    language,
                    cognates_by_form[row[form_table_column]],
                )
                cognates_by_form[row[form_table_column]] = set()
            for parameter in all_parameters(row[parameter_column]):
                if data[language][parameter]:
                    logger.warning(
                        "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.",
                        row[col_map.forms.id],
                        parameter,
                        row[col_map.forms.languageReference],
                        data[language][parameter],
                    )
        for parameter in all_parameters(row[parameter_column]):
            data[language][parameter] |= cognates_by_form[
                row[form_table_column]]
    return data
Beispiel #23
0
def root_presence_code(
    dataset: t.Mapping[types.Language_ID,
                       t.Mapping[types.Parameter_ID,
                                 t.Set[types.Cognateset_ID]]],
    relevant_concepts: t.Mapping[types.Cognateset_ID,
                                 t.Iterable[types.Parameter_ID]],
    ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"],
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]],
             t.Mapping[types.Cognateset_ID, int], ]:
    """Create a root-presence/absence coding from cognate codes in a dataset

    Take the cognate code information from a wordlist, i.e. a mapping of the
    form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary
    alignment from it that lists for every root whether it is present in that
    language or not. Return that, and the association between cognatesets and
    characters.

    >>> alignment, roots = root_presence_code(
    ...     {"Language": {"Meaning": {"Cognateset 1"}}},
    ...     relevant_concepts={"Cognateset 1": ["Meaning"]})
    >>> alignment
    {'Language': ['0', '1']}
    >>> roots
    {'Cognateset 1': 1}

    The first entry in each sequence is always '0': The configuration where a
    form is absent from all languages is never observed, but always possible,
    so we add this entry for the purposes of ascertainment correction.

    If a root is attested at all, in any concept, it is considered present.
    Because the word list is never a complete description of the language's
    lexicon, the function employs a heuristic to generate ‘absent’ states.

    If a root is unattested, and at least half of the relevant concepts
    associated with this root are attested, but each expressed by another root,
    the root is assumed to be absent in the target language. (If there is
    exactly one central concept, then that central concept being attested or
    unknown is a special case of this general rule.) Otherwise the
    presence/absence of the root is considered unknown.

    >>> alignment, roots = root_presence_code(
    ...     {"l1": {"m1": {"c1"}},
    ...      "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}},
    ...     relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]})
    >>> sorted(roots)
    ['c1', 'c2', 'c3']
    >>> sorted_roots = sorted(roots.items())
    >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()}
    {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']}
    >>> list(zip(*sorted(zip(*alignment.values()))))
    [('0', '0', '1', '?'), ('0', '1', '1', '1')]

    """
    all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts)
    language_roots: t.MutableMapping[
        types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for language, lexicon in dataset.items():
        for concept, cognatesets in lexicon.items():
            if not cognatesets:
                logger.warning(
                    f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it."
                )
            for cognateset in cognatesets:
                language_roots[language].add(cognateset)

    all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots)

    alignment = {}
    roots = {}
    for language, lexicon in dataset.items():
        alignment[language] = list(ascertainment)
        for root in all_roots_sorted:
            roots[root] = len(alignment[language])
            if root in language_roots[language]:
                alignment[language].append("1")
            else:
                n_concepts = 0
                n_filled_concepts = 0
                for concept in relevant_concepts[root]:
                    n_concepts += 1
                    if lexicon.get(concept):
                        n_filled_concepts += 1
                if 2 * n_filled_concepts >= n_concepts:
                    alignment[language].append("0")
                else:
                    alignment[language].append("?")

    return alignment, roots
Beispiel #24
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Beispiel #25
0
def read_single_excel_sheet(
    dataset: pycldf.Dataset,
    sheet: openpyxl.worksheet.worksheet.Worksheet,
    logger: cli.logging.Logger = cli.logger,
    match_form: t.Optional[t.List[str]] = None,
    entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(),
    concept_column: t.Optional[str] = None,
    ignore_missing: bool = False,
    ignore_superfluous: bool = False,
    status_update: t.Optional[str] = None,
) -> t.Mapping[str, ImportLanguageReport]:
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)

    concept_columns: t.Tuple[str, str]
    if concept_column is None:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            dataset["FormTable", "parameterReference"].name,
        )
    else:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            concept_column,
        )
    db = DB(dataset)
    db.cache_dataset()
    # required cldf fields of a form
    c_f_id = db.dataset["FormTable", "id"].name
    c_f_language = db.dataset["FormTable", "languageReference"].name
    c_f_form = db.dataset["FormTable", "form"].name
    c_f_value = db.dataset["FormTable", "value"].name
    c_f_concept = db.dataset["FormTable", "parameterReference"].name
    if not match_form:
        match_form = [c_f_form, c_f_language]
    if not db.dataset["FormTable", c_f_concept].separator:
        logger.warning(
            "Your metadata does not allow polysemous forms. According to your specifications, "
            "identical forms with different concepts will always be considered homophones, not a single "
            "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference "
            "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones."
        )
        match_form.append(c_f_concept)
    else:
        if c_f_concept in match_form:
            logger.info(
                "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones."
            )

    sheet_header = get_headers_from_excel(sheet)
    form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys())

    # These columns don't need to be given, we can infer them from the sheet title and from the other data:
    implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {}
    if c_f_language not in sheet_header:
        implicit["languageReference"] = c_f_language
    if c_f_id not in sheet_header:
        implicit["id"] = c_f_id
    if c_f_value not in sheet_header:
        implicit["value"] = c_f_value

    found_columns = set(sheet_header) - {concept_column} - set(implicit.values())
    expected_columns = set(form_header) - {c_f_concept} - set(implicit.values())
    if not found_columns >= expected_columns:
        if ignore_missing:
            logger.info(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"For the newly imported forms, these columns will be left empty in the dataset."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these "
                f"columns empty in the dataset for the newly imported forms."
            )
    if not found_columns <= expected_columns:
        if ignore_superfluous:
            logger.info(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. These columns will be ignored."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. Clean up your data, or use "
                f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns."
            )
    # check if language exist
    c_l_name = db.dataset["LanguageTable", "name"].name
    c_l_id = db.dataset["LanguageTable", "id"].name
    language_name_to_language_id = {
        row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values()
    }
    language_name = normalize_string(sheet.title)
    if language_name in language_name_to_language_id:
        language_id = language_name_to_language_id[language_name]
        report[language_id].is_new_language = False
    else:
        language_id = language_name
        report[language_id].is_new_language = True

    # read new data from sheet
    for form in cli.tq(
        import_data_from_sheet(
            sheet,
            sheet_header=sheet_header,
            implicit=implicit,
            language_id=language_id,
            concept_column=concept_columns,
        ),
        task=f"Parsing cells of sheet {sheet.title}",
        total=sheet.max_row,
    ):
        # if concept not in dataset, don't add form
        try:
            concept_entry = form[c_f_concept]
            entries_to_concepts[concept_entry]
        except KeyError:
            logger.warning(
                f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. "
                f"The corresponding form was ignored and not added to the dataset."
            )
            report[language_id].skipped += 1
            continue
        # else, look for candidates, link to existing form or add new form
        for item, value in form.items():
            try:
                sep = db.dataset["FormTable", item].separator
            except KeyError:
                continue
            if sep is None:
                continue
            form[item] = value.split(sep)
        form_candidates = db.find_db_candidates(form, match_form)
        if form_candidates:
            new_concept_added = False
            for form_id in form_candidates:
                logger.info(f"Form {form[c_f_value]} was already in dataset.")

                if db.dataset["FormTable", c_f_concept].separator:
                    for new_concept in form[c_f_concept]:
                        if (
                            new_concept
                            not in db.cache["FormTable"][form_id][c_f_concept]
                        ):
                            db.cache["FormTable"][form_id][c_f_concept].append(
                                new_concept
                            )
                            logger.info(
                                f"New form-concept association: Concept {form[c_f_concept]} was added to existing form "
                                f"{form_id}. If this was not intended "
                                f"(because it is a homophonous form, not a polysemy), "
                                f"you need to manually remove that concept from the old form in forms.csv "
                                f"and create a separate new form. If you want to treat identical forms "
                                f"as homophones in general, add  "
                                f"--match-forms={' '.join(match_form)}, "
                                f"{db.dataset['FormTable', 'parameterReference']} "
                                f"when you run this script."
                            )
                            new_concept_added = True
                break

            if new_concept_added:
                report[language_id].concepts += 1
            else:
                report[language_id].existing += 1
        else:
            # we land here after the break and keep adding existing forms to the dataset just with integer in id +1
            form[c_f_language] = language_id
            if "id" in implicit:
                # TODO: check for type of form id column
                form_concept = form[c_f_concept]
                concept_reference = (
                    form_concept[0] if isinstance(form_concept, list) else form_concept
                )
                form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}")
            db.make_id_unique(form)
            if status_update:
                form["Status_Column"] = status_update
            db.insert_into_db(form)
            report[language_id].new += 1
    # write to cldf
    db.write_dataset_from_cache()
    return report
Beispiel #26
0
def add_single_languages(
    metadata: Path,
    sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet],
    match_form: t.Optional[t.List[str]],
    concept_name: t.Optional[str],
    ignore_missing: bool,
    ignore_superfluous: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger,
) -> t.Mapping[str, ImportLanguageReport]:
    if status_update == "None":
        status_update = None
    # initiate dataset from meta data or csv depending on command line arguments
    if metadata:
        if metadata.name == "forms.csv":
            dataset = pycldf.Dataset.from_data(metadata)
        else:
            dataset = pycldf.Dataset.from_metadata(metadata)

    concepts: t.Mapping[str, str]
    try:
        cid = dataset["ParameterTable", "id"].name
        if concept_name is None:
            concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = dataset["FormTable", "parameterReference"].name
        else:
            name = dataset["ParameterTable", "name"].name
            concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = concept_name
    except (KeyError, FileNotFoundError) as err:
        if isinstance(err, KeyError):
            logger.warning(
                "Did not find a well-formed ParameterTable. Importing all forms independent of concept"
            )
        elif isinstance(err, FileNotFoundError):
            logger.warning(
                f"Did not find {dataset['ParameterTable'].url.string}. "
                f"Importing all forms independent of concept"
            )
        concepts = KeyKeyDict()
        if concept_name:
            concept_column = concept_name
        else:
            concept_column = dataset["FormTable", "parameterReference"].name
    # add Status_Column if not existing and status_update given
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="FormTable")
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)
    # import all selected sheets
    for sheet in sheets:
        for lang, subreport in read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            logger=logger,
            match_form=match_form,
            entries_to_concepts=concepts,
            concept_column=concept_column,
            ignore_missing=ignore_missing,
            ignore_superfluous=ignore_superfluous,
            status_update=status_update,
        ).items():
            report[lang] += subreport
    return report
Beispiel #27
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache
Beispiel #28
0
    def __init__(
        self,
        dataset: pycldf.Dataset,
        element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [
            # ("[", "]", "phonetic", True),
            ("<", ">", "form", True),
            # ("/", "/", "phonemic", True),
            ("(", ")", "comment", False),
            ("{", "}", "source", False),
        ],
        separation_pattern: str = r"([;,])",
        variant_separator: t.Optional[t.List[str]] = ["~", "%"],
        add_default_source: t.Optional[str] = "{1}",
        logger: cli.logging.Logger = cli.logger,
    ):
        super().__init__(dataset)

        # Colums implied by element semantics
        self.bracket_pairs = {
            start: end
            for start, end, _, _ in element_semantics
        }
        self.element_semantics = {
            start: (term, transcription)
            for start, _, term, transcription in element_semantics
        }
        for start, end, term, transcription in element_semantics:
            # Ensure that all terms required by the element semantics are fields we can write to.
            self.cc(short=term, long=("FormTable", term), dataset=dataset)
        assert self.transcriptions, (
            "Your metadata json file and your cell parser don’t match: Your cell parser "
            f"{self.__class__.__name__} expects to work with transcriptions "
            "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form "
            "in #FormTable, but your metadata defines no such column.")

        # Colums necessary for word list
        self.cc(short="source", long=("FormTable", "source"), dataset=dataset)
        self.cc(short="comment",
                long=("FormTable", "comment"),
                dataset=dataset)

        try:
            self.comment_separator = dataset["FormTable",
                                             "comment"].separator or "\t"
        except KeyError:
            logger.info("No #comment column found.")
            self.comment_separator = ""

        try:
            # As long as there is no CLDF term #variants, this will either be
            # 'variants' or raise a KeyError. However, it is a transparent
            # re-use of an otherwise established idiom in this module, so we
            # use this minor overhead.
            self.c["variants"] = dataset["FormTable", "variants"].name
        except KeyError:
            logger.warning(
                "No 'variants' column found for FormTable in Wordlist-metadata.json. "
                "Form variants will be added to #comment.")

        # Other class attributes
        self.separation_pattern = separation_pattern
        self.variant_separator = variant_separator
        self.add_default_source = add_default_source
Beispiel #29
0
def merge_forms(
    data: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID,
                         types.Cognate_ID, types.Cognateset_ID, ],
    mergers: t.Mapping[str, Merger],
    homophone_groups: t.MutableMapping[types.Form_ID,
                                       t.Sequence[types.Form_ID]],
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterable[types.Form]:
    """Merge forms from a dataset.

    TODO: Construct an example that shows that the order given in
    `homophone_groups` is maintained.

    Side Effects
    ============
    Changes homophone_groups:
        Groups that are skipped are removed

    """
    merge_targets = {
        variant: target
        for target, variants in homophone_groups.items()
        for variant in variants
    }
    for target in homophone_groups:
        assert merge_targets[target] == target

    c_f_id = data["FormTable", "id"].name

    buffer: t.Dict[types.Form_ID, types.Form] = {}

    unknown = set()
    form: types.Form
    for form in cli.tq(
            data["FormTable"],
            task="Going through forms and merging",
            logger=logger,
            total=data["FormTable"].common_props.get("dc:extent"),
    ):
        id: types.Form_ID = form[c_f_id]
        buffer[id] = form
        if id in merge_targets:
            unknown.add(id)
            target_id = merge_targets[id]
            group = homophone_groups[target_id]
            if all(i in buffer for i in group):
                try:
                    buffer[target_id] = merge_group(
                        [buffer[i] for i in group],
                        buffer[target_id].copy(),  # type: ignore
                        mergers,
                        data,
                        logger,
                    )

                    for i in group:
                        if i != target_id:
                            del buffer[i]
                except Skip:
                    logger.info(
                        f"Merging form {id} with forms {[f[c_f_id] for f in group]} was skipped."
                    )
                    del homophone_groups[id]
                    pass
                for i in group:
                    unknown.remove(i)

        for f in list(buffer):
            if f in unknown:
                break
            yield buffer.pop(f)
Beispiel #30
0
    def parse_form(
        self,
        form_string: str,
        language_id: str,
        cell_identifier: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Optional[Form]:
        """Create a dictionary of columns from a form description.

        Extract each value (transcriptions, comments, sources etc.) from a
        string describing a single form.
        """
        # not required fields
        c_comment = self.c.get("comment")
        c_variants = self.c.get("variants", c_comment)

        # if string is only whitespaces, there is no form.
        if not form_string.strip():
            return None

        properties: t.Dict[str, t.Any] = {
            self.c["lang"]: language_id,
            self.c["value"]: form_string,
        }

        # Semantics: 'None' for no variant expected, any string for the
        # decorator that introduces variant forms. Currently we expect '~' and
        # '%', see below.
        expect_variant: t.Optional[str] = None
        # Iterate over the delimiter-separated elements of the form.
        for element in components_in_brackets(form_string, self.bracket_pairs):
            element = element.strip()

            if not element:
                continue

            # If the element has mismatched brackets (tends to happen only for
            # the last element, because a mismatched opening bracket means we
            # are still waiting for the closing one), warn.
            if not check_brackets(element, self.bracket_pairs):
                try:
                    delimiter = self.bracket_pairs[element[0]]
                except KeyError:
                    delimiter = element[0]
                raise ValueError(
                    f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters "
                    f"{delimiter}. This could be a bigger problem in the cell, "
                    f"so the form was not imported.")
            # Check what kind of element we have.
            for start, (term, transcription) in self.element_semantics.items():
                field = self.c[term]
                if element.startswith(start):
                    break
            else:
                # TODO: here an other if catchin '-' might be necessary
                # The only thing we expect outside delimiters is the variant
                # separators, '~' and '%'.
                if self.variant_separator and element in self.variant_separator:
                    expect_variant = element
                else:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored"
                    )
                continue

            # If we encounter a field for the first time, we add it to the
            # dictionary. If repeatedly, to the variants, with a decorator that
            # shows how expected the variant was.
            # This drops sources and comments in variants, if more than one source or comment is provided
            # clean this up in self.postprocess_form

            if field in properties:
                if (not expect_variant and field != c_comment
                        and field != self.c["source"]):
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}"
                    )
                properties.setdefault(
                    c_variants, []).append((expect_variant or "") + element)
            else:
                if expect_variant:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}"
                    )
                properties[field] = element

            expect_variant = None

        self.postprocess_form(properties, language_id)
        return Form(properties)