Beispiel #1
0
def rename(
    ds,
    old_values_to_new_values,
    logger: cli.logging.Logger,
    status_update: t.Optional[str],
):
    concepts = ds["ParameterTable"]

    for table in ds.tables:
        if table == concepts:
            continue
        _, component = table.common_props["dc:conformsTo"].split("#")
        try:
            c_concept = ds[component, "parameterReference"]
            columns = {c_concept.name}
        except KeyError:
            columns = set()
        for reference in table.tableSchema.foreignKeys:
            if reference.reference.resource.string == concepts.url.string:
                (column, ) = reference.columnReference
                columns.add(column)
        if columns:
            logger.info(f"Changing columns {columns:} in {component:}…")
            ds.write(
                **{
                    component: [
                        substitute_many(
                            r,
                            columns,
                            old_values_to_new_values,
                            status_update=status_update,
                        ) for r in table
                    ]
                })
Beispiel #2
0
    def on_form_not_found(
        self,
        form: t.Dict[str, t.Any],
        cell_identifier: t.Optional[str] = None,
        language_id: t.Optional[str] = None,
        logger: cli.logging.Logger = cli.logger,
    ) -> bool:
        """Should I add a missing object? No, but inform the user.

        Send a warning (ObjectNotFoundWarning) reporting the missing object and cell.

        Returns
        =======
        False: The object should not be added.

        """
        rep = form.get("cldf_id", repr(form))
        logger.warning(
            f"Unable to find form {rep} in cell {cell_identifier} in the dataset. "
            f"This cognate judgement was skipped. "
            f"Please make sure that the form is present in forms.csv or in the file "
            f"used for the Wordlist importation.")
        # Do a fuzzy search
        for row in self.db.find_db_candidates(form,
                                              self.check_for_match,
                                              edit_dist_threshold=4):
            logger.info(f"Did you mean {row} ?")
        return False
Beispiel #3
0
def check_id_format(dataset: pycldf.Dataset,
                    logger: cli.logging.Logger = cli.logger):
    correct = True
    for table in dataset.tables:
        # Every table SHOULD have an ID column
        try:
            id_column = dataset[table, "id"]
        except KeyError:
            log_or_raise("Table %s has no identifier column.", logger)
            correct = False
            continue

        # All IDs SHOULD be [a-zA-Z0-9_-]+
        datatype = id_column.datatype
        if datatype.base == "string":
            if not datatype.format:
                correct = False
                log_or_raise(
                    f"Table {table.url} has an unconstrained ID column {id_column.name}. Consider setting "
                    f"its format to [a-zA-Z0-9_-]+ and/or running `lexedata.edit.simplify_ids`.",
                    logger,
                )
            else:
                if datatype.format not in {
                        "[a-zA-Z0-9_\\-]+",
                        "[a-zA-Z0-9_-]+",
                        "[a-zA-Z0-9\\-_]+",
                        "[a-z0-9_]+",
                }:
                    log_or_raise(
                        f"Table {table.url} has a string ID column {id_column.name} with format {datatype.format}. "
                        f"I am too dumb to check whether that's a subset of [a-zA-Z0-9_-]+ (which is fine) "
                        f"or not (in which case maybe change it).",
                        logger,
                    )

        elif datatype.base == "integer":
            logger.info(
                "Table %s has integer ID column %s. This is okay, I hope I will not mess it up.",
                table.url,
                id_column.name,
            )

        # IDs should be primary keys and primary keys IDs (not official part of the CLDF specs)
        if table.tableSchema.primaryKey != [id_column.name]:
            log_or_raise(
                f"Table {table.url} has ID column {id_column.name}, but primary key {table.tableSchema.primaryKey}",
                logger,
            )
            correct = False

    return correct
Beispiel #4
0
def clean_forms(
    table: t.Iterable[R],
    form_column_name="form",
    variants_column_name="variants",
    split_at=[",", ";"],
    split_at_and_keep=["~"],
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterator[R]:
    """Split all forms that contain separators into form+variants.

    >>> for row in clean_forms([
    ...   {'F': 'a ~ æ', 'V': []},
    ...   {'F': 'bə-, be-', 'V': ['b-']}],
    ...   "F", "V"):
    ...   print(row)
    {'F': 'a', 'V': ['~æ']}
    {'F': 'bə-', 'V': ['b-', 'be-']}

    """
    for r, row in enumerate(table):
        forms = [("", row[form_column_name])]
        for separator in split_at:
            forms = [("", form.strip()) for _, chunk in forms
                     for form in chunk.split(separator)]
        for separator in split_at_and_keep:
            forms = [(first_separator if f == 0 else separator, form.strip())
                     for first_separator, chunk in forms
                     for f, form in enumerate(chunk.split(separator))]

        if len(forms) > 1:
            logger.info(
                "Line %d: Split form '%s' into %d elements.",
                r,
                row[form_column_name],
                len(forms),
            )
            if forms[0][0]:
                logger.warn(
                    "First element was marked as variant using %s, ignoring the marker",
                    forms[0][0],
                )
            row[form_column_name] = forms[0][1]
            row[variants_column_name].extend(
                [f"{separator}{form}" for separator, form in forms[1:]])
        yield row
Beispiel #5
0
def replace_column(
    dataset: pycldf.Dataset,
    original: str,
    replacement: str,
    column_replace: bool,
    smush: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> None:
    # add Status_column if not existing and status update given
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="ParameterTable")

    if column_replace:
        assert (
            original == "id"
            or original == dataset["ParameterTable", "id"].name
        ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable."

        c_id = dataset["ParameterTable", original].name
        c_new = dataset["ParameterTable", replacement].name
        mapping = {
            concept[c_id]: concept[c_new]
            for concept in dataset["ParameterTable"]
        }
        assert smush or len(mapping) == len(
            set(mapping.values())
        ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended."
        # dataset["ParameterTable"].tableSchema.columns["c_id"]
        rename(dataset, mapping, logger, status_update=status_update)
    else:
        concepts = dataset["ParameterTable"]

        c_id = dataset["ParameterTable", "id"].name

        logger.info(f"Changing {c_id:} of ParameterTable…")
        dataset.write(ParameterTable=[
            substitute_many(r, [c_id], {original: replacement},
                            status_update=None) for r in concepts
        ])
        rename(dataset, {original: replacement},
               logger,
               status_update=status_update)
Beispiel #6
0
def import_cognates_from_excel(
    ws: openpyxl.worksheet.worksheet.Worksheet,
    dataset: pycldf.Dataset,
    extractor: re.Pattern = re.compile("/(?P<ID>[^/]*)/?$"),
    logger: cli.logging.Logger = cli.logger,
) -> None:
    logger.info("Loading sheet…")
    logger.info(
        f"Importing cognate sets from sheet {ws.title}, into {dataset.tablegroup._fname}…"
    )

    row_header, _ = header_from_cognate_excel(ws, dataset, logger=logger)
    excel_parser_cognate = CognateEditParser(
        dataset,
        top=2,
        # When the dataset has cognateset comments, that column is not a header
        # column, so this value is one higher than the actual number of header
        # columns, so actually correct for the 1-based indices. When there is
        # no comment column, we need to compensate for the 1-based Excel
        # indices.
        cellparser=cell_parsers.CellParserHyperlink(dataset,
                                                    extractor=extractor),
        row_header=row_header,
        check_for_language_match=[dataset["LanguageTable", "name"].name],
        check_for_match=[dataset["FormTable", "id"].name],
        check_for_row_match=[dataset["CognatesetTable", "id"].name],
    )
    excel_parser_cognate.db.cache_dataset()
    excel_parser_cognate.db.drop_from_cache("CognatesetTable")
    excel_parser_cognate.db.drop_from_cache("CognateTable")
    logger.info("Parsing cognate Excel…")
    excel_parser_cognate.parse_cells(ws, status_update=None)
    excel_parser_cognate.db.write_dataset_from_cache(
        ["CognateTable", "CognatesetTable"])
Beispiel #7
0
    def cache_dataset(self, logger: cli.logging.Logger = cli.logger):
        logger.info("Caching dataset into memory…")
        for table in self.dataset.tables:
            table_type = (table.common_props.get("dc:conformsTo", "").rsplit(
                "#", 1)[1] or table.url)
            (id, ) = table.tableSchema.primaryKey
            # Extent may be wrong, but it's usually at least roughly correct
            # and a better indication of the table size than none at all.
            try:
                self.cache[table_type] = {
                    row[id]: row
                    for row in cli.tq(
                        table,
                        task="Cache the dataset",
                        total=table.common_props.get("dc:extent"),
                    )
                }
            except FileNotFoundError:
                self.cache[table_type] = {}

        for source in self.dataset.sources:
            self.source_ids.add(source.id)
Beispiel #8
0
def filter(
    table: t.Iterable[R],
    column: str,
    filter: re.Pattern,
    invert: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterator[R]:
    """Return all rows matching a filter

    Match the filter regular expression and return all rows in the table where
    the filter matches the column. (Or all where it does not, if invert==True.)

    >>> list(filter([
    ...   {"C": "A"},
    ...   {"C": "An"},
    ...   {"C": "T"},
    ...   {"C": "E"},
    ... ], "C", re.compile("A"), invert=True))
    [{'C': 'T'}, {'C': 'E'}]

    """
    n_row = 0
    n_included = 0
    for row in table:
        n_row += 1
        # TODO: Treat list-valued columns better.
        string = str(row[column])
        row_matches = bool(filter.search(string))
        if row_matches ^ invert:
            n_included += 1
            yield row

    logger.info(
        "Filtered %d rows down to %d (%1.0f%%)",
        n_row,
        n_included,
        n_included / n_row * 100,
    )
Beispiel #9
0
def add_concepticon_definitions(
    dataset: pycldf.Dataset,
    column_name: str = "Concepticon_Definition",
    logger: cli.logging.Logger = cli.logger,
) -> None:
    concepticon_ids = dataset.column_names.parameters.concepticonReference
    if concepticon_ids is None:
        logger.error(
            "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references."
        )
        return

    # Create a concepticon_definition column
    try:
        dataset["ParameterTable", column_name]
        logger.info("Overwriting existing {:} column in concepts table".format(
            column_name))
    except KeyError:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
        # Now if this throws an exception, it's an unexpected exception.

    # write concepticon definitions
    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon definitions to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[
                row[concepticon_ids]].definition
        except KeyError:
            pass
        write_back.append(row)

    dataset.write(ParameterTable=write_back)
Beispiel #10
0
def update_ids(
    ds: pycldf.Dataset,
    table: csvw.metadata.Table,
    mapping: t.Mapping[str, str],
    logger: cli.logging.Logger = cli.logger,
):
    """Update all IDs of the table in the database, also in foreign keys, according to mapping."""
    c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id")
    rows = []
    for row in cli.tq(
            ds[table],
            task=f"Updating ids of {table.url.string}",
            total=ds[table].common_props.get("dc:extent"),
    ):
        row[c_id.name] = mapping.get(row[c_id.name], row[c_id.name])
        rows.append(row)
    logger.info(f"Writing {table.url.string} back to file…")
    table.write(rows)

    c_id.datatype.format = ID_FORMAT.pattern

    foreign_keys_to_here = {
        other_table.url.string: {
            foreign_key.columnReference[
                foreign_key.reference.columnReference.index(c_id.name)]
            for foreign_key in other_table.tableSchema.foreignKeys
            if foreign_key.reference.resource == table.url
            if c_id.name in foreign_key.reference.columnReference
        }
        for other_table in ds.tables
    }

    for other_table, columns in foreign_keys_to_here.items():
        if not columns:
            continue
        logger.info(
            f"Applying changed foreign key to columns {columns:} in {other_table:}…"
        )
        rows = []
        for row in cli.tq(
                ds[other_table],
                total=ds[other_table].common_props.get("dc:extent"),
                task="Replacing changed IDs",
        ):
            for column in columns:
                # TODO: is this enough to handle columns with a separator? like parameterReference in forms table
                if isinstance(row[column], list):
                    row[column] = [mapping.get(v, v) for v in row[column]]
                else:
                    row[column] = mapping.get(row[column], row[column])
                rows.append(row)
        logger.info(f"Writing {other_table} back to file…")
        ds[other_table].write(rows)

        for column in columns:
            ds[other_table, column].datatype = c_id.datatype
Beispiel #11
0
def update_integer_ids(
    ds: pycldf.Dataset,
    table: csvw.metadata.Table,
    logger: cli.logging.Logger = cli.logger,
):
    """Update all IDs of the table in the database, also in foreign keys."""
    c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id")
    max_id = 0
    no_integer_rows: t.Set[str] = set()
    # logger.info("Checking IDs that are already integers…")
    for row in cli.tq(
            ds[table],
            task="Checking IDs that are already integers…",
            total=ds[table].common_props.get("dc:extent"),
    ):
        try:
            max_id = max(int(row[c_id.name]), max_id)
        except ValueError:
            no_integer_rows.add(row[c_id.name])
    logger.info("Adding integer IDs to other rows…")

    mapping: t.Dict[str, int] = dict()
    rows: t.List[t.Dict[str, t.Any]] = []
    for row in cli.tq(
            ds[table],
            task="Updating integer ids",
            total=ds[table].common_props.get("dc:extent"),
    ):
        original = row[c_id.name]
        if row[c_id.name] in no_integer_rows:
            max_id += 1
            row[c_id.name] = max_id
        else:
            row[c_id.name] = int(row[c_id.name])
        mapping[original] = row[c_id.name]
        rows.append(row)
    logger.info(f"Writing {table.url.string} back to file…")
    table.write(rows)

    foreign_keys_to_here = {
        other_table.url.string: {
            foreign_key.columnReference[
                foreign_key.reference.columnReference.index(c_id.name)]
            for foreign_key in other_table.tableSchema.foreignKeys
            if foreign_key.reference.resource == table.url
            if c_id.name in foreign_key.reference.columnReference
        }
        for other_table in ds.tables
    }
    for other_table, columns in foreign_keys_to_here.items():
        if not columns:
            continue
        rows = []
        for row in cli.tq(
                ds[other_table],
                task=f"Applying changed foreign key to {other_table}…",
                total=ds[other_table].common_props.get("dc:extent"),
        ):
            for column in columns:
                # TODO: is this enough to handle columns with a separator? like parameterReference in forms table
                if isinstance(row[column], list):
                    row[column] = [mapping[v] for v in row[column]]
                else:
                    row[column] = mapping[row[column]]
            rows.append(row)

        for column in columns:
            ds[other_table, column].datatype = c_id.datatype

        logger.info(f"Writing {other_table} back to file…")

        ds[other_table].write(rows)
Beispiel #12
0
def read_single_excel_sheet(
    dataset: pycldf.Dataset,
    sheet: openpyxl.worksheet.worksheet.Worksheet,
    logger: cli.logging.Logger = cli.logger,
    match_form: t.Optional[t.List[str]] = None,
    entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(),
    concept_column: t.Optional[str] = None,
    ignore_missing: bool = False,
    ignore_superfluous: bool = False,
    status_update: t.Optional[str] = None,
) -> t.Mapping[str, ImportLanguageReport]:
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)

    concept_columns: t.Tuple[str, str]
    if concept_column is None:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            dataset["FormTable", "parameterReference"].name,
        )
    else:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            concept_column,
        )
    db = DB(dataset)
    db.cache_dataset()
    # required cldf fields of a form
    c_f_id = db.dataset["FormTable", "id"].name
    c_f_language = db.dataset["FormTable", "languageReference"].name
    c_f_form = db.dataset["FormTable", "form"].name
    c_f_value = db.dataset["FormTable", "value"].name
    c_f_concept = db.dataset["FormTable", "parameterReference"].name
    if not match_form:
        match_form = [c_f_form, c_f_language]
    if not db.dataset["FormTable", c_f_concept].separator:
        logger.warning(
            "Your metadata does not allow polysemous forms. According to your specifications, "
            "identical forms with different concepts will always be considered homophones, not a single "
            "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference "
            "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones."
        )
        match_form.append(c_f_concept)
    else:
        if c_f_concept in match_form:
            logger.info(
                "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones."
            )

    sheet_header = get_headers_from_excel(sheet)
    form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys())

    # These columns don't need to be given, we can infer them from the sheet title and from the other data:
    implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {}
    if c_f_language not in sheet_header:
        implicit["languageReference"] = c_f_language
    if c_f_id not in sheet_header:
        implicit["id"] = c_f_id
    if c_f_value not in sheet_header:
        implicit["value"] = c_f_value

    found_columns = set(sheet_header) - {concept_column} - set(implicit.values())
    expected_columns = set(form_header) - {c_f_concept} - set(implicit.values())
    if not found_columns >= expected_columns:
        if ignore_missing:
            logger.info(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"For the newly imported forms, these columns will be left empty in the dataset."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these "
                f"columns empty in the dataset for the newly imported forms."
            )
    if not found_columns <= expected_columns:
        if ignore_superfluous:
            logger.info(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. These columns will be ignored."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. Clean up your data, or use "
                f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns."
            )
    # check if language exist
    c_l_name = db.dataset["LanguageTable", "name"].name
    c_l_id = db.dataset["LanguageTable", "id"].name
    language_name_to_language_id = {
        row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values()
    }
    language_name = normalize_string(sheet.title)
    if language_name in language_name_to_language_id:
        language_id = language_name_to_language_id[language_name]
        report[language_id].is_new_language = False
    else:
        language_id = language_name
        report[language_id].is_new_language = True

    # read new data from sheet
    for form in cli.tq(
        import_data_from_sheet(
            sheet,
            sheet_header=sheet_header,
            implicit=implicit,
            language_id=language_id,
            concept_column=concept_columns,
        ),
        task=f"Parsing cells of sheet {sheet.title}",
        total=sheet.max_row,
    ):
        # if concept not in dataset, don't add form
        try:
            concept_entry = form[c_f_concept]
            entries_to_concepts[concept_entry]
        except KeyError:
            logger.warning(
                f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. "
                f"The corresponding form was ignored and not added to the dataset."
            )
            report[language_id].skipped += 1
            continue
        # else, look for candidates, link to existing form or add new form
        for item, value in form.items():
            try:
                sep = db.dataset["FormTable", item].separator
            except KeyError:
                continue
            if sep is None:
                continue
            form[item] = value.split(sep)
        form_candidates = db.find_db_candidates(form, match_form)
        if form_candidates:
            new_concept_added = False
            for form_id in form_candidates:
                logger.info(f"Form {form[c_f_value]} was already in dataset.")

                if db.dataset["FormTable", c_f_concept].separator:
                    for new_concept in form[c_f_concept]:
                        if (
                            new_concept
                            not in db.cache["FormTable"][form_id][c_f_concept]
                        ):
                            db.cache["FormTable"][form_id][c_f_concept].append(
                                new_concept
                            )
                            logger.info(
                                f"New form-concept association: Concept {form[c_f_concept]} was added to existing form "
                                f"{form_id}. If this was not intended "
                                f"(because it is a homophonous form, not a polysemy), "
                                f"you need to manually remove that concept from the old form in forms.csv "
                                f"and create a separate new form. If you want to treat identical forms "
                                f"as homophones in general, add  "
                                f"--match-forms={' '.join(match_form)}, "
                                f"{db.dataset['FormTable', 'parameterReference']} "
                                f"when you run this script."
                            )
                            new_concept_added = True
                break

            if new_concept_added:
                report[language_id].concepts += 1
            else:
                report[language_id].existing += 1
        else:
            # we land here after the break and keep adding existing forms to the dataset just with integer in id +1
            form[c_f_language] = language_id
            if "id" in implicit:
                # TODO: check for type of form id column
                form_concept = form[c_f_concept]
                concept_reference = (
                    form_concept[0] if isinstance(form_concept, list) else form_concept
                )
                form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}")
            db.make_id_unique(form)
            if status_update:
                form["Status_Column"] = status_update
            db.insert_into_db(form)
            report[language_id].new += 1
    # write to cldf
    db.write_dataset_from_cache()
    return report
Beispiel #13
0
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger):
    if fname.name != "forms.csv":
        cli.Exit.CLI_ARGUMENT_ERROR(
            "A metadata-free Wordlist must be in a file called 'forms.csv'.")
    default_wordlist = TableGroup.from_file(
        pycldf.util.pkg_path("modules", "Wordlist-metadata.json"))
    default_wordlist._fname = fname.with_name("Wordlist-metadata.json")
    ds = pycldf.Wordlist(default_wordlist)

    # `from_data` checks that the reqired columns of the FormTable are present,
    # but it does not consolidate the columns further.

    colnames = next(iterrows(fname))

    understood_colnames = {
        c.name
        for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames
    }
    more_columns = {
        c.propertyUrl.uri: c
        for c in ds[ds.primary_table].tableSchema.columns
        if c.name not in understood_colnames
    }
    logger.info(
        "CLDF freely understood the columns %s in your forms.csv.",
        sorted(understood_colnames),
    )

    # Consider the columns that were not understood.
    columns_without_metadata = set(colnames) - understood_colnames
    for column_name in columns_without_metadata:
        column: Column
        # Maybe they are known CLDF properties?
        if column_name in pycldf.terms.TERMS:
            column = pycldf.TERMS[column_name].to_column()
        # Maybe they are CLDF default column names?
        elif column_name in DEFAULT_NAME_COLUMNS:
            column = DEFAULT_NAME_COLUMNS[column_name]
        # Maybe they are columns that Lexedata knows to handle?
        elif column_name in LEXEDATA_COLUMNS:
            column = LEXEDATA_COLUMNS[column_name]
        # Maybe they are columns inherited from LingPy?
        elif column_name.upper() in LINGPY_COLUMNS:
            column = LINGPY_COLUMNS[column_name.upper()]
        # Maybe they are some name we have seen before?
        elif column_name in OTHER_KNOWN_COLUMNS:
            column = OTHER_KNOWN_COLUMNS[column_name]
        else:
            # TODO: Maybe they look like they have a specific type?
            ...
            # Otherwise, they are probably just text to be kept.
            column = Column(
                datatype=Datatype(base="string"),
                default="",
                null=[""],
                name=column_name,
            )
        column.name = column_name

        ds[ds.primary_table].tableSchema.columns.append(column)
        summary = column.propertyUrl or column.datatype
        logger.info(f"Column {column_name} seems to be a {summary} column.")
        if column.propertyUrl:
            to_be_replaced = more_columns.pop(column.propertyUrl.uri, None)
            if to_be_replaced is not None:
                ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced)

    for column in more_columns.values():
        logger.info(
            f"Also added column {column.name}, as expected for a FormTable.")

    ds[ds.primary_table].tableSchema.columns.sort(
        key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10)

    # TODO: Once lexedata is properly published, we can give a better URL.
    ds.properties["dc:contributor"] = [
        "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py"
    ]
    return ds
Beispiel #14
0
def merge_forms(
    data: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID,
                         types.Cognate_ID, types.Cognateset_ID, ],
    mergers: t.Mapping[str, Merger],
    homophone_groups: t.MutableMapping[types.Form_ID,
                                       t.Sequence[types.Form_ID]],
    logger: cli.logging.Logger = cli.logger,
) -> t.Iterable[types.Form]:
    """Merge forms from a dataset.

    TODO: Construct an example that shows that the order given in
    `homophone_groups` is maintained.

    Side Effects
    ============
    Changes homophone_groups:
        Groups that are skipped are removed

    """
    merge_targets = {
        variant: target
        for target, variants in homophone_groups.items()
        for variant in variants
    }
    for target in homophone_groups:
        assert merge_targets[target] == target

    c_f_id = data["FormTable", "id"].name

    buffer: t.Dict[types.Form_ID, types.Form] = {}

    unknown = set()
    form: types.Form
    for form in cli.tq(
            data["FormTable"],
            task="Going through forms and merging",
            logger=logger,
            total=data["FormTable"].common_props.get("dc:extent"),
    ):
        id: types.Form_ID = form[c_f_id]
        buffer[id] = form
        if id in merge_targets:
            unknown.add(id)
            target_id = merge_targets[id]
            group = homophone_groups[target_id]
            if all(i in buffer for i in group):
                try:
                    buffer[target_id] = merge_group(
                        [buffer[i] for i in group],
                        buffer[target_id].copy(),  # type: ignore
                        mergers,
                        data,
                        logger,
                    )

                    for i in group:
                        if i != target_id:
                            del buffer[i]
                except Skip:
                    logger.info(
                        f"Merging form {id} with forms {[f[c_f_id] for f in group]} was skipped."
                    )
                    del homophone_groups[id]
                    pass
                for i in group:
                    unknown.remove(i)

        for f in list(buffer):
            if f in unknown:
                break
            yield buffer.pop(f)
Beispiel #15
0
    def __init__(
        self,
        dataset: pycldf.Dataset,
        element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [
            # ("[", "]", "phonetic", True),
            ("<", ">", "form", True),
            # ("/", "/", "phonemic", True),
            ("(", ")", "comment", False),
            ("{", "}", "source", False),
        ],
        separation_pattern: str = r"([;,])",
        variant_separator: t.Optional[t.List[str]] = ["~", "%"],
        add_default_source: t.Optional[str] = "{1}",
        logger: cli.logging.Logger = cli.logger,
    ):
        super().__init__(dataset)

        # Colums implied by element semantics
        self.bracket_pairs = {
            start: end
            for start, end, _, _ in element_semantics
        }
        self.element_semantics = {
            start: (term, transcription)
            for start, _, term, transcription in element_semantics
        }
        for start, end, term, transcription in element_semantics:
            # Ensure that all terms required by the element semantics are fields we can write to.
            self.cc(short=term, long=("FormTable", term), dataset=dataset)
        assert self.transcriptions, (
            "Your metadata json file and your cell parser don’t match: Your cell parser "
            f"{self.__class__.__name__} expects to work with transcriptions "
            "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form "
            "in #FormTable, but your metadata defines no such column.")

        # Colums necessary for word list
        self.cc(short="source", long=("FormTable", "source"), dataset=dataset)
        self.cc(short="comment",
                long=("FormTable", "comment"),
                dataset=dataset)

        try:
            self.comment_separator = dataset["FormTable",
                                             "comment"].separator or "\t"
        except KeyError:
            logger.info("No #comment column found.")
            self.comment_separator = ""

        try:
            # As long as there is no CLDF term #variants, this will either be
            # 'variants' or raise a KeyError. However, it is a transparent
            # re-use of an otherwise established idiom in this module, so we
            # use this minor overhead.
            self.c["variants"] = dataset["FormTable", "variants"].name
        except KeyError:
            logger.warning(
                "No 'variants' column found for FormTable in Wordlist-metadata.json. "
                "Form variants will be added to #comment.")

        # Other class attributes
        self.separation_pattern = separation_pattern
        self.variant_separator = variant_separator
        self.add_default_source = add_default_source
Beispiel #16
0
def load_forms_from_tsv(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    input_file: Path,
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]:
    """

    Side effects
    ============
    This function overwrites dataset's FormTable
    """
    input = csv.DictReader(
        input_file.open(encoding="utf-8"),
        delimiter="\t",
    )

    # These days, all dicts are ordered by default. Still, better make this explicit.
    forms = util.cache_table(dataset)

    edictor_cognatesets: t.Dict[
        int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = collections.defaultdict(list)

    form_table_upper = {
        (util.cldf_property(column.propertyUrl) or column.name).upper(): (
            util.cldf_property(column.propertyUrl) or column.name
        )
        for column in dataset["FormTable"].tableSchema.columns
    }
    form_table_upper.update(
        {
            "DOCULECT": "languageReference",
            "CONCEPT": "parameterReference",
            "IPA": "form",
            "COGID": "cognatesetReference",
            "ALIGNMENT": "alignment",
            "TOKENS": "segments",
            "CLDF_ID": "id",
            "ID": "",
        }
    )
    if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]:
        form_table_upper["_PARAMETERREFERENCE"] = "parameterReference"
        form_table_upper["CONCEPT"] = ""

    separators: t.MutableMapping[str, t.Optional[str]] = {}
    # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames.
    for i in range(len(input.fieldnames)):
        if i == 0 and input.fieldnames[0] != "ID":
            raise ValueError(
                "When importing from Edictor, expected the first column to be named 'ID', but found %s",
                input.fieldnames["ID"],
            )

        lingpy = input.fieldnames[i]
        try:
            input.fieldnames[i] = form_table_upper[lingpy.upper()]
        except KeyError:
            logger.warning(
                "Your edictor file contained a column %s, which I could not interpret.",
                lingpy,
            )

        if input.fieldnames[i] == "cognatesetReference":
            separators[input.fieldnames[i]] = " "
        elif input.fieldnames[i] == "alignment":
            separators[input.fieldnames[i]] = " "

        try:
            separators[input.fieldnames[i]] = dataset[
                "FormTable", input.fieldnames[i]
            ].separator
        except KeyError:
            pass

    logger.info(
        "The header of your edictor file will be interpreted as %s.", input.fieldnames
    )

    affected_forms: t.Set[types.Form_ID] = set()
    for line in cli.tq(
        input, task="Importing form rows from edictor…", total=len(forms)
    ):
        # Column "" is the re-named Lingpy-ID column, so the first one.
        if not any(line.values()) or line[""].startswith("#"):
            # One of Edictor's comment rows, storing settings
            continue

        for (key, value) in line.items():
            value = value.replace("\\!t", "\t").replace("\\!n", "\n")
            sep = separators[key]
            if sep is not None:
                if not value:
                    line[key] = []
                else:
                    line[key] = value.split(sep)
            else:
                line[key] = value

        affected_forms.add(line["id"])

        try:
            for segments, cognateset, alignment in extract_partial_judgements(
                line["segments"],
                line["cognatesetReference"],
                line["alignment"],
                logger,
            ):
                edictor_cognatesets[cognateset].append(
                    (line["id"], segments, alignment)
                )
            forms[line["id"]] = line
        except IndexError:
            logger.warning(
                f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped."
            )
    edictor_cognatesets.pop(0, None)

    columns = {
        (util.cldf_property(column.propertyUrl) or column.name): column.name
        for column in dataset["FormTable"].tableSchema.columns
    }
    # Deliberately make use of the property of `write` to discard any entries
    # that don't correspond to existing columns. Otherwise, we'd still have to
    # get rid of the alignment, cognatesetReference and Lingpy-ID columns.
    dataset["FormTable"].write(
        (
            {
                columns[property]: value
                for property, value in form.items()
                if columns.get(property)
            }
            for form in forms.values()
        )
    )
    return edictor_cognatesets, affected_forms