Beispiel #1
0
    def cache_dataset(self, logger: cli.logging.Logger = cli.logger):
        logger.info("Caching dataset into memory…")
        for table in self.dataset.tables:
            table_type = (table.common_props.get("dc:conformsTo", "").rsplit(
                "#", 1)[1] or table.url)
            (id, ) = table.tableSchema.primaryKey
            # Extent may be wrong, but it's usually at least roughly correct
            # and a better indication of the table size than none at all.
            try:
                self.cache[table_type] = {
                    row[id]: row
                    for row in cli.tq(
                        table,
                        task="Cache the dataset",
                        total=table.common_props.get("dc:extent"),
                    )
                }
            except FileNotFoundError:
                self.cache[table_type] = {}

        for source in self.dataset.sources:
            self.source_ids.add(source.id)
Beispiel #2
0
    def parse_all_languages(
            self,
            sheet: openpyxl.worksheet.worksheet.Worksheet) -> t.Dict[str, str]:
        """Parse all language descriptions in the focal sheet.

        Returns
        =======
        languages: A dictionary mapping columns ("B", "C", "D", …) to language IDs
        """
        languages_by_column: t.Dict[str, str] = {}
        # iterate over language columns
        for lan_col in cli.tq(
                sheet.iter_cols(min_row=1,
                                max_row=self.top - 1,
                                min_col=self.left),
                task="Parse all languages",
                total=sheet.max_column - self.left,
        ):
            c_l_id = self.db.dataset["LanguageTable", "id"].name
            if cells_are_empty(lan_col):
                # Skip empty languages
                continue
            language = self.language_from_column(lan_col)
            candidates = self.db.find_db_candidates(
                language,
                self.check_for_language_match,
            )
            for language_id in candidates:
                break
            else:
                if self.on_language_not_found(language, lan_col[0].coordinate):
                    self.db.insert_into_db(language)
                else:
                    continue
                language_id = language[c_l_id]
            languages_by_column[lan_col[0].column] = language_id

        return languages_by_column
Beispiel #3
0
def count_segments(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Container[types.Language_ID],
):
    c_f_language = dataset["FormTable", "languageReference"].name
    try:
        c_f_segments = dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Segment invertories report requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )
    counter: t.MutableMapping[t.Counter[str]] = t.DefaultDict(t.Counter)
    for form in cli.tq(
            dataset["FormTable"],
            total=dataset["FormTable"].common_props.get("dc:extent"),
            task="Reading all forms",
    ):
        if form[c_f_language] in languages:
            counter[form[c_f_language]].update(form[c_f_segments])
    return counter
Beispiel #4
0
def cache_table(
    dataset,
    table: t.Optional[str] = None,
    columns: t.Optional[t.Mapping[str, str]] = None,
    index_column: str = "id",
    filter: t.Callable[[t.Mapping[str, t.Any]], bool] = lambda e: True,
) -> t.Mapping[str, t.Mapping[str, t.Any]]:
    """Load a dataset table into memory as a dictionary of dictionaries.

    If the table is unspecified, use the primary table of the dataset.

    If the columns are unspecified, read each row completely, into a dictionary
    indexed by the local CLDF properties of the table.

    Examples
    ========

    >>> ds = fs.new_wordlist(FormTable=[{
    ...  "ID": "ache_one",
    ...  "Language_ID": "ache",
    ...  "Parameter_ID": "one",
    ...  "Form": "e.ta.'kɾã",
    ...  "variants": ["~[test phonetic variant]"]
    ... }])
    >>> forms = cache_table(ds)
    >>> forms["ache_one"]["languageReference"]
    'ache'
    >>> forms["ache_one"]["form"]
    "e.ta.'kɾã"
    >>> forms["ache_one"]["variants"]
    ['~[test phonetic variant]']

    We can also use it to look up a specific set of columns, and change the index column.
    This allows us, for example, to get language IDs by name:
    >>> _ = ds.add_component("LanguageTable")
    >>> ds.write(LanguageTable=[
    ...     ['ache', 'Aché', 'South America', -25.59, -56.47, "ache1246", "guq"],
    ...     ['paraguayan_guarani', 'Paraguayan Guaraní', None, None, None, None, None]])
    >>> languages = cache_table(ds, "LanguageTable", {"id": "ID"}, index_column="Name")
    >>> languages == {'Aché': {'id': 'ache'},
    ...               'Paraguayan Guaraní': {'id': 'paraguayan_guarani'}}
    True

    In this case identical values later in the file overwrite earlier ones.

    """
    if table is None:
        table = dataset.primary_table
    assert (
        table
    ), "If your dataset has no primary table, you must specify which table to cache."
    if columns is None:
        columns = {(cldf_property(c.propertyUrl) if c.propertyUrl else c.name):
                   c.name
                   for c in dataset[table].tableSchema.columns}
    c_id = dataset[table, index_column].name
    return {
        row[c_id]: {prop: row[name]
                    for prop, name in columns.items()}
        for row in tq(
            dataset[table],
            task=f"Caching table {table}",
            total=dataset[table].common_props.get("dc:extent"),
        ) if filter(row)
    }
Beispiel #5
0
    def parse_cells(
        self,
        sheet: openpyxl.worksheet.worksheet.Worksheet,
        status_update: t.Optional[str] = None,
    ) -> None:
        languages = self.parse_all_languages(sheet)
        row_object: t.Optional[R] = None
        for row in cli.tq(
                sheet.iter_rows(min_row=self.top),
                task="Parsing cells",
                total=sheet.max_row - self.top,
        ):
            row_header, row_forms = row[:self.left - 1], row[self.left - 1:]
            # Parse the row header, creating or retrieving the associated row
            # object (i.e. a concept or a cognateset)
            properties = self.properties_from_row(row_header)
            if properties:
                c_r_id = self.db.dataset[properties.__table__, "id"].name
                try:
                    c_r_name = self.db.dataset[properties.__table__,
                                               "name"].name
                except KeyError:
                    c_r_name = None
                similar = self.db.find_db_candidates(properties,
                                                     self.check_for_row_match)
                for row_id in similar:
                    properties[c_r_id] = row_id
                    break
                else:
                    if self.on_row_not_found(
                            properties, cell_identifier=row[0].coordinate):
                        if c_r_id not in properties:
                            properties[c_r_id] = string_to_id(
                                str(properties.get(c_r_name, "")))
                        self.db.make_id_unique(properties)
                        self.db.insert_into_db(properties)
                    else:
                        continue
                # check the fields of properties are not empty, if so, set row
                # object to properties. This means that if there is no
                # properties object, of if it is empty, the previous row object
                # is re-used. This is intentional.
                if any(properties.values()):
                    row_object = properties

            if row_object is None:
                if any(c.value for c in row_forms):
                    raise AssertionError(
                        "Your first data row didn't have a name. "
                        "Please check your format specification or ensure the first row has a name."
                    )
                else:
                    continue
            # Parse the row, cell by cell
            for cell_with_forms in row_forms:
                try:
                    this_lan = languages[cell_with_forms.column]
                except KeyError:
                    continue

                # Parse the cell, which results (potentially) in multiple forms
                if row_object.__table__ == "FormTable":
                    raise NotImplementedError(
                        "TODO: I am confused why what I'm doing right now ever landed on my agenda, but you seem to have gotten me to attempt it. Please contact the developers and tell them what you did, so they can implement the thing you tried to do properly!"
                    )
                    c_f_form = self.db.dataset[row_object.__table__,
                                               "form"].name
                for params in self.cell_parser.parse(
                        cell_with_forms,
                        this_lan,
                        f"{sheet.title}.{cell_with_forms.coordinate}",
                ):
                    if row_object.__table__ == "FormTable":
                        if params[c_f_form] == "?":
                            continue
                        else:
                            self.handle_form(
                                params,
                                row_object,
                                cell_with_forms,
                                this_lan,
                                status_update,
                            )
                    else:
                        self.handle_form(params, row_object, cell_with_forms,
                                         this_lan, status_update)
        self.db.commit()
Beispiel #6
0
def load_forms_from_tsv(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    input_file: Path,
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]:
    """

    Side effects
    ============
    This function overwrites dataset's FormTable
    """
    input = csv.DictReader(
        input_file.open(encoding="utf-8"),
        delimiter="\t",
    )

    # These days, all dicts are ordered by default. Still, better make this explicit.
    forms = util.cache_table(dataset)

    edictor_cognatesets: t.Dict[
        int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = collections.defaultdict(list)

    form_table_upper = {
        (util.cldf_property(column.propertyUrl) or column.name).upper(): (
            util.cldf_property(column.propertyUrl) or column.name
        )
        for column in dataset["FormTable"].tableSchema.columns
    }
    form_table_upper.update(
        {
            "DOCULECT": "languageReference",
            "CONCEPT": "parameterReference",
            "IPA": "form",
            "COGID": "cognatesetReference",
            "ALIGNMENT": "alignment",
            "TOKENS": "segments",
            "CLDF_ID": "id",
            "ID": "",
        }
    )
    if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]:
        form_table_upper["_PARAMETERREFERENCE"] = "parameterReference"
        form_table_upper["CONCEPT"] = ""

    separators: t.MutableMapping[str, t.Optional[str]] = {}
    # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames.
    for i in range(len(input.fieldnames)):
        if i == 0 and input.fieldnames[0] != "ID":
            raise ValueError(
                "When importing from Edictor, expected the first column to be named 'ID', but found %s",
                input.fieldnames["ID"],
            )

        lingpy = input.fieldnames[i]
        try:
            input.fieldnames[i] = form_table_upper[lingpy.upper()]
        except KeyError:
            logger.warning(
                "Your edictor file contained a column %s, which I could not interpret.",
                lingpy,
            )

        if input.fieldnames[i] == "cognatesetReference":
            separators[input.fieldnames[i]] = " "
        elif input.fieldnames[i] == "alignment":
            separators[input.fieldnames[i]] = " "

        try:
            separators[input.fieldnames[i]] = dataset[
                "FormTable", input.fieldnames[i]
            ].separator
        except KeyError:
            pass

    logger.info(
        "The header of your edictor file will be interpreted as %s.", input.fieldnames
    )

    affected_forms: t.Set[types.Form_ID] = set()
    for line in cli.tq(
        input, task="Importing form rows from edictor…", total=len(forms)
    ):
        # Column "" is the re-named Lingpy-ID column, so the first one.
        if not any(line.values()) or line[""].startswith("#"):
            # One of Edictor's comment rows, storing settings
            continue

        for (key, value) in line.items():
            value = value.replace("\\!t", "\t").replace("\\!n", "\n")
            sep = separators[key]
            if sep is not None:
                if not value:
                    line[key] = []
                else:
                    line[key] = value.split(sep)
            else:
                line[key] = value

        affected_forms.add(line["id"])

        try:
            for segments, cognateset, alignment in extract_partial_judgements(
                line["segments"],
                line["cognatesetReference"],
                line["alignment"],
                logger,
            ):
                edictor_cognatesets[cognateset].append(
                    (line["id"], segments, alignment)
                )
            forms[line["id"]] = line
        except IndexError:
            logger.warning(
                f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped."
            )
    edictor_cognatesets.pop(0, None)

    columns = {
        (util.cldf_property(column.propertyUrl) or column.name): column.name
        for column in dataset["FormTable"].tableSchema.columns
    }
    # Deliberately make use of the property of `write` to discard any entries
    # that don't correspond to existing columns. Otherwise, we'd still have to
    # get rid of the alignment, cognatesetReference and Lingpy-ID columns.
    dataset["FormTable"].write(
        (
            {
                columns[property]: value
                for property, value in form.items()
                if columns.get(property)
            }
            for form in forms.values()
        )
    )
    return edictor_cognatesets, affected_forms