Exemple #1
0
    def on_form_not_found(
        self,
        form: t.Dict[str, t.Any],
        cell_identifier: t.Optional[str] = None,
        language_id: t.Optional[str] = None,
        logger: cli.logging.Logger = cli.logger,
    ) -> bool:
        """Should I add a missing object? No, but inform the user.

        Send a warning (ObjectNotFoundWarning) reporting the missing object and cell.

        Returns
        =======
        False: The object should not be added.

        """
        rep = form.get("cldf_id", repr(form))
        logger.warning(
            f"Unable to find form {rep} in cell {cell_identifier} in the dataset. "
            f"This cognate judgement was skipped. "
            f"Please make sure that the form is present in forms.csv or in the file "
            f"used for the Wordlist importation.")
        # Do a fuzzy search
        for row in self.db.find_db_candidates(form,
                                              self.check_for_match,
                                              edit_dist_threshold=4):
            logger.info(f"Did you mean {row} ?")
        return False
Exemple #2
0
def header_from_cognate_excel(
    ws: openpyxl.worksheet.worksheet.Worksheet,
    dataset: pycldf.Dataset,
    logger: cli.logging.Logger = cli.logger,
):
    row_header = []
    separators = []
    for (header, ) in ws.iter_cols(
            min_row=1,
            max_row=1,
            max_col=len(dataset["CognatesetTable"].tableSchema.columns),
    ):
        column_name = header.value
        if column_name is None:
            column_name = dataset["CognatesetTable", "id"].name
        elif column_name == "CogSet":
            column_name = dataset["CognatesetTable", "id"].name
        try:
            column_name = dataset["CognatesetTable", column_name].name
        except KeyError:
            break
        row_header.append(column_name)
        separators.append(dataset["CognatesetTable", column_name].separator)
        if column_name == dataset["CognatesetTable", "comment"].name:
            logger.warning(
                "Your cognates table has a separate ‘{header.value}’ column for comments, but `lexedata.importer.cognates` expects to extract comments from the cell comments of the cognateset metadata columns, not from a separate column. Your ‘{header.value}’ column will be ignored."
            )
    return row_header, separators
Exemple #3
0
    def source_from_source_string(
        self,
        source_string: str,
        language_id: t.Optional[str],
        logger: cli.logging.Logger = cli.logger,
    ) -> str:
        """Parse a string referencing a language-specific source"""
        context: t.Optional[str]
        if ":" in source_string:
            source_part, context = source_string.split(":", maxsplit=1)
            if not context.endswith("}"):
                logger.warning(
                    f"In source {source_string}: Closing bracket '}}' is missing, split into source and page/context may be wrong"
                )
            source_string = source_part + "}"
            context = context[:-1].strip()

            context = context.replace(":", "").replace(",", "")
        else:
            context = None

        if source_string.startswith("{") and source_string.endswith("}"):
            source_string = source_string[1:-1]
        if language_id is None:
            source_id = string_to_id(source_string)
        else:
            source_id = string_to_id(f"{language_id:}_s{source_string:}")

        source_id = source_id.replace(":", "").replace(",", "")

        if context:
            return f"{source_id}[{context}]"
        else:
            return source_id
Exemple #4
0
def list_homophones(dataset: pycldf.Dataset,
                    out: io.TextIOBase,
                    logger: cli.logging.Logger = cli.logger) -> None:
    clics = load_clics()
    # warn if clics cannot be loaded
    if not clics:
        logger.warning(
            "Clics could not be loaded. Using an empty graph instead")
        clics = nx.Graph()

    c_id = dataset["ParameterTable", "id"].name
    try:
        c_concepticon = dataset["ParameterTable", "concepticonReference"].name
    except KeyError:
        cli.Exit.INVALID_DATASET(
            "This script requires a column concepticonReference in ParamterTable. "
            "Please run add_concepticon.py")
    concepticon = {}
    for concept in dataset["ParameterTable"]:
        concepticon[concept[c_id]] = concept[c_concepticon]

    f_id = dataset["FormTable", "id"].name
    f_lang = dataset["FormTable", "languageReference"].name
    f_concept = dataset["FormTable", "parameterReference"].name
    f_form = dataset["FormTable", "form"].name

    homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[
        str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set))

    for form in dataset["FormTable"]:
        if form[f_form] == "-" or form[f_form] is None:
            continue
        if isinstance(form[f_concept], list):
            homophones[form[f_lang]][form[f_form]].add(
                tuple(form[f_concept]) + (form[f_id], ))
        else:
            homophones[form[f_lang]][form[f_form]].add(
                (form[f_concept], form[f_id]))
    for lang, forms in homophones.items():
        for form, meanings in forms.items():
            if len(meanings) == 1:
                continue
            clics_nodes = {concepticon.get(concept) for concept, _ in meanings}
            if None in clics_nodes:
                x = " (but at least one concept not found):"
            else:
                x = ":"
            clics_nodes -= {None}
            if len(clics_nodes) <= 1:
                x = "Unknown" + x
            elif nx.is_connected(clics.subgraph(clics_nodes)):
                x = "Connected" + x
            else:
                x = "Unconnected" + x
            line = f"{lang}, '{form}': {x}\n"
            for ele in sorted(meanings):
                line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n"
            out.write(line)
def add_central_concepts_to_cognateset_table(
    dataset: pycldf.Dataset,
    add_column: bool = True,
    overwrite_existing: bool = True,
    logger: cli.logging.Logger = cli.logger,
    status_update: t.Optional = None,
) -> pycldf.Dataset:
    # create mapping cognateset to central concept
    try:
        clics: t.Optional[networkx.Graph] = load_clics()
    except FileNotFoundError:
        logger.warning("Clics could not be loaded.")
        clics = None
    concepts_of_cognateset: t.Mapping[
        CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset)
    central: t.MutableMapping[str, str] = {}
    if clics and dataset.column_names.parameters.concepticonReference:
        concept_to_concepticon = concepts_to_concepticon(dataset)
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts,
                                                  concept_to_concepticon,
                                                  clics)
    else:
        logger.warning(
            f"Dataset {dataset:} had no concepticonReference in a ParamterTable."
        )
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts, {}, None)
    dataset = reshape_dataset(dataset, add_column=add_column)
    c_core_concept = dataset.column_names.cognatesets.parameterReference
    if c_core_concept is None:
        raise ValueError(
            f"Dataset {dataset:} had no parameterReference column in a CognatesetTable"
            " and is thus not compatible with this script.")
    # if status update given, add status column
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="CognatesetTable")
    # write cognatesets with central concepts
    write_back = []
    for row in cli.tq(
            dataset["CognatesetTable"],
            task="Write cognatesets with central concepts to dataset",
            total=dataset["CognatesetTable"].common_props.get("dc:extent"),
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        row["Status_Column"] = status_update
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset
Exemple #6
0
    def separate(
        self,
        values: str,
        context: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Iterable[str]:
        """Separate different form descriptions in one string.

        Separate forms separated by comma or semicolon, unless the comma or
        semicolon occurs within a set of matching component delimiters (eg.
        brackets)

        If the brackets don't match, the whole remainder string is passed on,
        so that the form parser can try to recover as much as possible or throw
        an exception.
        """
        raw_split = re.split(self.separation_pattern, values)
        if len(raw_split) <= 1:
            for form in raw_split:
                yield form
            return

        while len(raw_split) > 1:
            if check_brackets(raw_split[0], self.bracket_pairs):
                form = raw_split.pop(0).strip()
                if form:
                    yield form
                raw_split.pop(0)
            else:
                raw_split[:2] = ["".join(raw_split[:2])]
        if not check_brackets(raw_split[0], self.bracket_pairs):
            logger.warning(
                f"{context:}In values {values:}: "
                "Encountered mismatched closing delimiters. Please check that the "
                "separation of the cell into multiple entries, for different forms, was correct."
            )

        form = raw_split.pop(0).strip()
        if form:
            yield form
        assert not raw_split
Exemple #7
0
def add_cognate_table(
    dataset: pycldf.Wordlist,
    split: bool = True,
    logger: cli.logging.Logger = cli.logger,
) -> None:
    if "CognateTable" in dataset:
        return
    dataset.add_component("CognateTable")

    # TODO: Check if that cognatesetReference is already a foreign key to
    # elsewhere (could be a CognatesetTable, could be whatever), because then
    # we need to transfer that knowledge.

    # Load anything that's useful for a cognate set table: Form IDs, segments,
    # segment slices, cognateset references, alignments
    columns = {
        "id": dataset["FormTable", "id"].name,
        "concept": dataset["FormTable", "parameterReference"].name,
        "form": dataset["FormTable", "form"].name,
    }
    for property in [
            "segments", "segmentSlice", "cognatesetReference", "alignment"
    ]:
        try:
            columns[property] = dataset["FormTable", property].name
        except KeyError:
            pass
    cognate_judgements = []
    forms = cache_table(dataset, columns=columns)
    forms_without_segments = 0
    for f, form in cli.tq(forms.items(),
                          task="Extracting cognate judgements from forms…"):
        if form.get("cognatesetReference"):
            if split:
                cogset = util.string_to_id("{:}-{:}".format(
                    form["concept"], form["cognatesetReference"]))
            else:
                cogset = form["cognatesetReference"]
            judgement = {
                "ID": f,
                "Form_ID": f,
                "Cognateset_ID": cogset,
            }
            try:
                judgement["Segment_Slice"] = form["segmentSlice"]
            except KeyError:
                try:
                    if not form["segments"]:
                        raise ValueError("No segments")
                    if ("+" in form["segments"]
                            and dataset["FormTable",
                                        "cognatesetReference"].separator):
                        logger.warning(
                            "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!"
                        )
                    judgement["Segment_Slice"] = [
                        "1:{:d}".format(len(form["segments"]))
                    ]
                except (KeyError, TypeError, ValueError):
                    forms_without_segments += 1
                    if forms_without_segments >= 5:
                        pass
                    else:
                        logger.warning(
                            f"No segments found for form {f} ({form['form']})."
                        )
            # What does an alignment mean without segments or their slices?
            # Doesn't matter, if we were given one, we take it.
            judgement["Alignment"] = form.get("alignment")
            cognate_judgements.append(judgement)

    if forms_without_segments >= 5:
        logger.warning(
            "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.",
            forms_without_segments,
        )

    # Delete the cognateset column
    cols = dataset["FormTable"].tableSchema.columns
    remove = {
        dataset["FormTable", c].name
        for c in ["cognatesetReference", "segmentSlice", "alignment"]
        if ("FormTable", c) in dataset
    }

    def clean_form(form):
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)
Exemple #8
0
def load_forms_from_tsv(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    input_file: Path,
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]:
    """

    Side effects
    ============
    This function overwrites dataset's FormTable
    """
    input = csv.DictReader(
        input_file.open(encoding="utf-8"),
        delimiter="\t",
    )

    # These days, all dicts are ordered by default. Still, better make this explicit.
    forms = util.cache_table(dataset)

    edictor_cognatesets: t.Dict[
        int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = collections.defaultdict(list)

    form_table_upper = {
        (util.cldf_property(column.propertyUrl) or column.name).upper(): (
            util.cldf_property(column.propertyUrl) or column.name
        )
        for column in dataset["FormTable"].tableSchema.columns
    }
    form_table_upper.update(
        {
            "DOCULECT": "languageReference",
            "CONCEPT": "parameterReference",
            "IPA": "form",
            "COGID": "cognatesetReference",
            "ALIGNMENT": "alignment",
            "TOKENS": "segments",
            "CLDF_ID": "id",
            "ID": "",
        }
    )
    if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]:
        form_table_upper["_PARAMETERREFERENCE"] = "parameterReference"
        form_table_upper["CONCEPT"] = ""

    separators: t.MutableMapping[str, t.Optional[str]] = {}
    # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames.
    for i in range(len(input.fieldnames)):
        if i == 0 and input.fieldnames[0] != "ID":
            raise ValueError(
                "When importing from Edictor, expected the first column to be named 'ID', but found %s",
                input.fieldnames["ID"],
            )

        lingpy = input.fieldnames[i]
        try:
            input.fieldnames[i] = form_table_upper[lingpy.upper()]
        except KeyError:
            logger.warning(
                "Your edictor file contained a column %s, which I could not interpret.",
                lingpy,
            )

        if input.fieldnames[i] == "cognatesetReference":
            separators[input.fieldnames[i]] = " "
        elif input.fieldnames[i] == "alignment":
            separators[input.fieldnames[i]] = " "

        try:
            separators[input.fieldnames[i]] = dataset[
                "FormTable", input.fieldnames[i]
            ].separator
        except KeyError:
            pass

    logger.info(
        "The header of your edictor file will be interpreted as %s.", input.fieldnames
    )

    affected_forms: t.Set[types.Form_ID] = set()
    for line in cli.tq(
        input, task="Importing form rows from edictor…", total=len(forms)
    ):
        # Column "" is the re-named Lingpy-ID column, so the first one.
        if not any(line.values()) or line[""].startswith("#"):
            # One of Edictor's comment rows, storing settings
            continue

        for (key, value) in line.items():
            value = value.replace("\\!t", "\t").replace("\\!n", "\n")
            sep = separators[key]
            if sep is not None:
                if not value:
                    line[key] = []
                else:
                    line[key] = value.split(sep)
            else:
                line[key] = value

        affected_forms.add(line["id"])

        try:
            for segments, cognateset, alignment in extract_partial_judgements(
                line["segments"],
                line["cognatesetReference"],
                line["alignment"],
                logger,
            ):
                edictor_cognatesets[cognateset].append(
                    (line["id"], segments, alignment)
                )
            forms[line["id"]] = line
        except IndexError:
            logger.warning(
                f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped."
            )
    edictor_cognatesets.pop(0, None)

    columns = {
        (util.cldf_property(column.propertyUrl) or column.name): column.name
        for column in dataset["FormTable"].tableSchema.columns
    }
    # Deliberately make use of the property of `write` to discard any entries
    # that don't correspond to existing columns. Otherwise, we'd still have to
    # get rid of the alignment, cognatesetReference and Lingpy-ID columns.
    dataset["FormTable"].write(
        (
            {
                columns[property]: value
                for property, value in form.items()
                if columns.get(property)
            }
            for form in forms.values()
        )
    )
    return edictor_cognatesets, affected_forms
Exemple #9
0
def root_presence_code(
    dataset: t.Mapping[types.Language_ID,
                       t.Mapping[types.Parameter_ID,
                                 t.Set[types.Cognateset_ID]]],
    relevant_concepts: t.Mapping[types.Cognateset_ID,
                                 t.Iterable[types.Parameter_ID]],
    ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"],
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]],
             t.Mapping[types.Cognateset_ID, int], ]:
    """Create a root-presence/absence coding from cognate codes in a dataset

    Take the cognate code information from a wordlist, i.e. a mapping of the
    form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary
    alignment from it that lists for every root whether it is present in that
    language or not. Return that, and the association between cognatesets and
    characters.

    >>> alignment, roots = root_presence_code(
    ...     {"Language": {"Meaning": {"Cognateset 1"}}},
    ...     relevant_concepts={"Cognateset 1": ["Meaning"]})
    >>> alignment
    {'Language': ['0', '1']}
    >>> roots
    {'Cognateset 1': 1}

    The first entry in each sequence is always '0': The configuration where a
    form is absent from all languages is never observed, but always possible,
    so we add this entry for the purposes of ascertainment correction.

    If a root is attested at all, in any concept, it is considered present.
    Because the word list is never a complete description of the language's
    lexicon, the function employs a heuristic to generate ‘absent’ states.

    If a root is unattested, and at least half of the relevant concepts
    associated with this root are attested, but each expressed by another root,
    the root is assumed to be absent in the target language. (If there is
    exactly one central concept, then that central concept being attested or
    unknown is a special case of this general rule.) Otherwise the
    presence/absence of the root is considered unknown.

    >>> alignment, roots = root_presence_code(
    ...     {"l1": {"m1": {"c1"}},
    ...      "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}},
    ...     relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]})
    >>> sorted(roots)
    ['c1', 'c2', 'c3']
    >>> sorted_roots = sorted(roots.items())
    >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()}
    {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']}
    >>> list(zip(*sorted(zip(*alignment.values()))))
    [('0', '0', '1', '?'), ('0', '1', '1', '1')]

    """
    all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts)
    language_roots: t.MutableMapping[
        types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for language, lexicon in dataset.items():
        for concept, cognatesets in lexicon.items():
            if not cognatesets:
                logger.warning(
                    f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it."
                )
            for cognateset in cognatesets:
                language_roots[language].add(cognateset)

    all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots)

    alignment = {}
    roots = {}
    for language, lexicon in dataset.items():
        alignment[language] = list(ascertainment)
        for root in all_roots_sorted:
            roots[root] = len(alignment[language])
            if root in language_roots[language]:
                alignment[language].append("1")
            else:
                n_concepts = 0
                n_filled_concepts = 0
                for concept in relevant_concepts[root]:
                    n_concepts += 1
                    if lexicon.get(concept):
                        n_filled_concepts += 1
                if 2 * n_filled_concepts >= n_concepts:
                    alignment[language].append("0")
                else:
                    alignment[language].append("?")

    return alignment, roots
Exemple #10
0
def read_wordlist(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    code_column: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID,
                                                          t.Set]]:
    col_map = dataset.column_names

    if code_column:
        # Just in case that column was specified by property URL. We
        # definitely want the name. In any case, this will also throw a
        # helpful KeyError when the column does not exist.
        form_table_form = col_map.forms.form
        form_table_column = col_map.forms.id
        cognatesets = util.cache_table(
            dataset,
            columns={
                "form": form_table_column,
                "transcription": form_table_form,
                "code": dataset["FormTable", code_column].name,
            },
            filter=lambda row: bool(row[col_map.forms.form]),
        )
    else:
        # We search for cognatesetReferences in the FormTable or a separate
        # CognateTable.

        # Try the FormTable first.
        code_column = col_map.forms.cognatesetReference

        if code_column:
            # This is not the CLDF way, warn the user.
            form_table_column = col_map.forms.id
            form_table_form = col_map.forms.form
            logger.warning(
                "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table."
            )
            cognatesets = util.cache_table(
                dataset,
                columns={
                    "form": form_table_column,
                    "transcription": form_table_form,
                    "code": code_column,
                },
            )
        else:
            # There was no cognatesetReference in the form table. If we
            # find them in CognateTable (I mean, they should be there!), we
            # store them keyed with formReference.
            if (col_map.cognates and col_map.cognates.cognatesetReference
                    and col_map.cognates.formReference):
                code_column = col_map.cognates.cognatesetReference
                form_reference = col_map.cognates.formReference
                (foreign_key, ) = [
                    key
                    for key in dataset["CognateTable"].tableSchema.foreignKeys
                    if key.columnReference == [form_reference]
                ]
                (form_table_column, ) = foreign_key.reference.columnReference
                cognatesets = util.cache_table(
                    dataset,
                    "CognateTable",
                    {
                        "form": form_reference,
                        "code": code_column
                    },
                )
            else:
                raise ValueError(
                    "Dataset has no cognatesetReference column in its "
                    "primary table or in a separate cognate table. "
                    "Is this a metadata-free wordlist and you forgot to "
                    "specify code_column explicitly?")

    # Cognate sets have been loaded. Consolidate.
    cognates_by_form: t.MutableMapping[
        types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for judgement in cognatesets.values():
        cognates_by_form[judgement["form"]].add(judgement["code"])
    parameter_column = col_map.forms.parameterReference

    # If one form can have multiple concepts,
    if dataset["FormTable", parameter_column].separator:

        def all_parameters(parameter):
            return list(parameter)

    else:

        def all_parameters(parameter):
            return [parameter]

    data: t.MutableMapping[types.Language_ID,
                           t.MutableMapping[types.Parameter_ID, t.Set]]
    if "LanguageTable" in dataset:
        (langref_target, ) = [
            key for key in dataset["FormTable"].tableSchema.foreignKeys
            if key.columnReference ==
            [dataset["FormTable", "languageReference"].name]
        ]
        ref_col = langref_target.reference.columnReference[0]
        data = {
            lang[ref_col]: t.DefaultDict(set)
            for lang in dataset["LanguageTable"]
        }
    else:
        data = t.DefaultDict(lambda: t.DefaultDict(set))
    for row in dataset["FormTable"].iterdicts():
        if not row[col_map.forms.form]:
            # Transcription is empty, should not be a form. Skip, but maybe
            # warn if it was in a cognateset.
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    cognates_by_form[row[form_table_column]],
                )
            continue

        language = row[col_map.forms.languageReference]
        if row[col_map.forms.form] == "-":
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    language,
                    cognates_by_form[row[form_table_column]],
                )
                cognates_by_form[row[form_table_column]] = set()
            for parameter in all_parameters(row[parameter_column]):
                if data[language][parameter]:
                    logger.warning(
                        "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.",
                        row[col_map.forms.id],
                        parameter,
                        row[col_map.forms.languageReference],
                        data[language][parameter],
                    )
        for parameter in all_parameters(row[parameter_column]):
            data[language][parameter] |= cognates_by_form[
                row[form_table_column]]
    return data
def read_single_excel_sheet(
    dataset: pycldf.Dataset,
    sheet: openpyxl.worksheet.worksheet.Worksheet,
    logger: cli.logging.Logger = cli.logger,
    match_form: t.Optional[t.List[str]] = None,
    entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(),
    concept_column: t.Optional[str] = None,
    ignore_missing: bool = False,
    ignore_superfluous: bool = False,
    status_update: t.Optional[str] = None,
) -> t.Mapping[str, ImportLanguageReport]:
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)

    concept_columns: t.Tuple[str, str]
    if concept_column is None:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            dataset["FormTable", "parameterReference"].name,
        )
    else:
        concept_columns = (
            dataset["FormTable", "parameterReference"].name,
            concept_column,
        )
    db = DB(dataset)
    db.cache_dataset()
    # required cldf fields of a form
    c_f_id = db.dataset["FormTable", "id"].name
    c_f_language = db.dataset["FormTable", "languageReference"].name
    c_f_form = db.dataset["FormTable", "form"].name
    c_f_value = db.dataset["FormTable", "value"].name
    c_f_concept = db.dataset["FormTable", "parameterReference"].name
    if not match_form:
        match_form = [c_f_form, c_f_language]
    if not db.dataset["FormTable", c_f_concept].separator:
        logger.warning(
            "Your metadata does not allow polysemous forms. According to your specifications, "
            "identical forms with different concepts will always be considered homophones, not a single "
            "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference "
            "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones."
        )
        match_form.append(c_f_concept)
    else:
        if c_f_concept in match_form:
            logger.info(
                "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones."
            )

    sheet_header = get_headers_from_excel(sheet)
    form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys())

    # These columns don't need to be given, we can infer them from the sheet title and from the other data:
    implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {}
    if c_f_language not in sheet_header:
        implicit["languageReference"] = c_f_language
    if c_f_id not in sheet_header:
        implicit["id"] = c_f_id
    if c_f_value not in sheet_header:
        implicit["value"] = c_f_value

    found_columns = set(sheet_header) - {concept_column} - set(implicit.values())
    expected_columns = set(form_header) - {c_f_concept} - set(implicit.values())
    if not found_columns >= expected_columns:
        if ignore_missing:
            logger.info(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"For the newly imported forms, these columns will be left empty in the dataset."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. "
                f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these "
                f"columns empty in the dataset for the newly imported forms."
            )
    if not found_columns <= expected_columns:
        if ignore_superfluous:
            logger.info(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. These columns will be ignored."
            )
        else:
            raise ValueError(
                f"Your Excel sheet {sheet.title} contained unexpected columns "
                f"{found_columns - expected_columns}. Clean up your data, or use "
                f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns."
            )
    # check if language exist
    c_l_name = db.dataset["LanguageTable", "name"].name
    c_l_id = db.dataset["LanguageTable", "id"].name
    language_name_to_language_id = {
        row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values()
    }
    language_name = normalize_string(sheet.title)
    if language_name in language_name_to_language_id:
        language_id = language_name_to_language_id[language_name]
        report[language_id].is_new_language = False
    else:
        language_id = language_name
        report[language_id].is_new_language = True

    # read new data from sheet
    for form in cli.tq(
        import_data_from_sheet(
            sheet,
            sheet_header=sheet_header,
            implicit=implicit,
            language_id=language_id,
            concept_column=concept_columns,
        ),
        task=f"Parsing cells of sheet {sheet.title}",
        total=sheet.max_row,
    ):
        # if concept not in dataset, don't add form
        try:
            concept_entry = form[c_f_concept]
            entries_to_concepts[concept_entry]
        except KeyError:
            logger.warning(
                f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. "
                f"The corresponding form was ignored and not added to the dataset."
            )
            report[language_id].skipped += 1
            continue
        # else, look for candidates, link to existing form or add new form
        for item, value in form.items():
            try:
                sep = db.dataset["FormTable", item].separator
            except KeyError:
                continue
            if sep is None:
                continue
            form[item] = value.split(sep)
        form_candidates = db.find_db_candidates(form, match_form)
        if form_candidates:
            new_concept_added = False
            for form_id in form_candidates:
                logger.info(f"Form {form[c_f_value]} was already in dataset.")

                if db.dataset["FormTable", c_f_concept].separator:
                    for new_concept in form[c_f_concept]:
                        if (
                            new_concept
                            not in db.cache["FormTable"][form_id][c_f_concept]
                        ):
                            db.cache["FormTable"][form_id][c_f_concept].append(
                                new_concept
                            )
                            logger.info(
                                f"New form-concept association: Concept {form[c_f_concept]} was added to existing form "
                                f"{form_id}. If this was not intended "
                                f"(because it is a homophonous form, not a polysemy), "
                                f"you need to manually remove that concept from the old form in forms.csv "
                                f"and create a separate new form. If you want to treat identical forms "
                                f"as homophones in general, add  "
                                f"--match-forms={' '.join(match_form)}, "
                                f"{db.dataset['FormTable', 'parameterReference']} "
                                f"when you run this script."
                            )
                            new_concept_added = True
                break

            if new_concept_added:
                report[language_id].concepts += 1
            else:
                report[language_id].existing += 1
        else:
            # we land here after the break and keep adding existing forms to the dataset just with integer in id +1
            form[c_f_language] = language_id
            if "id" in implicit:
                # TODO: check for type of form id column
                form_concept = form[c_f_concept]
                concept_reference = (
                    form_concept[0] if isinstance(form_concept, list) else form_concept
                )
                form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}")
            db.make_id_unique(form)
            if status_update:
                form["Status_Column"] = status_update
            db.insert_into_db(form)
            report[language_id].new += 1
    # write to cldf
    db.write_dataset_from_cache()
    return report
Exemple #12
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Exemple #13
0
def segment_to_cognateset(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    cognatesets: t.Container[types.Cognateset_ID],
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]:
    # required fields
    c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference
    c_cognate_id = dataset.column_names.cognates.id
    c_cognate_form = dataset.column_names.cognates.formReference
    c_cognate_slice = dataset.column_names.cognates.segmentSlice

    forms = util.cache_table(dataset)
    cognateset_cache: t.Container[types.Cognateset_ID]
    if "CognatesetTable" in dataset:
        c_s_id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[c_s_id]
            for cognateset in dataset["CognatesetTable"]
            if cognatesets is None or cognateset["ID"] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = types.WorldSet()
        else:
            cognateset_cache = cognatesets

    which_segment_belongs_to_which_cognateset: t.Mapping[
        types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = {
            f: [set() for _ in form["segments"]]
            for f, form in forms.items() if form["form"]
            and form["form"].strip() and form["form"].strip() != "-"
        }
    for j in dataset["CognateTable"]:
        if j[c_cognate_form] in forms and j[
                c_cognate_cognateset] in cognateset_cache:
            form = forms[j[c_cognate_form]]
            if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset:
                continue
            if j.get(c_cognate_slice):
                try:
                    segments_judged = list(
                        parse_segment_slices(j[c_cognate_slice]))
                except ValueError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end."
                    )
                    continue
            else:
                segments_judged = list(range(len(form["segments"])))
            old_s = None

            for s in segments_judged:
                if old_s is not None and old_s + 1 != s:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous"
                    )
                try:
                    cognatesets = which_segment_belongs_to_which_cognateset[
                        j[c_cognate_form]][s]
                except IndexError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}."
                    )
                    continue
                cognatesets.add(j[c_cognate_cognateset])

    return which_segment_belongs_to_which_cognateset
def add_single_languages(
    metadata: Path,
    sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet],
    match_form: t.Optional[t.List[str]],
    concept_name: t.Optional[str],
    ignore_missing: bool,
    ignore_superfluous: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger,
) -> t.Mapping[str, ImportLanguageReport]:
    if status_update == "None":
        status_update = None
    # initiate dataset from meta data or csv depending on command line arguments
    if metadata:
        if metadata.name == "forms.csv":
            dataset = pycldf.Dataset.from_data(metadata)
        else:
            dataset = pycldf.Dataset.from_metadata(metadata)

    concepts: t.Mapping[str, str]
    try:
        cid = dataset["ParameterTable", "id"].name
        if concept_name is None:
            concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = dataset["FormTable", "parameterReference"].name
        else:
            name = dataset["ParameterTable", "name"].name
            concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = concept_name
    except (KeyError, FileNotFoundError) as err:
        if isinstance(err, KeyError):
            logger.warning(
                "Did not find a well-formed ParameterTable. Importing all forms independent of concept"
            )
        elif isinstance(err, FileNotFoundError):
            logger.warning(
                f"Did not find {dataset['ParameterTable'].url.string}. "
                f"Importing all forms independent of concept"
            )
        concepts = KeyKeyDict()
        if concept_name:
            concept_column = concept_name
        else:
            concept_column = dataset["FormTable", "parameterReference"].name
    # add Status_Column if not existing and status_update given
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="FormTable")
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)
    # import all selected sheets
    for sheet in sheets:
        for lang, subreport in read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            logger=logger,
            match_form=match_form,
            entries_to_concepts=concepts,
            concept_column=concept_column,
            ignore_missing=ignore_missing,
            ignore_superfluous=ignore_superfluous,
            status_update=status_update,
        ).items():
            report[lang] += subreport
    return report
Exemple #15
0
    def parse_form(
        self,
        form_string: str,
        language_id: str,
        cell_identifier: str = "",
        logger: cli.logging.Logger = cli.logger,
    ) -> t.Optional[Form]:
        """Create a dictionary of columns from a form description.

        Extract each value (transcriptions, comments, sources etc.) from a
        string describing a single form.
        """
        # not required fields
        c_comment = self.c.get("comment")
        c_variants = self.c.get("variants", c_comment)

        # if string is only whitespaces, there is no form.
        if not form_string.strip():
            return None

        properties: t.Dict[str, t.Any] = {
            self.c["lang"]: language_id,
            self.c["value"]: form_string,
        }

        # Semantics: 'None' for no variant expected, any string for the
        # decorator that introduces variant forms. Currently we expect '~' and
        # '%', see below.
        expect_variant: t.Optional[str] = None
        # Iterate over the delimiter-separated elements of the form.
        for element in components_in_brackets(form_string, self.bracket_pairs):
            element = element.strip()

            if not element:
                continue

            # If the element has mismatched brackets (tends to happen only for
            # the last element, because a mismatched opening bracket means we
            # are still waiting for the closing one), warn.
            if not check_brackets(element, self.bracket_pairs):
                try:
                    delimiter = self.bracket_pairs[element[0]]
                except KeyError:
                    delimiter = element[0]
                raise ValueError(
                    f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters "
                    f"{delimiter}. This could be a bigger problem in the cell, "
                    f"so the form was not imported.")
            # Check what kind of element we have.
            for start, (term, transcription) in self.element_semantics.items():
                field = self.c[term]
                if element.startswith(start):
                    break
            else:
                # TODO: here an other if catchin '-' might be necessary
                # The only thing we expect outside delimiters is the variant
                # separators, '~' and '%'.
                if self.variant_separator and element in self.variant_separator:
                    expect_variant = element
                else:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored"
                    )
                continue

            # If we encounter a field for the first time, we add it to the
            # dictionary. If repeatedly, to the variants, with a decorator that
            # shows how expected the variant was.
            # This drops sources and comments in variants, if more than one source or comment is provided
            # clean this up in self.postprocess_form

            if field in properties:
                if (not expect_variant and field != c_comment
                        and field != self.c["source"]):
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}"
                    )
                properties.setdefault(
                    c_variants, []).append((expect_variant or "") + element)
            else:
                if expect_variant:
                    logger.warning(
                        f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}"
                    )
                properties[field] = element

            expect_variant = None

        self.postprocess_form(properties, language_id)
        return Form(properties)
Exemple #16
0
def log_or_raise(message, log: cli.logging.Logger = cli.logger):
    log.warning(message)
Exemple #17
0
def create_singletons(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    status: t.Optional[str] = None,
    by_segment: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]:
    """Create singleton cognate judgements for forms that don't have cognate judgements.

    Depending on by_segment, singletons are created for every range of segments
    that is not in any cognate set yet (True) or just for every form where no
    segment is in any cognate sets (False).

    """
    forms = util.cache_table(dataset)
    c_j_id = dataset["CognateTable", "id"].name
    c_j_cogset = dataset["CognateTable", "cognatesetReference"].name
    c_j_form = dataset["CognateTable", "formReference"].name
    try:
        c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        c_j_segmentslice = None
    try:
        c_j_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        c_j_alignment = None

    if not dataset.get(("CognatesetTable", "Status_Column")):
        logger.warning(
            "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column."
        )

    try:
        c_s_id = dataset["CognatesetTable", "id"].name
        all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]}
    except KeyError:
        c_s_id = "id"
        c_s_name = "name"
        all_cognatesets = {
            id: types.Judgement({
                "id": id,
                "name": id
            })
            for id in {j[c_j_cogset]
                       for j in dataset["CognateTable"]}
        }
    try:
        c_s_name = dataset["CognatesetTable", "name"].name
    except KeyError:
        c_s_name = c_s_id

    all_judgements = list(dataset["CognateTable"])
    if by_segment:
        judgements = segment_to_cognateset(dataset, types.WorldSet(), logger)
        forms_and_segments = uncoded_segments(judgements, logger)
    else:
        forms_and_segments = uncoded_forms(
            forms.values(), {j[c_j_form]
                             for j in all_judgements})
    for form, slice in forms_and_segments:
        i = 1
        singleton_id = f"X_{form}_{i:d}"
        while singleton_id in all_cognatesets:
            i += 1
            singleton_id = f"X_{form}_{i:d}"
        all_cognatesets[singleton_id] = types.CogSet({})
        properties = {
            c_s_name: util.ensure_list(forms[form]["parameterReference"])[0],
            c_s_id: singleton_id,
            "Status_Column": status,
        }
        try:
            for column in dataset["CognatesetTable"].tableSchema.columns:
                all_cognatesets[singleton_id][column.name] = properties.get(
                    column.name)
        except KeyError:
            pass
        judgement = types.Judgement({})
        properties = {
            c_j_id: singleton_id,
            c_j_cogset: singleton_id,
            c_j_form: form,
            c_j_segmentslice: indices_to_segment_slice(slice),
            c_j_alignment: [forms[form]["segments"][i] for i in slice],
            "Status_Column": status,
        }
        for column in dataset["CognateTable"].tableSchema.columns:
            judgement[column.name] = properties.get(column.name)
        all_judgements.append(judgement)
    return all_cognatesets.values(), all_judgements
Exemple #18
0
    def __init__(
        self,
        dataset: pycldf.Dataset,
        element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [
            # ("[", "]", "phonetic", True),
            ("<", ">", "form", True),
            # ("/", "/", "phonemic", True),
            ("(", ")", "comment", False),
            ("{", "}", "source", False),
        ],
        separation_pattern: str = r"([;,])",
        variant_separator: t.Optional[t.List[str]] = ["~", "%"],
        add_default_source: t.Optional[str] = "{1}",
        logger: cli.logging.Logger = cli.logger,
    ):
        super().__init__(dataset)

        # Colums implied by element semantics
        self.bracket_pairs = {
            start: end
            for start, end, _, _ in element_semantics
        }
        self.element_semantics = {
            start: (term, transcription)
            for start, _, term, transcription in element_semantics
        }
        for start, end, term, transcription in element_semantics:
            # Ensure that all terms required by the element semantics are fields we can write to.
            self.cc(short=term, long=("FormTable", term), dataset=dataset)
        assert self.transcriptions, (
            "Your metadata json file and your cell parser don’t match: Your cell parser "
            f"{self.__class__.__name__} expects to work with transcriptions "
            "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form "
            "in #FormTable, but your metadata defines no such column.")

        # Colums necessary for word list
        self.cc(short="source", long=("FormTable", "source"), dataset=dataset)
        self.cc(short="comment",
                long=("FormTable", "comment"),
                dataset=dataset)

        try:
            self.comment_separator = dataset["FormTable",
                                             "comment"].separator or "\t"
        except KeyError:
            logger.info("No #comment column found.")
            self.comment_separator = ""

        try:
            # As long as there is no CLDF term #variants, this will either be
            # 'variants' or raise a KeyError. However, it is a transparent
            # re-use of an otherwise established idiom in this module, so we
            # use this minor overhead.
            self.c["variants"] = dataset["FormTable", "variants"].name
        except KeyError:
            logger.warning(
                "No 'variants' column found for FormTable in Wordlist-metadata.json. "
                "Form variants will be added to #comment.")

        # Other class attributes
        self.separation_pattern = separation_pattern
        self.variant_separator = variant_separator
        self.add_default_source = add_default_source
Exemple #19
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache