Esempio n. 1
0
def write_edictor_file(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    file: t.TextIO,
    forms: t.Mapping[types.Form_ID, t.Mapping[str, t.Any]],
    judgements_about_form,
    cognateset_numbers,
):
    """Write the judgements of a dataset to file, in edictor format."""
    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    tsv_header = [
        util.cldf_property(c.propertyUrl) or c.name
        for c in dataset["FormTable"].tableSchema.columns
    ]

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # write output to tsv
    out = csv.DictWriter(
        file,
        fieldnames=tsv_header,
        delimiter="\t",
    )
    out.writerow({column: rename(column) for column in tsv_header})
    out_cognatesets: t.List[t.Optional[str]]
    for f, (id, form) in enumerate(forms.items(), 1):
        # store original form id in other field and get cogset integer id
        this_form = dict(form)
        this_form["LINGPY_ID"] = f

        # Normalize the form:
        # 1. No list-valued entries
        for col, d in delimiters.items():
            this_form[col] = d.join(form[col])
        # 2. No tabs, newlines in entries, they make Edictor mad.
        for c, v in form.items():
            if type(v) == str:
                this_form[c] = (form[c].replace("\t", "  ;t  ").replace(
                    "\n", "    ;n    "))

        # if there is a cogset, add its integer id. otherwise set id to 0
        judgement = judgements_about_form[this_form["id"]]
        this_form["cognatesetReference"] = " ".join(
            str(cognateset_numbers.get(e, 0))
            for e in (judgement[1] or [None]))
        this_form["alignment"] = (" ".join(judgement[0]).replace(
            "(", "( ").replace(")", " )").replace(" ) ( ", " "))

        # add integer form id
        out.writerow(this_form)
    add_edictor_settings(file, dataset)
Esempio n. 2
0
def properties_as_key(data, columns):
    mapping = {
        column.name: util.cldf_property(column.propertyUrl)
        for column in columns if util.cldf_property(column.propertyUrl)
    }
    for s in data:
        for name, property in mapping.items():
            s[property] = s.pop(name, None)
Esempio n. 3
0
def merge_group(
    forms: t.Sequence[types.Form],
    target: types.Form,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.Form:
    """Merge one group of homophones

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 2]}

    The target is assumed to be already included in the forms.

    >>> merge_group(
    ...   [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}],
    ...   {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist())
    {'Parameter_ID': [1, 1, 2]}

    """
    c_f_id = dataset["FormTable", "id"].name
    for column in target:
        if column == c_f_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["FormTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([form[column] for form in forms], target)
            except AssertionError:
                # We cannot deal with this block, but others may be fine.
                merger_name = merger.__name__
                logger.error(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[form[column] for form in forms]}")
                raise Skip
            except TypeError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n"
                    f"Given input: {[form[column] for form in forms]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in FormTable.")
    return target
Esempio n. 4
0
def merge_group(
    cogsets: t.Sequence[types.CogSet],
    target: types.CogSet,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.CogSet:
    """Merge one group of cognate sets

    The target is assumed to be already included in the forms.

    """
    c_s_id = dataset["CognatesetTable", "id"].name
    for column in target:
        if column == c_s_id:
            continue
        try:
            reference_name = (util.cldf_property(
                dataset["CognatesetTable", column].propertyUrl) or column)
            merger = mergers.get(column,
                                 mergers.get(reference_name, must_be_equal))
            try:
                merge_result = merger([cogset[column] for cogset in cogsets],
                                      target)
            except AssertionError:
                merger_name = merger.__name__
                # We cannot deal with this block, but others may be fine.
                logger.error(
                    f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n"
                    f"The merge function {merger_name} requires the input data to be equal. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
                raise Skip
            except NotImplementedError:
                merger_name = merger.__name__
                # Other groups will have the same issue.
                cli.Exit.INVALID_INPUT(
                    f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n"
                    f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n"
                    f"Given input: {[cogset[column] for cogset in cogsets]}")
            target[column] = merge_result
        except KeyError:
            cli.Exit.INVALID_COLUMN_NAME(
                f"Column {column} is not in CognatesetTable.")
    return target
Esempio n. 5
0
def check_foreign_keys(dataset: pycldf.Dataset,
                       logger: cli.logging.Logger = cli.logger):
    # Get all foreign keys for each table
    valid = True
    for table in dataset.tables:
        for key in table.tableSchema.foreignKeys:
            reference = key.reference
            try:
                (target_column, ) = reference.columnReference
            except ValueError:
                # Multi-column foreign key. We *could* check that there's not a
                # reference column hidden in there, but we don't.
                continue
            (column, ) = key.columnReference
            # check that property url of foreign key column points to correct table
            column_type = util.cldf_property(
                dataset[table].get_column(column).propertyUrl)

            if column_type and pycldf.TERMS[column_type].references:
                target_table = pycldf.TERMS[column_type].references
            else:
                # Not a CLDF reference property. Nothing to check.
                continue

            if dataset[target_table] != dataset[reference.resource]:
                log_or_raise(
                    message=
                    f"Foreign key {key} is a declared as {column_type}, which should point to {target_table} but instead points to {reference}",
                    log=logger,
                )
                valid = False
                continue

            # Check that foreign key is ID of corresponding table
            if reference.columnReference != [
                    dataset[key.reference.resource, "id"].name
            ]:
                log_or_raise(
                    message=f"Foreign key {key} in table {table.url.string} "
                    f"does not point to the ID column of another table",
                    log=logger,
                )
                valid = False

    return valid
Esempio n. 6
0
    def __init__(
        self,
        dataset: pycldf.Dataset,
        database_url: t.Optional[str] = None,
        logger: cli.logging.Logger = cli.logger,
    ):
        self.set_header(dataset)
        self.separators = {
            util.cldf_property(c.propertyUrl) or c.name: c.separator
            for c in dataset[self.row_table].tableSchema.columns if c.separator
        }

        self.URL_BASE = database_url

        self.wb = op.Workbook()
        self.ws: op.worksheet.worksheet.Worksheet = self.wb.active

        self.logger = logger
Esempio n. 7
0
 def set_header(
     self,
     dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                             types.Parameter_ID, types.Cognate_ID,
                             types.Cognateset_ID, ],
 ):
     c_id = dataset["CognatesetTable", "id"].name
     try:
         c_comment = dataset["CognatesetTable", "comment"].name
     except (KeyError):
         c_comment = None
     self.header = []
     for column in dataset["CognatesetTable"].tableSchema.columns:
         if column.name == c_id:
             self.header.insert(0, ("id", "CogSet"))
         elif column.name == c_comment:
             continue
         else:
             property = util.cldf_property(
                 column.propertyUrl) or column.name
             self.header.append((property, column.name))
Esempio n. 8
0
    except (KeyError):
        cli.Exit.INVALID_DATASET(
            "Dataset has no explicit CognatesetTable. Add one using `lexedata.edit.add_table CognatesetTable`."
        )

    E = ExcelWriter(
        dataset,
        database_url=args.url_template,
        logger=logger,
    )

    cogsets, judgements = cogsets_and_judgements(
        dataset, args.add_singletons_with_status, args.by_segment, logger)

    try:
        cogset_order = (util.cldf_property(
            dataset["CognatesetTable", args.sort_cognatesets_by].propertyUrl)
                        or dataset["CognatesetTable",
                                   args.sort_cognatesets_by].name)
    except (KeyError):
        cli.Exit.INVALID_COLUMN_NAME(
            f"No column '{args.sort_cognatesets_by}' in your CognatesetTable.")
    sort_cognatesets(cogsets, judgements, cogset_order, size=args.size_sort)

    # TODO: wrap the following two blocks into a
    # get_sorted_languages() -> t.OrderedDict[languageReference, Column Header/Titel/Name]
    # function
    languages = list(util.cache_table(dataset, "LanguageTable").values())
    if args.sort_languages_by:
        c_sort = (util.cldf_property(
            dataset["LanguageTable", args.sort_languages_by].propertyUrl)
                  or dataset["LanguageTable", args.sort_languages_by].name)
Esempio n. 9
0
            invalid_ids.append(item)
        if "Name" in new_table.tableSchema.columndict:
            return {"ID": item, "Name": item}
        else:
            return {"ID": item}

    reference_properties = {
        property_name
        for property_name, term in pycldf.terms.Terms().properties.items()
        if term.references == args.table
    }

    referenced_items: t.Set[str] = set()
    for table in ds.tables:
        for column in table.tableSchema.columns:
            if util.cldf_property(column.propertyUrl) in reference_properties:
                referenced_items |= {
                    column.datatype.formatted(row[column.name]) for row in table
                }

    logger.info(
        "Found %d different entries for your new %s.", len(referenced_items), args.table
    )

    ds.write(**{args.table: [new_row(item) for item in sorted(referenced_items)]})

    if invalid_ids:
        logger.warning(
            "Some of your reference values are not valid as IDs: %s. You can transform them into valid ids by running lexedata.edit.simplify_ids",
            invalid_ids,
        )
Esempio n. 10
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache
Esempio n. 11
0
def load_forms_from_tsv(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    input_file: Path,
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]:
    """

    Side effects
    ============
    This function overwrites dataset's FormTable
    """
    input = csv.DictReader(
        input_file.open(encoding="utf-8"),
        delimiter="\t",
    )

    # These days, all dicts are ordered by default. Still, better make this explicit.
    forms = util.cache_table(dataset)

    edictor_cognatesets: t.Dict[
        int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = collections.defaultdict(list)

    form_table_upper = {
        (util.cldf_property(column.propertyUrl) or column.name).upper(): (
            util.cldf_property(column.propertyUrl) or column.name
        )
        for column in dataset["FormTable"].tableSchema.columns
    }
    form_table_upper.update(
        {
            "DOCULECT": "languageReference",
            "CONCEPT": "parameterReference",
            "IPA": "form",
            "COGID": "cognatesetReference",
            "ALIGNMENT": "alignment",
            "TOKENS": "segments",
            "CLDF_ID": "id",
            "ID": "",
        }
    )
    if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]:
        form_table_upper["_PARAMETERREFERENCE"] = "parameterReference"
        form_table_upper["CONCEPT"] = ""

    separators: t.MutableMapping[str, t.Optional[str]] = {}
    # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames.
    for i in range(len(input.fieldnames)):
        if i == 0 and input.fieldnames[0] != "ID":
            raise ValueError(
                "When importing from Edictor, expected the first column to be named 'ID', but found %s",
                input.fieldnames["ID"],
            )

        lingpy = input.fieldnames[i]
        try:
            input.fieldnames[i] = form_table_upper[lingpy.upper()]
        except KeyError:
            logger.warning(
                "Your edictor file contained a column %s, which I could not interpret.",
                lingpy,
            )

        if input.fieldnames[i] == "cognatesetReference":
            separators[input.fieldnames[i]] = " "
        elif input.fieldnames[i] == "alignment":
            separators[input.fieldnames[i]] = " "

        try:
            separators[input.fieldnames[i]] = dataset[
                "FormTable", input.fieldnames[i]
            ].separator
        except KeyError:
            pass

    logger.info(
        "The header of your edictor file will be interpreted as %s.", input.fieldnames
    )

    affected_forms: t.Set[types.Form_ID] = set()
    for line in cli.tq(
        input, task="Importing form rows from edictor…", total=len(forms)
    ):
        # Column "" is the re-named Lingpy-ID column, so the first one.
        if not any(line.values()) or line[""].startswith("#"):
            # One of Edictor's comment rows, storing settings
            continue

        for (key, value) in line.items():
            value = value.replace("\\!t", "\t").replace("\\!n", "\n")
            sep = separators[key]
            if sep is not None:
                if not value:
                    line[key] = []
                else:
                    line[key] = value.split(sep)
            else:
                line[key] = value

        affected_forms.add(line["id"])

        try:
            for segments, cognateset, alignment in extract_partial_judgements(
                line["segments"],
                line["cognatesetReference"],
                line["alignment"],
                logger,
            ):
                edictor_cognatesets[cognateset].append(
                    (line["id"], segments, alignment)
                )
            forms[line["id"]] = line
        except IndexError:
            logger.warning(
                f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped."
            )
    edictor_cognatesets.pop(0, None)

    columns = {
        (util.cldf_property(column.propertyUrl) or column.name): column.name
        for column in dataset["FormTable"].tableSchema.columns
    }
    # Deliberately make use of the property of `write` to discard any entries
    # that don't correspond to existing columns. Otherwise, we'd still have to
    # get rid of the alignment, cognatesetReference and Lingpy-ID columns.
    dataset["FormTable"].write(
        (
            {
                columns[property]: value
                for property, value in form.items()
                if columns.get(property)
            }
            for form in forms.values()
        )
    )
    return edictor_cognatesets, affected_forms
Esempio n. 12
0
def edictor_to_cldf(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    new_cogsets: t.Mapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ],
    affected_forms: t.Set[types.Form_ID],
    source: t.List[str] = [],
):
    ref_cogsets: t.MutableMapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = t.DefaultDict(list)
    cognate: t.List[types.Judgement] = []
    judgements_lookup: t.MutableMapping[
        types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement]
    ] = t.DefaultDict(dict)
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in affected_forms:
            ref_cogsets[j["cognatesetReference"]].append(
                (j["formReference"], j["segmentSlice"], j["alignment"])
            )
            judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j
        else:
            cognate.append(j)
    matches = match_cognatesets(new_cogsets, ref_cogsets)

    for cognateset, judgements in new_cogsets.items():
        cognateset = matches[cognateset]
        if cognateset is None:
            cognateset = "_".join(f for f, _, _ in judgements)
        for form, slice, alignment in judgements:
            was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset)
            if was:
                was["segmentSlice"] = util.indices_to_segment_slice(slice)
                was["alignment"] = alignment
                cognate.append(was)
                continue
            judgements_lookup
            cognate.append(
                types.Judgement(
                    {
                        "id": f"{form}-{cognateset}",
                        "formReference": form,
                        "cognatesetReference": cognateset,
                        "alignment": alignment,
                        "segmentSlice": util.indices_to_segment_slice(slice),
                        "source": source,
                        # TODO: Any more parameters? Status update?
                    }
                )
            )

    cognate.sort(key=lambda j: j["id"])
    m = {
        util.cldf_property(c.propertyUrl) or c.name: c.name
        for c in dataset["CognateTable"].tableSchema.columns
    }
    dataset["CognateTable"].write(
        [{m[k]: v for k, v in j.items() if k in m} for j in cognate]
    )