Ejemplo n.º 1
0
def test_cell_comments_export():
    dataset, _ = copy_to_temp(
        Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")

    E = MatrixExcelWriter(dataset,
                          database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)

    for col in E.ws.iter_cols():
        pass
    assert (
        col[-1].comment and col[-1].comment.content
    ), "Last row of last column should contain a form, with a comment attached to it."
    assert (col[-1].comment.content == "A Comment!"
            ), "Comment should match the comment from the form table"
Ejemplo n.º 2
0
def test_roundtrip(cldf_wordlist, working_and_nonworking_bibfile):
    filled_cldf_wordlist = working_and_nonworking_bibfile(cldf_wordlist)
    dataset, target = filled_cldf_wordlist
    c_formReference = dataset["CognateTable", "formReference"].name
    c_cogsetReference = dataset["CognateTable", "cognatesetReference"].name
    old_judgements = {(row[c_formReference], row[c_cogsetReference])
                      for row in dataset["CognateTable"].iterdicts()}
    writer = ExcelWriter(dataset,
                         database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(filled_cldf_wordlist[0])
    languages = util.cache_table(filled_cldf_wordlist[0],
                                 "LanguageTable").values()
    judgements = util.cache_table(filled_cldf_wordlist[0],
                                  "CognateTable").values()
    cogsets = util.cache_table(filled_cldf_wordlist[0],
                               "CognatesetTable").values()
    writer.create_excel(rows=cogsets,
                        judgements=judgements,
                        forms=forms,
                        languages=languages)

    # Reset the existing cognatesets and cognate judgements, to avoid
    # interference with the the data in the Excel file
    dataset["CognateTable"].write([])
    dataset["CognatesetTable"].write([])

    import_cognates_from_excel(writer.ws, dataset)

    new_judgements = {(row[c_formReference], row[c_cogsetReference])
                      for row in dataset["CognateTable"].iterdicts()}

    assert new_judgements == old_judgements
Ejemplo n.º 3
0
def test_toexcel_filtered(cldf_wordlist, working_and_nonworking_bibfile,
                          caplog):
    dataset, url = working_and_nonworking_bibfile(cldf_wordlist)
    writer = MatrixExcelWriter(
        dataset=dataset,
        database_url=str(url),
    )
    E = MatrixExcelWriter(dataset,
                          database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = [
        c for n, c in util.cache_table(dataset, "ParameterTable").items()
        if n == "Woman"
    ]
    with caplog.at_level(logging.WARNING):
        E.create_excel(rows=parameters,
                       judgements=judgements,
                       forms=forms,
                       languages=languages)
    assert len(list(writer.ws.iter_rows())) in {0, 2}
Ejemplo n.º 4
0
def test_adding_singleton_cognatesets_with_status(caplog):
    dataset = get_dataset(
        Path(__file__).parent /
        "data/cldf/smallmawetiguarani/cldf-metadata.json")
    dataset.add_columns("CognatesetTable", "Status_Column")
    with caplog.at_level(logging.WARNING):
        excel_writer = ExcelWriter(dataset=dataset)
        cogsets, judgements = create_singletons(
            dataset,
            status="NEW",
            by_segment=True,
        )
        properties_as_key(cogsets,
                          dataset["CognatesetTable"].tableSchema.columns)
        properties_as_key(judgements,
                          dataset["CognateTable"].tableSchema.columns)
        forms = util.cache_table(dataset)
        languages = util.cache_table(dataset, "LanguageTable").values()
        excel_writer.create_excel(rows=cogsets,
                                  judgements=judgements,
                                  forms=forms,
                                  languages=languages)
    assert re.search("no Status_Column to write", caplog.text) is None

    cogset_index = 0
    for row in excel_writer.ws.iter_rows(min_row=1, max_row=1):
        for cell in row:
            if cell.value == "Status_Column":
                cogset_index = cell.column - 1
    # when accessing the row as a tuple the index is not 1-based as for excel sheets
    status = [
        row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2)
    ]
    assert status == [
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        "NEW",
        "NEW",
        "NEW",
        "NEW",
    ]
Ejemplo n.º 5
0
def test_adding_singleton_cognatesets(caplog):
    dataset = get_dataset(
        Path(__file__).parent /
        "data/cldf/smallmawetiguarani/cldf-metadata.json")
    with caplog.at_level(logging.WARNING):
        excel_writer = ExcelWriter(dataset=dataset, )
        cogsets, judgements = create_singletons(
            dataset,
            status="NEW",
            by_segment=False,
        )
        properties_as_key(cogsets,
                          dataset["CognatesetTable"].tableSchema.columns)
        properties_as_key(judgements,
                          dataset["CognateTable"].tableSchema.columns)
        forms = util.cache_table(dataset)
        languages = util.cache_table(dataset, "LanguageTable").values()
        excel_writer.create_excel(rows=cogsets,
                                  judgements=judgements,
                                  forms=forms,
                                  languages=languages)
    assert re.search("No Status_Column", caplog.text)

    # load central concepts from output
    cogset_index = 0
    for row in excel_writer.ws.iter_rows(min_row=1, max_row=1):
        for cell in row:
            if cell.value == "CogSet":
                cogset_index = cell.column - 1
    # when accessing the row as a tuple the index is not 1-based as for excel sheets
    cogset_ids = [
        row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2)
    ]
    assert cogset_ids == [
        "one1",
        "one1",
        "one2",
        "one6",
        "two1",
        "three1",
        "two8",
        "three9",
        "four1",
        "four8",
        "five5",
        "X_old_paraguayan_guarani_two_1",
        "X_paraguayan_guarani_five_1",
    ]
Ejemplo n.º 6
0
def test_missing_required_column():
    dataset, _ = copy_to_temp(
        Path(__file__).parent /
        "data/cldf/smallmawetiguarani/cldf-metadata.json")
    dataset.remove_columns("FormTable", "ID")
    # TODO: switch to pycldf.dataset.SchemaError
    with pytest.raises(KeyError):
        excel_writer = ExcelWriter(dataset=dataset)
        forms = util.cache_table(dataset)
        languages = util.cache_table(dataset, "LanguageTable").values()
        judgements = util.cache_table(dataset, "CognateTable")
        cogsets = util.cache_table(dataset, "CognatesetTable")
        excel_writer.create_excel(rows=cogsets,
                                  judgements=judgements,
                                  forms=forms,
                                  languages=languages)
Ejemplo n.º 7
0
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile):
    filled_cldf_wordlist = working_and_nonworking_bibfile(cldf_wordlist)
    writer = ExcelWriter(
        dataset=filled_cldf_wordlist[0],
        database_url=str(filled_cldf_wordlist[1]),
    )
    forms = util.cache_table(filled_cldf_wordlist[0])
    languages = util.cache_table(filled_cldf_wordlist[0],
                                 "LanguageTable").values()
    judgements = util.cache_table(filled_cldf_wordlist[0],
                                  "CognateTable").values()
    cogsets = util.cache_table(filled_cldf_wordlist[0],
                               "CognatesetTable").values()
    writer.create_excel(rows=cogsets,
                        judgements=judgements,
                        forms=forms,
                        languages=languages)
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")
    writer.wb.save(filename=out_filename)
Ejemplo n.º 8
0
def test_no_comment_column():
    dataset, _ = copy_to_temp(
        Path(__file__).parent /
        "data/cldf/smallmawetiguarani/cldf-metadata.json")
    dataset.remove_columns("FormTable", "comment")
    writer = ExcelWriter(dataset=dataset, )
    forms = util.cache_table(dataset).values()
    for form in forms:
        assert writer.form_to_cell_value(
            form).strip() == "{ e t a k ɾ ã } ‘one, one’"
        break
Ejemplo n.º 9
0
def tiny_dataset():
    ds = util.fs.new_wordlist(
        FormTable=[{
            "ID": "f1"
        }],
        CognatesetTable=[
            {
                "ID": "s1",
                "Source": "3",
                "Description": "A"
            },
            {
                "ID": "s2",
                "Source": "3",
                "Description": "A"
            },
            {
                "ID": "s3",
                "Source": "3",
                "Description": "A"
            },
            {
                "ID": "s4",
                "Source": "1",
                "Description": "C"
            },
            {
                "ID": "s5",
                "Source": "1",
                "Description": "C"
            },
        ],
        CognateTable=[{
            "ID": f"{i}{n}",
            "Cognateset_ID": f"s{i}",
            "Form_ID": "f1"
        } for i in range(1, 6) for n in range(i)],
    )
    cognatesets = list(util.cache_table(ds, "CognatesetTable").values())
    judgements = util.cache_table(ds, "CognateTable").values()
    return cognatesets, judgements
Ejemplo n.º 10
0
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile):
    dataset, filename = working_and_nonworking_bibfile(cldf_wordlist)
    E = MatrixExcelWriter(
        dataset=dataset,
        database_url=str(filename),
    )
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")
    E.wb.save(filename=out_filename)
Ejemplo n.º 11
0
def cogsets_and_judgements(
    dataset,
    status: t.Optional[str],
    by_segment=True,
    logger: cli.logging.Logger = cli.logger,
):
    if status is not None:
        cogsets, judgements = create_singletons(
            dataset,
            status=status,
            by_segment=by_segment,
            logger=logger,
        )
        properties_as_key(cogsets,
                          dataset["CognatesetTable"].tableSchema.columns)
        properties_as_key(judgements,
                          dataset["CognateTable"].tableSchema.columns)
    else:
        cogsets = util.cache_table(dataset, "CognatesetTable").values()
        judgements = util.cache_table(dataset, "CognateTable").values()

    return cogsets, judgements
Ejemplo n.º 12
0
def test_write_edictor_singleton_dataset():
    forms = {
        "form1": {
            "ID": "form1",
            "Language_ID": "axav1032",
            "Parameter_ID": "one",
            "Form": "the form",
            "Segments": list("ðəfom"),
            "Source": [],
        }
    }
    dataset = lexedata.util.fs.new_wordlist(
        FormTable=forms.values(),
        CognateTable=[{
            "ID": "1-1",
            "Form_ID": "form1",
            "Cognateset_ID": "c1",
            "Segment_Slice": ["1:1"],
            "Alignment": ["ð"],
        }],
    )
    file = io.StringIO()
    file.name = "<memory>"
    judgements_about_form = {
        "form1": (["ð", "(ə)", "(f)", "(o)", "(m)"], ["c1"])
    }
    cognateset_numbers = {"c1": 2}
    exporter.write_edictor_file(
        dataset,
        file,
        util.cache_table(dataset),
        judgements_about_form,
        cognateset_numbers,
    )
    rows = [
        line.strip().split("\t") for line in file.getvalue().split("\n")[:3]
    ]
    assert rows[2] == [""]
    assert dict(zip(rows[0], rows[1])) == {
        "ID": "1",
        "CONCEPT": "one",
        "DOCULECT": "axav1032",
        "IPA": "the form",
        "CLDF_id": "form1",
        "TOKENS": "ð ə f o m",
        "source": "",
        "comment": "",
        "COGID": "2",
        "ALIGNMENT": "ð ( ə f o m )",
    }
    assert "<memory>" in file.getvalue()
Ejemplo n.º 13
0
def test_roundtrip_separator_column(cldf_wordlist,
                                    working_and_nonworking_bibfile):
    """Test whether a CognatesetTable column with separator survives a roundtrip."""
    dataset, target = working_and_nonworking_bibfile(cldf_wordlist)
    dataset.add_columns("CognatesetTable", "CommaSeparatedTags")
    dataset["CognatesetTable", "CommaSeparatedTags"].separator = ","
    c_id = dataset["CognatesetTable", "id"].name

    write_back = list(dataset["CognatesetTable"])
    tags = []
    for tag, row in zip(
            itertools.cycle([["two", "tags"], ["single-tag"], [],
                             ["tag;containing;other;separator"]]),
            write_back,
    ):
        tags.append((row[c_id], tag))
        row["CommaSeparatedTags"] = tag
    dataset.write(CognatesetTable=write_back)

    writer = ExcelWriter(dataset,
                         database_url="https://example.org/lexicon/{:}")
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")
    forms = util.cache_table(dataset)
    languages = util.cache_table(dataset, "LanguageTable").values()
    judgements = util.cache_table(dataset, "CognateTable").values()
    cogsets = util.cache_table(dataset, "CognatesetTable").values()
    writer.create_excel(rows=cogsets,
                        judgements=judgements,
                        forms=forms,
                        languages=languages)

    import_cognates_from_excel(writer.ws, dataset)

    reread_tags = [(c[c_id], c["CommaSeparatedTags"])
                   for c in dataset["CognatesetTable"]]
    reread_tags.sort(key=lambda x: x[0])
    tags.sort(key=lambda x: x[0])
    assert reread_tags == tags
Ejemplo n.º 14
0
def test_cell_comments_export():
    dataset, _ = copy_to_temp(
        Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")

    writer = ExcelWriter(dataset,
                         database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = util.cache_table(dataset, "CognateTable").values()
    cogsets = util.cache_table(dataset, "CognatesetTable").values()
    writer.create_excel(rows=cogsets,
                        judgements=judgements,
                        forms=forms,
                        languages=languages)

    for col in writer.ws.iter_cols():
        pass
    assert (
        col[-1].comment and col[-1].comment.content
    ), "Last row of last column should contain a judgement, with a comment attached to it."
    assert (col[-1].comment.content == "A judgement comment"
            ), "Comment should match the comment from the cognate table"
Ejemplo n.º 15
0
def aligne_cognate_table(dataset: pycldf.Dataset,
                         status_update: t.Optional[str] = None):
    # add Status_Column if not existing – TODO: make configurable
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="CognateTable")

    forms = util.cache_table(dataset, "FormTable")

    c_id = dataset["CognateTable", "id"].name
    c_form_id = dataset["CognateTable", "formReference"].name
    c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name
    c_slice = dataset["CognateTable", "segmentSlice"].name
    c_alignment = dataset["CognateTable", "alignment"].name

    cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {}
    judgements: t.Dict[str, t.Dict[str, t.Any]] = {}
    for judgement in cli.tq(
            dataset["CognateTable"],
            task="Aligning the cognate segments",
            total=dataset["CognateTable"].common_props.get("dc:extent"),
    ):
        judgements[judgement[c_id]] = judgement
        form = forms[judgement[c_form_id]]
        morpheme = []
        if not judgement[c_slice]:
            morpheme = form["segments"]
        else:
            morpheme = [
                form["segments"][i]
                for i in util.parse_segment_slices(judgement[c_slice])
            ]
        cognatesets.setdefault(judgement[c_cognateset_id], []).append(
            ((form["languageReference"], morpheme), judgement[c_id]))

    for cognateset, morphemes in cognatesets.items():
        for alignment, id in align(morphemes):
            judgements[id][c_alignment] = alignment
            if status_update:
                judgements[id]["Status_Column"] = status_update
    dataset.write(CognateTable=judgements.values())
Ejemplo n.º 16
0
        ttype = ds.get_tabletype(table)
        c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id")
        if c_id.datatype.base == "string":
            # Temporarily open up the datatype format, otherwise we may be unable to read
            c_id.datatype.format = None
        elif c_id.datatype.base == "integer":
            # Temporarily open up the datatype format, otherwise we may be unable to read
            c_id.datatype = "string"
            update_integer_ids(ds, table)
            c_id.datatype = "integer"
            continue
        else:
            logger.warning(
                f"Table {table.uri} had an id column ({c_id.name}) that is neither integer nor string. I did not touch it."
            )
            continue

        if args.transparent and ttype in ID_COMPONENTS:
            cols = {
                prop: ds[ttype, prop].name
                for prop in ID_COMPONENTS[ttype]
            }
            mapping = clean_mapping(cache_table(ds, ttype, cols))
        else:
            ids = {row[c_id.name] for row in ds[table]}
            mapping = clean_mapping(cache_table(ds, table.url.string, {}))

        update_ids(ds, table, mapping)

    ds.write_metadata()
Ejemplo n.º 17
0
        help="Path to output file (default: output to stdout)",
        type=Path,
    )

    args = parser.parse_args()
    logger = cli.setup_logging(args)
    dataset = pycldf.Dataset.from_metadata(args.metadata)
    which_segment_belongs_to_which_cognateset = segment_to_cognateset(
        dataset=dataset,
        cognatesets=args.cognatesets,
        logger=logger,
    )

    overlapping_cognatesets = network_of_overlaps(
        which_segment_belongs_to_which_cognateset,
        forms_cache=util.cache_table(dataset))
    graph = networkx.Graph()
    graph.add_edges_from(overlapping_cognatesets)
    if graph.nodes():
        out = args.output_file.open("w") if args.output_file else sys.stdout

        # Sort to keep order persistent
        for community in sorted(
                networkx.algorithms.community.greedy_modularity_communities(
                    graph),
                key=lambda x: sorted(x),
        ):
            print("Cluster of overlapping cognate sets:", file=out)
            for cognateset in sorted(community):
                print(f"\t {cognateset}", file=out)
                # TODO: Generate form segments, if considered informative
Ejemplo n.º 18
0
def segment_to_cognateset(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    cognatesets: t.Container[types.Cognateset_ID],
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]:
    # required fields
    c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference
    c_cognate_id = dataset.column_names.cognates.id
    c_cognate_form = dataset.column_names.cognates.formReference
    c_cognate_slice = dataset.column_names.cognates.segmentSlice

    forms = util.cache_table(dataset)
    cognateset_cache: t.Container[types.Cognateset_ID]
    if "CognatesetTable" in dataset:
        c_s_id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[c_s_id]
            for cognateset in dataset["CognatesetTable"]
            if cognatesets is None or cognateset["ID"] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = types.WorldSet()
        else:
            cognateset_cache = cognatesets

    which_segment_belongs_to_which_cognateset: t.Mapping[
        types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = {
            f: [set() for _ in form["segments"]]
            for f, form in forms.items() if form["form"]
            and form["form"].strip() and form["form"].strip() != "-"
        }
    for j in dataset["CognateTable"]:
        if j[c_cognate_form] in forms and j[
                c_cognate_cognateset] in cognateset_cache:
            form = forms[j[c_cognate_form]]
            if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset:
                continue
            if j.get(c_cognate_slice):
                try:
                    segments_judged = list(
                        parse_segment_slices(j[c_cognate_slice]))
                except ValueError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end."
                    )
                    continue
            else:
                segments_judged = list(range(len(form["segments"])))
            old_s = None

            for s in segments_judged:
                if old_s is not None and old_s + 1 != s:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous"
                    )
                try:
                    cognatesets = which_segment_belongs_to_which_cognateset[
                        j[c_cognate_form]][s]
                except IndexError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}."
                    )
                    continue
                cognatesets.add(j[c_cognate_cognateset])

    return which_segment_belongs_to_which_cognateset
Ejemplo n.º 19
0
def read_wordlist(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    code_column: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID,
                                                          t.Set]]:
    col_map = dataset.column_names

    if code_column:
        # Just in case that column was specified by property URL. We
        # definitely want the name. In any case, this will also throw a
        # helpful KeyError when the column does not exist.
        form_table_form = col_map.forms.form
        form_table_column = col_map.forms.id
        cognatesets = util.cache_table(
            dataset,
            columns={
                "form": form_table_column,
                "transcription": form_table_form,
                "code": dataset["FormTable", code_column].name,
            },
            filter=lambda row: bool(row[col_map.forms.form]),
        )
    else:
        # We search for cognatesetReferences in the FormTable or a separate
        # CognateTable.

        # Try the FormTable first.
        code_column = col_map.forms.cognatesetReference

        if code_column:
            # This is not the CLDF way, warn the user.
            form_table_column = col_map.forms.id
            form_table_form = col_map.forms.form
            logger.warning(
                "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table."
            )
            cognatesets = util.cache_table(
                dataset,
                columns={
                    "form": form_table_column,
                    "transcription": form_table_form,
                    "code": code_column,
                },
            )
        else:
            # There was no cognatesetReference in the form table. If we
            # find them in CognateTable (I mean, they should be there!), we
            # store them keyed with formReference.
            if (col_map.cognates and col_map.cognates.cognatesetReference
                    and col_map.cognates.formReference):
                code_column = col_map.cognates.cognatesetReference
                form_reference = col_map.cognates.formReference
                (foreign_key, ) = [
                    key
                    for key in dataset["CognateTable"].tableSchema.foreignKeys
                    if key.columnReference == [form_reference]
                ]
                (form_table_column, ) = foreign_key.reference.columnReference
                cognatesets = util.cache_table(
                    dataset,
                    "CognateTable",
                    {
                        "form": form_reference,
                        "code": code_column
                    },
                )
            else:
                raise ValueError(
                    "Dataset has no cognatesetReference column in its "
                    "primary table or in a separate cognate table. "
                    "Is this a metadata-free wordlist and you forgot to "
                    "specify code_column explicitly?")

    # Cognate sets have been loaded. Consolidate.
    cognates_by_form: t.MutableMapping[
        types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for judgement in cognatesets.values():
        cognates_by_form[judgement["form"]].add(judgement["code"])
    parameter_column = col_map.forms.parameterReference

    # If one form can have multiple concepts,
    if dataset["FormTable", parameter_column].separator:

        def all_parameters(parameter):
            return list(parameter)

    else:

        def all_parameters(parameter):
            return [parameter]

    data: t.MutableMapping[types.Language_ID,
                           t.MutableMapping[types.Parameter_ID, t.Set]]
    if "LanguageTable" in dataset:
        (langref_target, ) = [
            key for key in dataset["FormTable"].tableSchema.foreignKeys
            if key.columnReference ==
            [dataset["FormTable", "languageReference"].name]
        ]
        ref_col = langref_target.reference.columnReference[0]
        data = {
            lang[ref_col]: t.DefaultDict(set)
            for lang in dataset["LanguageTable"]
        }
    else:
        data = t.DefaultDict(lambda: t.DefaultDict(set))
    for row in dataset["FormTable"].iterdicts():
        if not row[col_map.forms.form]:
            # Transcription is empty, should not be a form. Skip, but maybe
            # warn if it was in a cognateset.
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    cognates_by_form[row[form_table_column]],
                )
            continue

        language = row[col_map.forms.languageReference]
        if row[col_map.forms.form] == "-":
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    language,
                    cognates_by_form[row[form_table_column]],
                )
                cognates_by_form[row[form_table_column]] = set()
            for parameter in all_parameters(row[parameter_column]):
                if data[language][parameter]:
                    logger.warning(
                        "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.",
                        row[col_map.forms.id],
                        parameter,
                        row[col_map.forms.languageReference],
                        data[language][parameter],
                    )
        for parameter in all_parameters(row[parameter_column]):
            data[language][parameter] |= cognates_by_form[
                row[form_table_column]]
    return data
Ejemplo n.º 20
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache
Ejemplo n.º 21
0
def load_forms_from_tsv(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    input_file: Path,
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]:
    """

    Side effects
    ============
    This function overwrites dataset's FormTable
    """
    input = csv.DictReader(
        input_file.open(encoding="utf-8"),
        delimiter="\t",
    )

    # These days, all dicts are ordered by default. Still, better make this explicit.
    forms = util.cache_table(dataset)

    edictor_cognatesets: t.Dict[
        int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = collections.defaultdict(list)

    form_table_upper = {
        (util.cldf_property(column.propertyUrl) or column.name).upper(): (
            util.cldf_property(column.propertyUrl) or column.name
        )
        for column in dataset["FormTable"].tableSchema.columns
    }
    form_table_upper.update(
        {
            "DOCULECT": "languageReference",
            "CONCEPT": "parameterReference",
            "IPA": "form",
            "COGID": "cognatesetReference",
            "ALIGNMENT": "alignment",
            "TOKENS": "segments",
            "CLDF_ID": "id",
            "ID": "",
        }
    )
    if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]:
        form_table_upper["_PARAMETERREFERENCE"] = "parameterReference"
        form_table_upper["CONCEPT"] = ""

    separators: t.MutableMapping[str, t.Optional[str]] = {}
    # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames.
    for i in range(len(input.fieldnames)):
        if i == 0 and input.fieldnames[0] != "ID":
            raise ValueError(
                "When importing from Edictor, expected the first column to be named 'ID', but found %s",
                input.fieldnames["ID"],
            )

        lingpy = input.fieldnames[i]
        try:
            input.fieldnames[i] = form_table_upper[lingpy.upper()]
        except KeyError:
            logger.warning(
                "Your edictor file contained a column %s, which I could not interpret.",
                lingpy,
            )

        if input.fieldnames[i] == "cognatesetReference":
            separators[input.fieldnames[i]] = " "
        elif input.fieldnames[i] == "alignment":
            separators[input.fieldnames[i]] = " "

        try:
            separators[input.fieldnames[i]] = dataset[
                "FormTable", input.fieldnames[i]
            ].separator
        except KeyError:
            pass

    logger.info(
        "The header of your edictor file will be interpreted as %s.", input.fieldnames
    )

    affected_forms: t.Set[types.Form_ID] = set()
    for line in cli.tq(
        input, task="Importing form rows from edictor…", total=len(forms)
    ):
        # Column "" is the re-named Lingpy-ID column, so the first one.
        if not any(line.values()) or line[""].startswith("#"):
            # One of Edictor's comment rows, storing settings
            continue

        for (key, value) in line.items():
            value = value.replace("\\!t", "\t").replace("\\!n", "\n")
            sep = separators[key]
            if sep is not None:
                if not value:
                    line[key] = []
                else:
                    line[key] = value.split(sep)
            else:
                line[key] = value

        affected_forms.add(line["id"])

        try:
            for segments, cognateset, alignment in extract_partial_judgements(
                line["segments"],
                line["cognatesetReference"],
                line["alignment"],
                logger,
            ):
                edictor_cognatesets[cognateset].append(
                    (line["id"], segments, alignment)
                )
            forms[line["id"]] = line
        except IndexError:
            logger.warning(
                f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped."
            )
    edictor_cognatesets.pop(0, None)

    columns = {
        (util.cldf_property(column.propertyUrl) or column.name): column.name
        for column in dataset["FormTable"].tableSchema.columns
    }
    # Deliberately make use of the property of `write` to discard any entries
    # that don't correspond to existing columns. Otherwise, we'd still have to
    # get rid of the alignment, cognatesetReference and Lingpy-ID columns.
    dataset["FormTable"].write(
        (
            {
                columns[property]: value
                for property, value in form.items()
                if columns.get(property)
            }
            for form in forms.values()
        )
    )
    return edictor_cognatesets, affected_forms
Ejemplo n.º 22
0
        dataset, args.add_singletons_with_status, args.by_segment, logger)

    try:
        cogset_order = (util.cldf_property(
            dataset["CognatesetTable", args.sort_cognatesets_by].propertyUrl)
                        or dataset["CognatesetTable",
                                   args.sort_cognatesets_by].name)
    except (KeyError):
        cli.Exit.INVALID_COLUMN_NAME(
            f"No column '{args.sort_cognatesets_by}' in your CognatesetTable.")
    sort_cognatesets(cogsets, judgements, cogset_order, size=args.size_sort)

    # TODO: wrap the following two blocks into a
    # get_sorted_languages() -> t.OrderedDict[languageReference, Column Header/Titel/Name]
    # function
    languages = list(util.cache_table(dataset, "LanguageTable").values())
    if args.sort_languages_by:
        c_sort = (util.cldf_property(
            dataset["LanguageTable", args.sort_languages_by].propertyUrl)
                  or dataset["LanguageTable", args.sort_languages_by].name)
        languages.sort(key=lambda x: x[c_sort], reverse=False)

    forms = util.cache_table(dataset)

    E.create_excel(
        size_sort=args.size_sort,
        languages=languages,
        rows=cogsets,
        judgements=judgements,
        forms=forms,
    )
Ejemplo n.º 23
0
        type=str,
        default="https://example.org/lexicon/{:}",
        help=
        "A template string for URLs pointing to individual forms. For example, to"
        " point to lexibank, you would use https://lexibank.clld.org/values/{:}."
        " (default: https://example.org/lexicon/{:})",
    )
    args = parser.parse_args()
    logger = cli.setup_logging(args)

    dataset = (pycldf.Wordlist.from_metadata(args.metadata), )
    E = MatrixExcelWriter(
        dataset,
        database_url=args.url_template,
        logger=logger,
    )
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)
    E.wb.save(filename=args.excel, )
Ejemplo n.º 24
0
def create_singletons(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    status: t.Optional[str] = None,
    by_segment: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]:
    """Create singleton cognate judgements for forms that don't have cognate judgements.

    Depending on by_segment, singletons are created for every range of segments
    that is not in any cognate set yet (True) or just for every form where no
    segment is in any cognate sets (False).

    """
    forms = util.cache_table(dataset)
    c_j_id = dataset["CognateTable", "id"].name
    c_j_cogset = dataset["CognateTable", "cognatesetReference"].name
    c_j_form = dataset["CognateTable", "formReference"].name
    try:
        c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        c_j_segmentslice = None
    try:
        c_j_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        c_j_alignment = None

    if not dataset.get(("CognatesetTable", "Status_Column")):
        logger.warning(
            "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column."
        )

    try:
        c_s_id = dataset["CognatesetTable", "id"].name
        all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]}
    except KeyError:
        c_s_id = "id"
        c_s_name = "name"
        all_cognatesets = {
            id: types.Judgement({
                "id": id,
                "name": id
            })
            for id in {j[c_j_cogset]
                       for j in dataset["CognateTable"]}
        }
    try:
        c_s_name = dataset["CognatesetTable", "name"].name
    except KeyError:
        c_s_name = c_s_id

    all_judgements = list(dataset["CognateTable"])
    if by_segment:
        judgements = segment_to_cognateset(dataset, types.WorldSet(), logger)
        forms_and_segments = uncoded_segments(judgements, logger)
    else:
        forms_and_segments = uncoded_forms(
            forms.values(), {j[c_j_form]
                             for j in all_judgements})
    for form, slice in forms_and_segments:
        i = 1
        singleton_id = f"X_{form}_{i:d}"
        while singleton_id in all_cognatesets:
            i += 1
            singleton_id = f"X_{form}_{i:d}"
        all_cognatesets[singleton_id] = types.CogSet({})
        properties = {
            c_s_name: util.ensure_list(forms[form]["parameterReference"])[0],
            c_s_id: singleton_id,
            "Status_Column": status,
        }
        try:
            for column in dataset["CognatesetTable"].tableSchema.columns:
                all_cognatesets[singleton_id][column.name] = properties.get(
                    column.name)
        except KeyError:
            pass
        judgement = types.Judgement({})
        properties = {
            c_j_id: singleton_id,
            c_j_cogset: singleton_id,
            c_j_form: form,
            c_j_segmentslice: indices_to_segment_slice(slice),
            c_j_alignment: [forms[form]["segments"][i] for i in slice],
            "Status_Column": status,
        }
        for column in dataset["CognateTable"].tableSchema.columns:
            judgement[column.name] = properties.get(column.name)
        all_judgements.append(judgement)
    return all_cognatesets.values(), all_judgements
Ejemplo n.º 25
0
def add_cognate_table(
    dataset: pycldf.Wordlist,
    split: bool = True,
    logger: cli.logging.Logger = cli.logger,
) -> None:
    if "CognateTable" in dataset:
        return
    dataset.add_component("CognateTable")

    # TODO: Check if that cognatesetReference is already a foreign key to
    # elsewhere (could be a CognatesetTable, could be whatever), because then
    # we need to transfer that knowledge.

    # Load anything that's useful for a cognate set table: Form IDs, segments,
    # segment slices, cognateset references, alignments
    columns = {
        "id": dataset["FormTable", "id"].name,
        "concept": dataset["FormTable", "parameterReference"].name,
        "form": dataset["FormTable", "form"].name,
    }
    for property in [
            "segments", "segmentSlice", "cognatesetReference", "alignment"
    ]:
        try:
            columns[property] = dataset["FormTable", property].name
        except KeyError:
            pass
    cognate_judgements = []
    forms = cache_table(dataset, columns=columns)
    forms_without_segments = 0
    for f, form in cli.tq(forms.items(),
                          task="Extracting cognate judgements from forms…"):
        if form.get("cognatesetReference"):
            if split:
                cogset = util.string_to_id("{:}-{:}".format(
                    form["concept"], form["cognatesetReference"]))
            else:
                cogset = form["cognatesetReference"]
            judgement = {
                "ID": f,
                "Form_ID": f,
                "Cognateset_ID": cogset,
            }
            try:
                judgement["Segment_Slice"] = form["segmentSlice"]
            except KeyError:
                try:
                    if not form["segments"]:
                        raise ValueError("No segments")
                    if ("+" in form["segments"]
                            and dataset["FormTable",
                                        "cognatesetReference"].separator):
                        logger.warning(
                            "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!"
                        )
                    judgement["Segment_Slice"] = [
                        "1:{:d}".format(len(form["segments"]))
                    ]
                except (KeyError, TypeError, ValueError):
                    forms_without_segments += 1
                    if forms_without_segments >= 5:
                        pass
                    else:
                        logger.warning(
                            f"No segments found for form {f} ({form['form']})."
                        )
            # What does an alignment mean without segments or their slices?
            # Doesn't matter, if we were given one, we take it.
            judgement["Alignment"] = form.get("alignment")
            cognate_judgements.append(judgement)

    if forms_without_segments >= 5:
        logger.warning(
            "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.",
            forms_without_segments,
        )

    # Delete the cognateset column
    cols = dataset["FormTable"].tableSchema.columns
    remove = {
        dataset["FormTable", c].name
        for c in ["cognatesetReference", "segmentSlice", "alignment"]
        if ("FormTable", c) in dataset
    }

    def clean_form(form):
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)
Ejemplo n.º 26
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Ejemplo n.º 27
0
def check_cognate_table(dataset: pycldf.Wordlist,
                        logger=cli.logger,
                        strict_concatenative=False) -> bool:
    """Check that the CognateTable makes sense.

    The cognate table MUST have an indication of forms, in a #formReference
    column, and cognate sets, in a #cognatesetReference column. It SHOULD have
    segment slices (#segmentSlice) and alignments (#alignment).

     - The segment slice must be a valid (1-based, inclusive) slice into the segments of the form
     - The alignment must match the segment slice applied to the segments of the form
     - The length of the alignment must match the lengths of other alignments of that cognate set
     - NA forms (Including "" for “source reports form as unknown” must not be in cognatesets)


    If checking for strictly concatenative morphology, also check that the
    segment slice is a contiguous, non-overlapping section of the form.

    Having no cognates is a valid choice for a dataset, so this function returns True if no CognateTable was found.

    """

    # First, load all forms that are referenced in the CognateTable

    try:
        cognatetable = dataset["CognateTable"]
    except KeyError:
        # Having no cognates is a valid choice for a dataset.
        return True

    try:
        c_form = dataset["CognateTable", "formReference"].name
    except KeyError:
        log_or_raise("CognateTable does not have a #formReference column.")
        # All further checks don't make sense, return early.
        return False

    try:
        c_cognateset = dataset["CognateTable", "cognatesetReference"].name
    except KeyError:
        log_or_raise(
            "CognateTable does not have a #cognatesetReference column.")
        # All further checks don't make sense, return early.
        return False

    # The CLDF specifications state that foreign key references take precedence
    # over the implicit semantics of a `#xxxReference` column pointing to an
    # `#id` column, so we need to find forms by the stated foreign key
    # relationship.
    for foreign_key in cognatetable.tableSchema.foreignKeys:
        if foreign_key.columnReference == [c_form]:
            referenced_table = str(foreign_key.reference.resource)
            # A multi-column column reference for a single-column foreign key
            # makes no sense, so use tuple unpacking to extract the only
            # element from that list.
            (referenced_column, ) = foreign_key.reference.columnReference
            if (not dataset[referenced_table].common_props["dc:conformsTo"]
                    == "http://cldf.clld.org/v1.0/terms.rdf#FormTable"):
                log_or_raise(
                    "CognateTable #formReference does not reference a FormTable.",
                )
            break
    else:
        log_or_raise("CognateTable #formReference must be a foreign key.")
        # All further checks don't make sense, return early.
        return False

    try:
        c_sslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        logger.info("CognateTable does not have a #segmentSlice column.")
        c_sslice = None

    try:
        c_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        logger.info("CognateTable does not have an #alignment column.")
        c_alignment = None

    if c_sslice is None and c_alignment is None:
        # No additional data concerning the associations between forms and
        # cognate sets. That's sad, but valid.
        # All further checks don't make sense, return early.
        return True

    try:
        c_f_form = dataset[referenced_table, "form"].name

        def form_given(row):
            return row[c_f_form] and row[c_f_form].strip() != "-"

    except KeyError:
        if dataset[referenced_table] == dataset["FormTable"]:
            log_or_raise("FormTable does not have a #form column.")

        def form_given(row):
            return True

    # Check whether each row is valid.
    all_judgements_okay = True
    forms = cache_table(
        dataset,
        columns={"segments": dataset[referenced_table, "segments"].name},
        table=referenced_table,
        index_column=referenced_column,
        filter=form_given,
    )
    missing_forms = cache_table(
        dataset,
        columns={},
        table=referenced_table,
        index_column=referenced_column,
        filter=lambda row: not form_given(row),
    )
    cognateset_alignment_lengths: t.DefaultDict[
        t.Any, t.Set[int]] = t.DefaultDict(set)

    for f, j, judgement in dataset["CognateTable"].iterdicts(
            with_metadata=True):
        try:
            form_segments = forms[judgement[c_form]]["segments"]
        except KeyError:
            if judgement[c_form] in missing_forms:
                log_or_raise(
                    "In {}, row {}: NA form {} was judged to be in cognate set."
                    .format(f, j, judgement[c_form]), )
            # The case of a missing foreign key in general is already handled
            # by the basic CLDF validator.
            continue

        if c_sslice is not None:
            if not judgement[c_sslice]:
                log_or_raise("In {}, row {}: Empty segment slice".format(f, j))
                continue
            try:
                included_segments = list(
                    parse_segment_slices(judgement[c_sslice]))
                if (max(included_segments) >= len(form_segments)
                        or min(included_segments) < 0):
                    log_or_raise(
                        "In {}, row {}: Segment slice {} is invalid for segments {}"
                        .format(
                            f,
                            j,
                            judgement[c_sslice],
                            form_segments,
                        ), )
                    all_judgements_okay = False
                    continue
                if strict_concatenative:
                    s1 = included_segments[0]
                    for s2 in included_segments[1:]:
                        if s2 != s1 + 1:
                            log_or_raise(
                                "In {}, row {}: Segment slice {} has non-consecutive elements {}, {}"
                                .format(
                                    f,
                                    j,
                                    judgement[c_sslice],
                                    s1,
                                    s2,
                                ))
                        s1 = s2
            except ValueError:
                log_or_raise(
                    "In {}, row {}: Segment slice {} is invalid".format(
                        f,
                        j,
                        judgement[c_sslice],
                    ))
                all_judgements_okay = False
                continue
        else:
            included_segments = list(range(len(form_segments)))

        if c_alignment:
            # Length of alignment should match length of every other alignment in this cognate set.
            lengths = cognateset_alignment_lengths[judgement[c_cognateset]]
            alignment_length = len(judgement[c_alignment])
            if lengths and alignment_length not in lengths:
                log_or_raise(
                    "In {}, row {}: Alignment has length {}, other alignments of cognateset {} have length(s) {}"
                    .format(f, j, alignment_length, judgement[c_cognateset],
                            lengths), )
                all_judgements_okay = False
            elif not lengths:
                lengths.add(alignment_length)

            # Alignment when gaps are removed should match segments. TODO:
            # Should we permit other gap characters? Where do we know them
            # from? TODO: To be more robust when segments are separated into
            # morphemes, not individual segments, compare alignment and
            # segments space-separated.
            without_gaps = " ".join(
                [c or "" for c in judgement[c_alignment] if c != "-"])
            actual_segments = " ".join(form_segments[i]
                                       for i in included_segments)
            if without_gaps.strip() != actual_segments.strip():
                if unicodedata.normalize(
                        "NFKC", without_gaps.strip()) == unicodedata.normalize(
                            "NFKC", actual_segments.strip()):
                    comment = " This is down to encoding differences: Their normalized unicode representations are the same. I suggest you run `lexedata.edit.normalize_unicode`."
                else:
                    comment = ""
                log_or_raise(
                    "In {}, row {}: Referenced segments in form resolve to {}, while alignment contains segments {}.{}"
                    .format(f, j, actual_segments, without_gaps, comment), )
                all_judgements_okay = False

    return all_judgements_okay
Ejemplo n.º 28
0
def edictor_to_cldf(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    new_cogsets: t.Mapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ],
    affected_forms: t.Set[types.Form_ID],
    source: t.List[str] = [],
):
    ref_cogsets: t.MutableMapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = t.DefaultDict(list)
    cognate: t.List[types.Judgement] = []
    judgements_lookup: t.MutableMapping[
        types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement]
    ] = t.DefaultDict(dict)
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in affected_forms:
            ref_cogsets[j["cognatesetReference"]].append(
                (j["formReference"], j["segmentSlice"], j["alignment"])
            )
            judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j
        else:
            cognate.append(j)
    matches = match_cognatesets(new_cogsets, ref_cogsets)

    for cognateset, judgements in new_cogsets.items():
        cognateset = matches[cognateset]
        if cognateset is None:
            cognateset = "_".join(f for f, _, _ in judgements)
        for form, slice, alignment in judgements:
            was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset)
            if was:
                was["segmentSlice"] = util.indices_to_segment_slice(slice)
                was["alignment"] = alignment
                cognate.append(was)
                continue
            judgements_lookup
            cognate.append(
                types.Judgement(
                    {
                        "id": f"{form}-{cognateset}",
                        "formReference": form,
                        "cognatesetReference": cognateset,
                        "alignment": alignment,
                        "segmentSlice": util.indices_to_segment_slice(slice),
                        "source": source,
                        # TODO: Any more parameters? Status update?
                    }
                )
            )

    cognate.sort(key=lambda j: j["id"])
    m = {
        util.cldf_property(c.propertyUrl) or c.name: c.name
        for c in dataset["CognateTable"].tableSchema.columns
    }
    dataset["CognateTable"].write(
        [{m[k]: v for k, v in j.items() if k in m} for j in cognate]
    )