Exemple #1
0
    def __init__(
        self,
        option_strings,
        dest,
        nargs="+",
        default=types.WorldSet(),
        help=None,
        autohelp=True,
        metavar=None,
        **kwargs,
    ):
        if nargs != "+":
            if (
                len(option_strings) == 1
                and nargs == "*"
                and not option_strings[0].startswith("-")
            ):
                # Mandatory argument, can be not given as default.
                pass
            else:
                raise ValueError(
                    "Optional ListOrFromFile makes sense only with variable argument count ('+')"
                )

        if metavar is None:
            metavar = option_strings[0].upper()
            if option_strings[0].endswith("s"):
                metavar = metavar[:-1]
            if option_strings[0].startswith("--"):
                metavar = metavar[2:]

        if autohelp:
            help = (
                (help or "")
                + f" Instead of a list of individual {metavar}s on the command line, this argument accepts also the path to a single {metavar}S.CSV file (with header row), containing the relevant IDs in the first column."
            )
            if type(default) == types.WorldSet:
                help += f" (default: All {metavar.lower()}s in the dataset)"
            help = help.strip()

        super().__init__(
            option_strings,
            dest,
            nargs=nargs,
            default=default,
            help=help,
            metavar=metavar,
            **kwargs,
        )
Exemple #2
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Exemple #3
0
def root_meaning_code(
    dataset: t.Mapping[types.Language_ID,
                       t.Mapping[types.Parameter_ID,
                                 t.Set[types.Cognateset_ID]]],
    core_concepts: t.Set[types.Parameter_ID] = types.WorldSet(),
    ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"],
) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]],
             t.Mapping[types.Parameter_ID, t.Mapping[types.Cognateset_ID,
                                                     int]], ]:
    """Create a root-meaning coding from cognate codes in a dataset

    Take the cognate code information from a wordlist, i.e. a mapping of the
    form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary
    alignment from it that lists for every meaning which roots are used to
    represent that meaning in each language.

    Return the aligment, and the list of slices belonging to each meaning.

    The default ascertainment is the a single absence ('0'): The configuration
    where a form is absent from all languages is never observed, but always
    possible, so we add this entry for the purposes of ascertainment
    correction.

    Examples
    ========

    >>> alignment, concepts = root_meaning_code({"Language": {"Meaning": {"Cognateset 1"}}})
    >>> alignment
    {'Language': ['0', '1']}


    >>> alignment, concepts = root_meaning_code(
    ...   {"l1": {"m1": {"c1"}},
    ...    "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}})
    >>> sorted(concepts)
    ['m1', 'm2']
    >>> sorted(concepts["m1"])
    ['c1', 'c2']
    >>> {language: sequence[concepts["m1"]["c1"]] for language, sequence in alignment.items()}
    {'l1': '1', 'l2': '0'}
    >>> {language: sequence[concepts["m2"]["c3"]] for language, sequence in alignment.items()}
    {'l1': '?', 'l2': '1'}
    >>> list(zip(*sorted(zip(*alignment.values()))))
    [('0', '0', '1', '?', '?'), ('0', '1', '0', '1', '1')]

    """
    roots: t.Dict[types.Parameter_ID, t.Set[types.Cognateset_ID]] = {}
    for language, lexicon in dataset.items():
        for concept, cognatesets in lexicon.items():
            if core_concepts is None or concept in core_concepts:
                roots.setdefault(concept, set()).update(cognatesets)

    blocks = {}
    sorted_roots: t.Dict[types.Parameter_ID, t.List[types.Cognateset_ID]] = {}
    c = len(ascertainment)
    for concept in sorted(roots):
        possible_roots = sorted(roots[concept])
        sorted_roots[concept] = possible_roots
        blocks[concept] = {root: r for r, root in enumerate(possible_roots, c)}
        c += len(possible_roots)

    alignment: t.Dict[types.Language_ID, t.List[Literal["0", "1", "?"]]] = {}
    for language, lexicon in dataset.items():
        alignment[language] = list(ascertainment)
        for concept, possible_roots in sorted_roots.items():
            entries = lexicon.get(concept)
            if entries is None:
                alignment[language].extend(["?" for _ in possible_roots])
            else:
                concept_sequence: t.List[Literal["0", "1", "?"]] = [
                    "1" if k in entries else "0" for k in possible_roots
                ]
                alignment[language].extend(concept_sequence)
    return alignment, blocks
Exemple #4
0
def coverage_report(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    min_percentage: float = 0.0,
    with_concept: t.Iterable[types.Parameter_ID] = set(),
    missing: Missing = Missing.KNOWN,
    only_coded: bool = True,
) -> t.List[t.List[str]]:
    coded: t.Container[types.Form_ID]
    if only_coded:
        try:
            c_j_form = dataset["CognateTable", "formReference"].name
        except KeyError:
            cli.Exit.NO_COGNATETABLE(
                message=
                "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements."
            )
        coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]}
    else:
        coded = types.WorldSet()

    languages: t.Dict[types.Language_ID, str] = {}
    try:
        c_l_id = dataset["LanguageTable", "id"].name
        c_l_name = dataset["LanguageTable", "name"].name
        for language in dataset["LanguageTable"]:
            languages[language[c_l_id]] = language[c_l_name]
    except KeyError:
        pass

    concepts: t.DefaultDict[types.Language_ID,
                            t.Counter[types.Parameter_ID]] = t.DefaultDict(
                                t.Counter)
    c_f_id = dataset["FormTable", "id"].name
    c_concept = dataset["FormTable", "parameterReference"].name
    c_language = dataset["FormTable", "languageReference"].name
    c_form = dataset["FormTable", "form"].name
    for form in dataset["FormTable"]:
        languages.setdefault(form[c_language], form[c_language])
        if form[c_f_id] not in coded:
            continue
        if missing == Missing.IGNORE and (not form[c_form]
                                          or form[c_form] == "-"):
            continue
        if missing == Missing.KNOWN and not form[c_form]:
            continue
        c: types.Parameter_ID
        for c in util.ensure_list(form[c_concept]):
            concepts[form[c_language]][c] += 1

    # load primary concepts and number of concepts
    primary_concepts: t.Container[types.Parameter_ID]
    try:
        c_c_id = dataset["ParameterTable", "id"].name
        primary_concepts = [
            c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"]
        ]
        total_number_concepts = len(primary_concepts)
    except KeyError:
        cli.logger.warning(
            "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. "
            "Loading all concepts.")
        primary_concepts = types.WorldSet()

        try:
            total_number_concepts = len(list(dataset["ParameterTable"]))
        except KeyError:
            total_number_concepts = len(
                set.union(*(set(cs) for cs in concepts.values())))

    data_languages = []
    for language, name in languages.items():
        conceptlist = concepts[language]
        try:
            synonyms = sum(conceptlist.values()) / len(conceptlist)
        except ZeroDivisionError:
            synonyms = float("nan")

        # percentage of all concepts covered by this language
        conceptlist_percentage = len(conceptlist) / total_number_concepts
        if conceptlist_percentage * 100 < min_percentage:
            continue

        if not all(c in conceptlist for c in with_concept):
            continue

        # count primary concepts
        primary_count = 0
        for c in conceptlist:
            if c in primary_concepts:
                primary_count += 1
        # if args.languages_only:
        #     print(language)
        data_languages.append([
            language,
            name,
            primary_count,
            conceptlist_percentage,
            synonyms,
        ])
    return data_languages
def test_segment_to_cognateset_no_slices(caplog):
    ds = new_wordlist(
        FormTable=[
            {
                "ID": "f1",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f"],
            },
            {
                "ID": "f2",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f"],
            },
            {
                "ID": "f3",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f", "i"],
            },
            {
                "ID": "f4",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f", "i"],
            },
        ],
        CognateTable=[],
    )
    ds.remove_columns("CognateTable", "Segment_Slice", "Alignment")
    ds.write(CognateTable=[
        {
            "ID": "j1",
            "Form_ID": "f1",
            "Cognateset_ID": "s1"
        },
        {
            "ID": "j2",
            "Form_ID": "f3",
            "Cognateset_ID": "s1"
        },
        {
            "ID": "j3",
            "Form_ID": "f4",
            "Cognateset_ID": "s1",
        },
        {
            "ID": "j4",
            "Form_ID": "f4",
            "Cognateset_ID": "s2"
        },
    ], )
    with caplog.at_level(logging.WARNING):
        segments = segment_to_cognateset(ds, types.WorldSet())
    assert segments == {
        "f1": [{"s1"}],
        "f2": [set()],
        "f3": [{"s1"}, {"s1"}],
        "f4": [{"s1", "s2"}, {"s1", "s2"}],
    }
def test_segment_to_cognateset(caplog):
    ds = new_wordlist(
        FormTable=[
            {
                "ID": "f1",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f"],
            },
            {
                "ID": "f2",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f"],
            },
            {
                "ID": "f3",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["f", "i"],
            },
            {
                "ID": "f4",
                "Parameter_ID": "c1",
                "Language_ID": "l1",
                "Form": "f",
                "Segments": ["t", "e", "s", "t"],
            },
        ],
        CognateTable=[
            {
                "ID": "j1",
                "Form_ID": "f1",
                "Cognateset_ID": "s1",
                "Segment_Slice": "1"
            },
            {
                "ID": "j2",
                "Form_ID": "f3",
                "Cognateset_ID": "s1",
                "Segment_Slice": "2"
            },
            {
                "ID": "j3",
                "Form_ID": "f4",
                "Cognateset_ID": "s1",
                "Segment_Slice": ["2:3"],
            },
            {
                "ID": "j4",
                "Form_ID": "f4",
                "Cognateset_ID": "s2",
                "Segment_Slice": "2"
            },
        ],
    )
    with caplog.at_level(logging.WARNING):
        segments = segment_to_cognateset(ds, types.WorldSet())
    assert segments == {
        "f1": [{"s1"}],
        "f2": [set()],
        "f3": [set(), {"s1"}],
        "f4": [set(), {"s1", "s2"}, {"s1"}, set()],
    }
Exemple #7
0
def create_singletons(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    status: t.Optional[str] = None,
    by_segment: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]:
    """Create singleton cognate judgements for forms that don't have cognate judgements.

    Depending on by_segment, singletons are created for every range of segments
    that is not in any cognate set yet (True) or just for every form where no
    segment is in any cognate sets (False).

    """
    forms = util.cache_table(dataset)
    c_j_id = dataset["CognateTable", "id"].name
    c_j_cogset = dataset["CognateTable", "cognatesetReference"].name
    c_j_form = dataset["CognateTable", "formReference"].name
    try:
        c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        c_j_segmentslice = None
    try:
        c_j_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        c_j_alignment = None

    if not dataset.get(("CognatesetTable", "Status_Column")):
        logger.warning(
            "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column."
        )

    try:
        c_s_id = dataset["CognatesetTable", "id"].name
        all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]}
    except KeyError:
        c_s_id = "id"
        c_s_name = "name"
        all_cognatesets = {
            id: types.Judgement({
                "id": id,
                "name": id
            })
            for id in {j[c_j_cogset]
                       for j in dataset["CognateTable"]}
        }
    try:
        c_s_name = dataset["CognatesetTable", "name"].name
    except KeyError:
        c_s_name = c_s_id

    all_judgements = list(dataset["CognateTable"])
    if by_segment:
        judgements = segment_to_cognateset(dataset, types.WorldSet(), logger)
        forms_and_segments = uncoded_segments(judgements, logger)
    else:
        forms_and_segments = uncoded_forms(
            forms.values(), {j[c_j_form]
                             for j in all_judgements})
    for form, slice in forms_and_segments:
        i = 1
        singleton_id = f"X_{form}_{i:d}"
        while singleton_id in all_cognatesets:
            i += 1
            singleton_id = f"X_{form}_{i:d}"
        all_cognatesets[singleton_id] = types.CogSet({})
        properties = {
            c_s_name: util.ensure_list(forms[form]["parameterReference"])[0],
            c_s_id: singleton_id,
            "Status_Column": status,
        }
        try:
            for column in dataset["CognatesetTable"].tableSchema.columns:
                all_cognatesets[singleton_id][column.name] = properties.get(
                    column.name)
        except KeyError:
            pass
        judgement = types.Judgement({})
        properties = {
            c_j_id: singleton_id,
            c_j_cogset: singleton_id,
            c_j_form: form,
            c_j_segmentslice: indices_to_segment_slice(slice),
            c_j_alignment: [forms[form]["segments"][i] for i in slice],
            "Status_Column": status,
        }
        for column in dataset["CognateTable"].tableSchema.columns:
            judgement[column.name] = properties.get(column.name)
        all_judgements.append(judgement)
    return all_cognatesets.values(), all_judgements
Exemple #8
0
def segment_to_cognateset(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    cognatesets: t.Container[types.Cognateset_ID],
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]:
    # required fields
    c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference
    c_cognate_id = dataset.column_names.cognates.id
    c_cognate_form = dataset.column_names.cognates.formReference
    c_cognate_slice = dataset.column_names.cognates.segmentSlice

    forms = util.cache_table(dataset)
    cognateset_cache: t.Container[types.Cognateset_ID]
    if "CognatesetTable" in dataset:
        c_s_id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[c_s_id]
            for cognateset in dataset["CognatesetTable"]
            if cognatesets is None or cognateset["ID"] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = types.WorldSet()
        else:
            cognateset_cache = cognatesets

    which_segment_belongs_to_which_cognateset: t.Mapping[
        types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = {
            f: [set() for _ in form["segments"]]
            for f, form in forms.items() if form["form"]
            and form["form"].strip() and form["form"].strip() != "-"
        }
    for j in dataset["CognateTable"]:
        if j[c_cognate_form] in forms and j[
                c_cognate_cognateset] in cognateset_cache:
            form = forms[j[c_cognate_form]]
            if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset:
                continue
            if j.get(c_cognate_slice):
                try:
                    segments_judged = list(
                        parse_segment_slices(j[c_cognate_slice]))
                except ValueError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end."
                    )
                    continue
            else:
                segments_judged = list(range(len(form["segments"])))
            old_s = None

            for s in segments_judged:
                if old_s is not None and old_s + 1 != s:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous"
                    )
                try:
                    cognatesets = which_segment_belongs_to_which_cognateset[
                        j[c_cognate_form]][s]
                except IndexError:
                    logger.warning(
                        f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}."
                    )
                    continue
                cognatesets.add(j[c_cognate_cognateset])

    return which_segment_belongs_to_which_cognateset