Beispiel #1
0
def check_na_form_has_no_alternative(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
):
    valid = True
    c_f_id = dataset["FormTable", "id"].name
    c_f_form = dataset["FormTable", "form"].name
    c_f_concept = dataset["FormTable", "parameterReference"].name
    c_f_language = dataset["FormTable", "languageReference"].name
    forms_by_concepts: t.Dict[types.Parameter_ID,
                              t.Set[types.Form_ID]] = t.DefaultDict(set)

    for f in dataset["FormTable"]:
        for c in util.ensure_list(f[c_f_concept]):
            forms_by_concepts[c].add(f[c_f_id])
    forms_to_languages = t.DefaultDict(set)
    for f in dataset["FormTable"]:
        forms_to_languages[f[c_f_language]].add(f[c_f_id])
    na_forms = [f for f in dataset["FormTable"] if f[c_f_form] == "-"]
    for form in na_forms:
        for c in util.ensure_list(form[c_f_concept]):
            if forms_by_concepts[c].intersection(
                    forms_to_languages[form[c_f_language]]) != {form[c_f_id]}:
                log_or_raise(
                    message=
                    f"Non empty forms exist for the NA form {form[c_f_id]} with identical parameter and language reference",
                    log=logger,
                )
                valid = False
    return valid
Beispiel #2
0
def multistate_code(
    dataset: t.Mapping[types.Language_ID,
                       t.Mapping[types.Parameter_ID,
                                 t.Set[types.Cognateset_ID]]],
) -> t.Tuple[t.Mapping[types.Language_ID, t.Sequence[t.Set[int]]],
             t.Sequence[int]]:
    """Create a multistate root-meaning coding from cognate codes in a dataset

    Take the cognate code information from a wordlist, i.e. a mapping of the
    form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a multistate
    alignment from it that lists for every meaning which roots are used to
    represent that meaning in each language.

    Also return the number of roots for each concept.

    Examples
    ========

    >>> alignment, lengths = multistate_code({"Language": {"Meaning": {"Cognateset 1"}}})
    >>> alignment =={'Language': [{0}]}
    True
    >>> lengths == [1]
    True


    >>> alignment, statecounts = multistate_code(
    ...     {"l1": {"m1": {"c1"}},
    ...      "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}})
    >>> alignment["l1"][1]
    set()
    >>> alignment["l2"][1] == {0, 1}
    True
    >>> statecounts
    [2, 2]

    """
    roots: t.Dict[types.Parameter_ID,
                  t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for language, lexicon in dataset.items():
        for concept, cognatesets in lexicon.items():
            roots[concept].update(cognatesets)
    sorted_roots: t.Mapping[
        types.Parameter_ID, t.Sequence[types.Cognateset_ID]] = {
            concept: sorted(cognatesets)
            for concept, cognatesets in sorted(roots.items())
        }

    states: t.List[int] = [len(roots) for _, roots in sorted_roots.items()]

    alignment: t.MutableMapping[types.Language_ID,
                                t.List[t.Set[int]]] = t.DefaultDict(list)
    for language, lexicon in dataset.items():
        for concept, possible_roots in sorted_roots.items():
            entries = lexicon.get(concept)
            alignment[language].append(set())
            if entries:
                for entry in entries:
                    state = possible_roots.index(entry)
                    alignment[language][-1].add(state)
    return alignment, states
Beispiel #3
0
def list_homophones(dataset: pycldf.Dataset,
                    out: io.TextIOBase,
                    logger: cli.logging.Logger = cli.logger) -> None:
    clics = load_clics()
    # warn if clics cannot be loaded
    if not clics:
        logger.warning(
            "Clics could not be loaded. Using an empty graph instead")
        clics = nx.Graph()

    c_id = dataset["ParameterTable", "id"].name
    try:
        c_concepticon = dataset["ParameterTable", "concepticonReference"].name
    except KeyError:
        cli.Exit.INVALID_DATASET(
            "This script requires a column concepticonReference in ParamterTable. "
            "Please run add_concepticon.py")
    concepticon = {}
    for concept in dataset["ParameterTable"]:
        concepticon[concept[c_id]] = concept[c_concepticon]

    f_id = dataset["FormTable", "id"].name
    f_lang = dataset["FormTable", "languageReference"].name
    f_concept = dataset["FormTable", "parameterReference"].name
    f_form = dataset["FormTable", "form"].name

    homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[
        str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set))

    for form in dataset["FormTable"]:
        if form[f_form] == "-" or form[f_form] is None:
            continue
        if isinstance(form[f_concept], list):
            homophones[form[f_lang]][form[f_form]].add(
                tuple(form[f_concept]) + (form[f_id], ))
        else:
            homophones[form[f_lang]][form[f_form]].add(
                (form[f_concept], form[f_id]))
    for lang, forms in homophones.items():
        for form, meanings in forms.items():
            if len(meanings) == 1:
                continue
            clics_nodes = {concepticon.get(concept) for concept, _ in meanings}
            if None in clics_nodes:
                x = " (but at least one concept not found):"
            else:
                x = ":"
            clics_nodes -= {None}
            if len(clics_nodes) <= 1:
                x = "Unknown" + x
            elif nx.is_connected(clics.subgraph(clics_nodes)):
                x = "Connected" + x
            else:
                x = "Unconnected" + x
            line = f"{lang}, '{form}': {x}\n"
            for ele in sorted(meanings):
                line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n"
            out.write(line)
Beispiel #4
0
 def collect_forms_by_row(
     self,
     judgements: t.Iterable[types.Judgement],
     rows: t.Iterable[types.Row_ID],
 ) -> t.Mapping[types.Cognateset_ID, t.Mapping[
         types.Form_ID, t.Sequence[types.Judgement]]]:
     "Collect forms by row object (ie. concept or cognate set)"
     all_forms: t.MutableMapping[types.Cognateset_ID, t.Mapping[
         types.Form_ID, t.List[types.Judgement]]] = t.DefaultDict(
             lambda: t.DefaultDict(list))
     for judgement in judgements:
         all_forms[judgement["cognatesetReference"]][
             judgement["formReference"]].append(judgement)
     return all_forms
Beispiel #5
0
def read_structure_dataset(
    dataset: pycldf.StructureDataset,
    logger: cli.logging.Logger = cli.logger
) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID,
                                                          t.Set]]:
    col_map = dataset.column_names
    data: t.MutableMapping[types.Language_ID, t.MutableMapping[
        types.Parameter_ID, t.Set]] = t.DefaultDict(lambda: t.DefaultDict(set))
    code_column = col_map.values.codeReference or col_map.values.value
    for row in dataset["ValueTable"]:
        lang_id = row[col_map.values.languageReference]
        feature_id = row[col_map.values.parameterReference]
        if row[code_column]:
            data[lang_id][feature_id].add(row[code_column])
    return data
Beispiel #6
0
 def test_no_defaultdict_instantiation(self):
     with self.assertRaises(TypeError):
         typing.DefaultDict()
     with self.assertRaises(TypeError):
         typing.DefaultDict[KT, VT]()
     with self.assertRaises(TypeError):
         typing.DefaultDict[str, int]()
Beispiel #7
0
def list_homophones(dataset: pycldf.Dataset) -> None:
    clics = load_clics()
    # warn if clics cannot be loaded
    if not clics:
        logger.warning(
            "Clics could not be loaded. Using an empty graph instead")
        clics = nx.Graph()

    c_id = dataset["ParameterTable", "id"].name
    c_concepticon = dataset["ParameterTable", "concepticonReference"].name
    concepticon = {}
    for concept in dataset["ParameterTable"]:
        concepticon[concept[c_id]] = concept[c_concepticon]

    f_id = dataset["FormTable", "id"].name
    f_lang = dataset["FormTable", "languageReference"].name
    f_concept = dataset["FormTable", "parameterReference"].name
    f_form = dataset["FormTable", "form"].name

    homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[
        str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set))

    for form in dataset["FormTable"]:
        homophones[form[f_lang]][form[f_form]].add(
            (form[f_concept], form[f_id]))

    for lang, forms in homophones.items():
        for form, meanings in forms.items():
            if len(meanings) == 1:
                continue
            clics_nodes = [
                concepticon.get(concept) for concept, form_id in meanings
            ]
            if None in clics_nodes:
                clics_nodes = [c for c in clics_nodes if c]
                x = "(but at least one concept not found)"
            else:
                x = ""
            if len(clics_nodes) <= 1:
                print("Unknown:", lang, form, meanings)
            elif nx.is_connected(clics.subgraph(clics_nodes)):
                print("Connected:", x, lang, form, meanings)
            else:
                print("Unconnected:", x, lang, form, meanings)
Beispiel #8
0
def coverage_report_concepts(dataset: pycldf.Dataset, ):
    # TODO: This assumes the existence of a ParameterTable. The script should
    # still work if none exists. TODO: In addition, we decided to not formalize
    # primary concepts, so this should instead depend on a command line
    # argument, either supplementing or replacing --with-concepts.
    c_c_id = dataset["ParameterTable", "id"].name
    try:
        # Load primary concepts if possible.
        primary_concepts = [
            c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"]
        ]
    except KeyError:
        cli.logger.warning(
            "ParamterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. "
            "Loading all concepts.")
        primary_concepts = [c[c_c_id] for c in dataset["ParameterTable"]]
    # get the foreign keys pointing to the required tables
    foreign_key_parameter = ""
    for foreign_key in dataset["FormTable"].tableSchema.foreignKeys:
        if foreign_key.reference.resource == dataset["ParameterTable"].url:
            foreign_key_parameter = foreign_key.columnReference[0]

    foreign_key_language = ""
    for foreign_key in dataset["FormTable"].tableSchema.foreignKeys:
        if foreign_key.reference.resource == dataset["LanguageTable"].url:
            foreign_key_language = foreign_key.columnReference[0]

    multiple_concepts = bool(dataset["FormTable",
                                     "parameterReference"].separator)
    c_concept = foreign_key_parameter
    c_language = foreign_key_language
    # for each concept count the languages
    concepts_to_languages: t.DefaultDict[str,
                                         t.List[str]] = t.DefaultDict(list)
    for form in dataset["FormTable"]:
        if multiple_concepts:
            language = form[c_language]
            for concept in form[c_concept]:
                if (language not in concepts_to_languages[concept]
                        and concept in primary_concepts):
                    concepts_to_languages[concept].append(language)
        else:
            concept = form[c_concept]
            language = form[c_language]
            if (language not in concepts_to_languages[concept]
                    and concept in primary_concepts):
                concepts_to_languages[concept].append(language)

    data_concepts = []
    for k, v in concepts_to_languages.items():
        data_concepts.append([k, len(set(v))])

    return data_concepts
Beispiel #9
0
def check_no_separator_in_ids(dataset: pycldf.Dataset,
                              logger: cli.logger = cli.logger) -> bool:
    valid = True
    # Check that reference columns that have a separator don't contain the separator inside a string value
    forbidden_separators: t.MutableMapping[str, t.MutableMapping[
        str,
        t.MutableMapping[str, t.List[t.Tuple[str, str]]]]] = t.DefaultDict(
            lambda: t.DefaultDict(lambda: t.DefaultDict(list)))
    for table in dataset.tables:
        for foreign_key in table.tableSchema.foreignKeys:
            try:
                (referencing_column, ) = foreign_key.columnReference
                (referenced_column, ) = foreign_key.reference.columnReference
            except ValueError:
                # Multi-column foreign key. We *could* check that there's not a
                # reference column hidden in there, but we don't.
                continue

            if table.get_column(referencing_column).separator is None:
                continue

            forbidden_separators[
                foreign_key.reference.resource.__str__()][referenced_column][
                    table.get_column(referencing_column).separator].append(
                        (table.url.string, referencing_column))

    for table, targets in forbidden_separators.items():
        for r, row in enumerate(dataset[table], 1):
            for target_column, separators_forbidden_here in targets.items():
                for separator, forbidden_by in separators_forbidden_here.items(
                ):
                    if separator in row[target_column]:
                        log_or_raise(
                            f"In table {table}, row {r} column {target_column} contains {separator}, which is also the separator of {forbidden_by}.",
                            log=logger,
                        )
                        valid = False
    return valid
def hex_ecoregions(
        ecoregions: numpy.array,
        transform: rasterio.Affine) -> t.Dict[h3.H3Index, t.Counter[int]]:
    c: t.Dict[h3.H3Index, t.Counter[int]] = t.DefaultDict(t.Counter)
    for y, row in enumerate(ecoregions):
        (_, lat) = transform * (0, y)
        area = numpy.cos(lat * numpy.pi / 180) * SQUARE_OF_15_ARCSEC
        for x, eco in enumerate(row):
            (lon, lat) = transform * (x, y)
            index: h3.H3Index = h3.geo_to_h3(lat, lon, RESOLUTION)
            c[index][int(
                eco
            )] += area  # eco is a numpy type that sqlalchemy does not understand as int
    return c
Beispiel #11
0
def count_segments(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Container[types.Language_ID],
):
    c_f_language = dataset["FormTable", "languageReference"].name
    try:
        c_f_segments = dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Segment invertories report requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )
    counter: t.MutableMapping[t.Counter[str]] = t.DefaultDict(t.Counter)
    for form in cli.tq(
            dataset["FormTable"],
            total=dataset["FormTable"].common_props.get("dc:extent"),
            task="Reading all forms",
    ):
        if form[c_f_language] in languages:
            counter[form[c_f_language]].update(form[c_f_segments])
    return counter
def connected_concepts(
    dataset: pycldf.Wordlist,
) -> t.Mapping[CognatesetID, t.Counter[ConceptID]]:
    """For each cognate set it the dataset, check which concepts it is connected to.

    >>>
    """
    concepts_by_form = load_concepts_by_form(dataset)
    cognatesets_to_concepts: t.DefaultDict[
        CognatesetID, t.Sequence[ConceptID]] = t.DefaultDict(list)

    # Check whether cognate judgements live in the FormTable …
    c_cognateset = dataset.column_names.forms.cognatesetReference
    c_form = dataset.column_names.forms.id
    table = dataset["FormTable"]
    # … or in a separate CognateTable
    if c_cognateset is None:
        c_cognateset = dataset.column_names.cognates.cognatesetReference
        c_form = dataset.column_names.cognates.formReference
        table = dataset["CognateTable"]

    if c_cognateset is None:
        raise ValueError(
            f"Dataset {dataset:} had no cognatesetReference column in a CognateTable"
            " or a FormTable and is thus not compatible with this script.")

    for judgement in cli.tq(
            table,
            task="Link cognatesets to concepts",
            total=table.common_props.get("dc:extent"),
    ):
        cognatesets_to_concepts[judgement[c_cognateset]].extend(
            concepts_by_form[judgement[c_form]])
    return {
        cogset: collections.Counter(concepts)
        for cogset, concepts in cognatesets_to_concepts.items()
    }
Beispiel #13
0
def edictor_to_cldf(
    dataset: types.Wordlist[
        types.Language_ID,
        types.Form_ID,
        types.Parameter_ID,
        types.Cognate_ID,
        types.Cognateset_ID,
    ],
    new_cogsets: t.Mapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ],
    affected_forms: t.Set[types.Form_ID],
    source: t.List[str] = [],
):
    ref_cogsets: t.MutableMapping[
        types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]]
    ] = t.DefaultDict(list)
    cognate: t.List[types.Judgement] = []
    judgements_lookup: t.MutableMapping[
        types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement]
    ] = t.DefaultDict(dict)
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in affected_forms:
            ref_cogsets[j["cognatesetReference"]].append(
                (j["formReference"], j["segmentSlice"], j["alignment"])
            )
            judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j
        else:
            cognate.append(j)
    matches = match_cognatesets(new_cogsets, ref_cogsets)

    for cognateset, judgements in new_cogsets.items():
        cognateset = matches[cognateset]
        if cognateset is None:
            cognateset = "_".join(f for f, _, _ in judgements)
        for form, slice, alignment in judgements:
            was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset)
            if was:
                was["segmentSlice"] = util.indices_to_segment_slice(slice)
                was["alignment"] = alignment
                cognate.append(was)
                continue
            judgements_lookup
            cognate.append(
                types.Judgement(
                    {
                        "id": f"{form}-{cognateset}",
                        "formReference": form,
                        "cognatesetReference": cognateset,
                        "alignment": alignment,
                        "segmentSlice": util.indices_to_segment_slice(slice),
                        "source": source,
                        # TODO: Any more parameters? Status update?
                    }
                )
            )

    cognate.sort(key=lambda j: j["id"])
    m = {
        util.cldf_property(c.propertyUrl) or c.name: c.name
        for c in dataset["CognateTable"].tableSchema.columns
    }
    dataset["CognateTable"].write(
        [{m[k]: v for k, v in j.items() if k in m} for j in cognate]
    )
Beispiel #14
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache
Beispiel #15
0
    args = parser.parse_args()

    if args.metadata_or_forms.name == "forms.csv":
        dataset = pycldf.Wordlist.from_data(args.metadata_or_forms)
    else:
        dataset = pycldf.Wordlist.from_metadata(args.metadata)

    languages = {}
    try:
        c_l_id = dataset["LanguageTable", "id"].name
        for language in dataset["LanguageTable"]:
            languages[language[c_l_id]] = language
    except KeyError:
        pass

    concepts: t.DefaultDict[str, t.Counter[str]] = t.DefaultDict(t.Counter)
    multiple_concepts = bool(dataset["FormTable",
                                     "parameterReference"].separator)
    c_concept = dataset["FormTable", "parameterReference"].name
    c_language = dataset["FormTable", "languageReference"].name
    c_form = dataset["FormTable", "form"].name
    for form in dataset["FormTable"]:
        languages.setdefault(form[c_language], {})
        if form[c_form] == "?" and args.missing:
            continue
        if multiple_concepts:
            for c in form[c_concept]:
                concepts[form[c_language]][c] += 1
        else:
            concepts[form[c_language]][form[c_concept]] += 1
        dataset.column_names.forms.parameterReference)
    multi = bool(concepts.separator)
    concepts_by_form: t.Dict[t.Hashable, t.List[t.Optional[t.Hashable]]] = {}
    for form in dataset['FormTable']:
        if multi:
            concepts_by_form[form[dataset.column_names.forms.id]] = [
                concept_to_concepticon.get(c) for c in form[concepts.name]
            ]
        else:
            concepts_by_form[form[dataset.column_names.forms.id]] = [
                concept_to_concepticon.get(form[concepts.name])
            ]

    concepts_by_cogset: t.DefaultDict[
        t.Hashable,
        t.Counter[t.Optional[t.Hashable]]] = t.DefaultDict(t.Counter)
    for row in table:
        cognateset = row[c_cognateset]
        form = row[c_form]
        concepts_by_cogset[cognateset].update(concepts_by_form[form])

    import networkx
    clics = networkx.parse_gml(
        (Path(__file__).parent / '../../../network-3-families.gml').open())
    r = {}
    for cognateset, concepts in concepts_by_cogset.items():
        centrality = networkx.algorithms.centrality.betweenness_centrality(
            clics.subgraph([c for c in concepts if c]))
        r[cognateset] = max(centrality, key=centrality.get)

    write_back = []
Beispiel #17
0
def check_cognate_table(dataset: pycldf.Wordlist,
                        logger=cli.logger,
                        strict_concatenative=False) -> bool:
    """Check that the CognateTable makes sense.

    The cognate table MUST have an indication of forms, in a #formReference
    column, and cognate sets, in a #cognatesetReference column. It SHOULD have
    segment slices (#segmentSlice) and alignments (#alignment).

     - The segment slice must be a valid (1-based, inclusive) slice into the segments of the form
     - The alignment must match the segment slice applied to the segments of the form
     - The length of the alignment must match the lengths of other alignments of that cognate set
     - NA forms (Including "" for “source reports form as unknown” must not be in cognatesets)


    If checking for strictly concatenative morphology, also check that the
    segment slice is a contiguous, non-overlapping section of the form.

    Having no cognates is a valid choice for a dataset, so this function returns True if no CognateTable was found.

    """

    # First, load all forms that are referenced in the CognateTable

    try:
        cognatetable = dataset["CognateTable"]
    except KeyError:
        # Having no cognates is a valid choice for a dataset.
        return True

    try:
        c_form = dataset["CognateTable", "formReference"].name
    except KeyError:
        log_or_raise("CognateTable does not have a #formReference column.")
        # All further checks don't make sense, return early.
        return False

    try:
        c_cognateset = dataset["CognateTable", "cognatesetReference"].name
    except KeyError:
        log_or_raise(
            "CognateTable does not have a #cognatesetReference column.")
        # All further checks don't make sense, return early.
        return False

    # The CLDF specifications state that foreign key references take precedence
    # over the implicit semantics of a `#xxxReference` column pointing to an
    # `#id` column, so we need to find forms by the stated foreign key
    # relationship.
    for foreign_key in cognatetable.tableSchema.foreignKeys:
        if foreign_key.columnReference == [c_form]:
            referenced_table = str(foreign_key.reference.resource)
            # A multi-column column reference for a single-column foreign key
            # makes no sense, so use tuple unpacking to extract the only
            # element from that list.
            (referenced_column, ) = foreign_key.reference.columnReference
            if (not dataset[referenced_table].common_props["dc:conformsTo"]
                    == "http://cldf.clld.org/v1.0/terms.rdf#FormTable"):
                log_or_raise(
                    "CognateTable #formReference does not reference a FormTable.",
                )
            break
    else:
        log_or_raise("CognateTable #formReference must be a foreign key.")
        # All further checks don't make sense, return early.
        return False

    try:
        c_sslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        logger.info("CognateTable does not have a #segmentSlice column.")
        c_sslice = None

    try:
        c_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        logger.info("CognateTable does not have an #alignment column.")
        c_alignment = None

    if c_sslice is None and c_alignment is None:
        # No additional data concerning the associations between forms and
        # cognate sets. That's sad, but valid.
        # All further checks don't make sense, return early.
        return True

    try:
        c_f_form = dataset[referenced_table, "form"].name

        def form_given(row):
            return row[c_f_form] and row[c_f_form].strip() != "-"

    except KeyError:
        if dataset[referenced_table] == dataset["FormTable"]:
            log_or_raise("FormTable does not have a #form column.")

        def form_given(row):
            return True

    # Check whether each row is valid.
    all_judgements_okay = True
    forms = cache_table(
        dataset,
        columns={"segments": dataset[referenced_table, "segments"].name},
        table=referenced_table,
        index_column=referenced_column,
        filter=form_given,
    )
    missing_forms = cache_table(
        dataset,
        columns={},
        table=referenced_table,
        index_column=referenced_column,
        filter=lambda row: not form_given(row),
    )
    cognateset_alignment_lengths: t.DefaultDict[
        t.Any, t.Set[int]] = t.DefaultDict(set)

    for f, j, judgement in dataset["CognateTable"].iterdicts(
            with_metadata=True):
        try:
            form_segments = forms[judgement[c_form]]["segments"]
        except KeyError:
            if judgement[c_form] in missing_forms:
                log_or_raise(
                    "In {}, row {}: NA form {} was judged to be in cognate set."
                    .format(f, j, judgement[c_form]), )
            # The case of a missing foreign key in general is already handled
            # by the basic CLDF validator.
            continue

        if c_sslice is not None:
            if not judgement[c_sslice]:
                log_or_raise("In {}, row {}: Empty segment slice".format(f, j))
                continue
            try:
                included_segments = list(
                    parse_segment_slices(judgement[c_sslice]))
                if (max(included_segments) >= len(form_segments)
                        or min(included_segments) < 0):
                    log_or_raise(
                        "In {}, row {}: Segment slice {} is invalid for segments {}"
                        .format(
                            f,
                            j,
                            judgement[c_sslice],
                            form_segments,
                        ), )
                    all_judgements_okay = False
                    continue
                if strict_concatenative:
                    s1 = included_segments[0]
                    for s2 in included_segments[1:]:
                        if s2 != s1 + 1:
                            log_or_raise(
                                "In {}, row {}: Segment slice {} has non-consecutive elements {}, {}"
                                .format(
                                    f,
                                    j,
                                    judgement[c_sslice],
                                    s1,
                                    s2,
                                ))
                        s1 = s2
            except ValueError:
                log_or_raise(
                    "In {}, row {}: Segment slice {} is invalid".format(
                        f,
                        j,
                        judgement[c_sslice],
                    ))
                all_judgements_okay = False
                continue
        else:
            included_segments = list(range(len(form_segments)))

        if c_alignment:
            # Length of alignment should match length of every other alignment in this cognate set.
            lengths = cognateset_alignment_lengths[judgement[c_cognateset]]
            alignment_length = len(judgement[c_alignment])
            if lengths and alignment_length not in lengths:
                log_or_raise(
                    "In {}, row {}: Alignment has length {}, other alignments of cognateset {} have length(s) {}"
                    .format(f, j, alignment_length, judgement[c_cognateset],
                            lengths), )
                all_judgements_okay = False
            elif not lengths:
                lengths.add(alignment_length)

            # Alignment when gaps are removed should match segments. TODO:
            # Should we permit other gap characters? Where do we know them
            # from? TODO: To be more robust when segments are separated into
            # morphemes, not individual segments, compare alignment and
            # segments space-separated.
            without_gaps = " ".join(
                [c or "" for c in judgement[c_alignment] if c != "-"])
            actual_segments = " ".join(form_segments[i]
                                       for i in included_segments)
            if without_gaps.strip() != actual_segments.strip():
                if unicodedata.normalize(
                        "NFKC", without_gaps.strip()) == unicodedata.normalize(
                            "NFKC", actual_segments.strip()):
                    comment = " This is down to encoding differences: Their normalized unicode representations are the same. I suggest you run `lexedata.edit.normalize_unicode`."
                else:
                    comment = ""
                log_or_raise(
                    "In {}, row {}: Referenced segments in form resolve to {}, while alignment contains segments {}.{}"
                    .format(f, j, actual_segments, without_gaps, comment), )
                all_judgements_okay = False

    return all_judgements_okay
Beispiel #18
0
# TODO: Options given on the command line should have preference over defaults,
# no matter whether they are given in terms of names ("Parameter_ID") or
# property URLs ("parameterReference")
default_mergers: t.Mapping[str, Merger] = t.DefaultDict(
    lambda: default,
    {
        "form": must_be_equal,
        "Form": must_be_equal,
        "languageReference": must_be_equal,
        "Language_ID": must_be_equal,
        "source": union,
        "Source": union,
        "parameterReference": union,
        "Parameter_ID": union,
        "variants": union,
        "comment": concatenate,
        "Comment": concatenate,
        "value": concatenate,
        "Value": concatenate,
        "status": constant_factory("MERGED: Review necessary"),
        "orthographic": transcription("<{}>"),
        "phonemic": transcription("/{}/"),
        "phonetic": transcription("[{}]"),
        "segments": must_be_equal,
        "Segments": must_be_equal,
    },
)


def merge_group(
    forms: t.Sequence[types.Form],
Beispiel #19
0
def read_wordlist(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    code_column: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID,
                                                          t.Set]]:
    col_map = dataset.column_names

    if code_column:
        # Just in case that column was specified by property URL. We
        # definitely want the name. In any case, this will also throw a
        # helpful KeyError when the column does not exist.
        form_table_form = col_map.forms.form
        form_table_column = col_map.forms.id
        cognatesets = util.cache_table(
            dataset,
            columns={
                "form": form_table_column,
                "transcription": form_table_form,
                "code": dataset["FormTable", code_column].name,
            },
            filter=lambda row: bool(row[col_map.forms.form]),
        )
    else:
        # We search for cognatesetReferences in the FormTable or a separate
        # CognateTable.

        # Try the FormTable first.
        code_column = col_map.forms.cognatesetReference

        if code_column:
            # This is not the CLDF way, warn the user.
            form_table_column = col_map.forms.id
            form_table_form = col_map.forms.form
            logger.warning(
                "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table."
            )
            cognatesets = util.cache_table(
                dataset,
                columns={
                    "form": form_table_column,
                    "transcription": form_table_form,
                    "code": code_column,
                },
            )
        else:
            # There was no cognatesetReference in the form table. If we
            # find them in CognateTable (I mean, they should be there!), we
            # store them keyed with formReference.
            if (col_map.cognates and col_map.cognates.cognatesetReference
                    and col_map.cognates.formReference):
                code_column = col_map.cognates.cognatesetReference
                form_reference = col_map.cognates.formReference
                (foreign_key, ) = [
                    key
                    for key in dataset["CognateTable"].tableSchema.foreignKeys
                    if key.columnReference == [form_reference]
                ]
                (form_table_column, ) = foreign_key.reference.columnReference
                cognatesets = util.cache_table(
                    dataset,
                    "CognateTable",
                    {
                        "form": form_reference,
                        "code": code_column
                    },
                )
            else:
                raise ValueError(
                    "Dataset has no cognatesetReference column in its "
                    "primary table or in a separate cognate table. "
                    "Is this a metadata-free wordlist and you forgot to "
                    "specify code_column explicitly?")

    # Cognate sets have been loaded. Consolidate.
    cognates_by_form: t.MutableMapping[
        types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for judgement in cognatesets.values():
        cognates_by_form[judgement["form"]].add(judgement["code"])
    parameter_column = col_map.forms.parameterReference

    # If one form can have multiple concepts,
    if dataset["FormTable", parameter_column].separator:

        def all_parameters(parameter):
            return list(parameter)

    else:

        def all_parameters(parameter):
            return [parameter]

    data: t.MutableMapping[types.Language_ID,
                           t.MutableMapping[types.Parameter_ID, t.Set]]
    if "LanguageTable" in dataset:
        (langref_target, ) = [
            key for key in dataset["FormTable"].tableSchema.foreignKeys
            if key.columnReference ==
            [dataset["FormTable", "languageReference"].name]
        ]
        ref_col = langref_target.reference.columnReference[0]
        data = {
            lang[ref_col]: t.DefaultDict(set)
            for lang in dataset["LanguageTable"]
        }
    else:
        data = t.DefaultDict(lambda: t.DefaultDict(set))
    for row in dataset["FormTable"].iterdicts():
        if not row[col_map.forms.form]:
            # Transcription is empty, should not be a form. Skip, but maybe
            # warn if it was in a cognateset.
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    cognates_by_form[row[form_table_column]],
                )
            continue

        language = row[col_map.forms.languageReference]
        if row[col_map.forms.form] == "-":
            if cognates_by_form[row[form_table_column]]:
                logger.warning(
                    "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.",
                    row[col_map.forms.id],
                    language,
                    cognates_by_form[row[form_table_column]],
                )
                cognates_by_form[row[form_table_column]] = set()
            for parameter in all_parameters(row[parameter_column]):
                if data[language][parameter]:
                    logger.warning(
                        "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.",
                        row[col_map.forms.id],
                        parameter,
                        row[col_map.forms.languageReference],
                        data[language][parameter],
                    )
        for parameter in all_parameters(row[parameter_column]):
            data[language][parameter] |= cognates_by_form[
                row[form_table_column]]
    return data
Beispiel #20
0
def root_presence_code(
    dataset: t.Mapping[types.Language_ID,
                       t.Mapping[types.Parameter_ID,
                                 t.Set[types.Cognateset_ID]]],
    relevant_concepts: t.Mapping[types.Cognateset_ID,
                                 t.Iterable[types.Parameter_ID]],
    ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"],
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]],
             t.Mapping[types.Cognateset_ID, int], ]:
    """Create a root-presence/absence coding from cognate codes in a dataset

    Take the cognate code information from a wordlist, i.e. a mapping of the
    form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary
    alignment from it that lists for every root whether it is present in that
    language or not. Return that, and the association between cognatesets and
    characters.

    >>> alignment, roots = root_presence_code(
    ...     {"Language": {"Meaning": {"Cognateset 1"}}},
    ...     relevant_concepts={"Cognateset 1": ["Meaning"]})
    >>> alignment
    {'Language': ['0', '1']}
    >>> roots
    {'Cognateset 1': 1}

    The first entry in each sequence is always '0': The configuration where a
    form is absent from all languages is never observed, but always possible,
    so we add this entry for the purposes of ascertainment correction.

    If a root is attested at all, in any concept, it is considered present.
    Because the word list is never a complete description of the language's
    lexicon, the function employs a heuristic to generate ‘absent’ states.

    If a root is unattested, and at least half of the relevant concepts
    associated with this root are attested, but each expressed by another root,
    the root is assumed to be absent in the target language. (If there is
    exactly one central concept, then that central concept being attested or
    unknown is a special case of this general rule.) Otherwise the
    presence/absence of the root is considered unknown.

    >>> alignment, roots = root_presence_code(
    ...     {"l1": {"m1": {"c1"}},
    ...      "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}},
    ...     relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]})
    >>> sorted(roots)
    ['c1', 'c2', 'c3']
    >>> sorted_roots = sorted(roots.items())
    >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()}
    {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']}
    >>> list(zip(*sorted(zip(*alignment.values()))))
    [('0', '0', '1', '?'), ('0', '1', '1', '1')]

    """
    all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts)
    language_roots: t.MutableMapping[
        types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set)
    for language, lexicon in dataset.items():
        for concept, cognatesets in lexicon.items():
            if not cognatesets:
                logger.warning(
                    f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it."
                )
            for cognateset in cognatesets:
                language_roots[language].add(cognateset)

    all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots)

    alignment = {}
    roots = {}
    for language, lexicon in dataset.items():
        alignment[language] = list(ascertainment)
        for root in all_roots_sorted:
            roots[root] = len(alignment[language])
            if root in language_roots[language]:
                alignment[language].append("1")
            else:
                n_concepts = 0
                n_filled_concepts = 0
                for concept in relevant_concepts[root]:
                    n_concepts += 1
                    if lexicon.get(concept):
                        n_filled_concepts += 1
                if 2 * n_filled_concepts >= n_concepts:
                    alignment[language].append("0")
                else:
                    alignment[language].append("?")

    return alignment, roots
Beispiel #21
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Beispiel #22
0
    all_mergers,
    default,
    first,
    format_mergers,
    must_be_equal,
    parse_homophones_report,
    parse_merge_override,
)

# TODO: Options given on the command line should have preference over defaults,
# no matter whether they are given in terms of names ("Parameter_ID") or
# property URLs ("parameterReference")
default_mergers: t.Mapping[str, Merger] = t.DefaultDict(
    lambda: default,
    {
        "Name": first,
        "parameterReference": first,
    },
)


def merge_group(
    cogsets: t.Sequence[types.CogSet],
    target: types.CogSet,
    mergers: t.Mapping[str, Merger],
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
) -> types.CogSet:
    """Merge one group of cognate sets
Beispiel #23
0
def coverage_report(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    min_percentage: float = 0.0,
    with_concept: t.Iterable[types.Parameter_ID] = set(),
    missing: Missing = Missing.KNOWN,
    only_coded: bool = True,
) -> t.List[t.List[str]]:
    coded: t.Container[types.Form_ID]
    if only_coded:
        try:
            c_j_form = dataset["CognateTable", "formReference"].name
        except KeyError:
            cli.Exit.NO_COGNATETABLE(
                message=
                "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements."
            )
        coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]}
    else:
        coded = types.WorldSet()

    languages: t.Dict[types.Language_ID, str] = {}
    try:
        c_l_id = dataset["LanguageTable", "id"].name
        c_l_name = dataset["LanguageTable", "name"].name
        for language in dataset["LanguageTable"]:
            languages[language[c_l_id]] = language[c_l_name]
    except KeyError:
        pass

    concepts: t.DefaultDict[types.Language_ID,
                            t.Counter[types.Parameter_ID]] = t.DefaultDict(
                                t.Counter)
    c_f_id = dataset["FormTable", "id"].name
    c_concept = dataset["FormTable", "parameterReference"].name
    c_language = dataset["FormTable", "languageReference"].name
    c_form = dataset["FormTable", "form"].name
    for form in dataset["FormTable"]:
        languages.setdefault(form[c_language], form[c_language])
        if form[c_f_id] not in coded:
            continue
        if missing == Missing.IGNORE and (not form[c_form]
                                          or form[c_form] == "-"):
            continue
        if missing == Missing.KNOWN and not form[c_form]:
            continue
        c: types.Parameter_ID
        for c in util.ensure_list(form[c_concept]):
            concepts[form[c_language]][c] += 1

    # load primary concepts and number of concepts
    primary_concepts: t.Container[types.Parameter_ID]
    try:
        c_c_id = dataset["ParameterTable", "id"].name
        primary_concepts = [
            c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"]
        ]
        total_number_concepts = len(primary_concepts)
    except KeyError:
        cli.logger.warning(
            "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. "
            "Loading all concepts.")
        primary_concepts = types.WorldSet()

        try:
            total_number_concepts = len(list(dataset["ParameterTable"]))
        except KeyError:
            total_number_concepts = len(
                set.union(*(set(cs) for cs in concepts.values())))

    data_languages = []
    for language, name in languages.items():
        conceptlist = concepts[language]
        try:
            synonyms = sum(conceptlist.values()) / len(conceptlist)
        except ZeroDivisionError:
            synonyms = float("nan")

        # percentage of all concepts covered by this language
        conceptlist_percentage = len(conceptlist) / total_number_concepts
        if conceptlist_percentage * 100 < min_percentage:
            continue

        if not all(c in conceptlist for c in with_concept):
            continue

        # count primary concepts
        primary_count = 0
        for c in conceptlist:
            if c in primary_concepts:
                primary_count += 1
        # if args.languages_only:
        #     print(language)
        data_languages.append([
            language,
            name,
            primary_count,
            conceptlist_percentage,
            synonyms,
        ])
    return data_languages