Beispiel #1
0
def parser():
    parser = cli.parser(
        description="Create an Excel cognate view from a CLDF dataset")
    parser.add_argument(
        "excel",
        type=Path,
        help="File path for the generated cognate excel file.",
    )
    parser.add_argument(
        "--size-sort",
        action="store_true",
        default=False,
        help=
        "List the biggest cognatesets first (within a group, if another sort order is specified by --sort-cognatesets-by)",
    )
    parser.add_argument(
        "--sort-languages-by",
        help=
        "The name of a column in the LanguageTable to sort languages by in the output",
    )
    parser.add_argument(
        "--sort-cognatesets-by",
        help=
        "The name of a column in the CognatesetTable to sort cognates by in the output",
    )
    parser.add_argument(
        "--url-template",
        type=str,
        default="https://example.org/lexicon/{:}",
        help=
        "A template string for URLs pointing to individual forms. For example, to"
        " point to lexibank, you would use https://lexibank.clld.org/values/{:}."
        " (default: https://example.org/lexicon/{:})",
    )
    parser.add_argument(
        "--add-singletons-with-status",
        default=None,
        metavar="MESSAGE",
        help=
        "Include in the output all forms that don't belong to a cognateset. For each form, a singleton cognateset is created, and its status column (if there is one) is set to MESSAGE.",
    )
    parser.add_argument(
        "--add-singletons",
        action="store_const",
        const="automatic singleton",
        help="Short for `--add-singletons-with-status='automatic singleton'`",
        dest="add_singletons_with_status",
    )
    parser.add_argument(
        "--by-segment",
        default=False,
        action="store_true",
        help=
        "If adding singletons: Instead of creating singleton cognate sets only for forms that are not cognate coded at all, make sure every contiguous set of segments in every form is in a cognate set.",
    )
    return parser
Beispiel #2
0
def parser():
    parser = cli.parser(description=__doc__.split("\n\n\n")[0],
                        epilog=__doc__.split("\n\n\n")[1])
    parser.add_argument("column",
                        help="The column to filter.",
                        metavar="COLUMN")
    parser.add_argument("filter",
                        help="An expression to filter by.",
                        metavar="FILTER")
    parser.add_argument(
        "table",
        nargs="?",
        help=
        "The table to filter. If you want to filter a CSV table from standard input, leave this argument out.",
        metavar="TABLE",
    )
    parser.add_argument(
        "--invert",
        "-V",
        action="store_true",
        default=False,
        help="Output exactly the NON-matching lines",
    )
    parser.add_argument(
        "--output-columns",
        "-c",
        nargs="+",
        default=[],
        help=
        "Output only columns OUTPUT_COLUMN1,OUTPUT_COLUMN2,OUTPUT_COLUMN3,… in the same order as given.",
    )
    parser.add_argument(
        "--output-file",
        "-o",
        type=argparse.FileType("w"),
        default=sys.stdout,
        help=
        "Write output to file OUTPUT_FILE, instead of to the console as stdout.",
    )

    return parser
Beispiel #3
0
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)


if __name__ == "__main__":
    parser = cli.parser(__doc__)
    parser.add_argument(
        "--unique-id",
        choices=["dataset", "concept"],
        default=False,
        help=
        "Are cognateset IDs unique over the whole *dataset* (including, but not limited to, cross-meaning cognatesets), or are they unique only *within a concept* (eg. cognateset 1 for concept ‘the hand’ has no relation cognateset 1 for concept ‘to eat’",
    )
    args = parser.parse_args()
    logger = cli.setup_logging(args)

    split: bool
    if args.unique_id == "dataset":
        split = False
    elif args.unique_id == "concept":
        split = True
Beispiel #4
0
import pycldf

import lexedata.cli as cli
from lexedata.util.simplify_ids import update_ids

if __name__ == "__main__":
    parser = cli.parser(
        description=
        "Replace the ID of an object (e.g. a language ID) in the wordlist")
    parser.add_argument("table",
                        type=str,
                        help="The table to apply the replacement to",
                        metavar="TABLE")
    parser.add_argument("original",
                        type=str,
                        help="Original ID to be replaced",
                        metavar="ORIGINAL")
    parser.add_argument("replacement",
                        type=str,
                        help="New ID of ORIGINAL",
                        metavar="REPLACEMENT")
    parser.add_argument(
        "--merge",
        action="store_true",
        default=False,
        help=
        "When the replacement would lead to two IDs being merged, warn, but proceed.",
    )
    args = parser.parse_args()
    logger = cli.setup_logging(args)
Beispiel #5
0
def parser():
    """Construct the CLI argument parser for this script."""
    parser = cli.parser(
        description=
        "Export a CLDF dataset to a coded character matrix to be used as input for phylogenetic analyses."
    )

    parser.add_argument(
        "--format",
        choices=("csv", "raw", "beast", "nexus"),
        default="raw",
        help=
        """Output format: `raw` for one language name per row, followed by spaces and
            the character state vector; `nexus` for a complete Nexus file; `beast`
            for the <data> tag to copy to a BEAST file; `csv` for a CSV
            with languages in rows and characters in columns. (default: raw)""",
    )
    parser.add_argument(
        "-b",
        action="store_const",
        const="beast",
        dest="format",
        help="""Short form of --format=beast""",
    )
    parser.add_argument(
        "--output-file",
        "-o",
        type=Path,
        help=
        """File to write output to. (If format=beast and output file exists, replace the
            first `data` tag in there.) (default: Write to stdout)""",
    )
    parser.add_argument(
        "--languages",
        action=cli.ListOrFromFile,
        help="Languages to include in the alignment.",
    )
    parser.add_argument(
        "--concepts",
        action=cli.ListOrFromFile,
        help="Concepts to be included or treated as primary concepts.",
    )
    parser.add_argument(
        "--cognatesets",
        action=cli.ListOrFromFile,
        help="Cognate sets to consider for the alignment.",
    )
    parser.add_argument(
        "--coding",
        action=cli.enum_from_lower(CodingProcedure),
        default="RootMeaning",
        help=
        """Coding method: In the `RootMeaning` coding method, every character
        describes the presence or absence of a particular root morpheme or
        cognate class in the word(s) for a given meaning; In the
        `RootPresence`, every character describes (up to the limitations of the
        data, which might not contain marginal forms) the presence or absence
        of a root (morpheme) in the language, independet of which meaning that
        root is attested in; And in the `Multistate` coding, each character
        describes, possibly including uniform ambiguities, the cognate class of
        a meaning. (default: RootMeaning)""",
    )
    parser.add_argument(
        "--absence-heuristic",
        action=cli.enum_from_lower(AbsenceHeuristic),
        help=
        """In case of --coding=rootpresence, which heuristic should be used for the
        coding of absences? The default depends on whether the dataset contains
        a #parameterReference column in its CognatesetTable: If there is one,
        or for --heuristic=CentralConcept, a root is considered absent
        when that concept (or at least half of them, if it is multi-valued) are
        attested with other roots. In the other case, or for
        --heuristic=HalfPrimaryConcepts, a root is considered absent when
        at least half the the concepts it is connected to are attested with
        other roots in the language.""",
    )
    parser.add_argument(
        "--stats-file",
        type=Path,
        help=
        "Path to a TeX file that will be filled with LaTeX command definitions for some summary statistics. (default: Don't write a stats file)",
    )
    return parser
Beispiel #6
0
            sheet=sheet,
            logger=logger,
            match_form=match_form,
            entries_to_concepts=concepts,
            concept_column=concept_column,
            ignore_missing=ignore_missing,
            ignore_superfluous=ignore_superfluous,
            status_update=status_update,
        ).items():
            report[lang] += subreport
    return report


if __name__ == "__main__":
    parser = cli.parser(
        description="Import forms and associated metadata from an excel file to a cldf dataset."
    )
    parser.add_argument(
        "excel",
        type=openpyxl.load_workbook,
        help="The Excel file to parse",
        metavar="EXCEL",
    )
    parser.add_argument(
        "--concept-name",
        type=str,
        help="Column to interpret as concept names "
        "By default, it is assumed that the #parameterReference column, usually named 'Concept_ID' "
        "or similar, matches the IDs of the concept. Use this "
        "switch if instead of concept IDs you have concept Names in the excel file instead.",
        metavar="COLUMN",
Beispiel #7
0
        if there is one.

        """
        form, metadata = form
        cell_value = self.form_to_cell_value(form)
        form_cell = self.ws.cell(row=row, column=column, value=cell_value)
        comment = form.pop("comment", None)
        if comment:
            form_cell.comment = op.comments.Comment(comment, __package__)
        if self.URL_BASE:
            link = self.URL_BASE.format(urllib.parse.quote(form["id"]))
            form_cell.hyperlink = link


if __name__ == "__main__":
    parser = cli.parser(
        description="Create an Excel matrix view from a CLDF dataset")
    parser.add_argument(
        "excel",
        type=Path,
        help="File path for the generated cognate excel file.",
    )
    parser.add_argument(
        "--concepts",
        action=cli.ListOrFromFile,
        help="Concepts to output.",
    )
    parser.add_argument(
        "--sort-languages-by",
        help=
        "The name of a column in the LanguageTable to sort languages by in the output",
    )
Beispiel #8
0
        dataset.add_columns(table_name, "Status_Column")
    else:
        cli.logger.info(
            f"Table {table_name} already contains a Status_Column.")


def status_column_to_table_list(dataset: pycldf.Dataset,
                                tables: t.List[str]) -> pycldf.Dataset:
    for table in tables:
        add_status_column_to_table(dataset, table)
    return dataset


if __name__ == "__main__":

    parser = cli.parser(
        description="Add Status_Column to specified tables of the dataset")
    parser.add_argument(
        "tables",
        type=str,
        nargs="*",
        default=[],
        help="Table names and files to which to add Status_Column "
        "(default: FormTable, CognatesetTable, CognateTable, ParameterTable)",
    )
    parser.add_argument(
        "--exclude",
        type=str,
        nargs="*",
        default=[],
        help="Table names to exclude (takes precedence over table-names)",
        metavar="TABLE",
Beispiel #9
0
        if set(item.__annotations__) == {"sequence", "target", "return"}:
            # It would be better to check the actual types, instead of the
            # parameter names, but that would need a deeper delve into the
            # typing system.
            all_mergers.add(item)

if __name__ == "__main__":
    parser = cli.parser(
        description="Script for merging homophones.",
        epilog="""The default merging functions are:
{:}

Every other column is merged with `default`.

The following merge functions are predefined, each takes the given entries for one column of the forms to be merged and aggregates them into a single new entry for the merged result form.
{:}
        """.format(
            format_mergers(default_mergers),
            "\n".join(
                sorted("{}: {}".format(m.__name__,
                                       m.__doc__.split("\n")[0])
                       for m in all_mergers)),
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "merge_file",
        type=Path,
        help=
        "Path pointing to the file containing the mergers, in the same format as output by report.homophones",
        metavar="MERGE_FILE",
    )
Beispiel #10
0
from lexedata import cli


def n(s: str) -> str:
    return unicodedata.normalize("NFC", s)


def normalize(file, original_encoding="utf-8"):
    # TODO: If this ever takes more than a second, add a cli.tq progress bar
    content = file.open(encoding=original_encoding).read()
    file.open("w", encoding=original_encoding).write(n(content))


if __name__ == "__main__":
    parser = cli.parser(__doc__.split("\n\n\n")[1])
    parser.add_argument(
        "file",
        nargs="*",
        type=Path,
        help=
        "The file(s) to re-encode. Default: All table files included by the metadata file, though not the sources.",
    )
    parser.add_argument("--from-encoding",
                        default="utf-8",
                        help="original encoding")
    args = parser.parse_args()
    logger = cli.setup_logging(args)
    if not args.file:
        args.file = [
            # TODO: Check whether other places using table.url.string might
Beispiel #11
0
#@root_formatter=COGID
#@note_formatter=undefined
#@pattern_formatter=undefined
#@publish=undefined
#@_almcol=ALIGNMENT
#@filename={:s}
#@navbar=true
#@_morphology_mode=partial""".format(
        languages,
        concepts,
        file.name,
    ))


if __name__ == "__main__":
    parser = cli.parser(
        description="Export #FormTable to tsv format for import to edictor")
    parser.add_argument(
        "--languages",
        action=cli.ListOrFromFile,
        help="Export only forms from these languages.",
    )
    parser.add_argument(
        "--concepts",
        action=cli.ListOrFromFile,
        help="Export only forms connected to these concepts.",
    )
    parser.add_argument(
        "--cognatesets",
        action=cli.ListOrFromFile,
        help="Export only these cognate sets.",
    )
Beispiel #12
0
        properties = {
            c_j_id: singleton_id,
            c_j_cogset: singleton_id,
            c_j_form: form,
            c_j_segmentslice: indices_to_segment_slice(slice),
            c_j_alignment: [forms[form]["segments"][i] for i in slice],
            "Status_Column": status,
        }
        for column in dataset["CognateTable"].tableSchema.columns:
            judgement[column.name] = properties.get(column.name)
        all_judgements.append(judgement)
    return all_cognatesets.values(), all_judgements


if __name__ == "__main__":
    parser = cli.parser(
        description="Add singleton cognatesets to a CLDF dataset")
    parser.add_argument(
        "--status",
        default=None,
        metavar="MESSAGE",
        help=
        "For each part of a form that is not cognate coded, a singleton cognateset is created, and its status column (if there is one) is set to MESSAGE.",
    )
    parser.add_argument(
        "--by-segment",
        default=False,
        action="store_true",
        help=
        "Instead of creating singleton cognate sets only for forms that are not cognate coded at all, make sure every contiguous set of segments in every form is in a cognate set.",
    )
    args = parser.parse_args()
Beispiel #13
0
        if status_update:
            add_status_column_to_table(dataset=dataset,
                                       table_name="CognateTable")
        ECP = ECP(dataset, row_type=CogSet)
        ECP.db.cache_dataset()
        for sheet in openpyxl.load_workbook(cognate_lexicon).worksheets:
            ECP.parse_cells(sheet, status_update=status_update)
        ECP.db.write_dataset_from_cache()


if __name__ == "__main__":
    import pycldf

    parser = cli.parser(
        description="Imports a dataset from an excel file into CLDF. "
        "The import is configured by a special key in the metadata file, check "
        "./test/data/cldf/smallmawetiguarani/Wordlist-metadata.json for examples."
    )
    parser.add_argument(
        "wordlist",
        nargs="?",
        default=None,
        help="Path to an Excel file containing the dataset",
        metavar="EXCEL",
    )
    parser.add_argument(
        "--cogsets",
        type=Path,
        default=None,
        help=
        "Path to an optional second Excel file containing cogsets and cognate judgements",
Beispiel #14
0
from collections import defaultdict

import pycldf

import lexedata.cli as cli
from lexedata.util.simplify_ids import update_ids, string_to_id

if __name__ == "__main__":
    parser = cli.parser(
        description="Replace the ID column of a table by some other column"
    )
    parser.add_argument(
        "table", type=str, help="The table to apply the replacement to", metavar="TABLE"
    )
    parser.add_argument(
        "replacement",
        type=str,
        help="Name of the replacement column",
        metavar="REPLACEMENT",
    )
    parser.add_argument(
        "--merge",
        action="store_true",
        default=False,
        help="When the replacement would lead to two IDs being merged, warn, but proceed.",
    )
    parser.add_argument(
        "--literally",
        action="store_true",
        default=False,
        help="Use the REPLACEMENT literally, instead of simplifying it. (Run lexedata.edit.simplify_ids if you change your mind later.)",
Beispiel #15
0
                            " ".join(
                                forms_cache.get(form)["segments"][i]
                                for i in sorted(s1 & s2)),
                        )
                    logger.info(
                        f"In form {form}, segments {as_text} are in both cognate sets {c1} and {c2}."
                    )
                    if len(s1 & s2) >= min(len(s1), len(s2)) / 2:
                        mergers.add((c1, c2))
    return mergers


if __name__ == "__main__":
    parser = cli.parser(
        description="List segments that indicate non-concatenative morphology.",
        epilog=
        "If you want a more general report on the cognate judgements, run `lexedata.report.judgements`.",
    )
    parser.add_argument(
        "--cognatesets",
        action=cli.ListOrFromFile,
        help=
        "Only use these cognate sets as indication of overlapping morphemes.",
    )
    parser.add_argument(
        "--output-file",
        "-o",
        help="Path to output file (default: output to stdout)",
        type=Path,
    )
Beispiel #16
0
    gloss_languages: t.Dict[str, str] = dict(language)
    add_concepticon_references(
        dataset,
        gloss_languages=gloss_languages,
        status_update=status_update,
        overwrite=overwrite,
    )

    if concepticon_glosses:
        add_concepticon_names(dataset)
    if concepticon_definition:
        add_concepticon_definitions(dataset=dataset)


if __name__ == "__main__":
    parser = cli.parser(description=__doc__.split("\n\n\n")[0],
                        epilog=__doc__.split("\n\n\n")[1])
    parser.add_argument(
        "--overwrite",
        action="store_true",
        default=False,
        help="Set Concepticon reference even if one exists already",
    )
    parser.add_argument(
        "--add-concept-set-names",
        action="store_true",
        default=False,
        help=
        "Add/update a column containing Concepticon's concept set names (glosses)",
    )
    parser.add_argument(
        "--add-definitions",
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        row["Status_Column"] = status_update
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset


if __name__ == "__main__":
    parser = cli.parser(description="""Add central concepts to cognatesets.

        Write a #ParameterReference column to #CognatesetTable based on the
        concepts linked to the cognateset through the cognate judgements. If
        links to Concepticon are available, the central concept is calculated
        according to CLICS. Otherwise, the most common concept is retained
        as the central concept.""")
    parser.add_argument(
        "--overwrite",
        action="store_true",
        default=False,
        help=
        "Overwrite #parameterReference values of cognate sets already given in the dataset",
    )
    parser.add_argument(
        "--status-update",
        type=str,
        default="automatic central concepts",
        help=
Beispiel #18
0
            concept = form[c_concept]
            language = form[c_language]
            if (language not in concepts_to_languages[concept]
                    and concept in primary_concepts):
                concepts_to_languages[concept].append(language)

    data_concepts = []
    for k, v in concepts_to_languages.items():
        data_concepts.append([k, len(set(v))])

    return data_concepts


if __name__ == "__main__":
    parser = cli.parser(
        description=
        "Summarise coverage, i.e. how many concepts are known for each language."
    )
    # parser.add_argument(
    #     "--min-concepts",
    #     default=0,
    #     type=int,
    #     help="Only include languages with at least M concepts",
    #     metavar="M",
    # )
    parser.add_argument(
        "--min-percentage",
        default=0,
        type=float,
        help="Only include languages with at least M%% concepts",
        metavar="M",
    )
Beispiel #19
0
            else:
                x = ":"
            clics_nodes -= {None}
            if len(clics_nodes) <= 1:
                x = "Unknown" + x
            elif nx.is_connected(clics.subgraph(clics_nodes)):
                x = "Connected" + x
            else:
                x = "Unconnected" + x
            line = f"{lang}, '{form}': {x}\n"
            for ele in sorted(meanings):
                line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n"
            out.write(line)


if __name__ == "__main__":
    parser = cli.parser(description=__doc__)
    parser.add_argument(
        "--output-file",
        "-o",
        help="Path to output file (default: output to stdout)",
        type=Path,
    )
    args = parser.parse_args()
    logger = cli.setup_logging(args)
    list_homophones(
        dataset=pycldf.Dataset.from_metadata(args.metadata),
        out=args.output_file.open("w") if args.output_file else sys.stdout,
        logger=logger,
    )
Beispiel #20
0
            )

    cognate.sort(key=lambda j: j["id"])
    m = {
        util.cldf_property(c.propertyUrl) or c.name: c.name
        for c in dataset["CognateTable"].tableSchema.columns
    }
    dataset["CognateTable"].write(
        [{m[k]: v for k, v in j.items() if k in m} for j in cognate]
    )
    # TODO: write new sets to cognateset table


if __name__ == "__main__":
    parser = cli.parser(
        description="Import the tsv format used by Edictor and Lingpy. Try to only change the subset of forms and cognatesets contained in the TSV, from a partial export."
    )
    parser.add_argument(
        "--source",
        default=None,
        metavar="MESSAGE",
        # TODO: Source is not really the right place, even though it's used for
        # this in one of our datasets.
        help="Set the source of all new cognate sets and all new cognate judgements to MESSAGE.",
    )

    parser.add_argument(
        "--input-file",
        "-i",
        type=Path,
        default="cognate.tsv",