Example #1
0
def test_properties_regex_error():
    excel = Path(__file__).parent / "data/excel/small_defective_no_regexes.xlsx"
    original = Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json"
    copy = copy_metadata(original=original)

    dataset = pycldf.Dataset.from_metadata(copy)
    dialect = argparse.Namespace(**dataset.tablegroup.common_props["special:fromexcel"])
    lexicon_wb = openpyxl.load_workbook(excel).active
    dialect.row_cell_regexes = [
        "(?P<set>.*)",
        # wrong regex
        r"(?P<Name>\[.*)",
        "(?P<English>.*)",
        "(?P<Spanish>.*)",
        "(?P<Portuguese>.*)",
        "(?P<French>.*)",
    ]
    EP = f.excel_parser_from_dialect(dataset, dialect, cognate=False)
    EP = EP(dataset)

    with pytest.raises(
        ValueError,
        match=r"In cell B3: Expected to encounter match for .*, but found no_concept_name",
    ):
        EP.parse_cells(lexicon_wb)
Example #2
0
def test_no_first_row_in_excel(empty_excel):
    original = Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json"
    copy = copy_metadata(original=original)
    with pytest.raises(
        AssertionError,
        match="Your first data row didn't have a name. Please check your format specification or ensure the "
        "first row has a name.",
    ):
        f.load_dataset(metadata=copy, lexicon=empty_excel)
Example #3
0
def test_db_chache():
    copy = copy_metadata(Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    res = dict()
    dataset = pycldf.Dataset.from_metadata(copy)
    db = f.DB(output_dataset=dataset)
    db.cache_dataset()
    for table in dataset.tables:
        table_type = (
            table.common_props.get("dc:conformsTo", "").rsplit("#", 1)[1] or table.url
        )
        res[table_type] = {}
    assert db.cache == res
Example #4
0
def test_single_excel_import_skips_na():
    data = [  # noqa
        ["phonetic", "Form", "English"],
        ["aa", "e.ta.'kɾã", "one"],
        ["bb", "mĩ.'ɾõ1", "two"],
        ["?", "?", "one"],
        ["cc", "?", "three"],
    ]
    # create excel with data
    wb = op.Workbook()
    ws = wb.active
    for row in data:
        ws.append(row)
    sheets = [sheet for sheet in wb]
    metadata = copy_metadata(
        Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    _ = add_single_languages(
        metadata=metadata,
        sheets=sheets,
        match_form=[],
        concept_name="English",
        ignore_superfluous=True,
        ignore_missing=True,
        status_update=None,
        logger=cli.logger,
    )
    dataset = pycldf.Dataset.from_metadata(metadata)
    forms = [f for f in dataset["FormTable"]]
    assert forms == [
        OrderedDict([
            ("ID", "sheet_one"),
            ("Language_ID", "Sheet"),
            ("Concept_ID", "one"),
            ("Form", "e.ta.'kɾã"),
            ("Segments", []),
            ("Value", "aa\te.ta.'kɾã\tone"),
            ("Comment", None),
            ("Source", []),
        ]),
        OrderedDict([
            ("ID", "sheet_two"),
            ("Language_ID", "Sheet"),
            ("Concept_ID", "two"),
            ("Form", "mĩ.'ɾõ1"),
            ("Segments", []),
            ("Value", "bb\tmĩ.'ɾõ1\ttwo"),
            ("Comment", None),
            ("Source", []),
        ]),
    ]
Example #5
0
def test_language_comment_regex_error():
    excel = Path(__file__).parent / "data/excel/small_defective_no_regexes.xlsx"
    original = Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json"
    copy = copy_metadata(original=original)

    dataset = pycldf.Dataset.from_metadata(copy)
    dialect = argparse.Namespace(**dataset.tablegroup.common_props["special:fromexcel"])
    lexicon_wb = openpyxl.load_workbook(excel).active
    dialect.lang_comment_regexes = [r"(\[.*)"]
    EP = f.excel_parser_from_dialect(dataset, dialect, cognate=False)
    EP = EP(dataset)
    with pytest.raises(
        ValueError,
        match="In cell G1: Expected to encounter match for .*, but found no_lan_comment.*",
    ):
        EP.parse_cells(lexicon_wb)
Example #6
0
def test_concept_file_not_found(caplog):
    from lexedata.cli import logger

    copy = copy_metadata(
        Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    add_single_languages(
        metadata=copy,
        sheets=[],
        match_form=None,
        concept_name=None,
        ignore_missing=True,
        ignore_superfluous=True,
        status_update=None,
        logger=logger,
    )
    assert re.search(
        r"Did not find concepts\.csv\. Importing all forms independent of concept",
        caplog.text,
    )
Example #7
0
def no_dialect(request):
    # Copy the dataset metadata file to a temporary directory.
    target = copy_metadata(Path(__file__).parent / request.param)
    with open(target, "r", encoding="utf-8") as file:
        j = json.load(file)
        j["special:fromexcel"] = {}
        j["tables"][0] = {
            "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#FormTable",
            "dc:extent": 2,
            "tableSchema": {
                "columns": [{
                    "datatype": "string",
                    "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id",
                    "name": "ID",
                }],
                "primaryKey": ["ID"],
            },
            "url": "forms.csv",
        }

    with open(target, "w") as file:
        json.dump(j, file, indent=4)
    dataset = pycldf.Dataset.from_metadata(target)
    return dataset
Example #8
0
def test_add_singlestons():
    forms = [  # noqa
        {
            "ID": "L2C1",
            "Language_ID": "L2",
            "Concept_ID": "C1",
            "Form": "L2C1",
            "Value": "L2C1",
            "Segments": ["f"],
        },
        {
            "ID": "L1C1",
            "Language_ID": "L1",
            "Concept_ID": "C1",
            "Form": "",
            "Value": "?",
            "Segments": [],
        },
        {
            "ID": "L1C2",
            "Language_ID": "L1",
            "Concept_ID": "C2",
            "Form": "L1C2",
            "Value": "L1C2",
            "Segments": ["f"],
        },
        {
            "ID": "L2C2",
            "Language_ID": "L2",
            "Concept_ID": "C2",
            "Form": "-",
            "Value": "-",
            "Segments": [],
        },
    ]
    cognates = [{"ID": "1", "Form_ID": "L2C1", "Cognateset": "1"}]
    concepts = [{"ID": "C1", "Name": "C1"}, {"ID": "C2", "Name": "C1"}]
    cogsets = [{"ID": "1", "Name": "1"}]
    dataset = pycldf.Dataset.from_metadata(
        copy_metadata(
            Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json"))
    dataset.write(
        FormTable=forms,
        ParameterTable=concepts,
        CognateTable=cognates,
        CognatesetTable=cogsets,
    )
    all_cogsets, judgements = create_singletons(dataset=dataset)
    assert [dict(x) for x in all_cogsets] == [
        {
            "ID": "1",
            "Name": "1",
            "Comment": None
        },
        {
            "ID": "X_L1C2_1",
            "Name": "C2",
            "Comment": None
        },
    ]
    assert [dict(j) for j in judgements] == [
        {
            "ID": "1",
            "Form_ID": "L2C1",
            "Cognateset": "1",
            "Segment_Slice": [],
            "Alignment": [],
            "Comment": None,
        },
        {
            "ID": "X_L1C2_1",
            "Form_ID": "L1C2",
            "Cognateset": "X_L1C2_1",
            "Segment_Slice": ["1:1"],
            "Alignment": ["f"],
            "Comment": None,
        },
    ]
Example #9
0
def test_add_segments_skips_na_forms():
    forms = [
        {
            "ID": "L2C2",
            "Language_ID": "L2",
            "Concept_ID": "C2",
            "Form": "-",
            "Value": "-",
        },
        {
            "ID": "L1C1",
            "Language_ID": "L1",
            "Concept_ID": "C1",
            "Form": "",
            "Value": "?",
        },
        {
            "ID": "L2C1",
            "Language_ID": "L2",
            "Concept_ID": "C1",
            "Form": "L2C1",
            "Value": "L2C1",
        },
        {
            "ID": "L1C2",
            "Language_ID": "L1",
            "Concept_ID": "C2",
            "Form": "L1C2",
            "Value": "L1C2",
        },
    ]
    dataset = pycldf.Dataset.from_metadata(
        copy_metadata(
            Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json"))
    dataset.write(FormTable=forms)
    _ = add_segments_to_dataset(
        dataset=dataset,
        transcription="Form",
        overwrite_existing=False,
        replace_form=False,
    )
    segmented_forms = [f for f in dataset["FormTable"]]
    assert segmented_forms == [
        OrderedDict([
            ("ID", "L2C2"),
            ("Language_ID", "L2"),
            ("Concept_ID", "C2"),
            ("Form", "-"),
            ("Segments", []),
            ("Value", "-"),
            ("Comment", None),
            ("Source", []),
        ]),
        OrderedDict([
            ("ID", "L1C1"),
            ("Language_ID", "L1"),
            ("Concept_ID", "C1"),
            ("Form", None),
            ("Segments", []),
            ("Value", "?"),
            ("Comment", None),
            ("Source", []),
        ]),
        OrderedDict([
            ("ID", "L2C1"),
            ("Language_ID", "L2"),
            ("Concept_ID", "C1"),
            ("Form", "L2C1"),
            ("Segments", ["L", "²", "C", "¹"]),
            ("Value", "L2C1"),
            ("Comment", None),
            ("Source", []),
        ]),
        OrderedDict([
            ("ID", "L1C2"),
            ("Language_ID", "L1"),
            ("Concept_ID", "C2"),
            ("Form", "L1C2"),
            ("Segments", ["L", "¹", "C", "²"]),
            ("Value", "L1C2"),
            ("Comment", None),
            ("Source", []),
        ]),
    ]
Example #10
0
def test_detect_cognates_ignores_na_forms():
    forms = [  # noqa
        {
            "ID": "L2C1",
            "Language_ID": "L2",
            "Concept_ID": "C1",
            "Form": "L2C1",
            "Value": "L2C1",
        },
        {
            "ID": "L1C1",
            "Language_ID": "L1",
            "Concept_ID": "C1",
            "Form": "",
            "Value": "?",
        },
        {
            "ID": "L1C2",
            "Language_ID": "L1",
            "Concept_ID": "C2",
            "Form": "L1C2",
            "Value": "L1C2",
        },
        {
            "ID": "L2C2",
            "Language_ID": "L2",
            "Concept_ID": "C2",
            "Form": "-",
            "Value": "-",
        },
    ]
    cognates = [  # noqa
        {
            "ID": "1",
            "Form_ID": "L2C1",
            "Cognateset": "1"
        },
        {
            "ID": "2",
            "Form_ID": "L1C2",
            "Cognateset": "2"
        },
    ]
    languages = [{"ID": "L1", "Name": "L1"}, {"ID": "L2", "Name": "L2"}]
    concepts = [{"ID": "C1", "Name": "C1"}, {"ID": "C2", "Name": "C2"}]
    # load dataset and write content and segment
    dataset = pycldf.Dataset.from_metadata(
        copy_metadata(
            Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json"))
    dataset.write(
        FormTable=forms,
        CognateTable=cognates,
        LanguageTable=languages,
        ParameterTable=concepts,
    )
    _ = add_segments_to_dataset(
        dataset=dataset,
        transcription="Form",
        overwrite_existing=False,
        replace_form=False,
    )
    # call detect cognates
    cognate_code_to_file(
        metadata=dataset.tablegroup._fname,
        ratio=1.5,
        soundclass="sca",
        cluster_method="infomap",
        gop=-2,
        mode="overlap",
        threshold=0.55,
        initial_threshold=0.7,
        output_file=dataset.tablegroup._fname.parent / "output",
    )
    # TODO: run ACD
    # TODO: Check that the cognatesets contain only L2C1 and L1C2
    cogsets = [c for c in dataset["CognatesetTable"]]
    print(cogsets)
    assert False