def test_properties_regex_error(): excel = Path(__file__).parent / "data/excel/small_defective_no_regexes.xlsx" original = Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json" copy = copy_metadata(original=original) dataset = pycldf.Dataset.from_metadata(copy) dialect = argparse.Namespace(**dataset.tablegroup.common_props["special:fromexcel"]) lexicon_wb = openpyxl.load_workbook(excel).active dialect.row_cell_regexes = [ "(?P<set>.*)", # wrong regex r"(?P<Name>\[.*)", "(?P<English>.*)", "(?P<Spanish>.*)", "(?P<Portuguese>.*)", "(?P<French>.*)", ] EP = f.excel_parser_from_dialect(dataset, dialect, cognate=False) EP = EP(dataset) with pytest.raises( ValueError, match=r"In cell B3: Expected to encounter match for .*, but found no_concept_name", ): EP.parse_cells(lexicon_wb)
def test_no_first_row_in_excel(empty_excel): original = Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json" copy = copy_metadata(original=original) with pytest.raises( AssertionError, match="Your first data row didn't have a name. Please check your format specification or ensure the " "first row has a name.", ): f.load_dataset(metadata=copy, lexicon=empty_excel)
def test_db_chache(): copy = copy_metadata(Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") res = dict() dataset = pycldf.Dataset.from_metadata(copy) db = f.DB(output_dataset=dataset) db.cache_dataset() for table in dataset.tables: table_type = ( table.common_props.get("dc:conformsTo", "").rsplit("#", 1)[1] or table.url ) res[table_type] = {} assert db.cache == res
def test_single_excel_import_skips_na(): data = [ # noqa ["phonetic", "Form", "English"], ["aa", "e.ta.'kɾã", "one"], ["bb", "mĩ.'ɾõ1", "two"], ["?", "?", "one"], ["cc", "?", "three"], ] # create excel with data wb = op.Workbook() ws = wb.active for row in data: ws.append(row) sheets = [sheet for sheet in wb] metadata = copy_metadata( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") _ = add_single_languages( metadata=metadata, sheets=sheets, match_form=[], concept_name="English", ignore_superfluous=True, ignore_missing=True, status_update=None, logger=cli.logger, ) dataset = pycldf.Dataset.from_metadata(metadata) forms = [f for f in dataset["FormTable"]] assert forms == [ OrderedDict([ ("ID", "sheet_one"), ("Language_ID", "Sheet"), ("Concept_ID", "one"), ("Form", "e.ta.'kɾã"), ("Segments", []), ("Value", "aa\te.ta.'kɾã\tone"), ("Comment", None), ("Source", []), ]), OrderedDict([ ("ID", "sheet_two"), ("Language_ID", "Sheet"), ("Concept_ID", "two"), ("Form", "mĩ.'ɾõ1"), ("Segments", []), ("Value", "bb\tmĩ.'ɾõ1\ttwo"), ("Comment", None), ("Source", []), ]), ]
def test_language_comment_regex_error(): excel = Path(__file__).parent / "data/excel/small_defective_no_regexes.xlsx" original = Path(__file__).parent / "data/cldf/smallmawetiguarani/cldf-metadata.json" copy = copy_metadata(original=original) dataset = pycldf.Dataset.from_metadata(copy) dialect = argparse.Namespace(**dataset.tablegroup.common_props["special:fromexcel"]) lexicon_wb = openpyxl.load_workbook(excel).active dialect.lang_comment_regexes = [r"(\[.*)"] EP = f.excel_parser_from_dialect(dataset, dialect, cognate=False) EP = EP(dataset) with pytest.raises( ValueError, match="In cell G1: Expected to encounter match for .*, but found no_lan_comment.*", ): EP.parse_cells(lexicon_wb)
def test_concept_file_not_found(caplog): from lexedata.cli import logger copy = copy_metadata( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") add_single_languages( metadata=copy, sheets=[], match_form=None, concept_name=None, ignore_missing=True, ignore_superfluous=True, status_update=None, logger=logger, ) assert re.search( r"Did not find concepts\.csv\. Importing all forms independent of concept", caplog.text, )
def no_dialect(request): # Copy the dataset metadata file to a temporary directory. target = copy_metadata(Path(__file__).parent / request.param) with open(target, "r", encoding="utf-8") as file: j = json.load(file) j["special:fromexcel"] = {} j["tables"][0] = { "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#FormTable", "dc:extent": 2, "tableSchema": { "columns": [{ "datatype": "string", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id", "name": "ID", }], "primaryKey": ["ID"], }, "url": "forms.csv", } with open(target, "w") as file: json.dump(j, file, indent=4) dataset = pycldf.Dataset.from_metadata(target) return dataset
def test_add_singlestons(): forms = [ # noqa { "ID": "L2C1", "Language_ID": "L2", "Concept_ID": "C1", "Form": "L2C1", "Value": "L2C1", "Segments": ["f"], }, { "ID": "L1C1", "Language_ID": "L1", "Concept_ID": "C1", "Form": "", "Value": "?", "Segments": [], }, { "ID": "L1C2", "Language_ID": "L1", "Concept_ID": "C2", "Form": "L1C2", "Value": "L1C2", "Segments": ["f"], }, { "ID": "L2C2", "Language_ID": "L2", "Concept_ID": "C2", "Form": "-", "Value": "-", "Segments": [], }, ] cognates = [{"ID": "1", "Form_ID": "L2C1", "Cognateset": "1"}] concepts = [{"ID": "C1", "Name": "C1"}, {"ID": "C2", "Name": "C1"}] cogsets = [{"ID": "1", "Name": "1"}] dataset = pycldf.Dataset.from_metadata( copy_metadata( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")) dataset.write( FormTable=forms, ParameterTable=concepts, CognateTable=cognates, CognatesetTable=cogsets, ) all_cogsets, judgements = create_singletons(dataset=dataset) assert [dict(x) for x in all_cogsets] == [ { "ID": "1", "Name": "1", "Comment": None }, { "ID": "X_L1C2_1", "Name": "C2", "Comment": None }, ] assert [dict(j) for j in judgements] == [ { "ID": "1", "Form_ID": "L2C1", "Cognateset": "1", "Segment_Slice": [], "Alignment": [], "Comment": None, }, { "ID": "X_L1C2_1", "Form_ID": "L1C2", "Cognateset": "X_L1C2_1", "Segment_Slice": ["1:1"], "Alignment": ["f"], "Comment": None, }, ]
def test_add_segments_skips_na_forms(): forms = [ { "ID": "L2C2", "Language_ID": "L2", "Concept_ID": "C2", "Form": "-", "Value": "-", }, { "ID": "L1C1", "Language_ID": "L1", "Concept_ID": "C1", "Form": "", "Value": "?", }, { "ID": "L2C1", "Language_ID": "L2", "Concept_ID": "C1", "Form": "L2C1", "Value": "L2C1", }, { "ID": "L1C2", "Language_ID": "L1", "Concept_ID": "C2", "Form": "L1C2", "Value": "L1C2", }, ] dataset = pycldf.Dataset.from_metadata( copy_metadata( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")) dataset.write(FormTable=forms) _ = add_segments_to_dataset( dataset=dataset, transcription="Form", overwrite_existing=False, replace_form=False, ) segmented_forms = [f for f in dataset["FormTable"]] assert segmented_forms == [ OrderedDict([ ("ID", "L2C2"), ("Language_ID", "L2"), ("Concept_ID", "C2"), ("Form", "-"), ("Segments", []), ("Value", "-"), ("Comment", None), ("Source", []), ]), OrderedDict([ ("ID", "L1C1"), ("Language_ID", "L1"), ("Concept_ID", "C1"), ("Form", None), ("Segments", []), ("Value", "?"), ("Comment", None), ("Source", []), ]), OrderedDict([ ("ID", "L2C1"), ("Language_ID", "L2"), ("Concept_ID", "C1"), ("Form", "L2C1"), ("Segments", ["L", "²", "C", "¹"]), ("Value", "L2C1"), ("Comment", None), ("Source", []), ]), OrderedDict([ ("ID", "L1C2"), ("Language_ID", "L1"), ("Concept_ID", "C2"), ("Form", "L1C2"), ("Segments", ["L", "¹", "C", "²"]), ("Value", "L1C2"), ("Comment", None), ("Source", []), ]), ]
def test_detect_cognates_ignores_na_forms(): forms = [ # noqa { "ID": "L2C1", "Language_ID": "L2", "Concept_ID": "C1", "Form": "L2C1", "Value": "L2C1", }, { "ID": "L1C1", "Language_ID": "L1", "Concept_ID": "C1", "Form": "", "Value": "?", }, { "ID": "L1C2", "Language_ID": "L1", "Concept_ID": "C2", "Form": "L1C2", "Value": "L1C2", }, { "ID": "L2C2", "Language_ID": "L2", "Concept_ID": "C2", "Form": "-", "Value": "-", }, ] cognates = [ # noqa { "ID": "1", "Form_ID": "L2C1", "Cognateset": "1" }, { "ID": "2", "Form_ID": "L1C2", "Cognateset": "2" }, ] languages = [{"ID": "L1", "Name": "L1"}, {"ID": "L2", "Name": "L2"}] concepts = [{"ID": "C1", "Name": "C1"}, {"ID": "C2", "Name": "C2"}] # load dataset and write content and segment dataset = pycldf.Dataset.from_metadata( copy_metadata( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")) dataset.write( FormTable=forms, CognateTable=cognates, LanguageTable=languages, ParameterTable=concepts, ) _ = add_segments_to_dataset( dataset=dataset, transcription="Form", overwrite_existing=False, replace_form=False, ) # call detect cognates cognate_code_to_file( metadata=dataset.tablegroup._fname, ratio=1.5, soundclass="sca", cluster_method="infomap", gop=-2, mode="overlap", threshold=0.55, initial_threshold=0.7, output_file=dataset.tablegroup._fname.parent / "output", ) # TODO: run ACD # TODO: Check that the cognatesets contain only L2C1 and L1C2 cogsets = [c for c in dataset["CognatesetTable"]] print(cogsets) assert False