def write_edictor_file( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], file: t.TextIO, forms: t.Mapping[types.Form_ID, t.Mapping[str, t.Any]], judgements_about_form, cognateset_numbers, ): """Write the judgements of a dataset to file, in edictor format.""" delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } tsv_header = [ util.cldf_property(c.propertyUrl) or c.name for c in dataset["FormTable"].tableSchema.columns ] tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # write output to tsv out = csv.DictWriter( file, fieldnames=tsv_header, delimiter="\t", ) out.writerow({column: rename(column) for column in tsv_header}) out_cognatesets: t.List[t.Optional[str]] for f, (id, form) in enumerate(forms.items(), 1): # store original form id in other field and get cogset integer id this_form = dict(form) this_form["LINGPY_ID"] = f # Normalize the form: # 1. No list-valued entries for col, d in delimiters.items(): this_form[col] = d.join(form[col]) # 2. No tabs, newlines in entries, they make Edictor mad. for c, v in form.items(): if type(v) == str: this_form[c] = (form[c].replace("\t", " ;t ").replace( "\n", " ;n ")) # if there is a cogset, add its integer id. otherwise set id to 0 judgement = judgements_about_form[this_form["id"]] this_form["cognatesetReference"] = " ".join( str(cognateset_numbers.get(e, 0)) for e in (judgement[1] or [None])) this_form["alignment"] = (" ".join(judgement[0]).replace( "(", "( ").replace(")", " )").replace(" ) ( ", " ")) # add integer form id out.writerow(this_form) add_edictor_settings(file, dataset)
def properties_as_key(data, columns): mapping = { column.name: util.cldf_property(column.propertyUrl) for column in columns if util.cldf_property(column.propertyUrl) } for s in data: for name, property in mapping.items(): s[property] = s.pop(name, None)
def merge_group( forms: t.Sequence[types.Form], target: types.Form, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.Form: """Merge one group of homophones >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist()) {'Parameter_ID': [1, 2]} The target is assumed to be already included in the forms. >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist()) {'Parameter_ID': [1, 1, 2]} """ c_f_id = dataset["FormTable", "id"].name for column in target: if column == c_f_id: continue try: reference_name = (util.cldf_property( dataset["FormTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([form[column] for form in forms], target) except AssertionError: # We cannot deal with this block, but others may be fine. merger_name = merger.__name__ logger.error( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[form[column] for form in forms]}") raise Skip except TypeError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n" f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n" f"Given input: {[form[column] for form in forms]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in FormTable.") return target
def merge_group( cogsets: t.Sequence[types.CogSet], target: types.CogSet, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.CogSet: """Merge one group of cognate sets The target is assumed to be already included in the forms. """ c_s_id = dataset["CognatesetTable", "id"].name for column in target: if column == c_s_id: continue try: reference_name = (util.cldf_property( dataset["CognatesetTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([cogset[column] for cogset in cogsets], target) except AssertionError: merger_name = merger.__name__ # We cannot deal with this block, but others may be fine. logger.error( f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") raise Skip except NotImplementedError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n" f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in CognatesetTable.") return target
def check_foreign_keys(dataset: pycldf.Dataset, logger: cli.logging.Logger = cli.logger): # Get all foreign keys for each table valid = True for table in dataset.tables: for key in table.tableSchema.foreignKeys: reference = key.reference try: (target_column, ) = reference.columnReference except ValueError: # Multi-column foreign key. We *could* check that there's not a # reference column hidden in there, but we don't. continue (column, ) = key.columnReference # check that property url of foreign key column points to correct table column_type = util.cldf_property( dataset[table].get_column(column).propertyUrl) if column_type and pycldf.TERMS[column_type].references: target_table = pycldf.TERMS[column_type].references else: # Not a CLDF reference property. Nothing to check. continue if dataset[target_table] != dataset[reference.resource]: log_or_raise( message= f"Foreign key {key} is a declared as {column_type}, which should point to {target_table} but instead points to {reference}", log=logger, ) valid = False continue # Check that foreign key is ID of corresponding table if reference.columnReference != [ dataset[key.reference.resource, "id"].name ]: log_or_raise( message=f"Foreign key {key} in table {table.url.string} " f"does not point to the ID column of another table", log=logger, ) valid = False return valid
def __init__( self, dataset: pycldf.Dataset, database_url: t.Optional[str] = None, logger: cli.logging.Logger = cli.logger, ): self.set_header(dataset) self.separators = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset[self.row_table].tableSchema.columns if c.separator } self.URL_BASE = database_url self.wb = op.Workbook() self.ws: op.worksheet.worksheet.Worksheet = self.wb.active self.logger = logger
def set_header( self, dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], ): c_id = dataset["CognatesetTable", "id"].name try: c_comment = dataset["CognatesetTable", "comment"].name except (KeyError): c_comment = None self.header = [] for column in dataset["CognatesetTable"].tableSchema.columns: if column.name == c_id: self.header.insert(0, ("id", "CogSet")) elif column.name == c_comment: continue else: property = util.cldf_property( column.propertyUrl) or column.name self.header.append((property, column.name))
except (KeyError): cli.Exit.INVALID_DATASET( "Dataset has no explicit CognatesetTable. Add one using `lexedata.edit.add_table CognatesetTable`." ) E = ExcelWriter( dataset, database_url=args.url_template, logger=logger, ) cogsets, judgements = cogsets_and_judgements( dataset, args.add_singletons_with_status, args.by_segment, logger) try: cogset_order = (util.cldf_property( dataset["CognatesetTable", args.sort_cognatesets_by].propertyUrl) or dataset["CognatesetTable", args.sort_cognatesets_by].name) except (KeyError): cli.Exit.INVALID_COLUMN_NAME( f"No column '{args.sort_cognatesets_by}' in your CognatesetTable.") sort_cognatesets(cogsets, judgements, cogset_order, size=args.size_sort) # TODO: wrap the following two blocks into a # get_sorted_languages() -> t.OrderedDict[languageReference, Column Header/Titel/Name] # function languages = list(util.cache_table(dataset, "LanguageTable").values()) if args.sort_languages_by: c_sort = (util.cldf_property( dataset["LanguageTable", args.sort_languages_by].propertyUrl) or dataset["LanguageTable", args.sort_languages_by].name)
invalid_ids.append(item) if "Name" in new_table.tableSchema.columndict: return {"ID": item, "Name": item} else: return {"ID": item} reference_properties = { property_name for property_name, term in pycldf.terms.Terms().properties.items() if term.references == args.table } referenced_items: t.Set[str] = set() for table in ds.tables: for column in table.tableSchema.columns: if util.cldf_property(column.propertyUrl) in reference_properties: referenced_items |= { column.datatype.formatted(row[column.name]) for row in table } logger.info( "Found %d different entries for your new %s.", len(referenced_items), args.table ) ds.write(**{args.table: [new_row(item) for item in sorted(referenced_items)]}) if invalid_ids: logger.warning( "Some of your reference values are not valid as IDs: %s. You can transform them into valid ids by running lexedata.edit.simplify_ids", invalid_ids, )
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache
def load_forms_from_tsv( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], input_file: Path, logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]: """ Side effects ============ This function overwrites dataset's FormTable """ input = csv.DictReader( input_file.open(encoding="utf-8"), delimiter="\t", ) # These days, all dicts are ordered by default. Still, better make this explicit. forms = util.cache_table(dataset) edictor_cognatesets: t.Dict[ int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = collections.defaultdict(list) form_table_upper = { (util.cldf_property(column.propertyUrl) or column.name).upper(): ( util.cldf_property(column.propertyUrl) or column.name ) for column in dataset["FormTable"].tableSchema.columns } form_table_upper.update( { "DOCULECT": "languageReference", "CONCEPT": "parameterReference", "IPA": "form", "COGID": "cognatesetReference", "ALIGNMENT": "alignment", "TOKENS": "segments", "CLDF_ID": "id", "ID": "", } ) if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]: form_table_upper["_PARAMETERREFERENCE"] = "parameterReference" form_table_upper["CONCEPT"] = "" separators: t.MutableMapping[str, t.Optional[str]] = {} # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames. for i in range(len(input.fieldnames)): if i == 0 and input.fieldnames[0] != "ID": raise ValueError( "When importing from Edictor, expected the first column to be named 'ID', but found %s", input.fieldnames["ID"], ) lingpy = input.fieldnames[i] try: input.fieldnames[i] = form_table_upper[lingpy.upper()] except KeyError: logger.warning( "Your edictor file contained a column %s, which I could not interpret.", lingpy, ) if input.fieldnames[i] == "cognatesetReference": separators[input.fieldnames[i]] = " " elif input.fieldnames[i] == "alignment": separators[input.fieldnames[i]] = " " try: separators[input.fieldnames[i]] = dataset[ "FormTable", input.fieldnames[i] ].separator except KeyError: pass logger.info( "The header of your edictor file will be interpreted as %s.", input.fieldnames ) affected_forms: t.Set[types.Form_ID] = set() for line in cli.tq( input, task="Importing form rows from edictor…", total=len(forms) ): # Column "" is the re-named Lingpy-ID column, so the first one. if not any(line.values()) or line[""].startswith("#"): # One of Edictor's comment rows, storing settings continue for (key, value) in line.items(): value = value.replace("\\!t", "\t").replace("\\!n", "\n") sep = separators[key] if sep is not None: if not value: line[key] = [] else: line[key] = value.split(sep) else: line[key] = value affected_forms.add(line["id"]) try: for segments, cognateset, alignment in extract_partial_judgements( line["segments"], line["cognatesetReference"], line["alignment"], logger, ): edictor_cognatesets[cognateset].append( (line["id"], segments, alignment) ) forms[line["id"]] = line except IndexError: logger.warning( f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped." ) edictor_cognatesets.pop(0, None) columns = { (util.cldf_property(column.propertyUrl) or column.name): column.name for column in dataset["FormTable"].tableSchema.columns } # Deliberately make use of the property of `write` to discard any entries # that don't correspond to existing columns. Otherwise, we'd still have to # get rid of the alignment, cognatesetReference and Lingpy-ID columns. dataset["FormTable"].write( ( { columns[property]: value for property, value in form.items() if columns.get(property) } for form in forms.values() ) ) return edictor_cognatesets, affected_forms
def edictor_to_cldf( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], new_cogsets: t.Mapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ], affected_forms: t.Set[types.Form_ID], source: t.List[str] = [], ): ref_cogsets: t.MutableMapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = t.DefaultDict(list) cognate: t.List[types.Judgement] = [] judgements_lookup: t.MutableMapping[ types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement] ] = t.DefaultDict(dict) for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in affected_forms: ref_cogsets[j["cognatesetReference"]].append( (j["formReference"], j["segmentSlice"], j["alignment"]) ) judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j else: cognate.append(j) matches = match_cognatesets(new_cogsets, ref_cogsets) for cognateset, judgements in new_cogsets.items(): cognateset = matches[cognateset] if cognateset is None: cognateset = "_".join(f for f, _, _ in judgements) for form, slice, alignment in judgements: was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset) if was: was["segmentSlice"] = util.indices_to_segment_slice(slice) was["alignment"] = alignment cognate.append(was) continue judgements_lookup cognate.append( types.Judgement( { "id": f"{form}-{cognateset}", "formReference": form, "cognatesetReference": cognateset, "alignment": alignment, "segmentSlice": util.indices_to_segment_slice(slice), "source": source, # TODO: Any more parameters? Status update? } ) ) cognate.sort(key=lambda j: j["id"]) m = { util.cldf_property(c.propertyUrl) or c.name: c.name for c in dataset["CognateTable"].tableSchema.columns } dataset["CognateTable"].write( [{m[k]: v for k, v in j.items() if k in m} for j in cognate] )