def add_concepticon_names( dataset: pycldf.Wordlist, column_name: str = "Concepticon_Gloss", ): # Create a concepticonReference column try: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() except ValueError: pass write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon names to dataset", ): try: row[column_name] = concepticon.api.conceptsets[row[ dataset.column_names.parameters.concepticonReference]].gloss except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def add_concepticon_references( dataset: pycldf.Wordlist, gloss_languages: t.Mapping[str, str], status_update: t.Optional[str], overwrite: bool = False, ) -> None: """Guess Concepticon links for a multilingual Concept table. Fill the concepticonReference column of the dateset's ParameterTable with best guesses for Concepticon IDs, based on gloss columns in different languages. Parameters ========== dataset: A pycldf.Wordlist with a concepticonReference column in its ParameterTable gloss_languages: A mapping from ParameterTable column names to ISO-639-1 language codes that Concepticon has concept lists for (eg. en, fr, de, es, zh, pt) status_update: String written to Status_Column of #parameterTable if provided overwrite: Overwrite existing Concepticon references """ # TODO: If this function took only dataset["ParameterTable"] and the name # of the target column in there as arguments, one could construct examples # that just use the Iterable API and therefore look nice as doctests. gloss_lists: t.Dict[str, t.List[str]] = {column: [] for column in gloss_languages} for row in dataset["ParameterTable"]: for column, glosses in gloss_lists.items(): glosses.append(row[column] or "?") # Concepticon abhors empty glosses. targets = { language: concepticon.api._get_map_for_language(language, None) for language in gloss_languages.values() } cmaps: t.List[t.Dict[int, t.Tuple[t.List[int], int]]] = [ ( concept_map2( glosses, [i[1] for i in targets[gloss_languages[column]]], similarity_level=2, language=gloss_languages[column], ), # What a horrendous API! Why can't it return glosses or IDs instead # of, as it does now, target-indices so I have to schlepp target along # with the results? targets[gloss_languages[column]], ) for column, glosses in gloss_lists.items() ] write_back = [] for i, row in enumerate(dataset["ParameterTable"]): if overwrite or not row.get( dataset.column_names.parameters.concepticonReference ): matches = [(m.get(i, ([], 10)), t) for m, t in cmaps] best_sim = min(x[0][1] for x in matches) best_matches = [t[m] for (ms, s), t in matches for m in ms if s <= best_sim] c: t.Counter[str] = collections.Counter(id for id, string in best_matches) if len(c) > 1: print(row, best_sim, c.most_common()) row[ dataset.column_names.parameters.concepticonReference ] = c.most_common(1)[0][0] elif len(c) < 1: print(row) else: row[ dataset.column_names.parameters.concepticonReference ] = c.most_common(1)[0][0] # add status update if given if status_update: row["Status_Column"] = status_update write_back.append(row) dataset.write(ParameterTable=write_back)
def add_cognate_table( dataset: pycldf.Wordlist, split: bool = True, logger: cli.logging.Logger = cli.logger, ) -> None: if "CognateTable" in dataset: return dataset.add_component("CognateTable") # TODO: Check if that cognatesetReference is already a foreign key to # elsewhere (could be a CognatesetTable, could be whatever), because then # we need to transfer that knowledge. # Load anything that's useful for a cognate set table: Form IDs, segments, # segment slices, cognateset references, alignments columns = { "id": dataset["FormTable", "id"].name, "concept": dataset["FormTable", "parameterReference"].name, "form": dataset["FormTable", "form"].name, } for property in [ "segments", "segmentSlice", "cognatesetReference", "alignment" ]: try: columns[property] = dataset["FormTable", property].name except KeyError: pass cognate_judgements = [] forms = cache_table(dataset, columns=columns) forms_without_segments = 0 for f, form in cli.tq(forms.items(), task="Extracting cognate judgements from forms…"): if form.get("cognatesetReference"): if split: cogset = util.string_to_id("{:}-{:}".format( form["concept"], form["cognatesetReference"])) else: cogset = form["cognatesetReference"] judgement = { "ID": f, "Form_ID": f, "Cognateset_ID": cogset, } try: judgement["Segment_Slice"] = form["segmentSlice"] except KeyError: try: if not form["segments"]: raise ValueError("No segments") if ("+" in form["segments"] and dataset["FormTable", "cognatesetReference"].separator): logger.warning( "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!" ) judgement["Segment_Slice"] = [ "1:{:d}".format(len(form["segments"])) ] except (KeyError, TypeError, ValueError): forms_without_segments += 1 if forms_without_segments >= 5: pass else: logger.warning( f"No segments found for form {f} ({form['form']})." ) # What does an alignment mean without segments or their slices? # Doesn't matter, if we were given one, we take it. judgement["Alignment"] = form.get("alignment") cognate_judgements.append(judgement) if forms_without_segments >= 5: logger.warning( "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.", forms_without_segments, ) # Delete the cognateset column cols = dataset["FormTable"].tableSchema.columns remove = { dataset["FormTable", c].name for c in ["cognatesetReference", "segmentSlice", "alignment"] if ("FormTable", c) in dataset } def clean_form(form): for c in remove: form.pop(c, None) return form forms = [clean_form(form) for form in dataset["FormTable"]] for c in remove: ix = cols.index(dataset["FormTable", c]) del cols[ix] dataset.write(FormTable=forms) dataset.write(CognateTable=cognate_judgements)