def pick(self, values: Sequence[str]) -> Optional[str]: """From a set of names, pick the most plausible user-facing one.""" # Sort to get stable results when it's a coin toss: values = sorted(values) if not len(values): return None normalised = [] lookup: Dict[str, List[str]] = {} # We're doing this in two stages, to avoid name forms with varied casing # (e.g. Smith vs. SMITH) are counted as widly different, leading to # implausible median outcomes. for value in values: norm = slugify(value, sep=" ") if norm is None: continue normalised.append(norm) lookup.setdefault(norm, []) lookup[norm].append(value) norm = setmedian(normalised) if norm is None: return None forms = lookup.get(norm, []) if len(forms) <= 1: return first(forms) return cast(str, setmedian(forms))
def pick(self, values): values = [sanitize_text(v) for v in ensure_list(values)] values = [v for v in values if v is not None] if not len(values): return None if 1 == len(values): return values[0] return setmedian(values)
def entities(self): for (key, type_), tags in self.tags.items(): # skip entities that do not meet a threshold of relevance: cutoff = self.type_cutoff(type_) if len(tags) < cutoff: continue label = tags[0] if type_ == registry.name and len(set(tags)) > 0: label = setmedian(tags) yield label, type_
def pick(self, values): values = [sanitize_text(v) for v in ensure_list(values)] values = [v for v in values if v is not None] if len(values) <= 1: return first(values) return setmedian(sorted(values))
def select_label(labels): return setmedian(labels)
def label(self): if not self.strict and len(self.labels) > 1: return setmedian(self.labels) return self.labels[0]