def sex_count(ent): """Enrich the match with data.""" data = {} for token in ent: label = token.ent_type_ value = token.lower_ if label == 'sex': data['sex'] = value elif (as_int := to_positive_int(value)) is not None: data['count'] = as_int
def fraction(token): """Handle fractional values like 10 3/8 inches.""" trait = Trait(start=token.start, end=token.end) trait.units = token.group.get("units") trait.units_inferred = not bool(trait.units) whole = to_positive_float(token.group.get("whole", "0")) numerator = to_positive_int(token.group["numerator"]) denominator = to_positive_int(token.group["denominator"]) try: trait.value = whole + Fraction(numerator, denominator) except TypeError: print(f"Fraction error: {numerator} / {denominator}") return None if trait.units: trait.value = convert_units(trait.value, trait.units) add_flags(token, trait) return trait
def range_(span): """Get a count range.""" values = [to_positive_int(t.text) for t in span if t.is_digit] data = {'low': min(values)} if len(values) > 1: data['high'] = max(values) return data
def multiple_seta_count(ent): """Handle multiple seta in one match.""" data = {'body_part': 'seta'} values = [] for token in ent: label = token._.cached_label if label == 'seta': data['seta'] = REPLACE.get(token.lower_, token.lower_) elif label == 'number_word': values.append(to_positive_int(REPLACE.get(token.lower_))) elif match := IS_INT.match(token.text): value = to_positive_int(match.group(0)) values.append(value) elif label == 'group': data['group'] = token.lower_
def range_values(ent): """Extract values from the range and cached label.""" data = {} range_ = [ e for e in ent.ents if e._.cached_label.split('.')[0] == 'range' ][0] values = re.findall(FLOAT_RE, range_.text) if not all([re.search(INT_TOKEN_RE, v) for v in values]): raise RejectMatch keys = range_.label_.split('.')[1:] for key, value in zip(keys, values): data[key] = to_positive_int(value) range_._.data = data range_._.new_label = 'count' return range_
def seta_count(ent): """Enrich the match.""" data = {'body_part': 'seta'} location = [] for token in ent: label = token._.cached_label if label == 'seta': data['seta'] = REPLACE.get(token.lower_, token.lower_) elif label == 'number_word': data['count'] = int(REPLACE.get(token.lower_, -1)) elif token.lower_ in MISSING: data['count'] = 0 elif label == 'group': data['group'] = token.lower_ elif match := IS_INT.match(token.text): data['count'] = to_positive_int(match.group(0))
def count_word(ent): """Enrich the match with data.""" ent._.new_label = 'count' word = [e for e in ent.ents if e.label_ == 'count_word'][0] word._.data = {'low': to_positive_int(REPLACE[word.text])} word._.new_label = 'count'
def sample(token): """Convert the span into a single integer.""" match = re.search(INT_RE, token.text) value = match.group(0) return {'n': to_positive_int(value)}