MARGIN_SHAPE = MatcherPatterns( 'margin_shape', on_match='efloras.margin.v1', decoder=COMMON_PATTERNS | { 'margin_shape': { 'ENT_TYPE': 'margin_shape' }, 'shape': { 'ENT_TYPE': { 'IN': SHAPES } }, 'leader': { 'ENT_TYPE': { 'IN': LEADERS } }, 'follower': { 'ENT_TYPE': { 'IN': FOLLOWERS } }, }, patterns=[ 'leader* -* margin_shape+', 'leader* -* margin_shape -* follower*', 'leader* -* margin_shape -* shape? follower+ shape?', 'shape+ -* follower+', ], )
'ENT_TYPE': 'part_loc' }, 'part/loc': { 'ENT_TYPE': { 'IN': ['part_loc', 'body_part'] } }, 'cconj': { 'POS': 'CCONJ' }, } SETAE = MatcherPatterns( 'seta', decoder=DECODER, patterns=[ 'part* cheta', 'any_part+ cheta', ], ) SETAE_ABBREV = MatcherPatterns( 'seta_abbrev', decoder=DECODER, patterns=['(? cheta_abbrev )?'], ) SETA_COUNT = MatcherPatterns( 'setae_count', on_match='anoplura.seta_count.v1', decoder=DECODER, patterns=[
'not_count_word': { 'LOWER': { 'IN': NOT_COUNT_WORDS } }, 'per_count': { 'ENT_TYPE': 'per_count' }, } COUNT = MatcherPatterns( 'count', on_match='efloras.count.v1', decoder=DECODER, patterns=[ '99-99 -* per_count?', '99-99 per_count count_suffix?', 'per_count adp? 99-99 count_suffix?', '( 99-99 count_suffix? ) per_count', ], ) COUNT_WORD = MatcherPatterns( 'count_word', on_match='efloras.count_word.v1', decoder=DECODER, patterns=[ 'count_word', ], )
"""Get scientific names.""" import spacy from spacy.tokens import Token from traiter.patterns.matcher_patterns import MatcherPatterns from anoplura.pylib.const import REPLACE NAMES = ['anoplura', 'mammalia'] SCI_NAME = MatcherPatterns( 'sci_name', on_match='anoplura.sci_name.v1', patterns=[[{'ENT_TYPE': {'IN': NAMES}}]], ) GENUS = MatcherPatterns( 'genus', on_match='anoplura.genus.v1', patterns=[[{'ENT_TYPE': 'anoplura_genus'}]], ) @spacy.registry.misc(SCI_NAME.on_match) def sci_name(ent): """Enrich the match.""" if isinstance(ent, Token): return ent._.data = { 'sci_name': REPLACE.get(ent.text.lower(), ent.text.capitalize()),
""".split() DECODER = COMMON_PATTERNS | { 'part': {'ENT_TYPE': 'part'}, 'subpart': {'ENT_TYPE': 'subpart'}, 'leader': {'LOWER': {'IN': LOCATION_LEADERS}}, 'not_loc': {'ENT_TYPE': {'IN': ['sex', 'location']}}, 'sex': {'ENT_TYPE': 'sex'}, 'of': {'LOWER': 'of'}, 'adj': {'POS': 'ADJ'}, } PART_AS_LOCATION = MatcherPatterns( 'part_as_loc', on_match=TEXT_ACTION, decoder=DECODER, patterns=[ 'leader part', ], ) SUBPART_AS_LOCATION = MatcherPatterns( 'subpart_location', on_match='efloras.subpart_location.v1', decoder=DECODER, patterns=[ 'leader subpart', 'leader subpart of adj? subpart' ], )
"""Parse count notations.""" import spacy from traiter.patterns.matcher_patterns import MatcherPatterns from traiter.util import to_positive_int SEX_COUNT = MatcherPatterns('sex_count', on_match='anoplura.sex_count.v1', patterns=[[ { 'IS_DIGIT': True }, { 'ENT_TYPE': 'sex' }, ]]) @spacy.registry.misc(SEX_COUNT.on_match) def sex_count(ent): """Enrich the match with data.""" data = {} for token in ent: label = token.ent_type_ value = token.lower_ if label == 'sex': data['sex'] = value elif (as_int := to_positive_int(value)) is not None: data['count'] = as_int
DESC = 'description' WORD_ENTS = [''] + """ sclerotin part_loc sex """.split() TRIM = re.compile(r'^\W+|\W+$') DESCRIPTION = MatcherPatterns( 'description', on_match='anoplura.description.v1', decoder={ 'body_part': { 'ENT_TYPE': 'body_part' }, 'words': { 'ENT_TYPE': { 'IN': WORD_ENTS } }, './;': { 'ENT_TYPE': 'stop' }, }, patterns=[ 'body_part words+ ./;', 'words+ body_part ./;', 'words+ body_part words+ ./;', ], ) @registry.misc(DESCRIPTION.on_match) def description(ent): """Look for trait descriptions in sentences."""
'ENT_TYPE': 'sex' }, 'x': { 'LOWER': { 'IN': CROSS } }, } SIZE = MatcherPatterns( 'size', on_match='efloras.size.v1', decoder=DECODER, patterns=[ 'about? 99.9-99.9 cm follow*', (' about? 99.9-99.9 cm? follow* ' 'x to? about? 99.9-99.9 cm follow*'), (' about? 99.9-99.9 cm? follow* ' 'x to? about? 99.9-99.9 cm? follow* ' 'x to? about? 99.9-99.9 cm follow*'), ], ) SIZE_HIGH_ONLY = MatcherPatterns( 'size.high_only', on_match='efloras.size_high_only.v1', decoder=DECODER, patterns=[ 'to about? 99.9 [?]? cm follow*', ], )
MISSING_RE = re.compile(MISSING_RE, flags=re.IGNORECASE) BODY_PART = MatcherPatterns( 'body_part', on_match='anoplura.body_part.v1', decoder=COMMON_PATTERNS | { 'seg': { 'ENT_TYPE': 'segmented' }, 'ord': { 'ENT_TYPE': { 'IN': ['ordinal', 'number_word'] } }, }, patterns=[ 'missing part+', 'missing? any_part* part', 'part+ &/,/or* part* &/,/or* part+', 'part+ ord -? ord', 'part+ 99? -? 99', 'part+ ord?', 'part+ 99?', 'part+ ord -? seg', 'part+ 99 -? seg', 'ord? -? seg? part+', '99 - seg part+', ], )
'max': { 'LOWER': { 'IN': MAXIMUM } }, 'width': { 'LOWER': { 'IN': WIDTH } }, } MEASUREMENT = MatcherPatterns( 'measurement', decoder=DECODER, patterns=[ '99.9 cm', '99.9 - 99.9 cm', ], ) MEAN = MatcherPatterns( 'mean', decoder=DECODER, patterns=['mean_word punct? 99.9 cm?'], ) SAMPLE = MatcherPatterns( 'sample', decoder=DECODER, patterns=['n = 99'], )
'IN': ['shape', 'shape_leader'] } }, 'angular': { 'LOWER': { 'IN': ['angular', 'angulate'] } }, } SHAPE = MatcherPatterns( 'shape', on_match='efloras.shape.v1', decoder=DECODER, patterns=[ 'shape_loc* -* shape+', 'shape_loc* -* shape -* shape+', 'shape_leader -/to shape_word+ -* shape+', 'shape_word+ -* shape+', ], ) N_SHAPE = MatcherPatterns( 'n_shape', on_match='efloras.n_shape.v1', decoder=DECODER, patterns=[ 'shape_loc* 9 - angular', ], )
from traiter.patterns.matcher_patterns import MatcherPatterns from efloras.pylib.const import COMMON_PATTERNS DECODER = COMMON_PATTERNS | { 'ambiguous': { 'LOWER': { 'IN': ['few', 'many'] } }, } RANGE_LOW = MatcherPatterns( 'range.low', decoder=DECODER, patterns=[ '99.9', '( 99.9 -/or ) ambiguous ( -/to ambiguous )', ], ) RANGE_MIN_LOW = MatcherPatterns( 'range.min.low', decoder=DECODER, patterns=[ '( 99.9 -/or ) 99.9', '( 99.9 -/to ) 99.9', ], ) RANGE_LOW_HIGH = MatcherPatterns( 'range.low.high',
from efloras.pylib.const import COMMON_PATTERNS, MISSING, REMOVE, REPLACE MULTIPLE_DASHES = ['\\' + c for c in DASH_CHAR] MULTIPLE_DASHES = fr'\s*[{"".join(MULTIPLE_DASHES)}]{{2,}}\s*' SKIP = DASH + MISSING COLOR = MatcherPatterns( 'color', on_match='efloras.color.v1', decoder=COMMON_PATTERNS | { 'color_words': { 'ENT_TYPE': { 'IN': ['color', 'color_mod'] } }, 'color': { 'ENT_TYPE': 'color' }, }, patterns=[ 'missing? color_words* -* color+ -* color_words*', ], ) @registry.misc(COLOR.on_match) def color(ent): """Enrich a phrase match.""" parts = { r: 1