Example #1
0
MARGIN_SHAPE = MatcherPatterns(
    'margin_shape',
    on_match='efloras.margin.v1',
    decoder=COMMON_PATTERNS | {
        'margin_shape': {
            'ENT_TYPE': 'margin_shape'
        },
        'shape': {
            'ENT_TYPE': {
                'IN': SHAPES
            }
        },
        'leader': {
            'ENT_TYPE': {
                'IN': LEADERS
            }
        },
        'follower': {
            'ENT_TYPE': {
                'IN': FOLLOWERS
            }
        },
    },
    patterns=[
        'leader* -* margin_shape+',
        'leader* -* margin_shape -* follower*',
        'leader* -* margin_shape -* shape? follower+ shape?',
        'shape+ -* follower+',
    ],
)
Example #2
0
        'ENT_TYPE': 'part_loc'
    },
    'part/loc': {
        'ENT_TYPE': {
            'IN': ['part_loc', 'body_part']
        }
    },
    'cconj': {
        'POS': 'CCONJ'
    },
}

SETAE = MatcherPatterns(
    'seta',
    decoder=DECODER,
    patterns=[
        'part* cheta',
        'any_part+ cheta',
    ],
)

SETAE_ABBREV = MatcherPatterns(
    'seta_abbrev',
    decoder=DECODER,
    patterns=['(? cheta_abbrev )?'],
)

SETA_COUNT = MatcherPatterns(
    'setae_count',
    on_match='anoplura.seta_count.v1',
    decoder=DECODER,
    patterns=[
Example #3
0
    'not_count_word': {
        'LOWER': {
            'IN': NOT_COUNT_WORDS
        }
    },
    'per_count': {
        'ENT_TYPE': 'per_count'
    },
}

COUNT = MatcherPatterns(
    'count',
    on_match='efloras.count.v1',
    decoder=DECODER,
    patterns=[
        '99-99 -* per_count?',
        '99-99 per_count count_suffix?',
        'per_count adp? 99-99 count_suffix?',
        '( 99-99 count_suffix? ) per_count',
    ],
)

COUNT_WORD = MatcherPatterns(
    'count_word',
    on_match='efloras.count_word.v1',
    decoder=DECODER,
    patterns=[
        'count_word',
    ],
)
Example #4
0
"""Get scientific names."""

import spacy
from spacy.tokens import Token
from traiter.patterns.matcher_patterns import MatcherPatterns

from anoplura.pylib.const import REPLACE

NAMES = ['anoplura', 'mammalia']

SCI_NAME = MatcherPatterns(
    'sci_name',
    on_match='anoplura.sci_name.v1',
    patterns=[[{'ENT_TYPE': {'IN': NAMES}}]],
)

GENUS = MatcherPatterns(
    'genus',
    on_match='anoplura.genus.v1',
    patterns=[[{'ENT_TYPE': 'anoplura_genus'}]],
)


@spacy.registry.misc(SCI_NAME.on_match)
def sci_name(ent):
    """Enrich the match."""
    if isinstance(ent, Token):
        return

    ent._.data = {
        'sci_name': REPLACE.get(ent.text.lower(), ent.text.capitalize()),
Example #5
0
    """.split()

DECODER = COMMON_PATTERNS | {
    'part': {'ENT_TYPE': 'part'},
    'subpart': {'ENT_TYPE': 'subpart'},
    'leader': {'LOWER': {'IN': LOCATION_LEADERS}},
    'not_loc': {'ENT_TYPE': {'IN': ['sex', 'location']}},
    'sex': {'ENT_TYPE': 'sex'},
    'of': {'LOWER': 'of'},
    'adj': {'POS': 'ADJ'},
}

PART_AS_LOCATION = MatcherPatterns(
    'part_as_loc',
    on_match=TEXT_ACTION,
    decoder=DECODER,
    patterns=[
        'leader part',
    ],
)

SUBPART_AS_LOCATION = MatcherPatterns(
    'subpart_location',
    on_match='efloras.subpart_location.v1',
    decoder=DECODER,
    patterns=[
        'leader subpart',
        'leader subpart of adj? subpart'
    ],
)

"""Parse count notations."""

import spacy
from traiter.patterns.matcher_patterns import MatcherPatterns
from traiter.util import to_positive_int

SEX_COUNT = MatcherPatterns('sex_count',
                            on_match='anoplura.sex_count.v1',
                            patterns=[[
                                {
                                    'IS_DIGIT': True
                                },
                                {
                                    'ENT_TYPE': 'sex'
                                },
                            ]])


@spacy.registry.misc(SEX_COUNT.on_match)
def sex_count(ent):
    """Enrich the match with data."""
    data = {}

    for token in ent:
        label = token.ent_type_
        value = token.lower_

        if label == 'sex':
            data['sex'] = value
        elif (as_int := to_positive_int(value)) is not None:
            data['count'] = as_int
DESC = 'description'
WORD_ENTS = [''] + """ sclerotin part_loc sex """.split()

TRIM = re.compile(r'^\W+|\W+$')

DESCRIPTION = MatcherPatterns(
    'description',
    on_match='anoplura.description.v1',
    decoder={
        'body_part': {
            'ENT_TYPE': 'body_part'
        },
        'words': {
            'ENT_TYPE': {
                'IN': WORD_ENTS
            }
        },
        './;': {
            'ENT_TYPE': 'stop'
        },
    },
    patterns=[
        'body_part words+ ./;',
        'words+ body_part ./;',
        'words+ body_part words+ ./;',
    ],
)


@registry.misc(DESCRIPTION.on_match)
def description(ent):
    """Look for trait descriptions in sentences."""
Example #8
0
        'ENT_TYPE': 'sex'
    },
    'x': {
        'LOWER': {
            'IN': CROSS
        }
    },
}

SIZE = MatcherPatterns(
    'size',
    on_match='efloras.size.v1',
    decoder=DECODER,
    patterns=[
        'about? 99.9-99.9 cm follow*',
        ('      about? 99.9-99.9 cm? follow* '
         'x to? about? 99.9-99.9 cm  follow*'),
        ('      about? 99.9-99.9 cm? follow* '
         'x to? about? 99.9-99.9 cm? follow* '
         'x to? about? 99.9-99.9 cm  follow*'),
    ],
)

SIZE_HIGH_ONLY = MatcherPatterns(
    'size.high_only',
    on_match='efloras.size_high_only.v1',
    decoder=DECODER,
    patterns=[
        'to about? 99.9 [?]? cm follow*',
    ],
)
MISSING_RE = re.compile(MISSING_RE, flags=re.IGNORECASE)

BODY_PART = MatcherPatterns(
    'body_part',
    on_match='anoplura.body_part.v1',
    decoder=COMMON_PATTERNS | {
        'seg': {
            'ENT_TYPE': 'segmented'
        },
        'ord': {
            'ENT_TYPE': {
                'IN': ['ordinal', 'number_word']
            }
        },
    },
    patterns=[
        'missing part+',
        'missing? any_part* part',
        'part+ &/,/or* part* &/,/or* part+',
        'part+ ord -? ord',
        'part+ 99? -? 99',
        'part+ ord?',
        'part+ 99?',
        'part+ ord -? seg',
        'part+ 99 -? seg',
        'ord? -? seg? part+',
        '99 - seg part+',
    ],
)

    'max': {
        'LOWER': {
            'IN': MAXIMUM
        }
    },
    'width': {
        'LOWER': {
            'IN': WIDTH
        }
    },
}

MEASUREMENT = MatcherPatterns(
    'measurement',
    decoder=DECODER,
    patterns=[
        '99.9 cm',
        '99.9 - 99.9 cm',
    ],
)

MEAN = MatcherPatterns(
    'mean',
    decoder=DECODER,
    patterns=['mean_word punct? 99.9 cm?'],
)

SAMPLE = MatcherPatterns(
    'sample',
    decoder=DECODER,
    patterns=['n = 99'],
)
Example #11
0
            'IN': ['shape', 'shape_leader']
        }
    },
    'angular': {
        'LOWER': {
            'IN': ['angular', 'angulate']
        }
    },
}

SHAPE = MatcherPatterns(
    'shape',
    on_match='efloras.shape.v1',
    decoder=DECODER,
    patterns=[
        'shape_loc* -* shape+',
        'shape_loc* -* shape -* shape+',
        'shape_leader -/to shape_word+ -* shape+',
        'shape_word+ -* shape+',
    ],
)

N_SHAPE = MatcherPatterns(
    'n_shape',
    on_match='efloras.n_shape.v1',
    decoder=DECODER,
    patterns=[
        'shape_loc* 9 - angular',
    ],
)
Example #12
0
from traiter.patterns.matcher_patterns import MatcherPatterns

from efloras.pylib.const import COMMON_PATTERNS

DECODER = COMMON_PATTERNS | {
    'ambiguous': {
        'LOWER': {
            'IN': ['few', 'many']
        }
    },
}

RANGE_LOW = MatcherPatterns(
    'range.low',
    decoder=DECODER,
    patterns=[
        '99.9',
        '( 99.9 -/or ) ambiguous ( -/to ambiguous )',
    ],
)

RANGE_MIN_LOW = MatcherPatterns(
    'range.min.low',
    decoder=DECODER,
    patterns=[
        '( 99.9 -/or ) 99.9',
        '( 99.9 -/to ) 99.9',
    ],
)

RANGE_LOW_HIGH = MatcherPatterns(
    'range.low.high',
Example #13
0
from efloras.pylib.const import COMMON_PATTERNS, MISSING, REMOVE, REPLACE

MULTIPLE_DASHES = ['\\' + c for c in DASH_CHAR]
MULTIPLE_DASHES = fr'\s*[{"".join(MULTIPLE_DASHES)}]{{2,}}\s*'

SKIP = DASH + MISSING

COLOR = MatcherPatterns(
    'color',
    on_match='efloras.color.v1',
    decoder=COMMON_PATTERNS | {
        'color_words': {
            'ENT_TYPE': {
                'IN': ['color', 'color_mod']
            }
        },
        'color': {
            'ENT_TYPE': 'color'
        },
    },
    patterns=[
        'missing? color_words* -* color+ -* color_words*',
    ],
)


@registry.misc(COLOR.on_match)
def color(ent):
    """Enrich a phrase match."""
    parts = {
        r: 1