Exemple #1
0
"""Parse nipple state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

NIPPLE_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("false", """ false """),
        VOCAB.term("much", """ much """),
        VOCAB.term(
            "lactation",
            r"""
                (indicate \s+)?
                (( previous | post | prior ) [\s-] )
                (lactation | lactating | lac )
            """,
        ),
        VOCAB.term(
            "other",
            """
                protuberant prominent showing worn distended
            """.split(),
        ),
        # Separates measurements
        VOCAB.part("separator", r' [;"?/,] '),
        # Skip arbitrary words
Exemple #2
0
"""Parse sex notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """),

        VOCAB.producer(convert, """ (?P<neg> none nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple none ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """),
    ],
)
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.term("lactating_abbrev", r"[oc][esm]l"),
        VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"),
        VOCAB.term("post", r""" post | finished """),

        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB.producer(convert, """ (?P<pos> lactating ) """),
        VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """),
        VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """),
        VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """),
        VOCAB.producer(convert, """ (?P<neg> not_lactating_abbrev ) """),
    ],
)
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
def convert_state(token):
    """Convert parsed tokens into a result."""
    trait = Trait(value="present", start=token.start, end=token.end)
    return trait


PLACENTAL_SCAR_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        # Adjectives to placental scars
        VOCAB.term(
            "adj",
            r"""
            faint prominent recent old possible """.split(),
        ),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB.part("sep", r" [;/] "),
        VOCAB.grouper(
            "count",
            """
                none embryo conj | none visible | integer | none
            """,
        ),
        VOCAB.producer(
            convert_count,
            """(?P<count1> count ) op (?P<count2> count )
                ( eq (?P<value> count ) )? plac_scar
VOCAB.part('x', r' [x×] ', capture=False)
VOCAB.part('quest', r' [?] ')
VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST)
VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST)
VOCAB.part('ampersand', r' [&] ', capture=False)
VOCAB.part('eq', r' [=] ', capture=False)
VOCAB.part('under', r' [_] ', capture=False)
VOCAB.part('eol', r' [\n\r\f] ', capture=False)
VOCAB.part('dot', r' [.] ', capture=False)

# Small words
VOCAB.part('by', r' by ', capture=False)
VOCAB.part('to', r' to ', capture=False)
VOCAB.part('with', r' with ', capture=False)
VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False)
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)

# NOTE: Double quotes as inches is handled elsewhere
VOCAB.part('inches', r"""
    (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) """)
VOCAB.part(
    'feet', r"""
    (?<! [a-z] ) ( foot s? | feet s? | ft s? (?! [,\w]) ) | (?<= \d ) ' """)
VOCAB.part(
    'metric_len', r"""
    ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """)
VOCAB.grouper('len_units', ' metric_len feet inches'.split())
    return squash(traits)


COLLECTOR = Base(
    name='collector',
    rules=[
        VOCAB['eol'],
        VOCAB['month_name'],
        STATE_NAMES,
        VOCAB.part('col_label',
                   r"""
            \b ( collect(or|ed) | coll | col ) ( \s* by )? 
            """,
                   capture=False),
        VOCAB.term('no_label', r""" number no num """.split(), capture=False),
        VOCAB.term('part',
                   r""" [[:alpha:]]+ """,
                   priority=LOWEST,
                   capture=False),
        VOCAB.term('other_label',
                   r"""
            art artist ass assist assistant auth authors?
            cartographer conservator contributor corator curator curatorial
            det determiner dir director
            ecologist editor entomologist expedition explorer extractor
            gardener geographer geologist georeferencer grower
            herbarium horticulturalist
            illustrator
            manager
            naturalist
import pandas as pd
import regex
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import const, patterns

STATE_CSV = const.DATA_DIR / 'US_states.csv'
STATES = {}
STATE_NAMES = []
NORMALIZE_US_STATE = {}

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term(
    'USA', r"""
    U\.?S\.?A\.? | U\.?S\.?
    | United \s? States \s? of \s? America | United \s? States
    | U\.? \s? of \s? A\.?""")


def normalize_key(state: str) -> str:
    """Convert state abbreviations into a consistent key."""
    return regex.sub(r'[^a-z]+', '', state.lower())


def normalize_state(state: str) -> str:
    """Convert state abbreviations to the state name."""
    return NORMALIZE_US_STATE.get(normalize_key(state), state.title())


def build_state(state, postal, abbrev_blob):
Exemple #10
0

def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


TRAGUS_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: tragusLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( tragus \s* ) \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm ) """,
        ),
        # Standard keywords that indicate a tragus length follows
        VOCAB.term(
            "key",
            r""" ( tragus | trag | tragi ) \s* (length | len | l )? | tr """,
        ),
        # Some patterns require a separator
        VOCAB.part("sep", r" [;,] | $ ", capture=False),
        VOCAB.grouper("noise", " word dash ".split()),
        # Handle fractional values like: tragus 9/16"
        VOCAB.producer(
            fraction,
            [
                "key len_fraction units",  # E.g.: tragus = 9/16 inches
        if IS_LEFT.search(text, trait.end, end):
            return True

    return False


TOTAL_LENGTH = Base(
    name=__name__.split(".")[-1],
    fix_up=fix_up,
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: TotalLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""
                ( total | snout \s* vent | head \s* body | fork ) \s*
                ( length | len )? \s* in \s* (?P<units> millimeters | mm )
            """,
        ),
        # Various total length keys
        VOCAB.part(
            "len_key",
            r"""
                t \s* [o.]? \s* l [._]? (?! [a-z] )
                | total  [\s-]* length [\s-]* in
                | ( total | max | standard ) [\s-]* lengths? \b
                | meas [\s*:]? \s* length [\s(]* [l] [)\s:]*
                | meas ( [a-z]* )? \.? : \s* l (?! [a-z.] )
                | s \.? \s? l \.? (?! [a-z.] )
                | label [\s.]* lengths? \b
                | ( fork | mean | body ) [\s-]* lengths? \b
import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

OVARY_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper("value", " cross | number len_units? "),
        # E.g.: active, Or: immature
        VOCAB.grouper("state",
                      "active mature destroyed visible developed".split()),
        # Male or female ambiguous, like: gonadLength1
        VOCAB.grouper(
            "ambiguous",
            """
                ambiguous_key dim_side
                | side ambiguous_key dimension
                | ambiguous_key dimension
            """,
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="pregnant" if token.group.get("pos") else "not pregnant",
        start=token.start,
        end=token.end,
    )
    return trait


PREGNANCY_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term(
            "pregnant",
            r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """.
            split(),
        ),
        VOCAB.part("separator", r' [;,"] '),
        VOCAB.producer(convert, """ (?P<neg> pregnant none) """),
        VOCAB.producer(convert, """ (?P<neg> none pregnant ) """),
        VOCAB.producer(convert, """ (?P<pos> pregnant ) """),
    ],
)
Exemple #14
0
"""Parse pregnancy state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

PREGNANCY_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term(
            "pregnant",
            r"""
                prega?n?ant pregnan preg pregnancy pregnancies gravid
                post[\s\-]?parous multiparous nulliparous parous primiparous
            """.split(),
        ),
        VOCAB.term("joiner", r""" of were """.split()),
        VOCAB.term(
            "recent",
            r""" recently recent was previously prev """.split(),
        ),
        VOCAB.term(
            "probably",
            r"""
                probably prob possibly possible
                appears? very
                visible visibly
                evidence evident
Exemple #15
0

def typed(token):
    """Convert single value tokens into a result."""
    trait = Trait(start=token.start, end=token.end)
    trait.notation = token.group["notation"]
    trait.value = to_positive_int(token.group["value1"])
    trait.value += to_positive_int(token.group.get("value2"))
    return trait


NIPPLE_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB.term("id", r" \d+-\d+ "),
        VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()),
        VOCAB.part("number", r" number | no | [#] "),
        VOCAB.part("eq", r" is | eq | equals? | [=] "),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB["sep"],
        VOCAB.grouper("count", " (?: integer | none )(?! side ) "),
        VOCAB.grouper("modifier", "adj visible".split()),
        VOCAB.grouper("skip", " number eq? integer "),
        VOCAB.producer(
            typed,
            """ (?P<notation>
                    (?P<value1> count) modifier
                    (?P<value2> count) modifier
                ) nipple
Exemple #16
0

def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


HIND_FOOT_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: HindFootLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( hind \s* )? foot \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a hind foot length follows
        VOCAB.term(
            "key",
            [
                r"hind \s* foot \s* with \s* (?P<includes> claw )",
                r"hind \s* foot ( \s* ( length | len ) )?",
                "hfl | hf",
            ],
        ),
        # Some patterns require a separator
        VOCAB.part("sep", r" [;,] | $ ", capture=False),
        VOCAB.grouper("noise", " word dash ".split()),
        # Handle fractional values like: hindFoot 9/16"
    ]
    value = [round(lbs + oz, 2) for oz in ozs]
    trait.value = squash(value)
    add_flags(token, trait)
    return trait


BODY_MASS = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Looking for keys like: MassInGrams
        VOCAB.term(
            "key_with_units",
            r"""
                ( weight | mass) [\s-]* in [\s-]*
                (?P<mass_units> grams | g | lbs )
            """,
        ),
        # These words indicate a body mass follows
        VOCAB.part("key_leader", "full observed total".split()),
        # Words for weight
        VOCAB.part("weight", "weights? weigh(ed|ing|s)?".split()),
        # Keys like: w.t.
        VOCAB.part("key_with_dots", r" \b w \.? \s? t s? \.? "),
        # Common prefixes that indicate a body mass
        VOCAB.part("mass", "mass"),
        VOCAB.part("body", "body"),
        # These indicate that the mass is NOT a body mass
        VOCAB.term(
            "other_wt",
Exemple #18
0
            return None

    # Try to disambiguate doubles quotes from inches
    return numeric_fix_ups(trait, text)


EAR_LENGTH = Base(
    name=__name__.split(".")[-1],
    fix_up=fix_up,
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: EarLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""
                ear \s* ( length | len ) \s* in \s*
                (?P<len_units> millimeters | mm )
            """,
        ),
        # Abbreviation containing the measured from notation, like: e/n or e/c
        VOCAB.part(
            "char_measured_from",
            r"""
                (?<! [a-z] ) (?<! [a-z] \s )
                (?P<ambiguous_key> e ) /? (?P<measured_from1> n | c ) [-]?
                (?! \.? [a-z] )
            """,
        ),
        # The abbreviation key, just: e. This can be a problem.
        VOCAB.part(
            "char_key",
    )

    trait2 = Trait(
        value=token.group["value"][1].lower(),
        side=token.group["side"][1].lower(),
        start=token.start,
        end=token.end,
    )

    return [trait1, trait2]


OVARIES_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("other", """ sev somewhat few """.split()),
        # Skip words
        VOCAB.term("skip", " womb nullip ".split()),
        # VOCAB['comma'],
        VOCAB.part("sep", r" [;\(] "),
        # E.g.: ovaries and uterine horns
        # Or:   ovaries and fallopian tubes
        VOCAB.grouper(
            "ovaries",
            r"""
                ovary ( ( and? uterus horns? ) | and? fallopian )?
            """,
        ),
        # E.g.: covered in copious fat
        VOCAB.grouper("coverage", " covered word{0,2} fat "),
        # E.g.: +corpus luteum
    has_year = any(x for x in digits if len(x) >= YEAR_LEN)
    if not (has_month and has_year):
        return None

    trait = convert(token)
    if trait:
        trait.value = str(trait.value[:-2]) + '??'
    return trait


LABEL_DATE = Base(
    name=__name__.split('.')[-1],
    rules=[
        VOCAB['eol'],
        VOCAB['uuid'],  # Get rid of these before they're a problem
        VOCAB.term('label', ' date '.split()),
        VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'),
        VOCAB.part('sep', r' [/_-]+ ', capture=False),
        VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep? month_name sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> month_name sep? digits sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep digits sep digits ) """),
        VOCAB.producer(
            short_date_digits, f"""
            label? (?P<value> digits sep digits ) """),
"""Parse testes size notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

TESTES_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Note: abbrev differs from the one in the testes_state_trait
        VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part("char_key", r" \b t (?! [a-z] )"),
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper(
            "value",
            """ cross | number len_units? (?! mass_units ) """,
        ),
Exemple #22
0
import pandas as pd
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import const, patterns
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,
                                      f' (?P<value> plant_genus word+ ) ')
Exemple #23
0
VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(value=token.group["value"].lower(),
                  start=token.start,
                  end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev", "ns sc".split()),
        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            """ (?P<value>
                ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
            """,
        ),
        VOCAB.producer(convert, """ (?P<value> non? scrotal ) """),
        VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev )  """),
    ],
)
        trait.right = count

    return trait


EMBRYO_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        VOCAB["metric_mass"],
        VOCAB.part(
            "sex",
            r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """,
        ),
        VOCAB.term("repo_key", r""" reproductive \s data """),
        VOCAB.term("near_term", r" near[\s-]?term"),
        VOCAB.term("each_side", r" each \s side "),
        VOCAB.term("skip", r" w  wt ".split()),
        VOCAB.part("sep", r" [;] "),
        VOCAB.part("bang", r" [!] "),
        VOCAB.grouper(
            "count",
            """ none (word | plac_scar) conj | integer | none | num_words | bang """,
        ),
        VOCAB.grouper("present", " found | near_term "),
        VOCAB.grouper("numeric", " integer | real "),
        VOCAB.grouper("skip_len",
                      " ( x? numeric metric_len ) | (x numeric metric_len?) "),
        VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "),
        VOCAB.grouper("side_link", " x | conj | word "),
Exemple #25
0
"""Shared reproductive trait tokens (testes & ovaries)."""

from traiter.old.vocabulary import LOWEST, Vocabulary

import vertnet.pylib.patterns as patterns

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term("sex", "females? | males? | [f]")

VOCAB.term("active", "active inactive".split())
VOCAB.part("and", r" ( and \b | [&] ) ")
VOCAB.term("count", r"""( only | all | both )? \s* [12]""")

VOCAB.term(
    "color",
    r""" (( dark | light | pale ) \s* )?
         ( red | pink | brown | black | white | pigmented )
    """,
)

VOCAB.term("texture", " smooth ")

VOCAB.term("covered", " covered ")

VOCAB.term("destroyed", "destroy(ed)?")

VOCAB.part(
    "size",
    r"""
        ( very \s+ )?
"""Patterns for names."""

import pandas as pd
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import patterns
from digi_leap.pylib.const import DATA_DIR

NAME_CSV = DATA_DIR / 'name_parts.csv'

SUFFIXES = 'filho ii iii jr sr'.split()

VOCAB = Vocabulary(patterns.VOCAB)


def build_name_parts():
    """Build name patterns."""
    df = pd.read_csv(NAME_CSV, na_filter=False, dtype=str)
    VOCAB.term('name_part', df['name'].tolist(), capture=False)


build_name_parts()

VOCAB.term('suffix', SUFFIXES)
VOCAB.term('initial', r'[[:alpha:]] (?! \s* \d+ )')
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST)
VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST)
VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST)
VOCAB.part("ampersand", r" [&] ", capture=False)
VOCAB.part("eq", r" [=] ", capture=False)
VOCAB.part("plus", r" [+] ", capture=False)
VOCAB.part("under", r" [_] ", capture=False)
VOCAB.part("eol", r" [\n\r\f] ", capture=False)
VOCAB.part("dot", r" [.] ", capture=False)

# Small words
VOCAB.part("by", r" by ", capture=False)
VOCAB.part("to", r" to ", capture=False)
VOCAB.part("with", r" with ", capture=False)
VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False)
VOCAB.term("and", r" and ", capture=False)
VOCAB.term("conj", " or and but ".split(), capture=False)
VOCAB.term("prep", " to with on of in ".split(), capture=False)
VOCAB.term("found", "found", capture=False)

# NOTE: Double quotes as inches is handled elsewhere
VOCAB.part(
    "inches",
    r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) (?! [:] ) """,
)
VOCAB.part(
    "feet",
    r""" (?<! [a-z] )
         ( foot s? (?! [:] ) | feet s? (?! [:] )
         | ft s? (?! [,\w]) )  | (?<= \d ) '
    """,

def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


FOREARM_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: ForearmLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a forearm length follows
        VOCAB.term(
            "key",
            r"""
                forearm ( \s* ( length | len | l ) )?
                | fore? \s? [.]? \s? a
                | fa
            """,
        ),
        # Some patterns require a separator
        VOCAB.part("sep", r" [;,] | $ ", capture=False),
        VOCAB.grouper("noise", " word dash ".split()),
        # Handle fractional values like: forearm 9/16"
Exemple #29
0
            return None

    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


TAIL_LENGTH = Base(
    name=__name__.split(".")[-1],
    fix_up=fix_up,
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Looking for keys like: tailLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                tail \s* ( length | len ) \s* in \s*
                (?P<units> millimeters | mm )
            """,
        ),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part(
            "char_key",
            r"""
                \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D )
            """,
        ),
        # Standard keywords that indicate a tail length follows
        VOCAB.term("keyword",
                   [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]),
        # Some patterns require a separator
        VOCAB.part("sep", r" [;,] | $ ", capture=False),
import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

TIME_OPTIONS = VOCAB["time_units"].pattern

LIFE_STAGE = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for life stage
        VOCAB.term(
            "json_key",
            [
                r" life \s* stage \s* (remarks?)? ",
                r" age \s* class ",
                r" age \s* in \s* (?P<time_units> {}) ".format(TIME_OPTIONS),
                r" age ",
            ],
        ),
        # These words are life stages without a keyword indicator
        VOCAB.term(
            "intrinsic",
            [
                r" yolk \s? sac ",
                r" young [\s-]? of [\s-]? the [\s-]? year ",
                r" adult \s* young ",
                r" young \s* adult ",
            ] + """
                ads? adulte?s?
                chicks?