BODY_MASS = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Looking for keys like: MassInGrams
        VOCAB.term(
            "key_with_units",
            r"""
                ( weight | mass) [\s-]* in [\s-]*
                (?P<mass_units> grams | g | lbs )
            """,
        ),
        # These words indicate a body mass follows
        VOCAB.part("key_leader", "full observed total".split()),
        # Words for weight
        VOCAB.part("weight", "weights? weigh(ed|ing|s)?".split()),
        # Keys like: w.t.
        VOCAB.part("key_with_dots", r" \b w \.? \s? t s? \.? "),
        # Common prefixes that indicate a body mass
        VOCAB.part("mass", "mass"),
        VOCAB.part("body", "body"),
        # These indicate that the mass is NOT a body mass
        VOCAB.term(
            "other_wt",
            """
                femur baculum bacu bac spleen thymus kidney
                testes testis ovaries epididymis epid
            """.split(),
        ),

PLACENTAL_SCAR_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        # Adjectives to placental scars
        VOCAB.term(
            "adj",
            r"""
            faint prominent recent old possible """.split(),
        ),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB.part("sep", r" [;/] "),
        VOCAB.grouper(
            "count",
            """
                none embryo conj | none visible | integer | none
            """,
        ),
        VOCAB.producer(
            convert_count,
            """(?P<count1> count ) op (?P<count2> count )
                ( eq (?P<value> count ) )? plac_scar
            """,
        ),
        VOCAB.producer(
            convert_count,
            """plac_scar op?
def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """),

        VOCAB.producer(convert, """ (?P<neg> none nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple none ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """),
    ],
)
         imagos? imms? immatures?
         jeunes? juvs? juveniles? juvéniles?
         larvae? larvals? larves? leptocephales? leptocephalus
         matures? metamorphs?
         neonates? nestlings? nulliparous
         premetamorphs?
         sub-adults? subads? subadulte?s?
         tadpoles? têtard
         yearlings? yg ygs young
     """.split(),
 ),
 # This indicates that the following words are NOT a life stage
 VOCAB.term("skip", r" determin \w* "),
 # Compound words separated by dashes or slashes
 # E.g. adult/juvenile or over-winter
 VOCAB.part("joiner", r" \s* [/-] \s* "),
 # Use this to find the end of a life stage pattern
 VOCAB.part("separator", r' [;,"?] | $ '),
 # For life stages with numbers as words in them
 VOCAB["ordinals"],
 VOCAB["time_units"],
 VOCAB.part("after", "after"),
 VOCAB.part("hatching", "hatching"),
 # Match any word
 VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "),
 VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"),
 # E.g.: life stage juvenile/yearling
 VOCAB.producer(
     convert,
     "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"),
 # E.g.: life stage young adult
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
Example #6
0
import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
        VOCAB.producer(convert, " (?P<value> sex_vocab quest? ) "),
    ],
)
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
        VOCAB.part("pre", r" \b pre [\s\-]? "),
        # Separates measurements
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

TESTES_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Note: abbrev differs from the one in the testes_state_trait
        VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part("char_key", r" \b t (?! [a-z] )"),
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper(
            "value",
            """ cross | number len_units? (?! mass_units ) """,
        ),
        VOCAB.grouper(
            "state",
Example #9
0
 fix_up=fix_up,
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     # Units are in the key, like: EarLengthInMillimeters
     VOCAB.term(
         "key_with_units",
         r"""
             ear \s* ( length | len ) \s* in \s*
             (?P<len_units> millimeters | mm )
         """,
     ),
     # Abbreviation containing the measured from notation, like: e/n or e/c
     VOCAB.part(
         "char_measured_from",
         r"""
             (?<! [a-z] ) (?<! [a-z] \s )
             (?P<ambiguous_key> e ) /? (?P<measured_from1> n | c ) [-]?
             (?! \.? [a-z] )
         """,
     ),
     # The abbreviation key, just: e. This can be a problem.
     VOCAB.part(
         "char_key",
         r"""
             (?<! \w ) (?<! \w \s )
             (?P<ambiguous_key> e )
             (?! \.? \s? [a-z\(] )
         """,
     ),
     # Standard keywords that indicate an ear length follows
     VOCAB.term(
         "keyword",
    if not (has_month and has_year):
        return None

    trait = convert(token)
    if trait:
        trait.value = str(trait.value[:-2]) + '??'
    return trait


LABEL_DATE = Base(
    name=__name__.split('.')[-1],
    rules=[
        VOCAB['eol'],
        VOCAB['uuid'],  # Get rid of these before they're a problem
        VOCAB.term('label', ' date '.split()),
        VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'),
        VOCAB.part('sep', r' [/_-]+ ', capture=False),
        VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep? month_name sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> month_name sep? digits sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep digits sep digits ) """),
        VOCAB.producer(
            short_date_digits, f"""
            label? (?P<value> digits sep digits ) """),
        VOCAB.producer(
        if col_no[-1] in ('m', 'M'):
            return None
        traits[0].col_no = col_no

    return squash(traits)


COLLECTOR = Base(
    name='collector',
    rules=[
        VOCAB['eol'],
        VOCAB['month_name'],
        STATE_NAMES,
        VOCAB.part('col_label',
                   r"""
            \b ( collect(or|ed) | coll | col ) ( \s* by )? 
            """,
                   capture=False),
        VOCAB.term('no_label', r""" number no num """.split(), capture=False),
        VOCAB.term('part',
                   r""" [[:alpha:]]+ """,
                   priority=LOWEST,
                   capture=False),
        VOCAB.term('other_label',
                   r"""
            art artist ass assist assistant auth authors?
            cartographer conservator contributor corator curator curatorial
            det determiner dir director
            ecologist editor entomologist expedition explorer extractor
            gardener geographer geologist georeferencer grower
            herbarium horticulturalist
 VOCAB.term(
     "key_with_units",
     r"""
         ( total | snout \s* vent | head \s* body | fork ) \s*
         ( length | len )? \s* in \s* (?P<units> millimeters | mm )
     """,
 ),
 # Various total length keys
 VOCAB.part(
     "len_key",
     r"""
         t \s* [o.]? \s* l [._]? (?! [a-z] )
         | total  [\s-]* length [\s-]* in
         | ( total | max | standard ) [\s-]* lengths? \b
         | meas [\s*:]? \s* length [\s(]* [l] [)\s:]*
         | meas ( [a-z]* )? \.? : \s* l (?! [a-z.] )
         | s \.? \s? l \.? (?! [a-z.] )
         | label [\s.]* lengths? \b
         | ( fork | mean | body ) [\s-]* lengths? \b
         | s \.? \s? v \.? \s? l \.? (?! [a-z.] )
         | snout [\s-]* vent [\s-]* lengths? \b
     """,
 ),
 # Words that indicate we don't have a total length
 VOCAB.term("skip", " horns? tag ".split()),
 # The word length on its own. Make sure it isn't proceeded by a letter
 VOCAB.part(
     "ambiguous",
     r""" (?<! [a-z] \s* ) (?P<ambiguous_key> lengths? ) """,
 ),
 # # We don't know if this is a length until we see the units
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="pregnant" if token.group.get("pos") else "not pregnant",
        start=token.start,
        end=token.end,
    )
    return trait


PREGNANCY_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term(
            "pregnant",
            r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """.
            split(),
        ),
        VOCAB.part("separator", r' [;,"] '),
        VOCAB.producer(convert, """ (?P<neg> pregnant none) """),
        VOCAB.producer(convert, """ (?P<neg> none pregnant ) """),
        VOCAB.producer(convert, """ (?P<pos> pregnant ) """),
    ],
)
        side=token.group["side"][1].lower(),
        start=token.start,
        end=token.end,
    )

    return [trait1, trait2]


OVARIES_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("other", """ sev somewhat few """.split()),
        # Skip words
        VOCAB.term("skip", " womb nullip ".split()),
        # VOCAB['comma'],
        VOCAB.part("sep", r" [;\(] "),
        # E.g.: ovaries and uterine horns
        # Or:   ovaries and fallopian tubes
        VOCAB.grouper(
            "ovaries",
            r"""
                ovary ( ( and? uterus horns? ) | and? fallopian )?
            """,
        ),
        # E.g.: covered in copious fat
        VOCAB.grouper("coverage", " covered word{0,2} fat "),
        # E.g.: +corpus luteum
        VOCAB.grouper("luteum", " sign? corpus? (alb | lut) "),
        VOCAB.grouper(
            "value_words",
            """
Example #15
0
"""Find taxon notations on herbarium specimen labels."""

import pandas as pd
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import const, patterns
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part('slash', r' [/] ', capture=False)
VOCAB.part('dash', r' (?: – | - ) ', capture=False)
VOCAB.part('open', r' [(\[] ', capture=False)
VOCAB.part('close', r' [)\]] ', capture=False)
VOCAB.part('x', r' [x×] ', capture=False)
VOCAB.part('quest', r' [?] ')
VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST)
VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST)
VOCAB.part('ampersand', r' [&] ', capture=False)
VOCAB.part('eq', r' [=] ', capture=False)
VOCAB.part('under', r' [_] ', capture=False)
VOCAB.part('eol', r' [\n\r\f] ', capture=False)
VOCAB.part('dot', r' [.] ', capture=False)

# Small words
VOCAB.part('by', r' by ', capture=False)
VOCAB.part('to', r' to ', capture=False)
VOCAB.part('with', r' with ', capture=False)
VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False)
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)
        count = to_positive_int(token.group["subcount"])
        trait.value = count + count
        trait.left = count
        trait.right = count

    return trait


EMBRYO_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        VOCAB["metric_mass"],
        VOCAB.part(
            "sex",
            r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """,
        ),
        VOCAB.term("repo_key", r""" reproductive \s data """),
        VOCAB.term("near_term", r" near[\s-]?term"),
        VOCAB.term("each_side", r" each \s side "),
        VOCAB.term("skip", r" w  wt ".split()),
        VOCAB.part("sep", r" [;] "),
        VOCAB.part("bang", r" [!] "),
        VOCAB.grouper(
            "count",
            """ none (word | plac_scar) conj | integer | none | num_words | bang """,
        ),
        VOCAB.grouper("present", " found | near_term "),
        VOCAB.grouper("numeric", " integer | real "),
        VOCAB.grouper("skip_len",
                      " ( x? numeric metric_len ) | (x numeric metric_len?) "),
        value="lactating" if token.group.get("pos") else "not lactating",
        start=token.start,
        end=token.end,
    )
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.term("lactating_abbrev", r"[oc][esm]l"),
        VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"),
        VOCAB.term("post", r""" post | finished """),

        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB.producer(convert, """ (?P<pos> lactating ) """),
        VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """),
        VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """),
        VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """),
Example #19
0
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


EMBRYO_LENGTH = Base(
    name=__name__.split(".")[-1],
    fix_up=fix_up,
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        VOCAB.part(
            "embryo_len_key",
            r"""
            (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) (
                ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump
                | (?<! [a-z] ) crl (?! [a-z] )
                | (?<! [a-z] ) c \.? r \.? (?! [a-z] )
            )""",
        ),
        VOCAB.part("len", r" (length | len) (?! [a-z] ) "),
        VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "),
        VOCAB.part("separator", r' [;"/.] '),
        VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """),
        VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """),
        VOCAB.grouper(
            "count",
            """
            number side number side eq?
            | number plus number ( eq number )?
            """,
Example #20
0
"""Shared reproductive trait tokens (testes & ovaries)."""

from traiter.old.vocabulary import LOWEST, Vocabulary

import vertnet.pylib.patterns as patterns

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term("sex", "females? | males? | [f]")

VOCAB.term("active", "active inactive".split())
VOCAB.part("and", r" ( and \b | [&] ) ")
VOCAB.term("count", r"""( only | all | both )? \s* [12]""")

VOCAB.term(
    "color",
    r""" (( dark | light | pale ) \s* )?
         ( red | pink | brown | black | white | pigmented )
    """,
)

VOCAB.term("texture", " smooth ")

VOCAB.term("covered", " covered ")

VOCAB.term("destroyed", "destroy(ed)?")

VOCAB.part(
    "size",
    r"""
        ( very \s+ )?
     "key_with_units",
     r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm )
     """,
 ),
 # Standard keywords that indicate a forearm length follows
 VOCAB.term(
     "key",
     r"""
         forearm ( \s* ( length | len | l ) )?
         | fore? \s? [.]? \s? a
         | fa
     """,
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: forearm 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: forearm = 9/16 inches
         "key len_fraction",  # E.g.: forearm = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: forearmLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: forearmLength=9-10 mm
Example #22
0
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

from vertnet.pylib.util import NUM_WORDS, ORDINALS

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part("slash", r" [/] ", capture=False)
VOCAB.part("dash", r" \p{Pd} ", capture=False)
VOCAB.part("open", r" \p{Ps} ", capture=False)
VOCAB.part("close", r" \p{Pe} ", capture=False)
VOCAB.part("x", r" [x×] ", capture=False)
VOCAB.part("quest", r" [?] ")
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST)
VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST)
VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST)
VOCAB.part("ampersand", r" [&] ", capture=False)
VOCAB.part("eq", r" [=] ", capture=False)
VOCAB.part("plus", r" [+] ", capture=False)
VOCAB.part("under", r" [_] ", capture=False)
VOCAB.part("eol", r" [\n\r\f] ", capture=False)
VOCAB.part("dot", r" [.] ", capture=False)

# Small words
VOCAB.part("by", r" by ", capture=False)
VOCAB.part("to", r" to ", capture=False)
VOCAB.part("with", r" with ", capture=False)
VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False)
VOCAB.term("and", r" and ", capture=False)
Example #23
0
def typed(token):
    """Convert single value tokens into a result."""
    trait = Trait(start=token.start, end=token.end)
    trait.notation = token.group["notation"]
    trait.value = to_positive_int(token.group["value1"])
    trait.value += to_positive_int(token.group.get("value2"))
    return trait


NIPPLE_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB.term("id", r" \d+-\d+ "),
        VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()),
        VOCAB.part("number", r" number | no | [#] "),
        VOCAB.part("eq", r" is | eq | equals? | [=] "),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB["sep"],
        VOCAB.grouper("count", " (?: integer | none )(?! side ) "),
        VOCAB.grouper("modifier", "adj visible".split()),
        VOCAB.grouper("skip", " number eq? integer "),
        VOCAB.producer(
            typed,
            """ (?P<notation>
                    (?P<value1> count) modifier
                    (?P<value2> count) modifier
                ) nipple
            """,
        ),
Example #24
0
 name=__name__.split(".")[-1],
 fix_up=fix_up,
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     # Looking for keys like: tailLengthInMM
     VOCAB.term(
         "key_with_units",
         r"""
             tail \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm )
         """,
     ),
     # The abbreviation key, just: t. This can be a problem.
     VOCAB.part(
         "char_key",
         r"""
             \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D )
         """,
     ),
     # Standard keywords that indicate a tail length follows
     VOCAB.term("keyword",
                [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]),
     # Some patterns require a separator
     VOCAB.part("sep", r" [;,] | $ ", capture=False),
     # Consider all of these tokens a key
     VOCAB.grouper("key", "keyword char_key".split()),
     # Handle fractional values like: tailLength 9/16"
     VOCAB.producer(
         fraction,
         [
             # E.g.: tail = 9/16 in
             "key len_fraction (?P<units> len_units )",
Example #25
0
    if token.group.get('us_county'):
        trait.us_county = token.group['us_county'].title()

    if token.group.get('us_state'):
        trait.us_state = us_states.normalize_state(token.group['us_state'])

    return trait


ADMIN_UNIT = Base(
    name='us_county',
    rules=[
        VOCAB['eol'],
        VOCAB.term('skip', r""" of the """.split()),
        VOCAB.term('co_label', r""" co | coun[tc]y """, capture=False),
        VOCAB.term('st_label', r"""
            ( plants | flora ) \s* of """, capture=False),
        VOCAB.term('other', r"""alluvial flood river plain """.split()),
        VOCAB.part('nope', r""" [(] """),
        VOCAB['word'],

        VOCAB.producer(convert, ' us_state? eol? co_label comma? us_county '),
        VOCAB.producer(convert, ' us_county co_label comma? us_state? '),
        VOCAB.producer(convert, ' us_county comma? us_state '),
        VOCAB.producer(convert, """
            st_label us_state eol? co_label us_county """),
        VOCAB.producer(convert, ' st_label eol? us_state '),
        VOCAB.producer(convert, ' (?<! skip ) us_state (?! other | nope ) '),
    ])