Python Vocabulary.part Examples, traiter.old.vocabulary.Vocabulary.part Python Examples

Example #1

0

Show file

File: body_mass.py Project: rafelafrance/traiter_vertnet


BODY_MASS = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Looking for keys like: MassInGrams
        VOCAB.term(
            "key_with_units",
            r"""
                ( weight | mass) [\s-]* in [\s-]*
                (?P<mass_units> grams | g | lbs )
            """,
        ),
        # These words indicate a body mass follows
        VOCAB.part("key_leader", "full observed total".split()),
        # Words for weight
        VOCAB.part("weight", "weights? weigh(ed|ing|s)?".split()),
        # Keys like: w.t.
        VOCAB.part("key_with_dots", r" \b w \.? \s? t s? \.? "),
        # Common prefixes that indicate a body mass
        VOCAB.part("mass", "mass"),
        VOCAB.part("body", "body"),
        # These indicate that the mass is NOT a body mass
        VOCAB.term(
            "other_wt",
            """
                femur baculum bacu bac spleen thymus kidney
                testes testis ovaries epididymis epid
            """.split(),
        ),

Example #2

0

Show file

File: placental_scar_count.py Project: rafelafrance/traiter_vertnet


PLACENTAL_SCAR_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        # Adjectives to placental scars
        VOCAB.term(
            "adj",
            r"""
            faint prominent recent old possible """.split(),
        ),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB.part("sep", r" [;/] "),
        VOCAB.grouper(
            "count",
            """
                none embryo conj | none visible | integer | none
            """,
        ),
        VOCAB.producer(
            convert_count,
            """(?P<count1> count ) op (?P<count2> count )
                ( eq (?P<value> count ) )? plac_scar
            """,
        ),
        VOCAB.producer(
            convert_count,
            """plac_scar op?

Example #3

0

Show file

File: nipples_enlarged.py Project: rafelafrance/traiter_vertnet

def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """),

        VOCAB.producer(convert, """ (?P<neg> none nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple none ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """),
    ],
)

Example #4

0

Show file

File: life_stage.py Project: rafelafrance/traiter_vertnet

         imagos? imms? immatures?
         jeunes? juvs? juveniles? juvéniles?
         larvae? larvals? larves? leptocephales? leptocephalus
         matures? metamorphs?
         neonates? nestlings? nulliparous
         premetamorphs?
         sub-adults? subads? subadulte?s?
         tadpoles? têtard
         yearlings? yg ygs young
     """.split(),
 ),
 # This indicates that the following words are NOT a life stage
 VOCAB.term("skip", r" determin \w* "),
 # Compound words separated by dashes or slashes
 # E.g. adult/juvenile or over-winter
 VOCAB.part("joiner", r" \s* [/-] \s* "),
 # Use this to find the end of a life stage pattern
 VOCAB.part("separator", r' [;,"?] | $ '),
 # For life stages with numbers as words in them
 VOCAB["ordinals"],
 VOCAB["time_units"],
 VOCAB.part("after", "after"),
 VOCAB.part("hatching", "hatching"),
 # Match any word
 VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "),
 VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"),
 # E.g.: life stage juvenile/yearling
 VOCAB.producer(
     convert,
     "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"),
 # E.g.: life stage young adult

Example #5

0

Show file

File: vagina_state.py Project: rafelafrance/traiter_vertnet

"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)

Example #6

0

Show file

File: sex.py Project: rafelafrance/traiter_vertnet

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
        VOCAB.producer(convert, " (?P<value> sex_vocab quest? ) "),
    ],
)

Example #7

0

Show file

File: lactation_state_old.py Project: rafelafrance/traiter_vertnet

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
        VOCAB.part("pre", r" \b pre [\s\-]? "),
        # Separates measurements

Example #8

0

Show file

File: testes_size.py Project: rafelafrance/traiter_vertnet

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

TESTES_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Note: abbrev differs from the one in the testes_state_trait
        VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part("char_key", r" \b t (?! [a-z] )"),
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper(
            "value",
            """ cross | number len_units? (?! mass_units ) """,
        ),
        VOCAB.grouper(
            "state",

Example #9

0

Show file

 fix_up=fix_up,
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     # Units are in the key, like: EarLengthInMillimeters
     VOCAB.term(
         "key_with_units",
         r"""
             ear \s* ( length | len ) \s* in \s*
             (?P<len_units> millimeters | mm )
         """,
     ),
     # Abbreviation containing the measured from notation, like: e/n or e/c
     VOCAB.part(
         "char_measured_from",
         r"""
             (?<! [a-z] ) (?<! [a-z] \s )
             (?P<ambiguous_key> e ) /? (?P<measured_from1> n | c ) [-]?
             (?! \.? [a-z] )
         """,
     ),
     # The abbreviation key, just: e. This can be a problem.
     VOCAB.part(
         "char_key",
         r"""
             (?<! \w ) (?<! \w \s )
             (?P<ambiguous_key> e )
             (?! \.? \s? [a-z\(] )
         """,
     ),
     # Standard keywords that indicate an ear length follows
     VOCAB.term(
         "keyword",

Example #10

0

Show file

File: label_date.py Project: rafelafrance/traiter_digi_leap

    if not (has_month and has_year):
        return None

    trait = convert(token)
    if trait:
        trait.value = str(trait.value[:-2]) + '??'
    return trait


LABEL_DATE = Base(
    name=__name__.split('.')[-1],
    rules=[
        VOCAB['eol'],
        VOCAB['uuid'],  # Get rid of these before they're a problem
        VOCAB.term('label', ' date '.split()),
        VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'),
        VOCAB.part('sep', r' [/_-]+ ', capture=False),
        VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep? month_name sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> month_name sep? digits sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep digits sep digits ) """),
        VOCAB.producer(
            short_date_digits, f"""
            label? (?P<value> digits sep digits ) """),
        VOCAB.producer(

Example #11

0

Show file

File: collector.py Project: rafelafrance/traiter_digi_leap

        if col_no[-1] in ('m', 'M'):
            return None
        traits[0].col_no = col_no

    return squash(traits)


COLLECTOR = Base(
    name='collector',
    rules=[
        VOCAB['eol'],
        VOCAB['month_name'],
        STATE_NAMES,
        VOCAB.part('col_label',
                   r"""
            \b ( collect(or|ed) | coll | col ) ( \s* by )? 
            """,
                   capture=False),
        VOCAB.term('no_label', r""" number no num """.split(), capture=False),
        VOCAB.term('part',
                   r""" [[:alpha:]]+ """,
                   priority=LOWEST,
                   capture=False),
        VOCAB.term('other_label',
                   r"""
            art artist ass assist assistant auth authors?
            cartographer conservator contributor corator curator curatorial
            det determiner dir director
            ecologist editor entomologist expedition explorer extractor
            gardener geographer geologist georeferencer grower
            herbarium horticulturalist

Example #12

0

Show file

File: total_length.py Project: rafelafrance/traiter_vertnet

 VOCAB.term(
     "key_with_units",
     r"""
         ( total | snout \s* vent | head \s* body | fork ) \s*
         ( length | len )? \s* in \s* (?P<units> millimeters | mm )
     """,
 ),
 # Various total length keys
 VOCAB.part(
     "len_key",
     r"""
         t \s* [o.]? \s* l [._]? (?! [a-z] )
         | total  [\s-]* length [\s-]* in
         | ( total | max | standard ) [\s-]* lengths? \b
         | meas [\s*:]? \s* length [\s(]* [l] [)\s:]*
         | meas ( [a-z]* )? \.? : \s* l (?! [a-z.] )
         | s \.? \s? l \.? (?! [a-z.] )
         | label [\s.]* lengths? \b
         | ( fork | mean | body ) [\s-]* lengths? \b
         | s \.? \s? v \.? \s? l \.? (?! [a-z.] )
         | snout [\s-]* vent [\s-]* lengths? \b
     """,
 ),
 # Words that indicate we don't have a total length
 VOCAB.term("skip", " horns? tag ".split()),
 # The word length on its own. Make sure it isn't proceeded by a letter
 VOCAB.part(
     "ambiguous",
     r""" (?<! [a-z] \s* ) (?P<ambiguous_key> lengths? ) """,
 ),
 # # We don't know if this is a length until we see the units

Example #13

0

Show file

File: pregnancy_state.py Project: rafelafrance/traiter_vertnet

from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="pregnant" if token.group.get("pos") else "not pregnant",
        start=token.start,
        end=token.end,
    )
    return trait


PREGNANCY_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term(
            "pregnant",
            r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """.
            split(),
        ),
        VOCAB.part("separator", r' [;,"] '),
        VOCAB.producer(convert, """ (?P<neg> pregnant none) """),
        VOCAB.producer(convert, """ (?P<neg> none pregnant ) """),
        VOCAB.producer(convert, """ (?P<pos> pregnant ) """),
    ],
)

Example #14

0

Show file

File: ovaries_state.py Project: rafelafrance/traiter_vertnet

        side=token.group["side"][1].lower(),
        start=token.start,
        end=token.end,
    )

    return [trait1, trait2]


OVARIES_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("other", """ sev somewhat few """.split()),
        # Skip words
        VOCAB.term("skip", " womb nullip ".split()),
        # VOCAB['comma'],
        VOCAB.part("sep", r" [;\(] "),
        # E.g.: ovaries and uterine horns
        # Or:   ovaries and fallopian tubes
        VOCAB.grouper(
            "ovaries",
            r"""
                ovary ( ( and? uterus horns? ) | and? fallopian )?
            """,
        ),
        # E.g.: covered in copious fat
        VOCAB.grouper("coverage", " covered word{0,2} fat "),
        # E.g.: +corpus luteum
        VOCAB.grouper("luteum", " sign? corpus? (alb | lut) "),
        VOCAB.grouper(
            "value_words",
            """

Example #15

0

Show file

"""Find taxon notations on herbarium specimen labels."""

import pandas as pd
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import const, patterns
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,

Example #16

0

Show file

File: patterns.py Project: rafelafrance/traiter_digi_leap

"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part('slash', r' [/] ', capture=False)
VOCAB.part('dash', r' (?: – | - ) ', capture=False)
VOCAB.part('open', r' [(\[] ', capture=False)
VOCAB.part('close', r' [)\]] ', capture=False)
VOCAB.part('x', r' [x×] ', capture=False)
VOCAB.part('quest', r' [?] ')
VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST)
VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST)
VOCAB.part('ampersand', r' [&] ', capture=False)
VOCAB.part('eq', r' [=] ', capture=False)
VOCAB.part('under', r' [_] ', capture=False)
VOCAB.part('eol', r' [\n\r\f] ', capture=False)
VOCAB.part('dot', r' [.] ', capture=False)

# Small words
VOCAB.part('by', r' by ', capture=False)
VOCAB.part('to', r' to ', capture=False)
VOCAB.part('with', r' with ', capture=False)
VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False)
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)

Example #17

0

Show file

File: embryo_count.py Project: rafelafrance/traiter_vertnet

        count = to_positive_int(token.group["subcount"])
        trait.value = count + count
        trait.left = count
        trait.right = count

    return trait


EMBRYO_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        VOCAB["metric_mass"],
        VOCAB.part(
            "sex",
            r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """,
        ),
        VOCAB.term("repo_key", r""" reproductive \s data """),
        VOCAB.term("near_term", r" near[\s-]?term"),
        VOCAB.term("each_side", r" each \s side "),
        VOCAB.term("skip", r" w  wt ".split()),
        VOCAB.part("sep", r" [;] "),
        VOCAB.part("bang", r" [!] "),
        VOCAB.grouper(
            "count",
            """ none (word | plac_scar) conj | integer | none | num_words | bang """,
        ),
        VOCAB.grouper("present", " found | near_term "),
        VOCAB.grouper("numeric", " integer | real "),
        VOCAB.grouper("skip_len",
                      " ( x? numeric metric_len ) | (x numeric metric_len?) "),

Example #18

0

Show file

File: lactation_state.py Project: rafelafrance/traiter_vertnet

        value="lactating" if token.group.get("pos") else "not lactating",
        start=token.start,
        end=token.end,
    )
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.term("lactating_abbrev", r"[oc][esm]l"),
        VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"),
        VOCAB.term("post", r""" post | finished """),

        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB.producer(convert, """ (?P<pos> lactating ) """),
        VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """),
        VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """),
        VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """),

Example #19

0

Show file

    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


EMBRYO_LENGTH = Base(
    name=__name__.split(".")[-1],
    fix_up=fix_up,
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB["shorthand"],
        VOCAB.part(
            "embryo_len_key",
            r"""
            (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) (
                ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump
                | (?<! [a-z] ) crl (?! [a-z] )
                | (?<! [a-z] ) c \.? r \.? (?! [a-z] )
            )""",
        ),
        VOCAB.part("len", r" (length | len) (?! [a-z] ) "),
        VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "),
        VOCAB.part("separator", r' [;"/.] '),
        VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """),
        VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """),
        VOCAB.grouper(
            "count",
            """
            number side number side eq?
            | number plus number ( eq number )?
            """,

Example #20

0

Show file

"""Shared reproductive trait tokens (testes & ovaries)."""

from traiter.old.vocabulary import LOWEST, Vocabulary

import vertnet.pylib.patterns as patterns

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term("sex", "females? | males? | [f]")

VOCAB.term("active", "active inactive".split())
VOCAB.part("and", r" ( and \b | [&] ) ")
VOCAB.term("count", r"""( only | all | both )? \s* [12]""")

VOCAB.term(
    "color",
    r""" (( dark | light | pale ) \s* )?
         ( red | pink | brown | black | white | pigmented )
    """,
)

VOCAB.term("texture", " smooth ")

VOCAB.term("covered", " covered ")

VOCAB.term("destroyed", "destroy(ed)?")

VOCAB.part(
    "size",
    r"""
        ( very \s+ )?

Example #21

0

Show file

File: forearm_length.py Project: rafelafrance/traiter_vertnet

     "key_with_units",
     r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm )
     """,
 ),
 # Standard keywords that indicate a forearm length follows
 VOCAB.term(
     "key",
     r"""
         forearm ( \s* ( length | len | l ) )?
         | fore? \s? [.]? \s? a
         | fa
     """,
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: forearm 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: forearm = 9/16 inches
         "key len_fraction",  # E.g.: forearm = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: forearmLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: forearmLength=9-10 mm

Example #22

0

Show file

File: patterns.py Project: rafelafrance/traiter_vertnet

"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

from vertnet.pylib.util import NUM_WORDS, ORDINALS

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part("slash", r" [/] ", capture=False)
VOCAB.part("dash", r" \p{Pd} ", capture=False)
VOCAB.part("open", r" \p{Ps} ", capture=False)
VOCAB.part("close", r" \p{Pe} ", capture=False)
VOCAB.part("x", r" [x×] ", capture=False)
VOCAB.part("quest", r" [?] ")
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST)
VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST)
VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST)
VOCAB.part("ampersand", r" [&] ", capture=False)
VOCAB.part("eq", r" [=] ", capture=False)
VOCAB.part("plus", r" [+] ", capture=False)
VOCAB.part("under", r" [_] ", capture=False)
VOCAB.part("eol", r" [\n\r\f] ", capture=False)
VOCAB.part("dot", r" [.] ", capture=False)

# Small words
VOCAB.part("by", r" by ", capture=False)
VOCAB.part("to", r" to ", capture=False)
VOCAB.part("with", r" with ", capture=False)
VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False)
VOCAB.term("and", r" and ", capture=False)

Example #23

0

Show file

def typed(token):
    """Convert single value tokens into a result."""
    trait = Trait(start=token.start, end=token.end)
    trait.notation = token.group["notation"]
    trait.value = to_positive_int(token.group["value1"])
    trait.value += to_positive_int(token.group.get("value2"))
    return trait


NIPPLE_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB.term("id", r" \d+-\d+ "),
        VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()),
        VOCAB.part("number", r" number | no | [#] "),
        VOCAB.part("eq", r" is | eq | equals? | [=] "),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB["sep"],
        VOCAB.grouper("count", " (?: integer | none )(?! side ) "),
        VOCAB.grouper("modifier", "adj visible".split()),
        VOCAB.grouper("skip", " number eq? integer "),
        VOCAB.producer(
            typed,
            """ (?P<notation>
                    (?P<value1> count) modifier
                    (?P<value2> count) modifier
                ) nipple
            """,
        ),

Example #24

0

Show file

 name=__name__.split(".")[-1],
 fix_up=fix_up,
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     # Looking for keys like: tailLengthInMM
     VOCAB.term(
         "key_with_units",
         r"""
             tail \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm )
         """,
     ),
     # The abbreviation key, just: t. This can be a problem.
     VOCAB.part(
         "char_key",
         r"""
             \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D )
         """,
     ),
     # Standard keywords that indicate a tail length follows
     VOCAB.term("keyword",
                [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]),
     # Some patterns require a separator
     VOCAB.part("sep", r" [;,] | $ ", capture=False),
     # Consider all of these tokens a key
     VOCAB.grouper("key", "keyword char_key".split()),
     # Handle fractional values like: tailLength 9/16"
     VOCAB.producer(
         fraction,
         [
             # E.g.: tail = 9/16 in
             "key len_fraction (?P<units> len_units )",

Example #25

0

Show file

    if token.group.get('us_county'):
        trait.us_county = token.group['us_county'].title()

    if token.group.get('us_state'):
        trait.us_state = us_states.normalize_state(token.group['us_state'])

    return trait


ADMIN_UNIT = Base(
    name='us_county',
    rules=[
        VOCAB['eol'],
        VOCAB.term('skip', r""" of the """.split()),
        VOCAB.term('co_label', r""" co | coun[tc]y """, capture=False),
        VOCAB.term('st_label', r"""
            ( plants | flora ) \s* of """, capture=False),
        VOCAB.term('other', r"""alluvial flood river plain """.split()),
        VOCAB.part('nope', r""" [(] """),
        VOCAB['word'],

        VOCAB.producer(convert, ' us_state? eol? co_label comma? us_county '),
        VOCAB.producer(convert, ' us_county co_label comma? us_state? '),
        VOCAB.producer(convert, ' us_county comma? us_state '),
        VOCAB.producer(convert, """
            st_label us_state eol? co_label us_county """),
        VOCAB.producer(convert, ' st_label eol? us_state '),
        VOCAB.producer(convert, ' (?<! skip ) us_state (?! other | nope ) '),
    ])