"abdominal non descended",
                "abdominal descended",
                "non descended",
                "fully descended",
                "partially descended",
                "size non descended",
                "size descended",
                "descended",
            ],
        ),
        # Simplify the testes length so it can be skipped easily
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            r""" (?P<value>
                ( testes | abbrev | ambiguous_key ) length?
                    ( state | abdominal | size )
                    ( conj? ( state | size ) )?
            ) """,
        ),
        VOCAB.producer(
            convert,
            r""" (?P<value> non ( testes | abbrev | ambiguous_key ) ( state )? ) """,
        ),
        VOCAB.producer(
            convert,
            """ label
                (?P<value> ( testes | abbrev )? length? size ( conj? state )? )
            """,
        ),
    ],
)
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """),
        VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """),

        VOCAB.producer(convert, """ (?P<neg> none nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple none ) """),
        VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """),
        VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """),
    ],
)
     "value_words",
     """
         size mature coverage luteum color corpus other active destroyed alb
         visible developed cyst texture fallopian luteum
     """.split(),
 ),
 VOCAB.grouper(
     "values",
     """
     ( value_words ( and | comma ) | non )?
     value_words """,
 ),
 VOCAB.producer(
     convert,
     """
         side? ovaries side? ( word | number | comma ){0,5}
         (?P<value> values+ )
     """,
 ),
 VOCAB.producer(
     convert,
     """
     (?P<value> values+ ) ( word | number | comma ){0,5}
        ( (?<! comma ) side )? (?<! comma ) ovaries """,
 ),
 # Get left and right side measurements
 # E.g.: ovaries: R 2 c. alb, L sev c. alb
 VOCAB.producer(
     double,
     r"""
         ovaries
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.term("lactating_abbrev", r"[oc][esm]l"),
        VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"),
        VOCAB.term("post", r""" post | finished """),

        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB.producer(convert, """ (?P<pos> lactating ) """),
        VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """),
        VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """),
        VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """),
        VOCAB.producer(convert, """ (?P<neg> not_lactating_abbrev ) """),
    ],
)
 VOCAB.term(
     "key",
     r"""
         forearm ( \s* ( length | len | l ) )?
         | fore? \s? [.]? \s? a
         | fa
     """,
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: forearm 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: forearm = 9/16 inches
         "key len_fraction",  # E.g.: forearm = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: forearmLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: forearmLength=9-10 mm
         "key noise? len_range",  # Missing units like: forearm 9-10
         "key dash number units?",
         "number key units?",
     ],
 ),
 VOCAB.producer(
 # Separators
 VOCAB["word"],
 VOCAB["semicolon"],
 VOCAB["comma"],
 # Any key not preceding by "other_wt" is considered a weight key
 VOCAB.grouper(
     "wt_key",
     """
         (?<! other_wt )
         ( key_leader weight | key_leader mass
             | body weight | body mass | body
             | weight | mass | key_with_dots )
     """,
 ),
 VOCAB.grouper("key", " wt_key ".split()),
 VOCAB.producer(compound, " key? compound_wt "),
 # Shorthand notation like: on tag: 11-22-33-44=99g
 VOCAB.producer(
     shorthand,
     [
         "key shorthand",
         "shorthand",
         "key shorthand_bats",
         "shorthand_bats",
     ],
 ),
 VOCAB.producer(simple_mass,
                " wt_key mass_units number (?! len_units ) "),
 VOCAB.producer(simple_mass, " wt_key mass_range "),
 VOCAB.producer(simple_mass,
                " ( key | triple_key ) mass_range mass_units "),
 VOCAB.grouper("value", " cross | number len_units? "),
 # E.g.: active, Or: immature
 VOCAB.grouper("state",
               "active mature destroyed visible developed".split()),
 # Male or female ambiguous, like: gonadLength1
 VOCAB.grouper(
     "ambiguous",
     """
         ambiguous_key dim_side
         | side ambiguous_key dimension
         | ambiguous_key dimension
     """,
 ),
 # These patterns contain measurements to both left & right ovaries
 # E.g.: reproductive data: ovaries left 10x5 mm, right 10x6 mm
 VOCAB.producer(double, """ label ovary side_cross """),
 # As above but without the ovaries marker:
 # E.g.: reproductive data: left 10x5 mm, right 10x6 mm
 VOCAB.producer(double, """label side_cross"""),
 # Has the ovaries marker but is lacking the label
 # E.g.: ovaries left 10x5 mm, right 10x6 mm
 VOCAB.producer(double, """ ovary side_cross """),
 # A typical testes size notation
 # E.g.: reproductive data: ovaries 10x5 mm
 VOCAB.producer(convert, " label ovary value "),
 # E.g.: reproductive data: left ovaries 10x5 mm
 VOCAB.producer(convert, " label side ovary value "),
 # E.g.: left ovaries 10x5 mm
 VOCAB.producer(convert, " side ovary value "),
 # May have a few words between the label and the measurement
 VOCAB.producer(
Example #9
0
     r"""( tragus \s* ) \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm ) """,
 ),
 # Standard keywords that indicate a tragus length follows
 VOCAB.term(
     "key",
     r""" ( tragus | trag | tragi ) \s* (length | len | l )? | tr """,
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: tragus 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: tragus = 9/16 inches
         "key len_fraction",  # E.g.: tragus = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: tragusLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: tragusLengthInMM=9-10 mm
         "key noise? len_range",  # Missing units: tragusLengthInMM 9-10
         "key dash? number units?",
     ],
 ),
 VOCAB.producer(
     partial(shorthand_length, measurement="shorthand_tr"),
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="pregnant" if token.group.get("pos") else "not pregnant",
        start=token.start,
        end=token.end,
    )
    return trait


PREGNANCY_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term(
            "pregnant",
            r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """.
            split(),
        ),
        VOCAB.part("separator", r' [;,"] '),
        VOCAB.producer(convert, """ (?P<neg> pregnant none) """),
        VOCAB.producer(convert, """ (?P<neg> none pregnant ) """),
        VOCAB.producer(convert, """ (?P<pos> pregnant ) """),
    ],
)
        start=token.start,
        end=token.end,
    )
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev_pos", "sc".split()),
        VOCAB.term("scrotal_abbrev_neg", "ns ".split()),

        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """),
        VOCAB.producer(
            convert,
            """ (?P<pos> (testes | testes_abbrev | label) scrotal_abbrev_pos ) """
        ),
        VOCAB.producer(
            convert,
            """ (?P<pos> scrotal_abbrev_pos (testes | testes_abbrev) ) """),
        VOCAB.producer(convert, """ (?P<neg> scrotal_neg ) """),
        VOCAB.producer(convert, """ (?P<neg> scrotal_pos none ) """),
        VOCAB.producer(convert, """ (?P<neg> none scrotal_pos ) """),
        VOCAB.producer(
            convert,
            """ (?P<neg> (testes | testes_abbrev | label) scrotal_abbrev_neg ) """
        ),
        VOCAB.producer(
Example #12
0
        ),
        VOCAB.term("joiner", r""" of were """.split()),
        VOCAB.term(
            "recent",
            r""" recently recent was previously prev """.split(),
        ),
        VOCAB.term(
            "probably",
            r"""
                probably prob possibly possible
                appears? very
                visible visibly
                evidence evident
            """.split(),
        ),
        VOCAB.term("stage", r" early late mid ".split()),
        VOCAB.part("separator", r' [;,"] '),
        # E.g.: pregnancy visible
        VOCAB.producer(
            convert, """ (?P<value> pregnant joiner? none? probably quest? ) """
        ),
        # E.g.: Probably early pregnancy
        VOCAB.producer(
            convert,
            """ (?P<value> none? (recent | probably)?
                stage? (none | joiner)? pregnant quest? )
            """,
        ),
    ],
)
Example #13
0
     ),
     # Separates measurements
     VOCAB.part("separator", r' [;"?/,] '),
     # Skip arbitrary words
     VOCAB["word"],
     VOCAB.grouper(
         "state_end",
         """
             ( size | fully | partially | other | lactation | color | false
                 | visible | tissue | present | active | developed )
         """,
     ),
     VOCAB.grouper("state_mid", """ ( uterus | and ) """),
     VOCAB.producer(
         convert,
         """(?P<value> non?
             (state_end | much) (state_mid | state_end){0,2} nipple)
         """,
     ),
     VOCAB.producer(
         convert,
         """(?P<value> non? nipple
             (state_end | much) (state_mid | state_end){0,2} )
         """,
     ),
     VOCAB.producer(
         convert,
         """(?P<value> nipple non?
             (state_end | much) (state_mid | state_end){0,2} )
         """,
     ),
 ],
 # Compound words separated by dashes or slashes
 # E.g. adult/juvenile or over-winter
 VOCAB.part("joiner", r" \s* [/-] \s* "),
 # Use this to find the end of a life stage pattern
 VOCAB.part("separator", r' [;,"?] | $ '),
 # For life stages with numbers as words in them
 VOCAB["ordinals"],
 VOCAB["time_units"],
 VOCAB.part("after", "after"),
 VOCAB.part("hatching", "hatching"),
 # Match any word
 VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "),
 VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"),
 # E.g.: life stage juvenile/yearling
 VOCAB.producer(
     convert,
     "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"),
 # E.g.: life stage young adult
 VOCAB.producer(convert,
                "json_key (?P<value> ( intrinsic | word ) intrinsic )"),
 # E.g.: life stage yearling
 VOCAB.producer(convert, "json_key (?P<value> intrinsic )"),
 # A sequence of words bracketed by a keyword and a separator
 # E.g.: LifeStage Remarks: 5-6 wks;
 VOCAB.producer(
     convert,
     """ json_key (?P<value> ( intrinsic | word | joiner ){1,5} )
     separator """,
 ),
 # E.g.: LifeStage = 1st month
 VOCAB.producer(convert, "json_key (?P<value> as_time )"),
 VOCAB.part("bang", r" [!] "),
 VOCAB.grouper(
     "count",
     """ none (word | plac_scar) conj | integer | none | num_words | bang """,
 ),
 VOCAB.grouper("present", " found | near_term "),
 VOCAB.grouper("numeric", " integer | real "),
 VOCAB.grouper("skip_len",
               " ( x? numeric metric_len ) | (x numeric metric_len?) "),
 VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "),
 VOCAB.grouper("side_link", " x | conj | word "),
 VOCAB.grouper("between", "side_link? | skip_words{,4}"),
 VOCAB.producer(
     convert,
     """ embryo eq? (?P<total> count ) skip_len?
         (?P<sub> side ) (?P<subcount> count ) between
         (?P<sub> side ) (?P<subcount> count )
     """,
 ),
 VOCAB.producer(
     convert,
     """ embryo eq? (?P<sub> side ) (?P<subcount> count ) between
         embryo?    (?P<sub> side ) (?P<subcount> count ) embryo?
     """,
 ),
 VOCAB.producer(
     convert,
     """ embryo eq? (?P<total> count ) skip_words{,4}
             (?P<subcount> count ) (?P<sub> side ) between
             (?P<subcount> count ) (?P<sub> side )
     """,
 VOCAB["semicolon"],
 VOCAB["comma"],
 VOCAB.grouper(
     "key",
     """ ( key_with_units | len_key | ambiguous | char_key ) ( eq | dash )? """,
 ),
 VOCAB.grouper(
     "value",
     """ len_range | number (?P<units> len_units )? (?! mass_units ) """,
 ),
 VOCAB.grouper(
     "value_units",
     """ len_range | number (?P<units> len_units ) """,
 ),
 # E.g.: 10 to 11 inches TL
 VOCAB.producer(simple, "value (?P<units> len_units ) key"),
 VOCAB.producer(simple, """ key value key? """),
 VOCAB.producer(simple, """ key (?P<units> len_units ) value """),
 VOCAB.producer(
     simple,
     """ key_units_req ( value_units | triple_key ) """,
 ),
 # E.g.: total length 4 feet 7 inches
 VOCAB.producer(compound, " key? compound_len "),
 # Handle fractional values like: total length 9/16"
 # E.g.: total = 9/16 inches
 VOCAB.producer(fraction,
                "key_units_req len_fraction (?P<units> len_units )"),
 # E.g.: svl 9/16 inches
 VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )"),
 # E.g.: len 9/16 in
Example #17
0
            (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) (
                ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump
                | (?<! [a-z] ) crl (?! [a-z] )
                | (?<! [a-z] ) c \.? r \.? (?! [a-z] )
            )""",
        ),
        VOCAB.part("len", r" (length | len) (?! [a-z] ) "),
        VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "),
        VOCAB.part("separator", r' [;"/.] '),
        VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """),
        VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """),
        VOCAB.grouper(
            "count",
            """
            number side number side eq?
            | number plus number ( eq number )?
            """,
        ),
        VOCAB.grouper("skip", " prep word cross | other | side "),
        VOCAB.producer(convert, """ embryo? key value quest? """),
        VOCAB.producer(convert, """ embryo? x? value key quest? """),
        VOCAB.producer(convert_many,
                       """ embryo count? value{2,} (?! skip ) quest? """),
        VOCAB.producer(convert, """ embryo? key x? value quest? """),
        VOCAB.producer(convert, """ embryo? x? value key quest? """),
        VOCAB.producer(convert, """ embryo x? value (?! skip ) quest? """),
        VOCAB.producer(isolate,
                       """ embryo colon? count? value len_units quest? """),
    ],
)
            [[:alpha:][:digit:]\-]+ (?! [.] )""",
                   priority=LOWEST),
        VOCAB.grouper('collector',
                      """
            ( (name_part | initial) )+ 
            ( name_part | part | initial )* """,
                      capture=False),
        VOCAB.grouper('joiner', ' ( conj | comma | with ){1,2} '),

        # With a label
        VOCAB.producer(
            convert, """
            (?<= ^ | eol )
            (?<! other_label comma? name_part? ) (?<! part | col_no )
                noise? col_label comma? noise?
                (?P<col_name> collector
                    ( joiner collector )* ( comma name_part )? )
                noise?
            ( eol* ( (no_label? comma? (?P<collector_no> col_no )
                | no_label comma?
                    (?P<collector_no> ( part | col_no ){1,2} ) ) ) )?
                """),

        # Without a label
        VOCAB.producer(
            convert, """
            (?<= ^ | eol )
            (?<! other_label noise? name_part? )  (?<! part | col_no )
            noise? col_label? comma? noise?
            (?P<col_name> initial? name_part+ ( joiner collector )* )
            ( eol* ( (no_label? comma? (?P<collector_no> col_no )
                | no_label comma?
Example #19
0
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     VOCAB.term("id", r" \d+-\d+ "),
     VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()),
     VOCAB.part("number", r" number | no | [#] "),
     VOCAB.part("eq", r" is | eq | equals? | [=] "),
     # Skip arbitrary words
     VOCAB["word"],
     VOCAB["sep"],
     VOCAB.grouper("count", " (?: integer | none )(?! side ) "),
     VOCAB.grouper("modifier", "adj visible".split()),
     VOCAB.grouper("skip", " number eq? integer "),
     VOCAB.producer(
         typed,
         """ (?P<notation>
                 (?P<value1> count) modifier
                 (?P<value2> count) modifier
             ) nipple
         """,
     ),
     # Eg: 1:2 = 6 mammae
     VOCAB.producer(
         convert,
         """ nipple op?
             (?P<notation> count modifier?
                 op? count modifier?
                 (eq (?P<value> count))? )
         """,
     ),
     # Eg: 1:2 = 6 mammae
     VOCAB.producer(
         convert,
Example #20
0
 VOCAB.term(
     "key",
     [
         r"hind \s* foot \s* with \s* (?P<includes> claw )",
         r"hind \s* foot ( \s* ( length | len ) )?",
         "hfl | hf",
     ],
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: hindFoot 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: hindFoot = 9/16 inches
         "key len_fraction",  # E.g.: hindFoot = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: hindFootLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: hindFootLength=9-10 mm
         "key noise? len_range",  # Missing units like: hindFootLength 9-10
         "key dash number units",
     ],
 ),
 VOCAB.producer(
     partial(shorthand_length, measurement="shorthand_hfl"),
     "adj",
     r"""
     faint prominent recent old possible """.split(),
 ),
 # Skip arbitrary words
 VOCAB["word"],
 VOCAB.part("sep", r" [;/] "),
 VOCAB.grouper(
     "count",
     """
         none embryo conj | none visible | integer | none
     """,
 ),
 VOCAB.producer(
     convert_count,
     """(?P<count1> count ) op (?P<count2> count )
         ( eq (?P<value> count ) )? plac_scar
     """,
 ),
 VOCAB.producer(
     convert_count,
     """plac_scar op?
           (?P<count1> count ) prep? (?P<side1> side )
         ( (?P<count2> count ) prep? (?P<side2> side ) )?
     """,
 ),
 VOCAB.producer(
     convert_count,
     """ (?P<count1> count ) prep? (?P<side1> side ) plac_scar
         ( (?P<count2> count ) prep? (?P<side2> side )
             (plac_scar)? )?
     """,
Example #22
0
        VOCAB.term(
            "keyword",
            [
                r" ear \s* from \s* (?P<measured_from1> notch | crown )",
                r" ear \s* ( length | len )",
                r" ear (?! \s* tag )",
                r" ef (?P<measured_from2> n | c ) [-]?",
            ],
        ),
        # Some patterns require a separator
        VOCAB["word"],
        VOCAB.part("sep", " [;,] "),
        # Consider any of the following as just a key
        VOCAB.grouper("key", "keyword char_key char_measured_from".split()),
        # Handle fractional values like: ear 9/16"
        VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )?"),
        # E.g.: earLengthInMM 9-10
        VOCAB.producer(simple_len, "(?P<key> key_with_units ) len_range"),
        # E.g.: ear 9-10 mm
        VOCAB.producer(simple_len, "key len_range (?P<units> len_units )?"),
        # Shorthand notation like: on tag: 11-22-33-44=99g
        VOCAB.producer(
            partial(shorthand_length, measurement="shorthand_el"),
            [
                "shorthand",
                "shorthand_bats",
            ],
        ),
    ],
)
     "state",
     ["""(non | partially | fully )? descended """]
     + """ scrotal abdominal size other """.split(),
 ),
 # Male or female ambiguous, like: gonadLength1
 VOCAB.grouper(
     "ambiguous",
     """
         ambiguous_key dim_side
         | side ambiguous_key dimension
         | ambiguous_key dimension
     """,
 ),
 # These patterns contain measurements to both left & right testes
 # E.g.: reproductive data: tests left 10x5 mm, right 10x6 mm
 VOCAB.producer(double, """label ( testes | abbrev | char_key ) side_cross """),
 # As above but without the testes marker:
 # E.g.: reproductive data: left 10x5 mm, right 10x6 mm
 VOCAB.producer(double, """ label side_cross """),
 # Has the testes marker but is lacking the label
 # E.g.: testes left 10x5 mm, right 10x6 mm
 VOCAB.producer(
     double,
     """
         ( testes | abbrev | char_key ) side_cross
     """,
 ),
 # E.g.: reproductive data: left 10x5 mm
 VOCAB.producer(
     double,
     """
    if trait:
        trait.value = str(trait.value[:-2]) + '??'
    return trait


LABEL_DATE = Base(
    name=__name__.split('.')[-1],
    rules=[
        VOCAB['eol'],
        VOCAB['uuid'],  # Get rid of these before they're a problem
        VOCAB.term('label', ' date '.split()),
        VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'),
        VOCAB.part('sep', r' [/_-]+ ', capture=False),
        VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep? month_name sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> month_name sep? digits sep? digits ) """),
        VOCAB.producer(
            convert, """
            label? (?P<value> digits sep digits sep digits ) """),
        VOCAB.producer(
            short_date_digits, f"""
            label? (?P<value> digits sep digits ) """),
        VOCAB.producer(
            short_date_name, f"""
            label? (?P<value> month_name sep? digits ) """),
        VOCAB.producer(
            short_date_name, f"""
Example #25
0
VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(value=token.group["value"].lower(),
                  start=token.start,
                  end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev", "ns sc".split()),
        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            """ (?P<value>
                ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
            """,
        ),
        VOCAB.producer(convert, """ (?P<value> non? scrotal ) """),
        VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev )  """),
    ],
)
LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
        VOCAB.part("pre", r" \b pre [\s\-]? "),
        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB["word"],
        VOCAB.grouper("prefix", "not post pre".split()),
        VOCAB.producer(convert, """ (?P<value> prefix? lactating quest? ) """),
    ],
)
Example #27
0
import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
        VOCAB.producer(convert, " (?P<value> sex_vocab quest? ) "),
    ],
)
Example #28
0
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,
                                      f' (?P<value> plant_genus word+ ) ')
                   ])

PLANT_FAMILY = Base(
    name='plant_family',
    rules=[VOCAB.producer(convert, f' (?P<value> plant_family ) ')])
Example #29
0
     r"""
         \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D )
     """,
 ),
 # Standard keywords that indicate a tail length follows
 VOCAB.term("keyword",
            [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 # Consider all of these tokens a key
 VOCAB.grouper("key", "keyword char_key".split()),
 # Handle fractional values like: tailLength 9/16"
 VOCAB.producer(
     fraction,
     [
         # E.g.: tail = 9/16 in
         "key len_fraction (?P<units> len_units )",
         "key len_fraction",  # Without units, like: tail = 9/16
     ],
 ),
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: tailLengthInMM=9-10
         "key len_range (?P<units> len_units )",  # E.g.: tailLength=9-10 mm
         "key len_range",  # Missing units like: tailLength 9-10
     ],
 ),
 VOCAB.producer(
     partial(shorthand_length, measurement="shorthand_tal"),
     [
         "shorthand",
Example #30
0
    if token.group.get('us_county'):
        trait.us_county = token.group['us_county'].title()

    if token.group.get('us_state'):
        trait.us_state = us_states.normalize_state(token.group['us_state'])

    return trait


ADMIN_UNIT = Base(
    name='us_county',
    rules=[
        VOCAB['eol'],
        VOCAB.term('skip', r""" of the """.split()),
        VOCAB.term('co_label', r""" co | coun[tc]y """, capture=False),
        VOCAB.term('st_label', r"""
            ( plants | flora ) \s* of """, capture=False),
        VOCAB.term('other', r"""alluvial flood river plain """.split()),
        VOCAB.part('nope', r""" [(] """),
        VOCAB['word'],

        VOCAB.producer(convert, ' us_state? eol? co_label comma? us_county '),
        VOCAB.producer(convert, ' us_county co_label comma? us_state? '),
        VOCAB.producer(convert, ' us_county comma? us_state '),
        VOCAB.producer(convert, """
            st_label us_state eol? co_label us_county """),
        VOCAB.producer(convert, ' st_label eol? us_state '),
        VOCAB.producer(convert, ' (?<! skip ) us_state (?! other | nope ) '),
    ])