name=__name__.split(".")[-1],
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     VOCAB["shorthand"],
     # Adjectives to placental scars
     VOCAB.term(
         "adj",
         r"""
         faint prominent recent old possible """.split(),
     ),
     # Skip arbitrary words
     VOCAB["word"],
     VOCAB.part("sep", r" [;/] "),
     VOCAB.grouper(
         "count",
         """
             none embryo conj | none visible | integer | none
         """,
     ),
     VOCAB.producer(
         convert_count,
         """(?P<count1> count ) op (?P<count2> count )
             ( eq (?P<value> count ) )? plac_scar
         """,
     ),
     VOCAB.producer(
         convert_count,
         """plac_scar op?
               (?P<count1> count ) prep? (?P<side1> side )
             ( (?P<count2> count ) prep? (?P<side2> side ) )?
         """,
     ),
    return [trait1, trait2]


OVARIES_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("other", """ sev somewhat few """.split()),
        # Skip words
        VOCAB.term("skip", " womb nullip ".split()),
        # VOCAB['comma'],
        VOCAB.part("sep", r" [;\(] "),
        # E.g.: ovaries and uterine horns
        # Or:   ovaries and fallopian tubes
        VOCAB.grouper(
            "ovaries",
            r"""
                ovary ( ( and? uterus horns? ) | and? fallopian )?
            """,
        ),
        # E.g.: covered in copious fat
        VOCAB.grouper("coverage", " covered word{0,2} fat "),
        # E.g.: +corpus luteum
        VOCAB.grouper("luteum", " sign? corpus? (alb | lut) "),
        VOCAB.grouper(
            "value_words",
            """
                size mature coverage luteum color corpus other active destroyed alb
                visible developed cyst texture fallopian luteum
            """.split(),
        ),
        VOCAB.grouper(
            "values",
Esempio n. 3
0
    trait.value += to_positive_int(token.group.get("value2"))
    return trait


NIPPLE_COUNT = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        VOCAB.term("id", r" \d+-\d+ "),
        VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()),
        VOCAB.part("number", r" number | no | [#] "),
        VOCAB.part("eq", r" is | eq | equals? | [=] "),
        # Skip arbitrary words
        VOCAB["word"],
        VOCAB["sep"],
        VOCAB.grouper("count", " (?: integer | none )(?! side ) "),
        VOCAB.grouper("modifier", "adj visible".split()),
        VOCAB.grouper("skip", " number eq? integer "),
        VOCAB.producer(
            typed,
            """ (?P<notation>
                    (?P<value1> count) modifier
                    (?P<value2> count) modifier
                ) nipple
            """,
        ),
        # Eg: 1:2 = 6 mammae
        VOCAB.producer(
            convert,
            """ nipple op?
                (?P<notation> count modifier?
Esempio n. 4
0
     "other_wt",
     """
         femur baculum bacu bac spleen thymus kidney
         testes testis ovaries epididymis epid
     """.split(),
 ),
 # Separators
 VOCAB["word"],
 VOCAB["semicolon"],
 VOCAB["comma"],
 # Any key not preceding by "other_wt" is considered a weight key
 VOCAB.grouper(
     "wt_key",
     """
         (?<! other_wt )
         ( key_leader weight | key_leader mass
             | body weight | body mass | body
             | weight | mass | key_with_dots )
     """,
 ),
 VOCAB.grouper("key", " wt_key ".split()),
 VOCAB.producer(compound, " key? compound_wt "),
 # Shorthand notation like: on tag: 11-22-33-44=99g
 VOCAB.producer(
     shorthand,
     [
         "key shorthand",
         "shorthand",
         "key shorthand_bats",
         "shorthand_bats",
     ],
Esempio n. 5
0
VOCAB.part(
    "inches",
    r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) (?! [:] ) """,
)
VOCAB.part(
    "feet",
    r""" (?<! [a-z] )
         ( foot s? (?! [:] ) | feet s? (?! [:] )
         | ft s? (?! [,\w]) )  | (?<= \d ) '
    """,
)
VOCAB.part(
    "metric_len",
    r""" ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """,
)
VOCAB.grouper("len_units", " metric_len feet inches".split())

VOCAB.part("pounds", r" pounds? | lbs? ")
VOCAB.part("ounces", r" ounces? | ozs? ")
METRIC_MASS = r"""
    milligrams? | kilograms? | grams?
    | (?<! [a-z] )( m \.? g s? | k \.? \s? g a? | g[mr]? s? )(?! [a-z] )
"""
VOCAB.part("metric_mass", METRIC_MASS)
VOCAB.grouper("mass_units", "metric_mass pounds ounces".split())

VOCAB.grouper("us_units", "feet inches pounds ounces".split())
VOCAB.grouper("units", "len_units mass_units".split())

# # UUIDs cause problems when extracting certain shorthand notations.
VOCAB.part(
Esempio n. 6
0
 ),
 # This indicates that the following words are NOT a life stage
 VOCAB.term("skip", r" determin \w* "),
 # Compound words separated by dashes or slashes
 # E.g. adult/juvenile or over-winter
 VOCAB.part("joiner", r" \s* [/-] \s* "),
 # Use this to find the end of a life stage pattern
 VOCAB.part("separator", r' [;,"?] | $ '),
 # For life stages with numbers as words in them
 VOCAB["ordinals"],
 VOCAB["time_units"],
 VOCAB.part("after", "after"),
 VOCAB.part("hatching", "hatching"),
 # Match any word
 VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "),
 VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"),
 # E.g.: life stage juvenile/yearling
 VOCAB.producer(
     convert,
     "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"),
 # E.g.: life stage young adult
 VOCAB.producer(convert,
                "json_key (?P<value> ( intrinsic | word ) intrinsic )"),
 # E.g.: life stage yearling
 VOCAB.producer(convert, "json_key (?P<value> intrinsic )"),
 # A sequence of words bracketed by a keyword and a separator
 # E.g.: LifeStage Remarks: 5-6 wks;
 VOCAB.producer(
     convert,
     """ json_key (?P<value> ( intrinsic | word | joiner ){1,5} )
     separator """,
LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
        VOCAB.part("pre", r" \b pre [\s\-]? "),
        # Separates measurements
        VOCAB.part("separator", r' [;"/] '),
        VOCAB["word"],
        VOCAB.grouper("prefix", "not post pre".split()),
        VOCAB.producer(convert, """ (?P<value> prefix? lactating quest? ) """),
    ],
)
Esempio n. 8
0
VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(value=token.group["value"].lower(),
                  start=token.start,
                  end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev", "ns sc".split()),
        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            """ (?P<value>
                ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
            """,
        ),
        VOCAB.producer(convert, """ (?P<value> non? scrotal ) """),
        VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev )  """),
    ],
)
            volunteers?
            writer
            """.split(),
                   capture=False),
        VOCAB.part('noise', r" [_`‘|\[\]/-]+ "),
        VOCAB.term('header_key', r' herbarium '.split()),
        VOCAB.term('junk', r' date '.split()),
        VOCAB.term('skip', """ of on dry """.split()),
        VOCAB.part('semi', r' [;] '),
        VOCAB.term('col_no',
                   r"""
            [[:alpha:][:digit:]\-]+ (?! [.] )""",
                   priority=LOWEST),
        VOCAB.grouper('collector',
                      """
            ( (name_part | initial) )+ 
            ( name_part | part | initial )* """,
                      capture=False),
        VOCAB.grouper('joiner', ' ( conj | comma | with ){1,2} '),

        # With a label
        VOCAB.producer(
            convert, """
            (?<= ^ | eol )
            (?<! other_label comma? name_part? ) (?<! part | col_no )
                noise? col_label comma? noise?
                (?P<col_name> collector
                    ( joiner collector )* ( comma name_part )? )
                noise?
            ( eol* ( (no_label? comma? (?P<collector_no> col_no )
                | no_label comma?
Esempio n. 10
0
     ),
     # Standard keywords that indicate an ear length follows
     VOCAB.term(
         "keyword",
         [
             r" ear \s* from \s* (?P<measured_from1> notch | crown )",
             r" ear \s* ( length | len )",
             r" ear (?! \s* tag )",
             r" ef (?P<measured_from2> n | c ) [-]?",
         ],
     ),
     # Some patterns require a separator
     VOCAB["word"],
     VOCAB.part("sep", " [;,] "),
     # Consider any of the following as just a key
     VOCAB.grouper("key", "keyword char_key char_measured_from".split()),
     # Handle fractional values like: ear 9/16"
     VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )?"),
     # E.g.: earLengthInMM 9-10
     VOCAB.producer(simple_len, "(?P<key> key_with_units ) len_range"),
     # E.g.: ear 9-10 mm
     VOCAB.producer(simple_len, "key len_range (?P<units> len_units )?"),
     # Shorthand notation like: on tag: 11-22-33-44=99g
     VOCAB.producer(
         partial(shorthand_length, measurement="shorthand_el"),
         [
             "shorthand",
             "shorthand_bats",
         ],
     ),
 ],
Esempio n. 11
0
 # Words that indicate we don't have a total length
 VOCAB.term("skip", " horns? tag ".split()),
 # The word length on its own. Make sure it isn't proceeded by a letter
 VOCAB.part(
     "ambiguous",
     r""" (?<! [a-z] \s* ) (?P<ambiguous_key> lengths? ) """,
 ),
 # # We don't know if this is a length until we see the units
 VOCAB.part("key_units_req", "measurements? body total".split()),
 # The abbreviation key, just: t. This can be a problem.
 VOCAB.part("char_key", r" \b (?P<ambiguous_key> l ) (?= [:=-] ) "),
 # Some patterns require a separator
 VOCAB["semicolon"],
 VOCAB["comma"],
 VOCAB.grouper(
     "key",
     """ ( key_with_units | len_key | ambiguous | char_key ) ( eq | dash )? """,
 ),
 VOCAB.grouper(
     "value",
     """ len_range | number (?P<units> len_units )? (?! mass_units ) """,
 ),
 VOCAB.grouper(
     "value_units",
     """ len_range | number (?P<units> len_units ) """,
 ),
 # E.g.: 10 to 11 inches TL
 VOCAB.producer(simple, "value (?P<units> len_units ) key"),
 VOCAB.producer(simple, """ key value key? """),
 VOCAB.producer(simple, """ key (?P<units> len_units ) value """),
 VOCAB.producer(
     simple,
Esempio n. 12
0
VOCAB = Vocabulary(patterns.VOCAB)

OVARY_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper("value", " cross | number len_units? "),
        # E.g.: active, Or: immature
        VOCAB.grouper("state",
                      "active mature destroyed visible developed".split()),
        # Male or female ambiguous, like: gonadLength1
        VOCAB.grouper(
            "ambiguous",
            """
                ambiguous_key dim_side
                | side ambiguous_key dimension
                | ambiguous_key dimension
            """,
        ),
        # These patterns contain measurements to both left & right ovaries
        # E.g.: reproductive data: ovaries left 10x5 mm, right 10x6 mm
        VOCAB.producer(double, """ label ovary side_cross """),
Esempio n. 13
0
     """,
 ),
 VOCAB.term(
     "other",
     """
         protuberant prominent showing worn distended
     """.split(),
 ),
 # Separates measurements
 VOCAB.part("separator", r' [;"?/,] '),
 # Skip arbitrary words
 VOCAB["word"],
 VOCAB.grouper(
     "state_end",
     """
         ( size | fully | partially | other | lactation | color | false
             | visible | tissue | present | active | developed )
     """,
 ),
 VOCAB.grouper("state_mid", """ ( uterus | and ) """),
 VOCAB.producer(
     convert,
     """(?P<value> non?
         (state_end | much) (state_mid | state_end){0,2} nipple)
     """,
 ),
 VOCAB.producer(
     convert,
     """(?P<value> non? nipple
         (state_end | much) (state_mid | state_end){0,2} )
     """,
Esempio n. 14
0
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)

# NOTE: Double quotes as inches is handled elsewhere
VOCAB.part('inches', r"""
    (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) """)
VOCAB.part(
    'feet', r"""
    (?<! [a-z] ) ( foot s? | feet s? | ft s? (?! [,\w]) ) | (?<= \d ) ' """)
VOCAB.part(
    'metric_len', r"""
    ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """)
VOCAB.grouper('len_units', ' metric_len feet inches'.split())

VOCAB.part('pounds', r' pounds? | lbs? ')
VOCAB.part('ounces', r' ounces? | ozs? ')
METRIC_MASS = r"""
    milligrams? | kilograms? | grams?
    | (?<! [a-z] )( m \.? g s? | k \.? \s? g a? | g[mr]? s? )(?! [a-z] )
    """
VOCAB.part('metric_mass', METRIC_MASS)
VOCAB.grouper('mass_units', 'metric_mass pounds ounces'.split())

VOCAB.grouper('us_units', 'feet inches pounds ounces'.split())
VOCAB.grouper('units', 'len_units mass_units'.split())

# # UUIDs cause problems when extracting certain shorthand notations.
VOCAB.part('uuid',
Esempio n. 15
0
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     VOCAB["shorthand"],
     VOCAB["metric_mass"],
     VOCAB.part(
         "sex",
         r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """,
     ),
     VOCAB.term("repo_key", r""" reproductive \s data """),
     VOCAB.term("near_term", r" near[\s-]?term"),
     VOCAB.term("each_side", r" each \s side "),
     VOCAB.term("skip", r" w  wt ".split()),
     VOCAB.part("sep", r" [;] "),
     VOCAB.part("bang", r" [!] "),
     VOCAB.grouper(
         "count",
         """ none (word | plac_scar) conj | integer | none | num_words | bang """,
     ),
     VOCAB.grouper("present", " found | near_term "),
     VOCAB.grouper("numeric", " integer | real "),
     VOCAB.grouper("skip_len",
                   " ( x? numeric metric_len ) | (x numeric metric_len?) "),
     VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "),
     VOCAB.grouper("side_link", " x | conj | word "),
     VOCAB.grouper("between", "side_link? | skip_words{,4}"),
     VOCAB.producer(
         convert,
         """ embryo eq? (?P<total> count ) skip_len?
             (?P<sub> side ) (?P<subcount> count ) between
             (?P<sub> side ) (?P<subcount> count )
         """,
     ),
Esempio n. 16
0
 VOCAB["uuid"],  # UUIDs cause problems with numbers
 # Note: abbrev differs from the one in the testes_state_trait
 VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
 # The abbreviation key, just: t. This can be a problem.
 VOCAB.part("char_key", r" \b t (?! [a-z] )"),
 # A key with units, like: gonadLengthInMM
 VOCAB.term(
     "key_with_units",
     r"""
         (?P<ambiguous_key> gonad ) \s*
             (?P<dim> length | len | width ) \s* in \s*
             (?P<len_units> millimeters | mm )
     """,
 ),
 VOCAB.grouper(
     "value",
     """ cross | number len_units? (?! mass_units ) """,
 ),
 VOCAB.grouper(
     "state",
     ["""(non | partially | fully )? descended """]
     + """ scrotal abdominal size other """.split(),
 ),
 # Male or female ambiguous, like: gonadLength1
 VOCAB.grouper(
     "ambiguous",
     """
         ambiguous_key dim_side
         | side ambiguous_key dimension
         | ambiguous_key dimension
     """,
 ),
Esempio n. 17
0
 rules=[
     VOCAB["uuid"],  # UUIDs cause problems with numbers
     VOCAB["shorthand"],
     VOCAB.part(
         "embryo_len_key",
         r"""
         (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) (
             ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump
             | (?<! [a-z] ) crl (?! [a-z] )
             | (?<! [a-z] ) c \.? r \.? (?! [a-z] )
         )""",
     ),
     VOCAB.part("len", r" (length | len) (?! [a-z] ) "),
     VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "),
     VOCAB.part("separator", r' [;"/.] '),
     VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """),
     VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """),
     VOCAB.grouper(
         "count",
         """
         number side number side eq?
         | number plus number ( eq number )?
         """,
     ),
     VOCAB.grouper("skip", " prep word cross | other | side "),
     VOCAB.producer(convert, """ embryo? key value quest? """),
     VOCAB.producer(convert, """ embryo? x? value key quest? """),
     VOCAB.producer(convert_many,
                    """ embryo count? value{2,} (?! skip ) quest? """),
     VOCAB.producer(convert, """ embryo? key x? value quest? """),
     VOCAB.producer(convert, """ embryo? x? value key quest? """),
Esempio n. 18
0
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
     r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
             (?P<units> millimeters | mm )
     """,
 ),
 # Standard keywords that indicate a forearm length follows
 VOCAB.term(
     "key",
     r"""
         forearm ( \s* ( length | len | l ) )?
         | fore? \s? [.]? \s? a
         | fa
     """,
 ),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 VOCAB.grouper("noise", " word dash ".split()),
 # Handle fractional values like: forearm 9/16"
 VOCAB.producer(
     fraction,
     [
         "key len_fraction units",  # E.g.: forearm = 9/16 inches
         "key len_fraction",  # E.g.: forearm = 9/16
     ],
 ),
 # A typical hind-foot notation
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: forearmLengthInMM=9-10
         "key noise? len_range units ",  # E.g.: forearmLength=9-10 mm
         "key noise? len_range",  # Missing units like: forearm 9-10
Esempio n. 20
0
     """,
 ),
 # The abbreviation key, just: t. This can be a problem.
 VOCAB.part(
     "char_key",
     r"""
         \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D )
     """,
 ),
 # Standard keywords that indicate a tail length follows
 VOCAB.term("keyword",
            [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]),
 # Some patterns require a separator
 VOCAB.part("sep", r" [;,] | $ ", capture=False),
 # Consider all of these tokens a key
 VOCAB.grouper("key", "keyword char_key".split()),
 # Handle fractional values like: tailLength 9/16"
 VOCAB.producer(
     fraction,
     [
         # E.g.: tail = 9/16 in
         "key len_fraction (?P<units> len_units )",
         "key len_fraction",  # Without units, like: tail = 9/16
     ],
 ),
 VOCAB.producer(
     simple,
     [
         "key_with_units len_range",  # E.g.: tailLengthInMM=9-10
         "key len_range (?P<units> len_units )",  # E.g.: tailLength=9-10 mm
         "key len_range",  # Missing units like: tailLength 9-10
Esempio n. 21
0
    return trait


TESTES_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        # Abbreviations for "testes"
        VOCAB.term("abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB["uterus"],
        VOCAB.grouper(
            "state",
            [
                "non fully descended",
                "abdominal non descended",
                "abdominal descended",
                "non descended",
                "fully descended",
                "partially descended",
                "size non descended",
                "size descended",
                "descended",
            ],
        ),
        # Simplify the testes length so it can be skipped easily
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            r""" (?P<value>
                ( testes | abbrev | ambiguous_key ) length?
                    ( state | abdominal | size )
                    ( conj? ( state | size ) )?
            ) """,