name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], # Adjectives to placental scars VOCAB.term( "adj", r""" faint prominent recent old possible """.split(), ), # Skip arbitrary words VOCAB["word"], VOCAB.part("sep", r" [;/] "), VOCAB.grouper( "count", """ none embryo conj | none visible | integer | none """, ), VOCAB.producer( convert_count, """(?P<count1> count ) op (?P<count2> count ) ( eq (?P<value> count ) )? plac_scar """, ), VOCAB.producer( convert_count, """plac_scar op? (?P<count1> count ) prep? (?P<side1> side ) ( (?P<count2> count ) prep? (?P<side2> side ) )? """, ),
return [trait1, trait2] OVARIES_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("other", """ sev somewhat few """.split()), # Skip words VOCAB.term("skip", " womb nullip ".split()), # VOCAB['comma'], VOCAB.part("sep", r" [;\(] "), # E.g.: ovaries and uterine horns # Or: ovaries and fallopian tubes VOCAB.grouper( "ovaries", r""" ovary ( ( and? uterus horns? ) | and? fallopian )? """, ), # E.g.: covered in copious fat VOCAB.grouper("coverage", " covered word{0,2} fat "), # E.g.: +corpus luteum VOCAB.grouper("luteum", " sign? corpus? (alb | lut) "), VOCAB.grouper( "value_words", """ size mature coverage luteum color corpus other active destroyed alb visible developed cyst texture fallopian luteum """.split(), ), VOCAB.grouper( "values",
trait.value += to_positive_int(token.group.get("value2")) return trait NIPPLE_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB.term("id", r" \d+-\d+ "), VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()), VOCAB.part("number", r" number | no | [#] "), VOCAB.part("eq", r" is | eq | equals? | [=] "), # Skip arbitrary words VOCAB["word"], VOCAB["sep"], VOCAB.grouper("count", " (?: integer | none )(?! side ) "), VOCAB.grouper("modifier", "adj visible".split()), VOCAB.grouper("skip", " number eq? integer "), VOCAB.producer( typed, """ (?P<notation> (?P<value1> count) modifier (?P<value2> count) modifier ) nipple """, ), # Eg: 1:2 = 6 mammae VOCAB.producer( convert, """ nipple op? (?P<notation> count modifier?
"other_wt", """ femur baculum bacu bac spleen thymus kidney testes testis ovaries epididymis epid """.split(), ), # Separators VOCAB["word"], VOCAB["semicolon"], VOCAB["comma"], # Any key not preceding by "other_wt" is considered a weight key VOCAB.grouper( "wt_key", """ (?<! other_wt ) ( key_leader weight | key_leader mass | body weight | body mass | body | weight | mass | key_with_dots ) """, ), VOCAB.grouper("key", " wt_key ".split()), VOCAB.producer(compound, " key? compound_wt "), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( shorthand, [ "key shorthand", "shorthand", "key shorthand_bats", "shorthand_bats", ],
VOCAB.part( "inches", r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) (?! [:] ) """, ) VOCAB.part( "feet", r""" (?<! [a-z] ) ( foot s? (?! [:] ) | feet s? (?! [:] ) | ft s? (?! [,\w]) ) | (?<= \d ) ' """, ) VOCAB.part( "metric_len", r""" ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """, ) VOCAB.grouper("len_units", " metric_len feet inches".split()) VOCAB.part("pounds", r" pounds? | lbs? ") VOCAB.part("ounces", r" ounces? | ozs? ") METRIC_MASS = r""" milligrams? | kilograms? | grams? | (?<! [a-z] )( m \.? g s? | k \.? \s? g a? | g[mr]? s? )(?! [a-z] ) """ VOCAB.part("metric_mass", METRIC_MASS) VOCAB.grouper("mass_units", "metric_mass pounds ounces".split()) VOCAB.grouper("us_units", "feet inches pounds ounces".split()) VOCAB.grouper("units", "len_units mass_units".split()) # # UUIDs cause problems when extracting certain shorthand notations. VOCAB.part(
), # This indicates that the following words are NOT a life stage VOCAB.term("skip", r" determin \w* "), # Compound words separated by dashes or slashes # E.g. adult/juvenile or over-winter VOCAB.part("joiner", r" \s* [/-] \s* "), # Use this to find the end of a life stage pattern VOCAB.part("separator", r' [;,"?] | $ '), # For life stages with numbers as words in them VOCAB["ordinals"], VOCAB["time_units"], VOCAB.part("after", "after"), VOCAB.part("hatching", "hatching"), # Match any word VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "), VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"), # E.g.: life stage juvenile/yearling VOCAB.producer( convert, "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"), # E.g.: life stage young adult VOCAB.producer(convert, "json_key (?P<value> ( intrinsic | word ) intrinsic )"), # E.g.: life stage yearling VOCAB.producer(convert, "json_key (?P<value> intrinsic )"), # A sequence of words bracketed by a keyword and a separator # E.g.: LifeStage Remarks: 5-6 wks; VOCAB.producer( convert, """ json_key (?P<value> ( intrinsic | word | joiner ){1,5} ) separator """,
LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.part("not", r" \b ( not | non | no ) "), VOCAB.part( "post", r""" \b ( (( just | recently ) \s+ )? finished | post | recently | recent | had | pre ) """, ), VOCAB.part("pre", r" \b pre [\s\-]? "), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB["word"], VOCAB.grouper("prefix", "not post pre".split()), VOCAB.producer(convert, """ (?P<value> prefix? lactating quest? ) """), ], )
VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait(value=token.group["value"].lower(), start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev", "ns sc".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, """ (?P<value> ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) ) """, ), VOCAB.producer(convert, """ (?P<value> non? scrotal ) """), VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev ) """), ], )
volunteers? writer """.split(), capture=False), VOCAB.part('noise', r" [_`‘|\[\]/-]+ "), VOCAB.term('header_key', r' herbarium '.split()), VOCAB.term('junk', r' date '.split()), VOCAB.term('skip', """ of on dry """.split()), VOCAB.part('semi', r' [;] '), VOCAB.term('col_no', r""" [[:alpha:][:digit:]\-]+ (?! [.] )""", priority=LOWEST), VOCAB.grouper('collector', """ ( (name_part | initial) )+ ( name_part | part | initial )* """, capture=False), VOCAB.grouper('joiner', ' ( conj | comma | with ){1,2} '), # With a label VOCAB.producer( convert, """ (?<= ^ | eol ) (?<! other_label comma? name_part? ) (?<! part | col_no ) noise? col_label comma? noise? (?P<col_name> collector ( joiner collector )* ( comma name_part )? ) noise? ( eol* ( (no_label? comma? (?P<collector_no> col_no ) | no_label comma?
), # Standard keywords that indicate an ear length follows VOCAB.term( "keyword", [ r" ear \s* from \s* (?P<measured_from1> notch | crown )", r" ear \s* ( length | len )", r" ear (?! \s* tag )", r" ef (?P<measured_from2> n | c ) [-]?", ], ), # Some patterns require a separator VOCAB["word"], VOCAB.part("sep", " [;,] "), # Consider any of the following as just a key VOCAB.grouper("key", "keyword char_key char_measured_from".split()), # Handle fractional values like: ear 9/16" VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )?"), # E.g.: earLengthInMM 9-10 VOCAB.producer(simple_len, "(?P<key> key_with_units ) len_range"), # E.g.: ear 9-10 mm VOCAB.producer(simple_len, "key len_range (?P<units> len_units )?"), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( partial(shorthand_length, measurement="shorthand_el"), [ "shorthand", "shorthand_bats", ], ), ],
# Words that indicate we don't have a total length VOCAB.term("skip", " horns? tag ".split()), # The word length on its own. Make sure it isn't proceeded by a letter VOCAB.part( "ambiguous", r""" (?<! [a-z] \s* ) (?P<ambiguous_key> lengths? ) """, ), # # We don't know if this is a length until we see the units VOCAB.part("key_units_req", "measurements? body total".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b (?P<ambiguous_key> l ) (?= [:=-] ) "), # Some patterns require a separator VOCAB["semicolon"], VOCAB["comma"], VOCAB.grouper( "key", """ ( key_with_units | len_key | ambiguous | char_key ) ( eq | dash )? """, ), VOCAB.grouper( "value", """ len_range | number (?P<units> len_units )? (?! mass_units ) """, ), VOCAB.grouper( "value_units", """ len_range | number (?P<units> len_units ) """, ), # E.g.: 10 to 11 inches TL VOCAB.producer(simple, "value (?P<units> len_units ) key"), VOCAB.producer(simple, """ key value key? """), VOCAB.producer(simple, """ key (?P<units> len_units ) value """), VOCAB.producer( simple,
VOCAB = Vocabulary(patterns.VOCAB) OVARY_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper("value", " cross | number len_units? "), # E.g.: active, Or: immature VOCAB.grouper("state", "active mature destroyed visible developed".split()), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ), # These patterns contain measurements to both left & right ovaries # E.g.: reproductive data: ovaries left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ label ovary side_cross """),
""", ), VOCAB.term( "other", """ protuberant prominent showing worn distended """.split(), ), # Separates measurements VOCAB.part("separator", r' [;"?/,] '), # Skip arbitrary words VOCAB["word"], VOCAB.grouper( "state_end", """ ( size | fully | partially | other | lactation | color | false | visible | tissue | present | active | developed ) """, ), VOCAB.grouper("state_mid", """ ( uterus | and ) """), VOCAB.producer( convert, """(?P<value> non? (state_end | much) (state_mid | state_end){0,2} nipple) """, ), VOCAB.producer( convert, """(?P<value> non? nipple (state_end | much) (state_mid | state_end){0,2} ) """,
VOCAB.term('and', r' and ', capture=False) VOCAB.term('conj', ' or and '.split(), capture=False) VOCAB.term('prep', ' to with on of '.split(), capture=False) VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST) # NOTE: Double quotes as inches is handled elsewhere VOCAB.part('inches', r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) """) VOCAB.part( 'feet', r""" (?<! [a-z] ) ( foot s? | feet s? | ft s? (?! [,\w]) ) | (?<= \d ) ' """) VOCAB.part( 'metric_len', r""" ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """) VOCAB.grouper('len_units', ' metric_len feet inches'.split()) VOCAB.part('pounds', r' pounds? | lbs? ') VOCAB.part('ounces', r' ounces? | ozs? ') METRIC_MASS = r""" milligrams? | kilograms? | grams? | (?<! [a-z] )( m \.? g s? | k \.? \s? g a? | g[mr]? s? )(?! [a-z] ) """ VOCAB.part('metric_mass', METRIC_MASS) VOCAB.grouper('mass_units', 'metric_mass pounds ounces'.split()) VOCAB.grouper('us_units', 'feet inches pounds ounces'.split()) VOCAB.grouper('units', 'len_units mass_units'.split()) # # UUIDs cause problems when extracting certain shorthand notations. VOCAB.part('uuid',
rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], VOCAB["metric_mass"], VOCAB.part( "sex", r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """, ), VOCAB.term("repo_key", r""" reproductive \s data """), VOCAB.term("near_term", r" near[\s-]?term"), VOCAB.term("each_side", r" each \s side "), VOCAB.term("skip", r" w wt ".split()), VOCAB.part("sep", r" [;] "), VOCAB.part("bang", r" [!] "), VOCAB.grouper( "count", """ none (word | plac_scar) conj | integer | none | num_words | bang """, ), VOCAB.grouper("present", " found | near_term "), VOCAB.grouper("numeric", " integer | real "), VOCAB.grouper("skip_len", " ( x? numeric metric_len ) | (x numeric metric_len?) "), VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "), VOCAB.grouper("side_link", " x | conj | word "), VOCAB.grouper("between", "side_link? | skip_words{,4}"), VOCAB.producer( convert, """ embryo eq? (?P<total> count ) skip_len? (?P<sub> side ) (?P<subcount> count ) between (?P<sub> side ) (?P<subcount> count ) """, ),
VOCAB["uuid"], # UUIDs cause problems with numbers # Note: abbrev differs from the one in the testes_state_trait VOCAB.term("abbrev", "tes ts tnd td tns ta".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b t (?! [a-z] )"), # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper( "value", """ cross | number len_units? (?! mass_units ) """, ), VOCAB.grouper( "state", ["""(non | partially | fully )? descended """] + """ scrotal abdominal size other """.split(), ), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ),
rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], VOCAB.part( "embryo_len_key", r""" (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) ( ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump | (?<! [a-z] ) crl (?! [a-z] ) | (?<! [a-z] ) c \.? r \.? (?! [a-z] ) )""", ), VOCAB.part("len", r" (length | len) (?! [a-z] ) "), VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "), VOCAB.part("separator", r' [;"/.] '), VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """), VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """), VOCAB.grouper( "count", """ number side number side eq? | number plus number ( eq number )? """, ), VOCAB.grouper("skip", " prep word cross | other | side "), VOCAB.producer(convert, """ embryo? key value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """), VOCAB.producer(convert_many, """ embryo count? value{2,} (?! skip ) quest? """), VOCAB.producer(convert, """ embryo? key x? value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """),
"""Parse v****a state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) VAGINA_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """), VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()), VOCAB.part( "closed", r""" closed | imperforated | imperf | cerrada | non [-\s] perforated | unperforate | non [-\s] perf | clsd | imp """, ), VOCAB.part("open", r""" open | perforated? | perf | abrir """), VOCAB.part("other", r""" swollen | plugged | plug | sealed """), VOCAB.grouper("state", """ closed | open | other """), VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """), VOCAB.producer(convert, """ (?P<value> state v****a state? ) """), VOCAB.producer(convert, """ (?P<value> ( state | abbrev ) v****a? ) """), ], )
r"""( forearm \s* )? \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a forearm length follows VOCAB.term( "key", r""" forearm ( \s* ( length | len | l ) )? | fore? \s? [.]? \s? a | fa """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: forearm 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: forearm = 9/16 inches "key len_fraction", # E.g.: forearm = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: forearmLengthInMM=9-10 "key noise? len_range units ", # E.g.: forearmLength=9-10 mm "key noise? len_range", # Missing units like: forearm 9-10
""", ), # The abbreviation key, just: t. This can be a problem. VOCAB.part( "char_key", r""" \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D ) """, ), # Standard keywords that indicate a tail length follows VOCAB.term("keyword", [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), # Consider all of these tokens a key VOCAB.grouper("key", "keyword char_key".split()), # Handle fractional values like: tailLength 9/16" VOCAB.producer( fraction, [ # E.g.: tail = 9/16 in "key len_fraction (?P<units> len_units )", "key len_fraction", # Without units, like: tail = 9/16 ], ), VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: tailLengthInMM=9-10 "key len_range (?P<units> len_units )", # E.g.: tailLength=9-10 mm "key len_range", # Missing units like: tailLength 9-10
return trait TESTES_STATE = Base( name=__name__.split(".")[-1], rules=[ # Abbreviations for "testes" VOCAB.term("abbrev", "tes ts tnd td tns ta t".split()), VOCAB["uterus"], VOCAB.grouper( "state", [ "non fully descended", "abdominal non descended", "abdominal descended", "non descended", "fully descended", "partially descended", "size non descended", "size descended", "descended", ], ), # Simplify the testes length so it can be skipped easily VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, r""" (?P<value> ( testes | abbrev | ambiguous_key ) length? ( state | abdominal | size ) ( conj? ( state | size ) )? ) """,