FOREARM_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: ForearmLengthInMillimeters VOCAB.term( "key_with_units", r"""( forearm \s* )? \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a forearm length follows VOCAB.term( "key", r""" forearm ( \s* ( length | len | l ) )? | fore? \s? [.]? \s? a | fa """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: forearm 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: forearm = 9/16 inches "key len_fraction", # E.g.: forearm = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: forearmLengthInMM=9-10 "key noise? len_range units ", # E.g.: forearmLength=9-10 mm "key noise? len_range", # Missing units like: forearm 9-10 "key dash number units?", "number key units?", ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_fa"), [ "shorthand", "shorthand_bats", ], ), ], )
NIPPLE_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB.term("id", r" \d+-\d+ "), VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()), VOCAB.part("number", r" number | no | [#] "), VOCAB.part("eq", r" is | eq | equals? | [=] "), # Skip arbitrary words VOCAB["word"], VOCAB["sep"], VOCAB.grouper("count", " (?: integer | none )(?! side ) "), VOCAB.grouper("modifier", "adj visible".split()), VOCAB.grouper("skip", " number eq? integer "), VOCAB.producer( typed, """ (?P<notation> (?P<value1> count) modifier (?P<value2> count) modifier ) nipple """, ), # Eg: 1:2 = 6 mammae VOCAB.producer( convert, """ nipple op? (?P<notation> count modifier? op? count modifier? (eq (?P<value> count))? ) """, ), # Eg: 1:2 = 6 mammae VOCAB.producer( convert, """ (?P<notation> count modifier? op? count modifier? (eq (?P<value> count))? ) nipple """, ), # Eg: 6 mammae VOCAB.producer(convert, """ (?P<value> count ) modifier? nipple """), # Eg: nipples 5 VOCAB.producer(convert, """ nipple (?P<value> count ) """), ], )
def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="enlarged" if token.group.get("pos") else "not enlarged", start=token.start, end=token.end, ) return trait NIPPLES_ENLARGED = Base( name=__name__.split(".")[-1], rules=[ VOCAB["conj"], VOCAB.part("separator", r' [;"?/,] '), VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"), VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"), VOCAB.term("false", """ false """), VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """), VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """), VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> none nipple ) """), VOCAB.producer(convert, """ (?P<neg> nipple none ) """), VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """), ], )
LIFE_STAGE = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for life stage VOCAB.term( "json_key", [ r" life \s* stage \s* (remarks?)? ", r" age \s* class ", r" age \s* in \s* (?P<time_units> {}) ".format(TIME_OPTIONS), r" age ", ], ), # These words are life stages without a keyword indicator VOCAB.term( "intrinsic", [ r" yolk \s? sac ", r" young [\s-]? of [\s-]? the [\s-]? year ", r" adult \s* young ", r" young \s* adult ", ] + """ ads? adulte?s? chicks? fledgelings? fleglings? fry hatched hatchlings? imagos? imms? immatures? jeunes? juvs? juveniles? juvéniles? larvae? larvals? larves? leptocephales? leptocephalus matures? metamorphs? neonates? nestlings? nulliparous premetamorphs? sub-adults? subads? subadulte?s? tadpoles? têtard yearlings? yg ygs young """.split(), ), # This indicates that the following words are NOT a life stage VOCAB.term("skip", r" determin \w* "), # Compound words separated by dashes or slashes # E.g. adult/juvenile or over-winter VOCAB.part("joiner", r" \s* [/-] \s* "), # Use this to find the end of a life stage pattern VOCAB.part("separator", r' [;,"?] | $ '), # For life stages with numbers as words in them VOCAB["ordinals"], VOCAB["time_units"], VOCAB.part("after", "after"), VOCAB.part("hatching", "hatching"), # Match any word VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "), VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"), # E.g.: life stage juvenile/yearling VOCAB.producer( convert, "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"), # E.g.: life stage young adult VOCAB.producer(convert, "json_key (?P<value> ( intrinsic | word ) intrinsic )"), # E.g.: life stage yearling VOCAB.producer(convert, "json_key (?P<value> intrinsic )"), # A sequence of words bracketed by a keyword and a separator # E.g.: LifeStage Remarks: 5-6 wks; VOCAB.producer( convert, """ json_key (?P<value> ( intrinsic | word | joiner ){1,5} ) separator """, ), # E.g.: LifeStage = 1st month VOCAB.producer(convert, "json_key (?P<value> as_time )"), # E.g.: Juvenile VOCAB.producer(convert, "(?P<value> intrinsic )"), # E.g.: 1st year VOCAB.producer(convert, "(?P<value> as_time )"), ], )
return trait LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.term("lactating_abbrev", r"[oc][esm]l"), VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"), VOCAB.term("post", r""" post | finished """), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB.producer(convert, """ (?P<pos> lactating ) """), VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """), VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """), VOCAB.producer(convert, """ (?P<neg> not_lactating_abbrev ) """), ], )
"""Parse v****a state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) VAGINA_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """), VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()), VOCAB.part( "closed", r""" closed | imperforated | imperf | cerrada | non [-\s] perforated | unperforate | non [-\s] perf | clsd | imp """, ), VOCAB.part("open", r""" open | perforated? | perf | abrir """), VOCAB.part("other", r""" swollen | plugged | plug | sealed """), VOCAB.grouper("state", """ closed | open | other """), VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """), VOCAB.producer(convert, """ (?P<value> state v****a state? ) """), VOCAB.producer(convert, """ (?P<value> ( state | abbrev ) v****a? ) """), ], )
TOTAL_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: TotalLengthInMillimeters VOCAB.term( "key_with_units", r""" ( total | snout \s* vent | head \s* body | fork ) \s* ( length | len )? \s* in \s* (?P<units> millimeters | mm ) """, ), # Various total length keys VOCAB.part( "len_key", r""" t \s* [o.]? \s* l [._]? (?! [a-z] ) | total [\s-]* length [\s-]* in | ( total | max | standard ) [\s-]* lengths? \b | meas [\s*:]? \s* length [\s(]* [l] [)\s:]* | meas ( [a-z]* )? \.? : \s* l (?! [a-z.] ) | s \.? \s? l \.? (?! [a-z.] ) | label [\s.]* lengths? \b | ( fork | mean | body ) [\s-]* lengths? \b | s \.? \s? v \.? \s? l \.? (?! [a-z.] ) | snout [\s-]* vent [\s-]* lengths? \b """, ), # Words that indicate we don't have a total length VOCAB.term("skip", " horns? tag ".split()), # The word length on its own. Make sure it isn't proceeded by a letter VOCAB.part( "ambiguous", r""" (?<! [a-z] \s* ) (?P<ambiguous_key> lengths? ) """, ), # # We don't know if this is a length until we see the units VOCAB.part("key_units_req", "measurements? body total".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b (?P<ambiguous_key> l ) (?= [:=-] ) "), # Some patterns require a separator VOCAB["semicolon"], VOCAB["comma"], VOCAB.grouper( "key", """ ( key_with_units | len_key | ambiguous | char_key ) ( eq | dash )? """, ), VOCAB.grouper( "value", """ len_range | number (?P<units> len_units )? (?! mass_units ) """, ), VOCAB.grouper( "value_units", """ len_range | number (?P<units> len_units ) """, ), # E.g.: 10 to 11 inches TL VOCAB.producer(simple, "value (?P<units> len_units ) key"), VOCAB.producer(simple, """ key value key? """), VOCAB.producer(simple, """ key (?P<units> len_units ) value """), VOCAB.producer( simple, """ key_units_req ( value_units | triple_key ) """, ), # E.g.: total length 4 feet 7 inches VOCAB.producer(compound, " key? compound_len "), # Handle fractional values like: total length 9/16" # E.g.: total = 9/16 inches VOCAB.producer(fraction, "key_units_req len_fraction (?P<units> len_units )"), # E.g.: svl 9/16 inches VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )"), # E.g.: len 9/16 in VOCAB.producer( fraction, """ (?P<ambiguous_key> ambiguous) len_fraction (?P<units> len_units ) """, ), # E.g.: total length: 10-29-39 10-11 VOCAB.producer( simple, """ ( key | key_units_req ) shorthand_triple? len_range """, ), # E.g.: L 12.4 cm VOCAB.producer( simple, """ char_key value (?P<units> len_units )? (?! mass_units ) """, ), VOCAB.producer( partial(numeric.shorthand_length, measurement="shorthand_tl"), ["( key | key_units_req ) shorthand", "shorthand"], # With a key ), # Without a key # Handle a truncated shorthand notation VOCAB.producer( partial(numeric.shorthand_length, measurement="shorthand_tl"), [ "key shorthand", "shorthand", "key shorthand_bats", "shorthand_bats", """ ( key | key_units_req ) shorthand_triple (?! shorthand | len_range ) """, ], ), ], )
OVARIES_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("other", """ sev somewhat few """.split()), # Skip words VOCAB.term("skip", " womb nullip ".split()), # VOCAB['comma'], VOCAB.part("sep", r" [;\(] "), # E.g.: ovaries and uterine horns # Or: ovaries and fallopian tubes VOCAB.grouper( "ovaries", r""" ovary ( ( and? uterus horns? ) | and? fallopian )? """, ), # E.g.: covered in copious fat VOCAB.grouper("coverage", " covered word{0,2} fat "), # E.g.: +corpus luteum VOCAB.grouper("luteum", " sign? corpus? (alb | lut) "), VOCAB.grouper( "value_words", """ size mature coverage luteum color corpus other active destroyed alb visible developed cyst texture fallopian luteum """.split(), ), VOCAB.grouper( "values", """ ( value_words ( and | comma ) | non )? value_words """, ), VOCAB.producer( convert, """ side? ovaries side? ( word | number | comma ){0,5} (?P<value> values+ ) """, ), VOCAB.producer( convert, """ (?P<value> values+ ) ( word | number | comma ){0,5} ( (?<! comma ) side )? (?<! comma ) ovaries """, ), # Get left and right side measurements # E.g.: ovaries: R 2 c. alb, L sev c. alb VOCAB.producer( double, r""" ovaries (?P<side> side) number? (?P<value> word? values+ ) ( and | comma )? (?P<side> side) number? (?P<value> word? values+ ) """, ), ], )
SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev_pos", "sc".split()), VOCAB.term("scrotal_abbrev_neg", "ns ".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """), VOCAB.producer( convert, """ (?P<pos> (testes | testes_abbrev | label) scrotal_abbrev_pos ) """ ), VOCAB.producer( convert, """ (?P<pos> scrotal_abbrev_pos (testes | testes_abbrev) ) """), VOCAB.producer(convert, """ (?P<neg> scrotal_neg ) """), VOCAB.producer(convert, """ (?P<neg> scrotal_pos none ) """), VOCAB.producer(convert, """ (?P<neg> none scrotal_pos ) """), VOCAB.producer( convert, """ (?P<neg> (testes | testes_abbrev | label) scrotal_abbrev_neg ) """ ), VOCAB.producer( convert, """ (?P<neg> scrotal_abbrev_neg ) (testes | testes_abbrev) """), ], )
OVARY_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper("value", " cross | number len_units? "), # E.g.: active, Or: immature VOCAB.grouper("state", "active mature destroyed visible developed".split()), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ), # These patterns contain measurements to both left & right ovaries # E.g.: reproductive data: ovaries left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ label ovary side_cross """), # As above but without the ovaries marker: # E.g.: reproductive data: left 10x5 mm, right 10x6 mm VOCAB.producer(double, """label side_cross"""), # Has the ovaries marker but is lacking the label # E.g.: ovaries left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ ovary side_cross """), # A typical testes size notation # E.g.: reproductive data: ovaries 10x5 mm VOCAB.producer(convert, " label ovary value "), # E.g.: reproductive data: left ovaries 10x5 mm VOCAB.producer(convert, " label side ovary value "), # E.g.: left ovaries 10x5 mm VOCAB.producer(convert, " side ovary value "), # May have a few words between the label and the measurement VOCAB.producer( convert, """ label ( ovary | state | word | sep ){0,3} ( ovary | state ) value """, ), # Handles: gonadLengthInMM 4x3 # And: gonadLength 4x3 VOCAB.producer(convert, "( ambiguous | key_with_units ) value"), # E.g.: gonadLengthInMM 6 x 8 VOCAB.producer( convert, """ ( key_with_units | ambiguous ) ( ovary | state | word | sep ){0,3} ( ovary | state ) value """, ), # Anchored by ovaries but with words between VOCAB.producer(convert, "ovary ( state | word | sep ){0,3} state value"), # Anchored by ovaries but with only one word in between # E.g.: ovaries 9mm VOCAB.producer(convert, "side? ovary ( state | word ) value"), # E.g.: Ovaries 5 x 3 VOCAB.producer(convert, "side? ovary value"), ], )
from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="pregnant" if token.group.get("pos") else "not pregnant", start=token.start, end=token.end, ) return trait PREGNANCY_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term( "pregnant", r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """. split(), ), VOCAB.part("separator", r' [;,"] '), VOCAB.producer(convert, """ (?P<neg> pregnant none) """), VOCAB.producer(convert, """ (?P<neg> none pregnant ) """), VOCAB.producer(convert, """ (?P<pos> pregnant ) """), ], )
PREGNANCY_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term( "pregnant", r""" prega?n?ant pregnan preg pregnancy pregnancies gravid post[\s\-]?parous multiparous nulliparous parous primiparous """.split(), ), VOCAB.term("joiner", r""" of were """.split()), VOCAB.term( "recent", r""" recently recent was previously prev """.split(), ), VOCAB.term( "probably", r""" probably prob possibly possible appears? very visible visibly evidence evident """.split(), ), VOCAB.term("stage", r" early late mid ".split()), VOCAB.part("separator", r' [;,"] '), # E.g.: pregnancy visible VOCAB.producer( convert, """ (?P<value> pregnant joiner? none? probably quest? ) """ ), # E.g.: Probably early pregnancy VOCAB.producer( convert, """ (?P<value> none? (recent | probably)? stage? (none | joiner)? pregnant quest? ) """, ), ], )
NIPPLE_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("false", """ false """), VOCAB.term("much", """ much """), VOCAB.term( "lactation", r""" (indicate \s+)? (( previous | post | prior ) [\s-] ) (lactation | lactating | lac ) """, ), VOCAB.term( "other", """ protuberant prominent showing worn distended """.split(), ), # Separates measurements VOCAB.part("separator", r' [;"?/,] '), # Skip arbitrary words VOCAB["word"], VOCAB.grouper( "state_end", """ ( size | fully | partially | other | lactation | color | false | visible | tissue | present | active | developed ) """, ), VOCAB.grouper("state_mid", """ ( uterus | and ) """), VOCAB.producer( convert, """(?P<value> non? (state_end | much) (state_mid | state_end){0,2} nipple) """, ), VOCAB.producer( convert, """(?P<value> non? nipple (state_end | much) (state_mid | state_end){0,2} ) """, ), VOCAB.producer( convert, """(?P<value> nipple non? (state_end | much) (state_mid | state_end){0,2} ) """, ), ], )
BODY_MASS = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Looking for keys like: MassInGrams VOCAB.term( "key_with_units", r""" ( weight | mass) [\s-]* in [\s-]* (?P<mass_units> grams | g | lbs ) """, ), # These words indicate a body mass follows VOCAB.part("key_leader", "full observed total".split()), # Words for weight VOCAB.part("weight", "weights? weigh(ed|ing|s)?".split()), # Keys like: w.t. VOCAB.part("key_with_dots", r" \b w \.? \s? t s? \.? "), # Common prefixes that indicate a body mass VOCAB.part("mass", "mass"), VOCAB.part("body", "body"), # These indicate that the mass is NOT a body mass VOCAB.term( "other_wt", """ femur baculum bacu bac spleen thymus kidney testes testis ovaries epididymis epid """.split(), ), # Separators VOCAB["word"], VOCAB["semicolon"], VOCAB["comma"], # Any key not preceding by "other_wt" is considered a weight key VOCAB.grouper( "wt_key", """ (?<! other_wt ) ( key_leader weight | key_leader mass | body weight | body mass | body | weight | mass | key_with_dots ) """, ), VOCAB.grouper("key", " wt_key ".split()), VOCAB.producer(compound, " key? compound_wt "), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( shorthand, [ "key shorthand", "shorthand", "key shorthand_bats", "shorthand_bats", ], ), VOCAB.producer(simple_mass, " wt_key mass_units number (?! len_units ) "), VOCAB.producer(simple_mass, " wt_key mass_range "), VOCAB.producer(simple_mass, " ( key | triple_key ) mass_range mass_units "), VOCAB.producer(simple_mass, " (?P<key> key_with_units ) mass_range "), ], )
HIND_FOOT_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: HindFootLengthInMillimeters VOCAB.term( "key_with_units", r"""( hind \s* )? foot \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a hind foot length follows VOCAB.term( "key", [ r"hind \s* foot \s* with \s* (?P<includes> claw )", r"hind \s* foot ( \s* ( length | len ) )?", "hfl | hf", ], ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: hindFoot 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: hindFoot = 9/16 inches "key len_fraction", # E.g.: hindFoot = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: hindFootLengthInMM=9-10 "key noise? len_range units ", # E.g.: hindFootLength=9-10 mm "key noise? len_range", # Missing units like: hindFootLength 9-10 "key dash number units", ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_hfl"), [ "shorthand", "key shorthand_bats", "shorthand_bats", # Handle a truncated shorthand notation "triple_key shorthand_triple (?! shorthand | len_range )", ], ), ], )
PLACENTAL_SCAR_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], # Adjectives to placental scars VOCAB.term( "adj", r""" faint prominent recent old possible """.split(), ), # Skip arbitrary words VOCAB["word"], VOCAB.part("sep", r" [;/] "), VOCAB.grouper( "count", """ none embryo conj | none visible | integer | none """, ), VOCAB.producer( convert_count, """(?P<count1> count ) op (?P<count2> count ) ( eq (?P<value> count ) )? plac_scar """, ), VOCAB.producer( convert_count, """plac_scar op? (?P<count1> count ) prep? (?P<side1> side ) ( (?P<count2> count ) prep? (?P<side2> side ) )? """, ), VOCAB.producer( convert_count, """ (?P<count1> count ) prep? (?P<side1> side ) plac_scar ( (?P<count2> count ) prep? (?P<side2> side ) (plac_scar)? )? """, ), VOCAB.producer( convert_count, """ (?P<side1> side ) (?P<count1> count ) (visible | op)? plac_scar ( (?P<side2> side ) (?P<count2> count ) (visible)? (visible | op)? plac_scar? )? """, ), VOCAB.producer( convert_count, """ (?<! lut ) (?P<count1> count ) prep? (?P<side1> side ) ( (?P<count2> count ) prep? (?P<side2> side ) )? plac_scar """, ), VOCAB.producer( convert_count, """ (?P<count1> count ) plac_scar (?P<side1> side ) ( (?P<count2> count ) plac_scar? (?P<side2> side ) )? """, ), VOCAB.producer( convert_count, """ plac_scar (?P<side1> side ) (?P<count1> count ) ( plac_scar (?P<side2> side ) (?P<count2> count ) )? """, ), VOCAB.producer( convert_count, """ plac_scar (?P<count1> count ) op (?P<count2> count ) ( eq (?P<value> count ) )? """, ), VOCAB.producer( convert_count, """ (?P<value> count ) adj? plac_scar op? ( (?P<count1> count ) (?P<side1> side ) op? (?P<count2> count ) (?P<side2> side ) )? """, ), VOCAB.producer( convert_count, """ (?P<value> count ) embryo? plac_scar (?! count ) """ ), VOCAB.producer( convert_count, """ plac_scar eq? (?P<count1> count ) (?P<side1> side ) """ ), VOCAB.producer(convert_count, """ plac_scar eq? (?P<value> count ) """), VOCAB.producer(convert_state, """ plac_scar """), ], )
EAR_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: EarLengthInMillimeters VOCAB.term( "key_with_units", r""" ear \s* ( length | len ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), # Abbreviation containing the measured from notation, like: e/n or e/c VOCAB.part( "char_measured_from", r""" (?<! [a-z] ) (?<! [a-z] \s ) (?P<ambiguous_key> e ) /? (?P<measured_from1> n | c ) [-]? (?! \.? [a-z] ) """, ), # The abbreviation key, just: e. This can be a problem. VOCAB.part( "char_key", r""" (?<! \w ) (?<! \w \s ) (?P<ambiguous_key> e ) (?! \.? \s? [a-z\(] ) """, ), # Standard keywords that indicate an ear length follows VOCAB.term( "keyword", [ r" ear \s* from \s* (?P<measured_from1> notch | crown )", r" ear \s* ( length | len )", r" ear (?! \s* tag )", r" ef (?P<measured_from2> n | c ) [-]?", ], ), # Some patterns require a separator VOCAB["word"], VOCAB.part("sep", " [;,] "), # Consider any of the following as just a key VOCAB.grouper("key", "keyword char_key char_measured_from".split()), # Handle fractional values like: ear 9/16" VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )?"), # E.g.: earLengthInMM 9-10 VOCAB.producer(simple_len, "(?P<key> key_with_units ) len_range"), # E.g.: ear 9-10 mm VOCAB.producer(simple_len, "key len_range (?P<units> len_units )?"), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( partial(shorthand_length, measurement="shorthand_el"), [ "shorthand", "shorthand_bats", ], ), ], )
TESTES_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Note: abbrev differs from the one in the testes_state_trait VOCAB.term("abbrev", "tes ts tnd td tns ta".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b t (?! [a-z] )"), # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper( "value", """ cross | number len_units? (?! mass_units ) """, ), VOCAB.grouper( "state", ["""(non | partially | fully )? descended """] + """ scrotal abdominal size other """.split(), ), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ), # These patterns contain measurements to both left & right testes # E.g.: reproductive data: tests left 10x5 mm, right 10x6 mm VOCAB.producer(double, """label ( testes | abbrev | char_key ) side_cross """), # As above but without the testes marker: # E.g.: reproductive data: left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ label side_cross """), # Has the testes marker but is lacking the label # E.g.: testes left 10x5 mm, right 10x6 mm VOCAB.producer( double, """ ( testes | abbrev | char_key ) side_cross """, ), # E.g.: reproductive data: left 10x5 mm VOCAB.producer( double, """ label (?P<side_1> side ) (?P<value_1> number ) (?P<units_1> len_units )? (?P<side_2> side ) (?P<value_2> number ) (?P<units_2> len_units )? """, ), # E.g.: reproductive data: left 10x5 mm VOCAB.producer( convert, """ ( testes | abbrev | char_key ) (?P<value_1> number ) (?P<units_1> len_units )? dash (?P<value_2> number ) (?P<units_2> len_units )? """, ), # A typical testes size notation # E.g.: reproductive data: tests 10x5 mm VOCAB.producer( convert, """ label ( testes | abbrev | char_key ) side_cross """, ), # E.g.: reproductive data: left tests 10x5 mm VOCAB.producer( convert, """ label side ( testes | abbrev | char_key ) (dash | comma)? value""", ), # E.g.: reproductive data=T: L-2x4mm VOCAB.producer( convert, """ label ( testes | abbrev | char_key ) side dash? value """, ), # E.g.: reproductive data: left 10x5 mm VOCAB.producer(convert, "label side dash? value len_units?"), # E.g.: reproductive data: 10x5 mm VOCAB.producer(convert, "label value len_units?"), # Has the testes marker but is lacking the label # E.g.: testes left 10x5 mm, right 10x6 mm VOCAB.producer(convert, """( testes | abbrev ) value """), # May have a few words between the label and the measurement # E.g.: reproductive data=testes not descended - 6 mm VOCAB.producer( convert, """ label ( testes | abbrev | state | word | sep | char_key){0,3} ( testes | abbrev | state | char_key ) ( dash | comma )? value """, ), # Handles: gonadLengthInMM 4x3 # And: gonadLength 4x3 VOCAB.producer(convert, "( ambiguous | key_with_units ) value"), # E.g.: gonadLengthInMM 6 x 8 VOCAB.producer( convert, """ ( key_with_units | ambiguous ) ( testes | abbrev | state | word | sep | char_key ){0,3} ( testes | abbrev | state | char_key ) value """, ), # Anchored by testes but with words between # E.g.: testes scrotal; T = 9mm VOCAB.producer( convert, """ testes ( abbrev | state | word | sep | char_key ){0,3} ( abbrev | state | char_key ) value """, ), # Anchored by testes but with only one word in between # E.g.: testes scrotal 9mm VOCAB.producer( convert, """ testes ( abbrev | state | word | char_key ) ( comma | dash )? value """, ), # E.g.: Testes 5 x 3 VOCAB.producer( convert, """ ( testes | state | abbrev ) (comma | dash | x )? value """, ), # E.g.: T 5 x 4 VOCAB.producer(convert, " (?P<ambiguous_char> char_key ) value "), ], )
LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.part("not", r" \b ( not | non | no ) "), VOCAB.part( "post", r""" \b ( (( just | recently ) \s+ )? finished | post | recently | recent | had | pre ) """, ), VOCAB.part("pre", r" \b pre [\s\-]? "), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB["word"], VOCAB.grouper("prefix", "not post pre".split()), VOCAB.producer(convert, """ (?P<value> prefix? lactating quest? ) """), ], )
VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait(value=token.group["value"].lower(), start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev", "ns sc".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, """ (?P<value> ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) ) """, ), VOCAB.producer(convert, """ (?P<value> non? scrotal ) """), VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev ) """), ], )
EMBRYO_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], VOCAB["metric_mass"], VOCAB.part( "sex", r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """, ), VOCAB.term("repo_key", r""" reproductive \s data """), VOCAB.term("near_term", r" near[\s-]?term"), VOCAB.term("each_side", r" each \s side "), VOCAB.term("skip", r" w wt ".split()), VOCAB.part("sep", r" [;] "), VOCAB.part("bang", r" [!] "), VOCAB.grouper( "count", """ none (word | plac_scar) conj | integer | none | num_words | bang """, ), VOCAB.grouper("present", " found | near_term "), VOCAB.grouper("numeric", " integer | real "), VOCAB.grouper("skip_len", " ( x? numeric metric_len ) | (x numeric metric_len?) "), VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "), VOCAB.grouper("side_link", " x | conj | word "), VOCAB.grouper("between", "side_link? | skip_words{,4}"), VOCAB.producer( convert, """ embryo eq? (?P<total> count ) skip_len? (?P<sub> side ) (?P<subcount> count ) between (?P<sub> side ) (?P<subcount> count ) """, ), VOCAB.producer( convert, """ embryo eq? (?P<sub> side ) (?P<subcount> count ) between embryo? (?P<sub> side ) (?P<subcount> count ) embryo? """, ), VOCAB.producer( convert, """ embryo eq? (?P<total> count ) skip_words{,4} (?P<subcount> count ) (?P<sub> side ) between (?P<subcount> count ) (?P<sub> side ) """, ), VOCAB.producer( convert, """ embryo eq? (?P<subcount> count ) (?P<sub> side ) between (?P<subcount> count ) (?P<sub> side ) eq (?P<total> count ) """, ), VOCAB.producer( convert, """ embryo eq? (?P<subcount> count ) (?P<sub> side ) between (?P<subcount> count ) (?P<sub> side ) """, ), VOCAB.producer( convert, """ embryo eq? (?P<subcount> count ) skip_len (?P<sub> side ) """, ), VOCAB.producer(found, """ embryo word? (?P<sub> side ) (?! plac_scar ) """), VOCAB.producer(found, """ embryo present | present embryo """), VOCAB.producer( convert, """ (?P<total> count ) near_term? embryo (?! plac_scar ) """, ), VOCAB.producer( convert, """ (?P<total> count ) near_term? embryo (?! plac_scar ) skip_len? (?P<subcount> count ) (?P<sub> side | sex ) side_link? (?P<subcount> count ) (?P<sub> side | sex ) """, ), VOCAB.producer( convert, """ (?P<total> count ) ( size | word )? embryo (?! plac_scar ) """, ), VOCAB.producer( convert, """ (?P<total> count ) ( size | word )? embryo (?! plac_scar ) (?P<subcount> count ) (?P<sub> side ) side_link? (?P<subcount> count ) (?P<sub> side ) """, ), VOCAB.producer( convert, """ (?P<total> count ) skip_len? embryo (?! plac_scar ) (?P<subcount> count ) (?P<sub> side ) side_link? (?P<subcount> count ) (?P<sub> side ) """, ), VOCAB.producer( convert, """ (?P<total> count ) skip_len embryo (?! plac_scar ) """, ), VOCAB.producer( convert, """ (?P<sub> side ) eq? (?P<subcount> count ) eq? side_link? (?P<sub> side ) eq? (?P<subcount> count ) eq? numeric? embryo """, ), VOCAB.producer( convert, """ (?P<subcount> count ) (?P<sub> side ) side_link? (?P<subcount> count ) (?P<sub> side ) embryo """, ), VOCAB.producer( convert, """ (?P<subcount> count ) embryo word? (?P<sub> side ) side_link? (?P<subcount> count ) word? (?P<sub> side ) """, ), VOCAB.producer( convert, """ repo_key ( eq | word ){,2} (?P<subcount> count ) (?P<sub> side ) side_link? (?P<subcount> count ) (?P<sub> side ) """, ), VOCAB.producer( convert, """ (?P<total> count ) embryo (?P<sub> side ) eq? (?P<subcount> count ) side_link? (?P<sub> side ) eq? (?P<subcount> count ) """, ), VOCAB.producer( convert, """ (?P<sub> side ) (?P<subcount> count ) embryo skip_len? (?P<sub> side ) (?P<subcount> count ) embryo? """, ), VOCAB.producer( convert, """ (?P<subcount> count ) (?P<sub> side ) x (?P<subcount> count ) (?P<sub> side ) x eq? skip_len? embryo """, ), VOCAB.producer( convert, """ (?P<sub> side ) skip_words{,4} (?P<subcount> count ) embryo? skip_len (?P<sub> side ) skip_words{,4} (?P<subcount> count ) """, ), VOCAB.producer(convert, """ (?P<subcount> count ) embryo (?P<sub> side )"""), VOCAB.producer(convert, """ embryo eq? (?P<total> count )"""), VOCAB.producer( convert, """ (?P<subcount> count ) (?P<sub> side ) skip_words{,3} embryo """, ), VOCAB.producer(each_side, """ (?P<subcount> count ) embryo each_side """), ], )
import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) SEX = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for sex VOCAB.term("sex_key", "sex"), # The sexes VOCAB.term("sex_vocab", "females? males?".split()), # These are words that indicate that "sex" is not a key VOCAB.term("not_sex", "and is was".split()), # Allow arbitrary words in some cases VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '), # Some patterns need a terminator VOCAB.part("separator", ' [;,"] | $ '), # E.g.: sex might be female; VOCAB.producer( convert, """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """, ), # E.g.: sex=female?, Or: sex=unknown VOCAB.producer(convert, " sex_key (?P<value> ( sex_vocab | word ) quest? ) "), # E.g.: male, Or: male? VOCAB.producer(convert, " (?P<value> sex_vocab quest? ) "), ], )
EMBRYO_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], VOCAB.part( "embryo_len_key", r""" (?<! collector [\s=:.] ) (?<! reg [\s=:.] ) ( ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump | (?<! [a-z] ) crl (?! [a-z] ) | (?<! [a-z] ) c \.? r \.? (?! [a-z] ) )""", ), VOCAB.part("len", r" (length | len) (?! [a-z] ) "), VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "), VOCAB.part("separator", r' [;"/.] '), VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """), VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """), VOCAB.grouper( "count", """ number side number side eq? | number plus number ( eq number )? """, ), VOCAB.grouper("skip", " prep word cross | other | side "), VOCAB.producer(convert, """ embryo? key value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """), VOCAB.producer(convert_many, """ embryo count? value{2,} (?! skip ) quest? """), VOCAB.producer(convert, """ embryo? key x? value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """), VOCAB.producer(convert, """ embryo x? value (?! skip ) quest? """), VOCAB.producer(isolate, """ embryo colon? count? value len_units quest? """), ], )
TAIL_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Looking for keys like: tailLengthInMM VOCAB.term( "key_with_units", r""" tail \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # The abbreviation key, just: t. This can be a problem. VOCAB.part( "char_key", r""" \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D ) """, ), # Standard keywords that indicate a tail length follows VOCAB.term("keyword", [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), # Consider all of these tokens a key VOCAB.grouper("key", "keyword char_key".split()), # Handle fractional values like: tailLength 9/16" VOCAB.producer( fraction, [ # E.g.: tail = 9/16 in "key len_fraction (?P<units> len_units )", "key len_fraction", # Without units, like: tail = 9/16 ], ), VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: tailLengthInMM=9-10 "key len_range (?P<units> len_units )", # E.g.: tailLength=9-10 mm "key len_range", # Missing units like: tailLength 9-10 ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_tal"), [ "shorthand", "key shorthand_bats", "shorthand_bats", # Handle a truncated shorthand notation "triple_key shorthand_triple (?! shorthand | len_range )", ], ), ], )
TESTES_STATE = Base( name=__name__.split(".")[-1], rules=[ # Abbreviations for "testes" VOCAB.term("abbrev", "tes ts tnd td tns ta t".split()), VOCAB["uterus"], VOCAB.grouper( "state", [ "non fully descended", "abdominal non descended", "abdominal descended", "non descended", "fully descended", "partially descended", "size non descended", "size descended", "descended", ], ), # Simplify the testes length so it can be skipped easily VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, r""" (?P<value> ( testes | abbrev | ambiguous_key ) length? ( state | abdominal | size ) ( conj? ( state | size ) )? ) """, ), VOCAB.producer( convert, r""" (?P<value> non ( testes | abbrev | ambiguous_key ) ( state )? ) """, ), VOCAB.producer( convert, """ label (?P<value> ( testes | abbrev )? length? size ( conj? state )? ) """, ), ], )