"abdominal non descended", "abdominal descended", "non descended", "fully descended", "partially descended", "size non descended", "size descended", "descended", ], ), # Simplify the testes length so it can be skipped easily VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, r""" (?P<value> ( testes | abbrev | ambiguous_key ) length? ( state | abdominal | size ) ( conj? ( state | size ) )? ) """, ), VOCAB.producer( convert, r""" (?P<value> non ( testes | abbrev | ambiguous_key ) ( state )? ) """, ), VOCAB.producer( convert, """ label (?P<value> ( testes | abbrev )? length? size ( conj? state )? ) """, ), ], )
"""Parse v****a state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) VAGINA_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """), VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()), VOCAB.part( "closed", r""" closed | imperforated | imperf | cerrada | non [-\s] perforated | unperforate | non [-\s] perf | clsd | imp """, ), VOCAB.part("open", r""" open | perforated? | perf | abrir """), VOCAB.part("other", r""" swollen | plugged | plug | sealed """), VOCAB.grouper("state", """ closed | open | other """), VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """), VOCAB.producer(convert, """ (?P<value> state v****a state? ) """), VOCAB.producer(convert, """ (?P<value> ( state | abbrev ) v****a? ) """), ], )
def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="enlarged" if token.group.get("pos") else "not enlarged", start=token.start, end=token.end, ) return trait NIPPLES_ENLARGED = Base( name=__name__.split(".")[-1], rules=[ VOCAB["conj"], VOCAB.part("separator", r' [;"?/,] '), VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"), VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"), VOCAB.term("false", """ false """), VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """), VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """), VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> none nipple ) """), VOCAB.producer(convert, """ (?P<neg> nipple none ) """), VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """), ], )
"value_words", """ size mature coverage luteum color corpus other active destroyed alb visible developed cyst texture fallopian luteum """.split(), ), VOCAB.grouper( "values", """ ( value_words ( and | comma ) | non )? value_words """, ), VOCAB.producer( convert, """ side? ovaries side? ( word | number | comma ){0,5} (?P<value> values+ ) """, ), VOCAB.producer( convert, """ (?P<value> values+ ) ( word | number | comma ){0,5} ( (?<! comma ) side )? (?<! comma ) ovaries """, ), # Get left and right side measurements # E.g.: ovaries: R 2 c. alb, L sev c. alb VOCAB.producer( double, r""" ovaries
return trait LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.term("lactating_abbrev", r"[oc][esm]l"), VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"), VOCAB.term("post", r""" post | finished """), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB.producer(convert, """ (?P<pos> lactating ) """), VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """), VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """), VOCAB.producer(convert, """ (?P<neg> not_lactating_abbrev ) """), ], )
VOCAB.term( "key", r""" forearm ( \s* ( length | len | l ) )? | fore? \s? [.]? \s? a | fa """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: forearm 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: forearm = 9/16 inches "key len_fraction", # E.g.: forearm = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: forearmLengthInMM=9-10 "key noise? len_range units ", # E.g.: forearmLength=9-10 mm "key noise? len_range", # Missing units like: forearm 9-10 "key dash number units?", "number key units?", ], ), VOCAB.producer(
# Separators VOCAB["word"], VOCAB["semicolon"], VOCAB["comma"], # Any key not preceding by "other_wt" is considered a weight key VOCAB.grouper( "wt_key", """ (?<! other_wt ) ( key_leader weight | key_leader mass | body weight | body mass | body | weight | mass | key_with_dots ) """, ), VOCAB.grouper("key", " wt_key ".split()), VOCAB.producer(compound, " key? compound_wt "), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( shorthand, [ "key shorthand", "shorthand", "key shorthand_bats", "shorthand_bats", ], ), VOCAB.producer(simple_mass, " wt_key mass_units number (?! len_units ) "), VOCAB.producer(simple_mass, " wt_key mass_range "), VOCAB.producer(simple_mass, " ( key | triple_key ) mass_range mass_units "),
VOCAB.grouper("value", " cross | number len_units? "), # E.g.: active, Or: immature VOCAB.grouper("state", "active mature destroyed visible developed".split()), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ), # These patterns contain measurements to both left & right ovaries # E.g.: reproductive data: ovaries left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ label ovary side_cross """), # As above but without the ovaries marker: # E.g.: reproductive data: left 10x5 mm, right 10x6 mm VOCAB.producer(double, """label side_cross"""), # Has the ovaries marker but is lacking the label # E.g.: ovaries left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ ovary side_cross """), # A typical testes size notation # E.g.: reproductive data: ovaries 10x5 mm VOCAB.producer(convert, " label ovary value "), # E.g.: reproductive data: left ovaries 10x5 mm VOCAB.producer(convert, " label side ovary value "), # E.g.: left ovaries 10x5 mm VOCAB.producer(convert, " side ovary value "), # May have a few words between the label and the measurement VOCAB.producer(
r"""( tragus \s* ) \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a tragus length follows VOCAB.term( "key", r""" ( tragus | trag | tragi ) \s* (length | len | l )? | tr """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: tragus 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: tragus = 9/16 inches "key len_fraction", # E.g.: tragus = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: tragusLengthInMM=9-10 "key noise? len_range units ", # E.g.: tragusLengthInMM=9-10 mm "key noise? len_range", # Missing units: tragusLengthInMM 9-10 "key dash? number units?", ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_tr"),
from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="pregnant" if token.group.get("pos") else "not pregnant", start=token.start, end=token.end, ) return trait PREGNANCY_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term( "pregnant", r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """. split(), ), VOCAB.part("separator", r' [;,"] '), VOCAB.producer(convert, """ (?P<neg> pregnant none) """), VOCAB.producer(convert, """ (?P<neg> none pregnant ) """), VOCAB.producer(convert, """ (?P<pos> pregnant ) """), ], )
start=token.start, end=token.end, ) return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev_pos", "sc".split()), VOCAB.term("scrotal_abbrev_neg", "ns ".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """), VOCAB.producer( convert, """ (?P<pos> (testes | testes_abbrev | label) scrotal_abbrev_pos ) """ ), VOCAB.producer( convert, """ (?P<pos> scrotal_abbrev_pos (testes | testes_abbrev) ) """), VOCAB.producer(convert, """ (?P<neg> scrotal_neg ) """), VOCAB.producer(convert, """ (?P<neg> scrotal_pos none ) """), VOCAB.producer(convert, """ (?P<neg> none scrotal_pos ) """), VOCAB.producer( convert, """ (?P<neg> (testes | testes_abbrev | label) scrotal_abbrev_neg ) """ ), VOCAB.producer(
), VOCAB.term("joiner", r""" of were """.split()), VOCAB.term( "recent", r""" recently recent was previously prev """.split(), ), VOCAB.term( "probably", r""" probably prob possibly possible appears? very visible visibly evidence evident """.split(), ), VOCAB.term("stage", r" early late mid ".split()), VOCAB.part("separator", r' [;,"] '), # E.g.: pregnancy visible VOCAB.producer( convert, """ (?P<value> pregnant joiner? none? probably quest? ) """ ), # E.g.: Probably early pregnancy VOCAB.producer( convert, """ (?P<value> none? (recent | probably)? stage? (none | joiner)? pregnant quest? ) """, ), ], )
), # Separates measurements VOCAB.part("separator", r' [;"?/,] '), # Skip arbitrary words VOCAB["word"], VOCAB.grouper( "state_end", """ ( size | fully | partially | other | lactation | color | false | visible | tissue | present | active | developed ) """, ), VOCAB.grouper("state_mid", """ ( uterus | and ) """), VOCAB.producer( convert, """(?P<value> non? (state_end | much) (state_mid | state_end){0,2} nipple) """, ), VOCAB.producer( convert, """(?P<value> non? nipple (state_end | much) (state_mid | state_end){0,2} ) """, ), VOCAB.producer( convert, """(?P<value> nipple non? (state_end | much) (state_mid | state_end){0,2} ) """, ), ],
# Compound words separated by dashes or slashes # E.g. adult/juvenile or over-winter VOCAB.part("joiner", r" \s* [/-] \s* "), # Use this to find the end of a life stage pattern VOCAB.part("separator", r' [;,"?] | $ '), # For life stages with numbers as words in them VOCAB["ordinals"], VOCAB["time_units"], VOCAB.part("after", "after"), VOCAB.part("hatching", "hatching"), # Match any word VOCAB.part("word", r" \b \w [\w?.-]* (?! [./-] ) "), VOCAB.grouper("as_time", " after? (ordinals | hatching) time_units"), # E.g.: life stage juvenile/yearling VOCAB.producer( convert, "json_key (?P<value> ( intrinsic | word ) joiner intrinsic )"), # E.g.: life stage young adult VOCAB.producer(convert, "json_key (?P<value> ( intrinsic | word ) intrinsic )"), # E.g.: life stage yearling VOCAB.producer(convert, "json_key (?P<value> intrinsic )"), # A sequence of words bracketed by a keyword and a separator # E.g.: LifeStage Remarks: 5-6 wks; VOCAB.producer( convert, """ json_key (?P<value> ( intrinsic | word | joiner ){1,5} ) separator """, ), # E.g.: LifeStage = 1st month VOCAB.producer(convert, "json_key (?P<value> as_time )"),
VOCAB.part("bang", r" [!] "), VOCAB.grouper( "count", """ none (word | plac_scar) conj | integer | none | num_words | bang """, ), VOCAB.grouper("present", " found | near_term "), VOCAB.grouper("numeric", " integer | real "), VOCAB.grouper("skip_len", " ( x? numeric metric_len ) | (x numeric metric_len?) "), VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "), VOCAB.grouper("side_link", " x | conj | word "), VOCAB.grouper("between", "side_link? | skip_words{,4}"), VOCAB.producer( convert, """ embryo eq? (?P<total> count ) skip_len? (?P<sub> side ) (?P<subcount> count ) between (?P<sub> side ) (?P<subcount> count ) """, ), VOCAB.producer( convert, """ embryo eq? (?P<sub> side ) (?P<subcount> count ) between embryo? (?P<sub> side ) (?P<subcount> count ) embryo? """, ), VOCAB.producer( convert, """ embryo eq? (?P<total> count ) skip_words{,4} (?P<subcount> count ) (?P<sub> side ) between (?P<subcount> count ) (?P<sub> side ) """,
VOCAB["semicolon"], VOCAB["comma"], VOCAB.grouper( "key", """ ( key_with_units | len_key | ambiguous | char_key ) ( eq | dash )? """, ), VOCAB.grouper( "value", """ len_range | number (?P<units> len_units )? (?! mass_units ) """, ), VOCAB.grouper( "value_units", """ len_range | number (?P<units> len_units ) """, ), # E.g.: 10 to 11 inches TL VOCAB.producer(simple, "value (?P<units> len_units ) key"), VOCAB.producer(simple, """ key value key? """), VOCAB.producer(simple, """ key (?P<units> len_units ) value """), VOCAB.producer( simple, """ key_units_req ( value_units | triple_key ) """, ), # E.g.: total length 4 feet 7 inches VOCAB.producer(compound, " key? compound_len "), # Handle fractional values like: total length 9/16" # E.g.: total = 9/16 inches VOCAB.producer(fraction, "key_units_req len_fraction (?P<units> len_units )"), # E.g.: svl 9/16 inches VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )"), # E.g.: len 9/16 in
(?<! collector [\s=:.] ) (?<! reg [\s=:.] ) ( ( crown | cr ) ( [_\s\-] | \s+ to \s+ )? rump | (?<! [a-z] ) crl (?! [a-z] ) | (?<! [a-z] ) c \.? r \.? (?! [a-z] ) )""", ), VOCAB.part("len", r" (length | len) (?! [a-z] ) "), VOCAB.part("other", r" \( \s* \d+ \s* \w+ \s* \) "), VOCAB.part("separator", r' [;"/.] '), VOCAB.grouper("value", """ cross | number len_units? (?! sex ) """), VOCAB.grouper("key", """ embryo_len_key len? ( eq | colon )? """), VOCAB.grouper( "count", """ number side number side eq? | number plus number ( eq number )? """, ), VOCAB.grouper("skip", " prep word cross | other | side "), VOCAB.producer(convert, """ embryo? key value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """), VOCAB.producer(convert_many, """ embryo count? value{2,} (?! skip ) quest? """), VOCAB.producer(convert, """ embryo? key x? value quest? """), VOCAB.producer(convert, """ embryo? x? value key quest? """), VOCAB.producer(convert, """ embryo x? value (?! skip ) quest? """), VOCAB.producer(isolate, """ embryo colon? count? value len_units quest? """), ], )
[[:alpha:][:digit:]\-]+ (?! [.] )""", priority=LOWEST), VOCAB.grouper('collector', """ ( (name_part | initial) )+ ( name_part | part | initial )* """, capture=False), VOCAB.grouper('joiner', ' ( conj | comma | with ){1,2} '), # With a label VOCAB.producer( convert, """ (?<= ^ | eol ) (?<! other_label comma? name_part? ) (?<! part | col_no ) noise? col_label comma? noise? (?P<col_name> collector ( joiner collector )* ( comma name_part )? ) noise? ( eol* ( (no_label? comma? (?P<collector_no> col_no ) | no_label comma? (?P<collector_no> ( part | col_no ){1,2} ) ) ) )? """), # Without a label VOCAB.producer( convert, """ (?<= ^ | eol ) (?<! other_label noise? name_part? ) (?<! part | col_no ) noise? col_label? comma? noise? (?P<col_name> initial? name_part+ ( joiner collector )* ) ( eol* ( (no_label? comma? (?P<collector_no> col_no ) | no_label comma?
rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB.term("id", r" \d+-\d+ "), VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()), VOCAB.part("number", r" number | no | [#] "), VOCAB.part("eq", r" is | eq | equals? | [=] "), # Skip arbitrary words VOCAB["word"], VOCAB["sep"], VOCAB.grouper("count", " (?: integer | none )(?! side ) "), VOCAB.grouper("modifier", "adj visible".split()), VOCAB.grouper("skip", " number eq? integer "), VOCAB.producer( typed, """ (?P<notation> (?P<value1> count) modifier (?P<value2> count) modifier ) nipple """, ), # Eg: 1:2 = 6 mammae VOCAB.producer( convert, """ nipple op? (?P<notation> count modifier? op? count modifier? (eq (?P<value> count))? ) """, ), # Eg: 1:2 = 6 mammae VOCAB.producer( convert,
VOCAB.term( "key", [ r"hind \s* foot \s* with \s* (?P<includes> claw )", r"hind \s* foot ( \s* ( length | len ) )?", "hfl | hf", ], ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: hindFoot 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: hindFoot = 9/16 inches "key len_fraction", # E.g.: hindFoot = 9/16 ], ), # A typical hind-foot notation VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: hindFootLengthInMM=9-10 "key noise? len_range units ", # E.g.: hindFootLength=9-10 mm "key noise? len_range", # Missing units like: hindFootLength 9-10 "key dash number units", ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_hfl"),
"adj", r""" faint prominent recent old possible """.split(), ), # Skip arbitrary words VOCAB["word"], VOCAB.part("sep", r" [;/] "), VOCAB.grouper( "count", """ none embryo conj | none visible | integer | none """, ), VOCAB.producer( convert_count, """(?P<count1> count ) op (?P<count2> count ) ( eq (?P<value> count ) )? plac_scar """, ), VOCAB.producer( convert_count, """plac_scar op? (?P<count1> count ) prep? (?P<side1> side ) ( (?P<count2> count ) prep? (?P<side2> side ) )? """, ), VOCAB.producer( convert_count, """ (?P<count1> count ) prep? (?P<side1> side ) plac_scar ( (?P<count2> count ) prep? (?P<side2> side ) (plac_scar)? )? """,
VOCAB.term( "keyword", [ r" ear \s* from \s* (?P<measured_from1> notch | crown )", r" ear \s* ( length | len )", r" ear (?! \s* tag )", r" ef (?P<measured_from2> n | c ) [-]?", ], ), # Some patterns require a separator VOCAB["word"], VOCAB.part("sep", " [;,] "), # Consider any of the following as just a key VOCAB.grouper("key", "keyword char_key char_measured_from".split()), # Handle fractional values like: ear 9/16" VOCAB.producer(fraction, "key len_fraction (?P<units> len_units )?"), # E.g.: earLengthInMM 9-10 VOCAB.producer(simple_len, "(?P<key> key_with_units ) len_range"), # E.g.: ear 9-10 mm VOCAB.producer(simple_len, "key len_range (?P<units> len_units )?"), # Shorthand notation like: on tag: 11-22-33-44=99g VOCAB.producer( partial(shorthand_length, measurement="shorthand_el"), [ "shorthand", "shorthand_bats", ], ), ], )
"state", ["""(non | partially | fully )? descended """] + """ scrotal abdominal size other """.split(), ), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """, ), # These patterns contain measurements to both left & right testes # E.g.: reproductive data: tests left 10x5 mm, right 10x6 mm VOCAB.producer(double, """label ( testes | abbrev | char_key ) side_cross """), # As above but without the testes marker: # E.g.: reproductive data: left 10x5 mm, right 10x6 mm VOCAB.producer(double, """ label side_cross """), # Has the testes marker but is lacking the label # E.g.: testes left 10x5 mm, right 10x6 mm VOCAB.producer( double, """ ( testes | abbrev | char_key ) side_cross """, ), # E.g.: reproductive data: left 10x5 mm VOCAB.producer( double, """
if trait: trait.value = str(trait.value[:-2]) + '??' return trait LABEL_DATE = Base( name=__name__.split('.')[-1], rules=[ VOCAB['eol'], VOCAB['uuid'], # Get rid of these before they're a problem VOCAB.term('label', ' date '.split()), VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'), VOCAB.part('sep', r' [/_-]+ ', capture=False), VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False), VOCAB.producer( convert, """ label? (?P<value> digits sep? month_name sep? digits ) """), VOCAB.producer( convert, """ label? (?P<value> month_name sep? digits sep? digits ) """), VOCAB.producer( convert, """ label? (?P<value> digits sep digits sep digits ) """), VOCAB.producer( short_date_digits, f""" label? (?P<value> digits sep digits ) """), VOCAB.producer( short_date_name, f""" label? (?P<value> month_name sep? digits ) """), VOCAB.producer( short_date_name, f"""
VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait(value=token.group["value"].lower(), start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev", "ns sc".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, """ (?P<value> ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) ) """, ), VOCAB.producer(convert, """ (?P<value> non? scrotal ) """), VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev ) """), ], )
LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.part("not", r" \b ( not | non | no ) "), VOCAB.part( "post", r""" \b ( (( just | recently ) \s+ )? finished | post | recently | recent | had | pre ) """, ), VOCAB.part("pre", r" \b pre [\s\-]? "), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB["word"], VOCAB.grouper("prefix", "not post pre".split()), VOCAB.producer(convert, """ (?P<value> prefix? lactating quest? ) """), ], )
import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) SEX = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for sex VOCAB.term("sex_key", "sex"), # The sexes VOCAB.term("sex_vocab", "females? males?".split()), # These are words that indicate that "sex" is not a key VOCAB.term("not_sex", "and is was".split()), # Allow arbitrary words in some cases VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '), # Some patterns need a terminator VOCAB.part("separator", ' [;,"] | $ '), # E.g.: sex might be female; VOCAB.producer( convert, """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """, ), # E.g.: sex=female?, Or: sex=unknown VOCAB.producer(convert, " sex_key (?P<value> ( sex_vocab | word ) quest? ) "), # E.g.: male, Or: male? VOCAB.producer(convert, " (?P<value> sex_vocab quest? ) "), ], )
from digi_leap.pylib.trait import Trait PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv' PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv' VOCAB = Vocabulary(patterns.VOCAB) VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST) DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str) VOCAB.term('plant_family', DATA['complete_name'].tolist()) DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str) VOCAB.term('plant_genus', DATA['complete_name'].tolist()) def convert(token): """Normalize a parsed taxon notation""" return Trait(start=token.start, end=token.end, value=token.group['value']) PLANT_TAXON = Base(name='plant_taxon', rules=[ VOCAB['eol'], VOCAB.producer(convert, f' (?P<value> plant_genus word+ ) ') ]) PLANT_FAMILY = Base( name='plant_family', rules=[VOCAB.producer(convert, f' (?P<value> plant_family ) ')])
r""" \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D ) """, ), # Standard keywords that indicate a tail length follows VOCAB.term("keyword", [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), # Consider all of these tokens a key VOCAB.grouper("key", "keyword char_key".split()), # Handle fractional values like: tailLength 9/16" VOCAB.producer( fraction, [ # E.g.: tail = 9/16 in "key len_fraction (?P<units> len_units )", "key len_fraction", # Without units, like: tail = 9/16 ], ), VOCAB.producer( simple, [ "key_with_units len_range", # E.g.: tailLengthInMM=9-10 "key len_range (?P<units> len_units )", # E.g.: tailLength=9-10 mm "key len_range", # Missing units like: tailLength 9-10 ], ), VOCAB.producer( partial(shorthand_length, measurement="shorthand_tal"), [ "shorthand",
if token.group.get('us_county'): trait.us_county = token.group['us_county'].title() if token.group.get('us_state'): trait.us_state = us_states.normalize_state(token.group['us_state']) return trait ADMIN_UNIT = Base( name='us_county', rules=[ VOCAB['eol'], VOCAB.term('skip', r""" of the """.split()), VOCAB.term('co_label', r""" co | coun[tc]y """, capture=False), VOCAB.term('st_label', r""" ( plants | flora ) \s* of """, capture=False), VOCAB.term('other', r"""alluvial flood river plain """.split()), VOCAB.part('nope', r""" [(] """), VOCAB['word'], VOCAB.producer(convert, ' us_state? eol? co_label comma? us_county '), VOCAB.producer(convert, ' us_county co_label comma? us_state? '), VOCAB.producer(convert, ' us_county comma? us_state '), VOCAB.producer(convert, """ st_label us_state eol? co_label us_county """), VOCAB.producer(convert, ' st_label eol? us_state '), VOCAB.producer(convert, ' (?<! skip ) us_state (?! other | nope ) '), ])