def _battery(self): battery = ParserBattery(exclude_pattern=r''' ^ (?: and | was | is ) $ ''') # Look for a key and value that is terminated with a delimiter battery.append( 'sex_key_value_delimited', r''' \b (?P<key> sex) \W+ (?P<value> [\w?.]+ (?: \s+ [\w?.]+ ){0,2} ) \s* (?: [:;,"] | $ ) ''' ) # Look for a key and value without a clear delimiter battery.append( 'sex_key_value_undelimited', r''' \b (?P<key> sex) \W+ (?P<value> \w+ ) ''' ) # Look for the words male & female battery.append( 'sex_unkeyed', r''' \b (?P<value> (?: males? | females? ) (?: \s* \? )? ) \b ''', want_array=2 ) return battery
def _battery(self, common_patterns): battery = ParserBattery(exclude_pattern=r''' ^ determin ''') # Look for a key and value that is terminated with a delimiter battery.append( 'life_stage_key_value_delimited', common_patterns + r''' \b (?P<key> (?: life \s* stage (?: \s* remarks )? | age (?: \s* class )? ) ) \W+ (?P<value> (?&word_chars) (?: \s+(?&word_chars) ){0,4} ) \s* (?: [:;,"] | $ ) ''' ) # Look for a key and value without a clear delimiter battery.append( 'life_stage_key_value_undelimited', common_patterns + r''' \b (?P<key> life \s* stage (?: \s* remarks )? | age \s* class | age \s* in \s* (?: hour | day ) s? | age ) \W+ (?P<value> [\w?.\/\-]+ (?: \s+ (?: year | recorded ) )? ) ''' ) # Look for common life stage phrases battery.append( 'life_stage_no_keyword', common_patterns + r''' (?P<value> (?: after \s+ )? (?: first | second | third | fourth | hatching ) \s+ year ) ''' ) return battery
def _battery(self, common_patterns): battery = ParserBattery(parse_units=True, units_from_key=r''' (?P<units> mm | millimeters ) $ ''') # Look for a pattern like: total length: 4 ft 8 in battery.append( 'en_len', common_patterns + r''' \b (?P<key> (?&all_len_keys))? (?&key_end)? (?P<value1> (?&range)) \s* (?P<units1> (?&len_foot)) \s* (?P<value2> (?&range)) \s* (?P<units2> (?&len_inch)) ''', default_key='_english_', compound_value=True ) # Look for total key, number (not a range) and optional units # Like: total length = 10.5 mm battery.append( 'total_len_key_num', common_patterns + r''' \b (?P<key> (?&total_len_key)) (?&key_end) (?P<value> (?&number)) (?! [\d\-\.] ) \s* (?P<units> (?&len_units))? ''', default_units='_mm_' ) # Look for these secondary length keys next but allow a range battery.append( 'other_len_key', common_patterns + r''' \b (?P<key> (?&other_len_key)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units))? ''' ) # Look for keys where the units are required battery.append( 'key_units_req', common_patterns + r''' \b (?P<key> (?&key_units_req)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units)) ''' ) # Look for a length in a phrase battery.append( 'len_in_phrase', common_patterns + r''' \b (?P<key> (?&len_in_phrase)) \D{1,32} (?P<value> (?&range)) \s* (?P<units> (?&len_units))? ''' ) # These ambiguous keys have a suffix that disambiguate them battery.append( 'len_key_ambiguous_units', common_patterns + r''' (?&no_word) (?&len_key_ambiguous) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units))? \s* (?P<key> (?&len_key_suffix)) ''' ) # These keys require units to disambiguate what is being measured battery.append( 'len_key_ambiguous_units', common_patterns + r''' (?&no_word) (?P<key> (?&len_key_ambiguous)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units)) ''' ) # An out of order parse: tol (mm) 20-25 battery.append( 'len_key_abbrev', common_patterns + r''' \b (?P<key> (?&len_key_abbrev)) \s* (?&open) \s* (?P<units> (?&len_units)) \s* (?&close) \s* (?P<value> (?&range)) ''' ) # This parse puts the key at the end: 20-25 mm TL battery.append( 'len_key_suffix', common_patterns + r''' \b (?P<value> (?&range)) \s* (?P<units> (?&len_units))? \s* (?P<key> (?&len_key_suffix)) ''' ) # Length is in shorthand notation battery.append( 'len_shorthand', common_patterns + r''' \b (?: (?P<key> (?&all_len_keys)) (?&key_end) )? (?P<value> (?&number)) (?&len_shorthand) ''', default_units='_mm_', default_key='_shorthand_' ) # A shorthand notation with some abbreviations in it battery.append( 'len_shorthand_euro', common_patterns + r''' \b (?: (?P<key> (?&all_len_keys)) (?&key_end) )? [a-z]* (?P<value> (?&number)) (?&len_shorthand_euro) ''', default_units='_mm_', default_key='_shorthand_' ) # Now we can look for the total length, RANGE, optional units # See 'total_len_key_num' above battery.append( 'total_len_key', common_patterns + r''' \b (?P<key> (?&total_len_key)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units))? ''', default_units='_mm_' ) # We will now allow an ambiguous key if it is not preceded by another word battery.append( 'len_key_ambiguous', common_patterns + r''' (?&no_word) (?P<key> (?&len_key_ambiguous)) (?&key_end) (?P<value> (?&range)) ''' ) # Look for snout-vent length keys battery.append( 'svl_len_key', common_patterns + r''' \b (?P<key> (?&svl_len_key)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&len_units))? ''' ) return battery
def _battery(self, common_patterns): battery = ParserBattery(parse_units=True, units_from_key=r''' (?P<units> grams ) $ ''') # Look for a pattern like: body mass: 4 lbs 8 oz battery.append( 'en_wt', common_patterns + r''' \b (?P<key> (?&all_wt_keys))? (?&key_end)? (?P<value1> (?&range)) \s* (?P<units1> (?&wt_pound)) \s* (?P<value2> (?&range)) \s* (?P<units2> (?&wt_ounce)) ''', default_key='_english_', compound_value=2 ) # Look for body mass with a total weight key and optional units battery.append( 'total_wt_key', common_patterns + r''' \b (?P<key> (?&total_wt_key)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&wt_units))? ''' ) # Look for these secondary body mass keys next battery.append( 'other_wt_key', common_patterns + r''' \b (?P<key> (?&other_wt_key)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&wt_units))? ''' ) # Look for keys where the units are required battery.append( 'key_units_req', common_patterns + r''' \b (?P<key> (?&key_units_req)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&wt_units)) ''' ) # Look for the body mass in a phrase battery.append( 'wt_in_phrase', common_patterns + r''' \b (?P<key> (?&wt_in_phrase)) \D{1,32} (?P<value> (?&range)) \s* (?P<units> (?&wt_units))? ''' ) # An out of order parse: body mass (g) 20-25 battery.append( 'wt_key_word', common_patterns + r''' \b (?P<key> (?&wt_key_word)) \s* (?&open) \s* (?P<units> (?&wt_units)) \s* (?&close) \s* (?P<value> (?&range)) ''' ) # These keys require units to disambiguate what is being measured battery.append( 'wt_key_word_req', common_patterns + r''' (?P<key> (?&wt_key_word)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&wt_units)) ''' ) # Body mass is in shorthand notation battery.append( 'wt_shorthand', common_patterns + r''' \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )? (?&wt_shorthand) \s* (?P<value> (?&number)) \s* (?P<units> (?&wt_units))? ''', default_key='_shorthand_' ) # Body mass is in shorthand notation (units required) battery.append( 'wt_shorthand_req', common_patterns + r''' \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )? (?&wt_shorthand_req) \s* (?P<value> (?&number)) \s* (?P<units> (?&wt_units)) ''', default_key='_shorthand_' ) # A shorthand notation with some abbreviations in it battery.append( 'wt_shorthand_euro', common_patterns + r''' \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )? (?&wt_shorthand_euro) \s* (?P<value> (?&number)) \s* (?P<units> (?&wt_units))? ''', default_key='_shorthand_' ) # A notation using 'fa'. It can be shorter than the other shorthand notations battery.append( 'wt_fa', common_patterns + r''' fa \d* - (?P<value> (?&number)) \s* (?P<units> (?&wt_units))? ''', default_key='_shorthand_' ) # Now we can look for the body mass, RANGE, optional units battery.append( 'wt_key_ambiguous', common_patterns + r''' (?P<key> (?&wt_key_word)) (?&key_end) (?P<value> (?&range)) \s* (?P<units> (?&wt_units))? ''' ) return battery