Example #1
0
    def _battery(self):
        battery = ParserBattery(exclude_pattern=r''' ^ (?: and | was | is ) $ ''')

        # Look for a key and value that is terminated with a delimiter
        battery.append(
            'sex_key_value_delimited',
            r'''
                \b (?P<key> sex)
                \W+
                (?P<value> [\w?.]+ (?: \s+ [\w?.]+ ){0,2} )
                \s* (?: [:;,"] | $ )
            '''
        )

        # Look for a key and value without a clear delimiter
        battery.append(
            'sex_key_value_undelimited',
            r'''
                \b (?P<key> sex) \W+ (?P<value> \w+ )
            '''
        )

        # Look for the words male & female
        battery.append(
            'sex_unkeyed',
            r'''
                \b (?P<value> (?: males? | females? ) (?: \s* \? )? ) \b
            ''',
            want_array=2
        )

        return battery
    def _battery(self, common_patterns):
        battery = ParserBattery(exclude_pattern=r''' ^ determin ''')

        # Look for a key and value that is terminated with a delimiter
        battery.append(
            'life_stage_key_value_delimited',
            common_patterns + r'''
                \b (?P<key> (?: life \s* stage (?: \s* remarks )? | age (?: \s* class )? ) )
                \W+
                (?P<value> (?&word_chars) (?: \s+(?&word_chars) ){0,4} ) \s*
                (?: [:;,"] | $ )
            '''
        )

        # Look for a key and value without a clear delimiter
        battery.append(
            'life_stage_key_value_undelimited',
            common_patterns + r'''
                \b (?P<key> life \s* stage (?: \s* remarks )?
                        | age \s* class
                        | age \s* in \s* (?: hour | day ) s?
                        | age
                    )
                    \W+
                    (?P<value> [\w?.\/\-]+ (?: \s+ (?: year | recorded ) )? )
            '''
        )

        # Look for common life stage phrases
        battery.append(
            'life_stage_no_keyword',
            common_patterns + r'''
                (?P<value> (?: after \s+ )?
                        (?: first | second | third | fourth | hatching ) \s+
                        year )
            '''
        )

        return battery
    def _battery(self, common_patterns):
        battery = ParserBattery(parse_units=True, units_from_key=r''' (?P<units> mm | millimeters ) $ ''')

        # Look for a pattern like: total length: 4 ft 8 in
        battery.append(
            'en_len',
            common_patterns + r'''
                \b (?P<key> (?&all_len_keys))? (?&key_end)?
                   (?P<value1> (?&range))    \s*
                   (?P<units1> (?&len_foot)) \s*
                   (?P<value2> (?&range))    \s*
                   (?P<units2> (?&len_inch))
            ''',
            default_key='_english_',
            compound_value=True
        )

        # Look for total key, number (not a range) and optional units
        # Like: total length = 10.5 mm
        battery.append(
            'total_len_key_num',
            common_patterns + r'''
                \b (?P<key> (?&total_len_key)) (?&key_end)
                   (?P<value> (?&number)) (?! [\d\-\.] ) \s*
                   (?P<units> (?&len_units))?
            ''',
            default_units='_mm_'
        )

        # Look for these secondary length keys next but allow a range
        battery.append(
            'other_len_key',
            common_patterns + r'''
                \b (?P<key>   (?&other_len_key)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&len_units))?
            '''
        )

        # Look for keys where the units are required
        battery.append(
            'key_units_req',
            common_patterns + r'''
                \b (?P<key>   (?&key_units_req)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&len_units))
            '''
        )

        # Look for a length in a phrase
        battery.append(
            'len_in_phrase',
            common_patterns + r'''
                \b (?P<key>   (?&len_in_phrase)) \D{1,32}
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&len_units))?
            '''
        )

        # These ambiguous keys have a suffix that disambiguate them
        battery.append(
            'len_key_ambiguous_units',
            common_patterns + r'''
                (?&no_word) (?&len_key_ambiguous) (?&key_end)
                (?P<value>  (?&range)) \s*
                (?P<units>  (?&len_units))? \s*
                (?P<key>    (?&len_key_suffix))
            '''
        )

        # These keys require units to disambiguate what is being measured
        battery.append(
            'len_key_ambiguous_units',
            common_patterns + r'''
                (?&no_word)
                (?P<key>   (?&len_key_ambiguous)) (?&key_end)
                (?P<value> (?&range)) \s*
                (?P<units> (?&len_units))
            '''
        )

        # An out of order parse: tol (mm) 20-25
        battery.append(
            'len_key_abbrev',
            common_patterns + r'''
                \b (?P<key>      (?&len_key_abbrev)) \s*
                   (?&open)  \s* (?P<units> (?&len_units)) \s* (?&close) \s*
                   (?P<value>    (?&range))
            '''
        )

        # This parse puts the key at the end: 20-25 mm TL
        battery.append(
            'len_key_suffix',
            common_patterns + r'''
                \b (?P<value> (?&range)) \s*
                   (?P<units> (?&len_units))? \s*
                   (?P<key>   (?&len_key_suffix))
            '''
        )

        # Length is in shorthand notation
        battery.append(
            'len_shorthand',
            common_patterns + r'''
                \b (?: (?P<key> (?&all_len_keys)) (?&key_end) )?
                   (?P<value> (?&number))
                   (?&len_shorthand)
            ''',
            default_units='_mm_',
            default_key='_shorthand_'
        )

        # A shorthand notation with some abbreviations in it
        battery.append(
            'len_shorthand_euro',
            common_patterns + r'''
                \b (?: (?P<key> (?&all_len_keys)) (?&key_end) )?
                   [a-z]*
                   (?P<value>   (?&number))
                   (?&len_shorthand_euro)
            ''',
            default_units='_mm_',
            default_key='_shorthand_'
        )

        # Now we can look for the total length, RANGE, optional units
        # See 'total_len_key_num' above
        battery.append(
            'total_len_key',
            common_patterns + r'''
                \b (?P<key>   (?&total_len_key)) (?&key_end)
                (?P<value> (?&range)) \s*
                (?P<units> (?&len_units))?
            ''',
            default_units='_mm_'
        )

        # We will now allow an ambiguous key if it is not preceded by another word
        battery.append(
            'len_key_ambiguous',
            common_patterns + r'''
                (?&no_word)
                (?P<key>   (?&len_key_ambiguous)) (?&key_end)
                (?P<value> (?&range))
            '''
        )

        # Look for snout-vent length keys
        battery.append(
            'svl_len_key',
            common_patterns + r'''
                \b (?P<key>   (?&svl_len_key)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&len_units))?
            '''
        )

        return battery
Example #4
0
    def _battery(self, common_patterns):
        battery = ParserBattery(parse_units=True, units_from_key=r''' (?P<units> grams ) $ ''')

        # Look for a pattern like: body mass: 4 lbs 8 oz
        battery.append(
            'en_wt',
            common_patterns + r'''
                \b (?P<key>    (?&all_wt_keys))? (?&key_end)?
                   (?P<value1> (?&range))    \s*
                   (?P<units1> (?&wt_pound)) \s*
                   (?P<value2> (?&range))    \s*
                   (?P<units2> (?&wt_ounce))
            ''',
            default_key='_english_',
            compound_value=2
        )

        # Look for body mass with a total weight key and optional units
        battery.append(
            'total_wt_key',
            common_patterns + r'''
                \b (?P<key>   (?&total_wt_key)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&wt_units))?
            '''
        )

        # Look for these secondary body mass keys next
        battery.append(
            'other_wt_key',
            common_patterns + r'''
                \b (?P<key>   (?&other_wt_key)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&wt_units))?
            '''
        )

        # Look for keys where the units are required
        battery.append(
            'key_units_req',
            common_patterns + r'''
                \b (?P<key>   (?&key_units_req)) (?&key_end)
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&wt_units))
            '''
        )

        # Look for the body mass in a phrase
        battery.append(
            'wt_in_phrase',
            common_patterns + r'''
                \b (?P<key>   (?&wt_in_phrase)) \D{1,32}
                   (?P<value> (?&range)) \s*
                   (?P<units> (?&wt_units))?
            '''
        )

        # An out of order parse: body mass (g) 20-25
        battery.append(
            'wt_key_word',
            common_patterns + r'''
                \b (?P<key>   (?&wt_key_word)) \s*
                   (?&open) \s* (?P<units> (?&wt_units)) \s* (?&close) \s*
                   (?P<value> (?&range))
            '''
        )

        # These keys require units to disambiguate what is being measured
        battery.append(
            'wt_key_word_req',
            common_patterns + r'''
                (?P<key>   (?&wt_key_word)) (?&key_end)
                (?P<value> (?&range)) \s*
                (?P<units> (?&wt_units))
            '''
        )

        # Body mass is in shorthand notation
        battery.append(
            'wt_shorthand',
            common_patterns + r'''
                \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )?
                   (?&wt_shorthand) \s*
                   (?P<value> (?&number)) \s*
                   (?P<units> (?&wt_units))?
            ''',
            default_key='_shorthand_'
        )

        # Body mass is in shorthand notation (units required)
        battery.append(
            'wt_shorthand_req',
            common_patterns + r'''
                \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )?
                   (?&wt_shorthand_req) \s*
                   (?P<value> (?&number)) \s*
                   (?P<units> (?&wt_units))
            ''',
            default_key='_shorthand_'
        )

        # A shorthand notation with some abbreviations in it
        battery.append(
            'wt_shorthand_euro',
            common_patterns + r'''
                \b (?: (?P<key> (?&all_wt_keys)) (?&key_end) )?
                   (?&wt_shorthand_euro) \s*
                   (?P<value> (?&number)) \s*
                   (?P<units> (?&wt_units))?
            ''',
            default_key='_shorthand_'
        )

        # A notation using 'fa'. It can be shorter than the other shorthand notations
        battery.append(
            'wt_fa',
            common_patterns + r'''
                fa \d* -
                (?P<value> (?&number)) \s*
                (?P<units> (?&wt_units))?
            ''',
            default_key='_shorthand_'
        )

        # Now we can look for the body mass, RANGE, optional units
        battery.append(
            'wt_key_ambiguous',
            common_patterns + r'''
                (?P<key>   (?&wt_key_word)) (?&key_end)
                (?P<value> (?&range)) \s*
                (?P<units> (?&wt_units))?
            '''
        )

        return battery