def bootstrap(config): """Bootstraps the location parser""" # Will test if something matches 5 or 9 digit postalcode pattern postal_regex = re.compile( r'^' + r'(\d{2,7}(-\d{2,4})?)|' + r'([a-zA-Z]\d{3})|' + r'([a-zA-Z]{2}\s\d{2})|' + r'([a-zA-Z]{2}-\d{2})|' + r'(AD\d{3})|' + r'(\d{3}\s\d{2})|' + r'([a-zA-Z]{2}\d{4})|' + r'(\d{4}\sW3)|' + r'(\d{4}\s[a-zA-Z]{2})|' + r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' + r'(AZ\s\d{4})|' + r'(BB\d{1,5})|' + r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' + r'(JMA[a-zA-Z]{2}\d{2})|' + r'(AZ-\d{4})|' + r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' + r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' + r'([a-zA-Z]{3}\s\d{4})|' + r'([a-zA-Z]{4}\s1ZZ)|' + r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' + r'(\d{5}\sCEDEX(\s\d{1,2})?)' + r'$' ) registry.set('ZCP_postal_code_regex', postal_regex)
def bootstrap(config): """ Trains the bayes classifier with examples from various programming languages :param config: cahoots config :type config: cahoots.config.BaseConfig """ classifier = simplebayes.SimpleBayes( ProgrammingBayesianClassifier.bayes_tokenizer ) directory = os.path.dirname(os.path.abspath(__file__)) trainers = {} trainer_zip = zipfile.ZipFile(directory + '/trainers.zip', 'r') for filename in trainer_zip.namelist(): language = filename.split('.')[0] trainers[language] = trainer_zip.read(filename) for language in trainers: classifier.train(language, trainers[language]) registry.set('PP_bayes', classifier)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ # Will test if something matches 5 or 9 digit postalcode pattern postal_regex = re.compile( r'^' + r'(\d{2,7}(-\d{2,4})?)|' + r'([a-zA-Z]\d{3})|' + r'([a-zA-Z]{2}\s\d{2})|' + r'([a-zA-Z]{2}-\d{2})|' + r'(AD\d{3})|' + r'(\d{3}\s\d{2})|' + r'([a-zA-Z]{2}\d{4})|' + r'(\d{4}\sW3)|' + r'(\d{4}\s[a-zA-Z]{2})|' + r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' + r'(AZ\s\d{4})|' + r'(BB\d{1,5})|' + r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' + r'(JMA[a-zA-Z]{2}\d{2})|' + r'(AZ-\d{4})|' + r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' + r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' + r'([a-zA-Z]{3}\s\d{4})|' + r'([a-zA-Z]{4}\s1ZZ)|' + r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' + r'(\d{5}\sCEDEX(\s\d{1,2})?)' + r'$' ) registry.set('ZCP_postal_code_regex', postal_regex)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ the_regex = re.compile('^the ', re.IGNORECASE) registry.set('LP_the_regex', the_regex)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ email_regex = re.compile(VALID_ADDRESS_REGEXP) registry.set('EP_valid_regex', email_regex)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ upper_alpha = re.compile('[A-Z]') registry.set('NP_upper_alpha_regex', upper_alpha)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ split_regex = re.compile(r"[\s#`'?.;,-/]") registry.set('AP_split_regex', split_regex)
def test_flush(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertNotEqual(0, len(registry.storage)) registry.flush() self.assertEqual(0, len(registry.storage))
def get_preposition_literals(): """Generates the prepositions parser and returns it""" if registry.test('DP_prepositions'): return registry.get('DP_prepositions') prepositions = \ Or([CaselessLiteral(s) for s in DataHandler().get_prepositions()]) registry.set('DP_prepositions', prepositions) return prepositions
def get_prepositions(self): """returns the list of prepositions""" if registry.test('DATA_prepositions'): return registry.get('DATA_prepositions') handle = self.get_file_handle('prepositions.yaml') prepositions = yaml.load(handle) handle.close() registry.set('DATA_prepositions', prepositions) return prepositions
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ # Will test if something matches regular coordinates # 34.56,23.65 or 34.56 23.65 or 34.56 , 23.65 coord_regex = re.compile(r'^(-?\d{1,3}(?:\.\d+)?)' + r'(?:(?:(?:\s+)?,(?:\s+)?)|(?:\s+))' + r'(-?\d{1,3}(?:\.\d+))?$') registry.set('CP_coord_regex', coord_regex) # Will test if something matches degree coordinates # 40.244° N 79.123° W deg_regex = re.compile( u('^(\d{1,3}\.\d+°?\s+[nNsS])') + u('\s+') + u('(\d{1,3}\.\d+°?\s+[wWeE])$')) registry.set('CP_deg_regex', deg_regex) # Will test if something matches deg/min coordinates # 13° 34.425' N 45° 37.983' W deg_min_regex = re.compile( u('^(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[nNsS])') + u('\s+') + u('(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[wWeE])$')) registry.set('CP_deg_min_regex', deg_min_regex) # Will test if something matches deg/min/sec coordinates # 40° 26' 46.56" N 79° 58' 56.88" W deg_min_sec_regex = re.compile( u('^(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[nNsS])') + u('\s+') + u('(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[wWeE])$')) registry.set('CP_deg_min_sec_regex', deg_min_sec_regex)
def get_prepositions(self): """ returns the list of prepositions :return: list of prepositions :rtype: list """ if registry.test('DATA_prepositions'): return registry.get('DATA_prepositions') handle = self.get_file_handle('prepositions.yaml') prepositions = yaml.load(handle) handle.close() registry.set('DATA_prepositions', prepositions) return prepositions
def bootstrap(config): """Loads tokens from the yaml files on disk""" all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, 'r') as language_file: language = yaml.load(language_file) all_keywords.extend(language['keywords']) language_keywords[language['id']] = language registry.set('PP_all_keywords', set(all_keywords)) registry.set('PP_language_keywords', language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ time_scales = [ 'microseconds', 'milliseconds', 'seconds', 'minutes', 'hours', 'days', 'weeks', 'years', 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'day', 'week', 'year', ] # <number> <timescale> <preposition> # 3 seconds until / 50 seconds since pre_timedeltas = Or( [DateParser.create_pre_timedelta_literal(t) for t in time_scales] ) pre_timedelta_phrases = \ pre_timedeltas + Word(alphas + nums + " .,;-/'") registry.set('DP_pre_timedelta_phrases', pre_timedelta_phrases) # <operator> <number> <timescale> # plus 5 hours / - 17 days post_timedelta_phrases = Or( [DateParser.create_post_timedelta_literal(t) for t in time_scales] ) registry.set('DP_post_timedelta_phrases', post_timedelta_phrases)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ # Will test if something matches 5 or 9 digit postalcode pattern postal_regex = re.compile( r'^' + r'(\d{2,7}(-\d{2,4})?)|' + r'([a-zA-Z]\d{3})|' + r'([a-zA-Z]{2}\s\d{2})|' + r'([a-zA-Z]{2}-\d{2})|' + r'(AD\d{3})|' + r'(\d{3}\s\d{2})|' + r'([a-zA-Z]{2}\d{4})|' + r'(\d{4}\sW3)|' + r'(\d{4}\s[a-zA-Z]{2})|' + r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' + r'(AZ\s\d{4})|' + r'(BB\d{1,5})|' + r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' + r'(JMA[a-zA-Z]{2}\d{2})|' + r'(AZ-\d{4})|' + r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' + r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' + r'([a-zA-Z]{3}\s\d{4})|' + r'([a-zA-Z]{4}\s1ZZ)|' + r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' + r'(\d{5}\sCEDEX(\s\d{1,2})?)' + r'$') registry.set('ZCP_postal_code_regex', postal_regex)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ time_scales = [ 'microseconds', 'milliseconds', 'seconds', 'minutes', 'hours', 'days', 'weeks', 'years', 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'day', 'week', 'year', ] # <number> <timescale> <preposition> # 3 seconds until / 50 seconds since pre_timedeltas = Or( [DateParser.create_pre_timedelta_literal(t) for t in time_scales]) pre_timedelta_phrases = \ pre_timedeltas + Word(alphas + nums + " .,;-/'") registry.set('DP_pre_timedelta_phrases', pre_timedelta_phrases) # <operator> <number> <timescale> # plus 5 hours / - 17 days post_timedelta_phrases = Or( [DateParser.create_post_timedelta_literal(t) for t in time_scales]) registry.set('DP_post_timedelta_phrases', post_timedelta_phrases)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, "r") as language_file: language = yaml.load(language_file) all_keywords.extend(language["keywords"]) language_keywords[language["id"]] = language registry.set("PP_all_keywords", set(all_keywords)) registry.set("PP_language_keywords", language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ all_keywords = [] language_keywords = {} directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "languages/*.yaml") for file_path in glob.glob(path): with open(file_path, 'r') as language_file: language = yaml.load(language_file) all_keywords.extend(language['keywords']) language_keywords[language['id']] = language registry.set('PP_all_keywords', set(all_keywords)) registry.set('PP_language_keywords', language_keywords) ProgrammingBayesianClassifier.bootstrap(config)
def bootstrap(config): """ This method is statically called to bootstrap a parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ # Will test if something matches regular coordinates # 34.56,23.65 or 34.56 23.65 or 34.56 , 23.65 coord_regex = re.compile( r'^(-?\d{1,3}(?:\.\d+)?)' + r'(?:(?:(?:\s+)?,(?:\s+)?)|(?:\s+))' + r'(-?\d{1,3}(?:\.\d+))?$' ) registry.set('CP_coord_regex', coord_regex) # Will test if something matches degree coordinates # 40.244° N 79.123° W deg_regex = re.compile( u('^(\d{1,3}\.\d+°?\s+[nNsS])') + u('\s+') + u('(\d{1,3}\.\d+°?\s+[wWeE])$') ) registry.set('CP_deg_regex', deg_regex) # Will test if something matches deg/min coordinates # 13° 34.425' N 45° 37.983' W deg_min_regex = re.compile( u('^(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[nNsS])') + u('\s+') + u('(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[wWeE])$') ) registry.set('CP_deg_min_regex', deg_min_regex) # Will test if something matches deg/min/sec coordinates # 40° 26' 46.56" N 79° 58' 56.88" W deg_min_sec_regex = re.compile( u('^(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[nNsS])') + u('\s+') + u('(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[wWeE])$') ) registry.set('CP_deg_min_sec_regex', deg_min_sec_regex)
def bootstrap(config): """ Loads unit lists for use in this instance of the measurement parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ units = {} systems = {} prepositions = DataHandler().get_prepositions() directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "units/*.yaml") for file_path in glob.glob(path): unit_file = open(file_path, 'r') unit_type = yaml.load(unit_file) for unit in unit_type['keywords']: units[unit] = unit_type['id'] systems[unit_type['id']] = \ (unit_type['system'], unit_type['type']) preposition_parser = \ Or([CaselessLiteral(s) for s in prepositions]) + Word(alphas) measurement_parser = \ originalTextFor( Word(nums) + ZeroOrMore(',' + Word(nums+',')) + ZeroOrMore('.' + Word(nums)) + ZeroOrMore(Word(nums) + '/' + Word(nums)) ) + \ Or([CaselessLiteral(s) for s in units.keys()]) + \ Optional(originalTextFor(preposition_parser)) registry.set('MP_units', units) registry.set('MP_systems', systems) registry.set('MP_preposition_parser', preposition_parser) registry.set('MP_measurement_parser', measurement_parser)
def bootstrap(config): """ Loads unit lists for use in this instance of the measurement parser :param config: cahoots config :type config: cahoots.config.BaseConfig """ units = {} systems = {} prepositions = DataHandler().get_prepositions() directory = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(directory, "units/*.yaml") for file_path in glob.glob(path): unit_file = open(file_path, 'r') unit_type = yaml.load(unit_file) for unit in unit_type['keywords']: units[unit] = unit_type['id'] systems[unit_type['id']] = \ (unit_type['system'], unit_type['type']) preposition_parser = \ Or([CaselessLiteral(s) for s in prepositions]) + Word(alphas) measurement_parser = \ originalTextFor( Word(nums, max=3) + ZeroOrMore(',' + Word(nums, exact=3)) + ZeroOrMore('.' + Word(nums)) + ZeroOrMore(Word(nums) + '/' + Word(nums)) ) + \ Or([CaselessLiteral(s) for s in units.keys()]) + \ Optional(originalTextFor(preposition_parser)) registry.set('MP_units', units) registry.set('MP_systems', systems) registry.set('MP_preposition_parser', preposition_parser) registry.set('MP_measurement_parser', measurement_parser)
def setUp(self): registry.set('PP_bayes', SimpleBayesStub())
def test_set(self): registry.set('test', 'foo') self.assertEqual('foo', registry.storage['test'])
def test_test(self): registry.set('test', 'foo') self.assertTrue(registry.test('test')) self.assertFalse(registry.test('bar'))
def test_get(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertIsNone(registry.get('bar'))
def bootstrap(config): """preps the address parser""" the_regex = re.compile('^the ', re.IGNORECASE) registry.set('LP_the_regex', the_regex)