def test_to_csv(self): """ Test stats instance dict to csv conversion """ with mock.patch('PatternOmatic.ge.stats.time') as mock_time: mock_time.return_value = .123 self.stats.aes = 10 self.stats.mbf = 0.5 self.stats.mean_time = 0.22 self.stats.success_rate = 0.5 # When a best individual has not been found csv_stats = \ f'{.123}\t{self.stats.mbf}\t{self.stats.success_rate}\t{self.stats.aes}\t{self.stats.mean_time}\t' \ f'{None}\t' super().assertEqual(csv_stats, self.stats._to_csv()) # When a best individual has been found i = object.__new__(Individual) i.__setattr__(self.fitness_value_literal, 1.0) self.stats.most_fitted_accumulator = [i] csv_stats += f'{None}\t{i.fitness_value}\t' super().assertEqual(csv_stats, self.stats._to_csv()) # Also check csv is correctly persisted config = Config() config.report_path = self.test_report_path_file config.report_format = ReportFormat.CSV self.stats.persist() with open(self.test_report_path_file, 'r') as persisted_report: red_report = persisted_report.readlines() super().assertEqual(csv_stats + '\n', red_report[0])
def test_persist(self): config = Config() config.report_format = ReportFormat.JSON config.report_path = self.test_report_path_file # When a best individual has been found i = object.__new__(Individual) i.__setattr__(self.fitness_value_literal, 1.0) self.stats.aes = 100 self.stats.mbf = 0.9 self.stats.mean_time = 0.42 self.stats.success_rate = 1.0 self.stats.most_fitted_accumulator = [i] self.stats.persist() with open(self.test_report_path_file, 'r') as persisted_report: red_report = persisted_report.readlines() super().assertEqual(str(dict(self.stats)) + '\n', red_report[0]) # When a best individual has not been found self.stats.most_fitted_accumulator = [] self.stats.persist() with open(self.test_report_path_file, 'r') as persisted_report: red_report = persisted_report.readlines() super().assertEqual(str(dict(self.stats)) + '\n', red_report[1])
def test_find_patterns_when_valid_configuration_file_provided(self): """ Checks that providing a valid configuration file path loads configuration from that file """ config_file_path = \ os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini') _ = find_patterns(self.my_samples, configuration=config_file_path) super().assertEqual(config_file_path, Config().file_path)
class BasePopulationTest(unittest.TestCase): """ Base class to supply shard attributes and helpers """ # # Shared attributes # config = Config() nlp = spacy.load("en_core_web_sm") samples = [ nlp(u'I am a raccoon!'), nlp(u'You are a cat!'), nlp(u'Is she a rabbit?'), nlp(u'This is a test') ] grammar = dgg(samples) stats = Stats() # # Helpers # def setUp(self) -> None: """ Fresh Config instance """ self.config = Config() def tearDown(self) -> None: """ Destroy Config instance """ Config.clear_instance()
def __init__(self, samples: [Doc], grammar: dict, stats: Stats, dna: str = None): """ Individual constructor, if dna is not supplied, sets up randomly its binary genotype Args: samples: list of Spacy doc objects grammar: Backus Naur Form grammar notation encoded in a dictionary stats (Stats): statistics object related with this run dna: Optional, binary string representation """ self.config = Config() self.samples = samples self.grammar = grammar self.stats = stats self.bin_genotype = self._initialize() if dna is None else self.mutate( dna, self.config.mutation_probability) self.int_genotype = self._transcription() self.fenotype = self._translation() self.fitness_value = Fitness(self.config, self.samples, self.fenotype).__call__() # Stats concerns self._is_solution()
def dynamic_generator(samples: [Doc]) -> dict: """ Dynamically generates a grammar in Backus Naur Form (BNF) notation representing the available Spacy NLP Linguistic Feature values of the given sample list of Doc instances Args: samples: List of Spacy Doc objects Returns: Backus Naur Form grammar notation encoded in a dictionary """ config = Config() LOG.info(f'Generating BNF based on the following samples: {str(samples)}') # BNF root pattern_grammar = {S: [P]} # Watch out features of seen samples and max number of tokens per sample max_length_token, min_length_token, features_dict, extended_features = _features_seen(samples) # Update times token per pattern [Min length of tokens, Max length of tokens] interval pattern_grammar[P] = _symbol_stacker(T, max_length_token, min_length_token) # Update times features per token (Max length of features) pattern_grammar[T] = _symbol_stacker(F, _get_features_per_token(features_dict)) if config.use_token_wildcard is True: pattern_grammar[T].append(TOKEN_WILDCARD) # Update available features (just the features list) list_of_features = list(features_dict.keys()) if config.use_grammar_operators is True and config.use_extended_pattern_syntax is False: pattern_grammar = _add_grammar_operators(pattern_grammar, list_of_features) elif config.use_extended_pattern_syntax is True and config.use_grammar_operators is False: pattern_grammar = _add_extended_pattern_syntax(pattern_grammar, list_of_features, features_dict) else: pattern_grammar[F] = list_of_features # Update each feature possible values for k, v in features_dict.items(): if config.use_extended_pattern_syntax is True: v.append(XPS) pattern_grammar.update({k: v}) if config.use_custom_attributes is True: pattern_grammar = _add_custom_attributes(pattern_grammar, extended_features) LOG.info(f'Dynamically generated BNF: {str(pattern_grammar)}') return pattern_grammar
def __init__(self): """ Stats instances constructor """ self.config = Config() self.success_rate_accumulator = list() self.mbf_accumulator = list() self.aes_accumulator = list() self.time_accumulator = list() self.most_fitted_accumulator = list() self.solution_found = False self.success_rate = None self.mbf = None self.aes = None self.mean_time = None self.aes_counter = 0
def test_xps_gop_can_not_be_enabled_together(self): """ Tests Spacy's Grammar Operators and Extended Patter Syntax can not be enabled both """ config = Config() config.use_grammar_operators = True config.use_extended_pattern_syntax = True super().assertNotEqual(config.use_grammar_operators, config.use_extended_pattern_syntax) config.use_grammar_operators = False config.use_extended_pattern_syntax = True super().assertEqual(True, config.use_extended_pattern_syntax) config.use_grammar_operators = True super().assertEqual(False, config.use_extended_pattern_syntax)
def test_config_read_from_path(self): """ Tests providing or not providing a configuration file works as expected""" # No config file provided super().assertEqual(None, self.config.file_path) # Correct config file provided file_path = os.path.join( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini') Config.clear_instance() self.config = Config(file_path) super().assertEqual(file_path, self.config.file_path) # Bad path provided Config.clear_instance() self.config = Config('') super().assertEqual(None, self.config.file_path)
def test_setting_config_attribute_with_wrong_type_has_no_effect(self): config = Config() config.max_runs = 0.5 config.use_extended_pattern_syntax = None config.fitness_function_type = RecombinationType.RANDOM_ONE_POINT_CROSSOVER config.report_path = 0 super().assertNotEqual(config.max_runs, 0.5) super().assertNotEqual(config.use_extended_pattern_syntax, None) super().assertNotEqual(config.fitness_function_type, RecombinationType.RANDOM_ONE_POINT_CROSSOVER) super().assertNotEqual(config.report_path, 0)
def _get_features_per_token(features_dict: dict) -> int: """ Given the configuration set up, determine the maximum number of features per token at grammar Args: features_dict: dictionary of features keys with all possible feature value options Returns: integer """ config = Config() if config.features_per_token <= 0: max_length_features = len(features_dict.keys()) else: if len(features_dict.keys()) < config.features_per_token + 1: max_length_features = len(features_dict.keys()) else: max_length_features = config.features_per_token return max_length_features
def tearDown(self) -> None: """ Destroy Config instance, reset Underscore's token extensions """ Config.clear_instance() Underscore.token_extensions = {}
def find_patterns( samples: List[str], configuration: Union[str, None] = None, spacy_language_model_name: Union[str, None] = None) -> List[Tuple[Any, ...]]: """ Given some samples, this function finds optimized patterns to be used by the Spacy's Rule Based Matcher. Args: samples: List of strings from where to find common linguistic patterns configuration: (str) Optional configuration file path to to be loaded (Fallbacks to default configuration) spacy_language_model_name: (str) Optional valid Spacy Language Model (Fallbacks to Spacy's en_core_web_sm) Returns: List of patterns found and list of each pattern matching score against the samples """ LOG.info(f'Loading language model {spacy_language_model_name}...') if 'en-core-web-sm' not in [ d.project_name for d in pkg_resources.working_set ]: LOG.info( f'PatternOmatic\'s default spaCy\'s Language Model not installed,' f' proceeding to install en_core_web_sm, please wait...') spacy_download('en_core_web_sm') try: nlp = spacy_load(spacy_language_model_name) except OSError: LOG.warning( f'Model {spacy_language_model_name} not found, ' f'falling back to patternOmatic\'s default language model: en_core_web_sm' ) nlp = spacy_load('en_core_web_sm') LOG.info(f'Building Doc instances...') samples = [nlp(sample) for sample in samples] if isinstance(configuration, str): LOG.info( f'Setting up configuration from the following path: {configuration}...' ) config = Config(config_file_path=configuration) else: config = Config() LOG.info(f'Existing Config instance found: {config}') stats = Stats() bnf_g = dgg(samples) LOG.info('Starting Execution...') for _ in range(0, config.max_runs): start = time.monotonic() p = Population(samples, bnf_g, stats) p.evolve() end = time.monotonic() stats.add_time(end - start) stats.calculate_metrics() LOG.info(f'Execution report {stats}') stats.persist() LOG.info(f'Best individuals for this execution:') stats.most_fitted_accumulator.sort(key=lambda i: i.fitness_value, reverse=True) for individual in stats.most_fitted_accumulator: LOG.info(f'{individual}') return list( zip(*[[i.fenotype, i.fitness_value] for i in stats.most_fitted_accumulator]))
def test_config_is_clearable(self): """ Tests its possible to renew the singleton instance """ Config.clear_instance() another_config = Config() super().assertNotEqual(self.config, another_config)
def test_config_is_singleton(self): """ Tests config instance is a singleton one """ another_config = Config() super().assertEqual(self.config, another_config)
def test_find_patterns_when_config_instance_provided(self): """ Checks when setting up a Config instance before find_patterns invocation works """ config = Config() config.max_runs = 10 patterns, _ = find_patterns(self.my_samples) super().assertEqual(10, len(patterns))
def setUp(self) -> None: """ Fresh Config instance """ self.config = Config()
class TestIndividual(unittest.TestCase): """ Unit Test class for GE Individual object """ config = Config() nlp = spacy.load("en_core_web_sm") samples = [ nlp(u'I am a raccoon!'), nlp(u'You are a cat!'), nlp(u'Is she a rabbit?'), nlp(u'This is a test') ] grammar = dgg(samples) stats = Stats() def test_init(self): """ Test that Individual instantiation works """ i = Individual(self.samples, self.grammar, self.stats) super().assertIs(type(i), Individual) def test_init_with_dna(self): """ Test that Individual instantiation works when providing dna""" i = Individual(self.samples, self.grammar, self.stats, '10101010101010101010101010101010') super().assertNotEqual(i, None) def test_transcription(self): """ Check for transcription idempotency """ self.config.mutation_probability = 0.0 i = Individual(self.samples, self.grammar, self.stats, '11111111') i._transcription() i._transcription() i._transcription() super().assertListEqual(i.int_genotype, [127, 1]) def test_translation(self): """ Check for translation idempotency """ self.config.mutation_probability = 0.0 i = Individual(self.samples, self.grammar, self.stats, '11111111') i._translation() i._translation() i._translation() super().assertListEqual(i.fenotype, [{ 'TEXT': 'am' }, { 'TEXT': '?' }, { 'TEXT': 'am' }, { 'TEXT': '?' }, { 'TEXT': 'am' }]) def test_mutation(self): """ Checks that mutation works """ self.config.mutation_probability = 1.0 i = Individual(self.samples, self.grammar, self.stats, '11111111') super().assertNotEqual(i.bin_genotype, '11111111') def test_fitness_basic(self): """ Fitness "basic" sets fitness """ self.config.mutation_probability = 0.0 self.config.fitness_function_type = FitnessType.BASIC i = Individual(self.samples, self.grammar, self.stats, '01110101100101100110010110010101') super().assertEqual(i.fitness_value, 0.25) def test_fitness_full_match(self): """ Fitness "full match" sets fitness """ self.config.mutation_probability = 0.0 self.config.fitness_function_type = FitnessType.FULL_MATCH i = Individual(self.samples, self.grammar, self.stats, '01101010100001101000110111000100') super().assertEqual(i.fitness_value, 0.25) def test_token_wildcard_penalty(self): """ Checks that token wildcard penalty is properly set """ # When using token wildcard, penalty is applied f = object.__new__(Fitness) f.fenotype = [{}, {}, {}, 'Whatever'] self.config.use_token_wildcard = True f.config = self.config super().assertEqual(0.25, f._wildcard_penalty(1.0)) # When not using token wildcard, penalty is not applied self.config.use_token_wildcard = False f.fenotype = 1.0 super().assertEqual(1.0, f._wildcard_penalty(1.0)) def test_translate(self): """ Verifies conversions over the BNF are done correctly """ i = object.__new__(Individual) # Root i.grammar = {S: [P]} super().assertEqual('"S":"<P>"', i._translate(0, S, S)) # Pattern root symbol to Token symbol i.grammar = {P: [T]} super().assertEqual(T, i._translate(0, P, P)) # Token symbol to Feature symbol inside Token i.grammar = {T: [F]} super().assertEqual('{<F>}', i._translate(0, T, T)) # Token symbol to wildcard i.grammar = {T: [TOKEN_WILDCARD]} super().assertEqual('{}', i._translate(0, T, T)) # Feature symbol to specific symbol i.grammar = {F: [ORTH]} super().assertEqual('{<ORTH>}', i._translate(0, F, '{<F>}')) # Basic Terminal conversion i.grammar = {ORTH: ['Test']} super().assertEqual('{"ORTH":"Test"}', i._translate(0, ORTH, '{<ORTH>}')) # Underscore conversion i.grammar = {UNDERSCORE: [IS_CURRENCY]} super().assertEqual('{"_": {<CUSTOM_IS_CURRENCY>}}', i._translate(0, UNDERSCORE, '{<UNDERSCORE>}')) # Underscore terminal conversion i.grammar = {IS_CURRENCY: [True]} super().assertEqual( '{"_": {"CUSTOM_IS_CURRENCY":"True"}}', i._translate(0, IS_CURRENCY, '{"_": {<CUSTOM_IS_CURRENCY>}}')) # Grammar Operators conversion i.grammar = {OP: ZERO_OR_MORE} super().assertEqual('"OP":"*"', i._translate(0, OP, '<OP>')) # Extended Pattern Syntax conversion (base) i.grammar = {XPS: [IN]} super().assertEqual('{<IN>}', i._translate(0, XPS, '<XPS>')) i.grammar = {ORTH: [XPS]} super().assertEqual('"ORTH":<XPS>', i._translate(0, ORTH, '<ORTH>')) # Extended Pattern Syntax conversion (terminal logical) i.grammar = {NOT_IN: [['Test']]} super().assertEqual('{"ORTH": {"NOT_IN":["Test"]}}', i._translate(0, NOT_IN, '{"ORTH": {<NOT_IN>}}')) # Extended Pattern Syntax (terminal arithmetical) i.grammar = {GTH: [5]} super().assertEqual('{"LENGTH": {">":5}}', i._translate(0, GTH, '{"LENGTH": {<GTH>}}')) # # Helpers # def setUp(self) -> None: """ Fresh Config instance """ self.config = Config() def tearDown(self) -> None: """ Destroy Config instance """ Config.clear_instance()
def _features_seen(samples: [Doc]) -> (int, int, dict, dict): """ Builds up a dictionary containing Spacy Linguistic Feature Keys and their respective seen values for the sample Args: samples: List of Spacy Doc objects Returns: Integer, the max length of a doc within the sample and a dict of features """ config = Config() # Just tokenizer features orth_list = [] text_list = [] lower_list = [] length_list = [] shape_list = [] # For boolean features bool_list = [True, False] # Require more than a tokenizer pos_list = [] tag_list = [] dep_list = [] lemma_list = [] ent_type_list = [] # Capture the len of the largest doc max_doc_length = 0 min_doc_length = 999999999 # Set token extensions if config.use_custom_attributes is True: _set_token_extension_attributes(samples[0][0]) extended_features = _extended_features_seen([token for sample in samples for token in sample]) else: extended_features = {UNDERSCORE: {}} for sample in samples: sample_length = len(sample) for token in sample: orth_list.append(token.orth_) text_list.append(token.text) lower_list.append(token.lower_) length_list.append(len(token)) pos_list.append(token.pos_) tag_list.append(token.tag_) dep_list.append(token.dep_) lemma_list.append(token.lemma_) shape_list.append(token.shape_) ent_type_list.append(token.ent_type_) # Checks for max/min length of tokens per sample if sample_length > max_doc_length: max_doc_length = sample_length if sample_length < min_doc_length: min_doc_length = sample_length if config.use_uniques is True: features = {ORTH: sorted(list(set(orth_list))), TEXT: sorted(list(set(text_list))), LOWER: sorted(list(set(lower_list))), LENGTH: sorted(list(set(length_list))), POS: sorted(list(set(pos_list))), TAG: sorted(list(set(tag_list))), DEP: sorted(list(set(dep_list))), LEMMA: sorted(list(set(lemma_list))), SHAPE: sorted(list(set(shape_list))), ENT_TYPE: sorted(list(set(ent_type_list)))} else: features = {ORTH: orth_list, TEXT: text_list, LOWER: lower_list, LENGTH: length_list, POS: pos_list, TAG: tag_list, DEP: dep_list, LEMMA: lemma_list, SHAPE: shape_list, ENT_TYPE: ent_type_list} # Add boolean features if config.use_boolean_features is True: features.update({ IS_ALPHA: bool_list, IS_ASCII: bool_list, IS_DIGIT: bool_list, IS_LOWER: bool_list, IS_UPPER: bool_list, IS_TITLE: bool_list, IS_PUNCT: bool_list, IS_SPACE: bool_list, IS_STOP: bool_list, LIKE_NUM: bool_list, LIKE_URL: bool_list, LIKE_EMAIL: bool_list }) # Drop all observations equal to empty string features = _feature_pruner(features) extended_features[UNDERSCORE] = _feature_pruner(extended_features[UNDERSCORE]) return max_doc_length, min_doc_length, features, extended_features
def tearDown(self) -> None: """ Destroy Config instance """ Config.clear_instance()
class TestConfig(unittest.TestCase): """ Test class for settings """ config = None def test_config_is_singleton(self): """ Tests config instance is a singleton one """ another_config = Config() super().assertEqual(self.config, another_config) def test_config_is_clearable(self): """ Tests its possible to renew the singleton instance """ Config.clear_instance() another_config = Config() super().assertNotEqual(self.config, another_config) def test_config_read_from_path(self): """ Tests providing or not providing a configuration file works as expected""" # No config file provided super().assertEqual(None, self.config.file_path) # Correct config file provided file_path = os.path.join( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini') Config.clear_instance() self.config = Config(file_path) super().assertEqual(file_path, self.config.file_path) # Bad path provided Config.clear_instance() self.config = Config('') super().assertEqual(None, self.config.file_path) def test_xps_gop_can_not_be_enabled_together(self): """ Tests Spacy's Grammar Operators and Extended Patter Syntax can not be enabled both """ config = Config() config.use_grammar_operators = True config.use_extended_pattern_syntax = True super().assertNotEqual(config.use_grammar_operators, config.use_extended_pattern_syntax) config.use_grammar_operators = False config.use_extended_pattern_syntax = True super().assertEqual(True, config.use_extended_pattern_syntax) config.use_grammar_operators = True super().assertEqual(False, config.use_extended_pattern_syntax) def test_setting_config_attribute_with_wrong_type_has_no_effect(self): config = Config() config.max_runs = 0.5 config.use_extended_pattern_syntax = None config.fitness_function_type = RecombinationType.RANDOM_ONE_POINT_CROSSOVER config.report_path = 0 super().assertNotEqual(config.max_runs, 0.5) super().assertNotEqual(config.use_extended_pattern_syntax, None) super().assertNotEqual(config.fitness_function_type, RecombinationType.RANDOM_ONE_POINT_CROSSOVER) super().assertNotEqual(config.report_path, 0) def test_validate_config_argument(self): """ Checks that config arguments are properly fetched according to its type """ config_parser = configparser.ConfigParser() test_section = 'test_section' test_option_int = 'test_option_int' test_option_float = 'test_option_float' test_option_boolean = 'test_option_boolean' test_option_string = 'test_option_string' config_parser.add_section(test_section) config_parser[test_section][test_option_int] = '0' config_parser[test_section][test_option_float] = '0.0' config_parser[test_section][test_option_boolean] = 'False' config_parser[test_section][test_option_string] = '' # With valid types super().assertEqual( 0, self.config._validate_config_argument(test_section, test_option_int, 1, config_parser)) super().assertEqual( .0, self.config._validate_config_argument(test_section, test_option_float, .1, config_parser)) super().assertEqual( False, self.config._validate_config_argument(test_section, test_option_boolean, True, config_parser)) super().assertEqual( '', self.config._validate_config_argument(test_section, test_option_string, 'Whatever', config_parser)) # With wrong type config_parser[test_section][test_option_int] = 'False' super().assertEqual( 1, self.config._validate_config_argument(test_section, test_option_int, 1, config_parser)) # With not even a possible type used by the config parser super().assertEqual({}, self.config._validate_config_argument( test_section, test_option_int, {}, config_parser)) # # Helpers # def setUp(self) -> None: """ Fresh Config instance """ self.config = Config() def tearDown(self) -> None: """ Destroy Config instance """ Config.clear_instance()