def test_duplicate_tokens_error(self): doc = doc1 match_examples = [ util.idxs_to_tokens( doc, [0, 1, 1, 3]) # [We, introduce, introduce, methods] ] for match_example in match_examples: with pytest.raises(DuplicateTokensError): build_dependency_pattern(doc, match_example)
def test_tokens_not_connected_error(self): doc = doc1 match_examples = [ util.idxs_to_tokens( doc, [19, 20, 21, 27]) # [courses, generated, by, models] ] feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} for match_example in match_examples: with pytest.raises(TokensNotFullyConnectedError): build_dependency_pattern(doc, match_example, feature_dict)
def test_custom_extension(self): Token.set_extension('custom_attr', default=False) feature_dict = {'DEP': 'dep_', '_': {'custom_attr': 'custom_attr'}} for i, case in enumerate(cases): doc = case['example']['doc'] for token in doc: token._.custom_attr = 'my_attr' match_example = case['example']['match'] pattern = build_dependency_pattern(doc, match_example, feature_dict) matches = match.find_matches(doc, pattern) assert match_example in matches, 'does not match example' pattern_file_name = 'examples/pattern_{}.json'.format(i) with open(pattern_file_name, 'w') as f: json.dump(pattern, f, indent=2) if 'should_hit' in case: for item in case['should_hit']: doc = item['doc'] hit_match = item['match'] matches = match.find_matches(doc, pattern) assert hit_match in matches, 'false negative' if 'should_miss' in case: for item in case['should_miss']: doc = item['doc'] miss_match = item['match'] matches = match.find_matches(doc, pattern) assert miss_match not in matches, 'false positive'
def yield_tree_level_pattern_variants(role_pattern, training_match, feature_dict): match_tokens = training_match.match_tokens extended_match_tokens = spacy_pattern_builder.yield_extended_trees( match_tokens) doc = util.doc_from_match(training_match) for match_tokens in extended_match_tokens: match_tokens = sorted(match_tokens, key=lambda token: token.i) token_labels = role_pattern_builder.build_pattern_label_list( match_tokens, training_match) dependency_pattern_variant = spacy_pattern_builder.build_dependency_pattern( doc, match_tokens, feature_dict=feature_dict) assert (len(dependency_pattern_variant) == len( role_pattern.spacy_dep_pattern) + 1) assert len(token_labels) == len(role_pattern.token_labels) + 1 role_pattern_variant = RolePattern(dependency_pattern_variant, token_labels) match_tokens_depth_order = spacy_pattern_builder.util.sort_by_depth( match_tokens) # Should be same order as the dependency pattern token_labels_depth_order = role_pattern_builder.build_pattern_label_list( match_tokens_depth_order, training_match) role_pattern_variant.token_labels_depth_order = token_labels_depth_order role_pattern_variant.builder = role_pattern.builder new_training_match = RolePatternMatch(training_match) new_training_match.match_tokens = match_tokens role_pattern_variant.training_match = new_training_match yield role_pattern_variant
def test_yield_extended_trees(self): # Build initial pattern doc = doc1 match_tokens = util.idxs_to_tokens( doc, [0, 1, 3]) # [We, introduce, methods] feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'} pattern = build_dependency_pattern(doc, match_tokens, feature_dict) match_tokens_variants = list(yield_extended_trees(match_tokens)) pattern_variants = [ build_dependency_pattern(doc, match_token_variant, feature_dict) for match_token_variant in match_tokens_variants ] assert not util.list_contains_duplicates(pattern_variants) n_variants = len(pattern_variants) for pattern_variant, match_tokens_variant in zip( pattern_variants, match_tokens_variants): matches = match.find_matches(doc, pattern_variant) match_tokens_variant = sorted(match_tokens_variant, key=lambda t: t.i) assert match_tokens_variant in matches
def build_role_pattern( match_example, feature_dict=DEFAULT_BUILD_PATTERN_TOKEN_FEATURE_DICT, validate_pattern=True, ): doc = util.doc_from_match(match_example) util.annotate_token_depth(doc) tokens = util.flatten_list(match_example.values()) tokens = [doc[idx] for idx in util.token_idxs(tokens) ] # Ensure that tokens have the newly added depth attribute nx_graph = util.doc_to_nx_graph(doc) match_tokens = util.smallest_connected_subgraph(tokens, nx_graph, doc) spacy_dep_pattern = spacy_pattern_builder.build_dependency_pattern( doc, match_tokens, feature_dict=feature_dict) token_labels = build_pattern_label_list(match_tokens, match_example) role_pattern = RolePattern(spacy_dep_pattern, token_labels) match_tokens_depth_order = spacy_pattern_builder.util.sort_by_depth( match_tokens) # Should be same order as the dependency pattern token_labels_depth_order = build_pattern_label_list( match_tokens_depth_order, match_example) role_pattern.token_labels_depth_order = token_labels_depth_order if validate_pattern: pattern_does_match_example, matches = validate.pattern_matches_example( role_pattern, match_example) if not pattern_does_match_example: spacy_dep_pattern = role_pattern.spacy_dep_pattern message = [ 'Unable to match example: \n{}'.format(pformat(match_example)), 'From doc: {}'.format(doc), 'Constructed role pattern: \n', 'spacy_dep_pattern: \n{}'.format(pformat(spacy_dep_pattern)), 'token_labels: \n{}'.format( pformat(role_pattern.token_labels_depth_order)), ] if matches: message.append('Matches found:') for match in matches: message += [ 'Match tokens: \n{}'.format(pformat( match.match_tokens)), 'Slots: \n{}'.format(pformat(match)), ] else: message.append('Matches found: None') message = '\n{}'.format('\n'.join(message)) raise RolePatternDoesNotMatchExample(message) return role_pattern
def test_yield_node_level_pattern_variants(self): # Build initial pattern doc = doc1 match_tokens = util.idxs_to_tokens( doc, [0, 1, 3]) # [We, introduce, methods] feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} pattern = build_dependency_pattern(doc, match_tokens, feature_dict) feature_dicts = ( { 'DEP': 'dep_', 'TAG': 'tag_' }, { 'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_' }, ) pattern_variants = list( yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts)) assert not util.list_contains_duplicates(pattern_variants) n_variants = len(pattern_variants) assert n_variants == len(feature_dicts)**len(pattern) for pattern_variant in pattern_variants: matches = match.find_matches(doc, pattern_variant) assert match_tokens in matches # Test mutate_tokens parameter pattern_variants = list( yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts, mutate_tokens=[match_tokens[1]])) n_variants = len(pattern_variants) assert n_variants == len(feature_dicts)**len(pattern) for pattern_variant in pattern_variants: matches = match.find_matches(doc, pattern_variant) assert match_tokens in matches
def test_build_pattern(self): feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} for i, case in enumerate(cases): doc = case['example']['doc'] match_example = case['example']['match'] pattern = build_dependency_pattern(doc, match_example, feature_dict) matches = match.find_matches(doc, pattern) assert match_example in matches, 'does not match example' pattern_file_name = 'examples/pattern_{}.json'.format(i) with open(pattern_file_name, 'w') as f: json.dump(pattern, f, indent=2) if 'should_hit' in case: for item in case['should_hit']: doc = item['doc'] hit_match = item['match'] matches = match.find_matches(doc, pattern) assert hit_match in matches, 'false negative' if 'should_miss' in case: for item in case['should_miss']: doc = item['doc'] miss_match = item['match'] matches = match.find_matches(doc, pattern) assert miss_match not in matches, 'false positive'
without needing to traverse tokens outside of the list. Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError. You can get a connected set that includes your tokens with the following: ''' from spacy_pattern_builder import util connected_tokens = util.smallest_connected_subgraph(match_tokens, doc) assert match_tokens == connected_tokens # Specify the token attributes / features to use feature_dict = { # This here is equal to the default feature_dict 'DEP': 'dep_', 'TAG': 'tag_' } # Build the pattern pattern = build_dependency_pattern(doc, match_tokens, feature_dict=feature_dict) from pprint import pprint pprint(pattern) # In the format consumed by SpaCy's DependencyTreeMatcher: ''' [{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}}, {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'}, 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}}, {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'}, 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}] ''' # Create a matcher and add the newly generated pattern from spacy.matcher import DependencyTreeMatcher