class CausationInstance(_RelationInstance): Degrees = Enum(['Facilitate', 'Enable', 'Disentail', 'Inhibit']) CausationTypes = Enum( ['Consequence', 'Motivation', 'Purpose', 'Inference']) _types = CausationTypes _num_args = 3 def __init__(self, source_sentence, degree=None, causation_type=None, connective=None, cause=None, effect=None, means=None, annotation_id=None): if degree is None: degree = len(self.Degrees) if causation_type is None: degree = len(self.CausationTypes) super(CausationInstance, self).__init__(source_sentence, connective, cause, effect, causation_type, annotation_id) self.degree = degree self.arg2 = means # Map argument attribute names to arg_i attributes. arg_names = bidict({'arg0': 'cause', 'arg1': 'effect', 'arg2': 'means'})
class OverlappingRelationInstance(_RelationInstance): RelationTypes = Enum([ 'Temporal', 'Correlation', 'Hypothetical', 'Obligation_permission', 'Creation_termination', 'Extremity_sufficiency', 'Context' ]) _types = RelationTypes def __init__(self, source_sentence, rel_type=None, connective=None, arg0=None, arg1=None, annotation_id=None, attached_causation=None): if rel_type is None: rel_type = set() # overlapping rel can have multiple types all_args = locals().copy() del all_args['self'] del all_args['attached_causation'] super(OverlappingRelationInstance, self).__init__(**all_args) self.attached_causation = attached_causation def get_interpretable_type(self): if self.type: return set(self._types[t] for t in self.type) else: return set(['UNKNOWN']) def _get_type_str(self): return '+'.join(self.get_interpretable_type())
class FeatureExtractor(object): FeatureTypes = Enum(['Categorical', 'Numerical', 'Binary']) ''' Whether extract() can return features not registered by extract_subfeature_names when run on the same set of instances. Should be overridden in extractor classes where this is true. ''' _EXTRACT_PRODUCES_VALUES_TO_IGNORE = False def __init__(self, name, extractor_fn, feature_type=None): if feature_type is None: feature_type = self.FeatureTypes.Categorical self.name = name self.feature_type = feature_type self._extractor_fn = extractor_fn def extract_subfeature_names(self, instances): if self.feature_type == self.FeatureTypes.Categorical: values_set = set(self._extractor_fn(part) for part in instances) return [ self._get_categorical_feature_name(self.name, value) for value in values_set ] else: # feature_type == Numerical or feature_type == Binary return [self.name] def extract(self, part): ''' Returns a dictionary of subfeature name -> subfeature value. More complex feature extractor classes should override this function. ''' feature_value = self._extractor_fn(part) if self.feature_type == self.FeatureTypes.Categorical: feature_name = self._get_categorical_feature_name( self.name, feature_value) return {feature_name: 1.0} else: # feature_type == Numerical or feature_type == Binary return {self.name: feature_value} def extract_all(self, parts): return [self.extract(part) for part in parts] @staticmethod def _get_categorical_feature_name(base_name, value): return '%s=%s' % (base_name, value) def __repr__(self): return '<Feature extractor: %s>' % self.name
class StanfordNERStage(Stage): NER_TYPES = Enum(['Person', 'Organization', 'Location', 'O']) def __init__(self, name): self.name = name self.model = Model() def train(self, documents, instances_by_doc): pass def _test_documents(self, documents, sentences_by_doc, writer): model_path = path.join(FLAGS.stanford_ner_path, 'classifiers', FLAGS.stanford_ner_model_name) jar_path = path.join(FLAGS.stanford_ner_path, FLAGS.stanford_ner_jar) tagger = SentenceSplitStanfordNERTagger(model_path, jar_path) tokens_by_sentence = [ [StanfordParsedSentence.escape_token_text(token.original_text) # Omit fictitious tokens. for token in sentence.tokens if token.start_offset is not None] for sentence in chain.from_iterable(sentences_by_doc)] # Batch process sentences (faster than repeatedly running Stanford NLP) ner_results = tagger.tag_sents(tokens_by_sentence) all_sentences = chain.from_iterable(sentences_by_doc) for sentence, sentence_result in zip(all_sentences, ner_results): sentence_result_iter = iter(sentence_result) for token in sentence.tokens: if token.start_offset is None: # Ignore fictitious tokens. token.ner_tag = None else: # Throws StopIteration if result is too short. _token_text, tag = next(sentence_result_iter) token.ner_tag = self.NER_TYPES.index(tag.title()) # Make sure there are no extra tags for the sentence. NLTK is dumb. try: next(sentence_result_iter) assert(False) except StopIteration: pass if writer: writer.instance_complete(sentence)
class RegexConnectiveModel(Model): def __init__(self, *args, **kwargs): super(RegexConnectiveModel, self).__init__(*args, **kwargs) self.regexes = [] def _train_model(self, sentences): self.regexes = [ (re.compile(pattern), matching_groups) for pattern, matching_groups in self._extract_patterns(sentences) ] def test(self, sentences): logging.info('Tagging possible connectives...') start_time = time.time() for sentence in sentences: sentence.possible_causations = [] tokens = sentence.tokens[1:] # skip ROOT if FLAGS.regex_include_pos: lemmas_to_match = [ '%s/%s' % (token.lemma, token.get_gen_pos()) for token in tokens ] else: lemmas_to_match = [token.lemma for token in tokens] # Remember bounds of tokens so that we can recover the correct # tokens from regex matches. token_bounds = [] # Final space eases matching string_to_match = ' '.join(lemmas_to_match) + ' ' next_start = 0 for lemma in lemmas_to_match: token_bounds.append((next_start, next_start + len(lemma))) next_start += len(lemma) + 1 # More than one pattern may match a given connective. We record # which patterns matched which sets of connective words. matches = defaultdict(list) for regex, matching_group_indices in self.regexes: match = regex.search(string_to_match) while match is not None: # We need to add 1 to indices to account for root. token_indices = tuple( token_bounds.index(match.span(i)) + 1 for i in matching_group_indices) matches[token_indices].append(regex.pattern) # Skip past the first token that matched to start looking # for the next match. This ensures that we won't match the # same connective twice with this pattern. # (We start from the end of the first group *after* the # pattern start group.) match = regex.search(string_to_match, pos=match.span(2)[1]) for token_indices, matching_patterns in matches.items(): connective_tokens = [sentence.tokens[i] for i in token_indices] true_causation_instance = None for causation_instance in sentence.causation_instances: if causation_instance.connective == connective_tokens: true_causation_instance = causation_instance possible_causation = PossibleCausation( sentence, matching_patterns, connective_tokens, true_causation_instance) sentence.possible_causations.append(possible_causation) elapsed_seconds = time.time() - start_time logging.info("Done tagging possible connectives in %0.2f seconds" % elapsed_seconds) ##################################### # Sentence preprocessing ##################################### @staticmethod def _filter_sentences_for_pattern(sentences, pattern, connective_lemmas): possible_sentence_indices = [] for i, sentence in enumerate(sentences): token_lemmas = [token.lemma for token in sentence.tokens] # TODO: Should we filter here by whether there are enough tokens in # the sentence to match the rest of the pattern, too? if all([ connective_lemma in token_lemmas for connective_lemma in connective_lemmas ]): possible_sentence_indices.append(i) return possible_sentence_indices ##################################### # Pattern generation ##################################### CONNECTIVE_INTERJECTION_PATTERN = ARG_WORDS_PATTERN = '([\S]+ )+?' # Pattern can start after another word, or @ start of sentence PATTERN_START = '(^| )' TokenTypes = Enum(['Connective', 'Cause', 'Effect']) # Also possible: None @staticmethod def _get_pattern(sentence, connective_tokens, cause_tokens, effect_tokens): connective_capturing_groups = [] pattern = RegexConnectiveModel.PATTERN_START next_group_index = 2 # whole match is 0, and pattern start will add 1 previous_token_type = None connective_tokens.sort(key=lambda token: token.index) # just in case next_connective_index = 0 for token in sentence.tokens[1:]: if (next_connective_index < len(connective_tokens) and token.index == connective_tokens[next_connective_index].index): # We ensure above that every token lemma in the tested string # has a space after it, even the last token, so space is safe. if FLAGS.regex_include_pos: pattern += '(%s/%s) ' % (token.lemma, token.get_gen_pos()) else: pattern += '(%s) ' % token.lemma previous_token_type = ( RegexConnectiveModel.TokenTypes.Connective) connective_capturing_groups.append(next_group_index) next_group_index += 1 next_connective_index += 1 else: if token in cause_tokens: token_type = RegexConnectiveModel.TokenTypes.Cause elif token in effect_tokens: token_type = RegexConnectiveModel.TokenTypes.Effect else: token_type = None if previous_token_type != token_type: if token_type is None: # It's possible for a connective to be interrupted by a # word that's not consistent enough to make it count as # a connective token (e.g., a determiner). if (token.index > connective_tokens[0].index and next_connective_index < len(connective_tokens)): # We're in the middle of the connective pattern += (RegexConnectiveModel. CONNECTIVE_INTERJECTION_PATTERN) next_group_index += 1 else: # we've transitioned from non-argument to argument pattern += RegexConnectiveModel.ARG_WORDS_PATTERN next_group_index += 1 previous_token_type = token_type return pattern, connective_capturing_groups @staticmethod def _extract_patterns(sentences): # TODO: Extend this to work with cases of missing arguments. regex_patterns = [] patterns_seen = set() if FLAGS.print_patterns: print 'Patterns:' for sentence in sentences: for instance in sentence.causation_instances: connective = instance.connective cause_tokens, effect_tokens = [ arg if arg is not None else [] for arg in [instance.cause, instance.effect] ] pattern, connective_capturing_groups = ( RegexConnectiveModel._get_pattern(sentence, connective, cause_tokens, effect_tokens)) if pattern not in patterns_seen: if FLAGS.print_patterns: print ' ', pattern.encode('utf-8') print ' Sentence:', sentence.original_text.encode( 'utf-8') print patterns_seen.add(pattern) regex_patterns.append( (pattern, connective_capturing_groups)) return regex_patterns
class StanfordParsedSentence(object): PTB_ESCAPE_MAP = {'*': '\\*', '. . .': '...', '(': '-LRB-', ')': '-RRB-', '{': '-LCB-', '}': '-RCB-', '[': '-LSB-', ']': '-RSB-'} PTB_UNESCAPE_MAP = {} # filled in later from PTB_ESCAPE_MAP below # TODO: Should we be allowing the parser to PTB-escape more things? PERIOD_SUBSTITUTES = '.:' SUBJECT_EDGE_LABELS = ['nsubj', 'csubj', 'nsubjpass', 'csubjpass'] INCOMING_CLAUSE_EDGES = ['ccomp', 'xcomp', 'csubj', 'csubjpass', 'advcl', 'acl', 'acl:relcl'] # TODO: allow conj/parataxis? EDGE_REGEX = re.compile( "([A-Za-z_\\-/\\.':]+)\\((.+)-(\\d+)('*), (.+)-(\\d+)('*)\\)") DEPTH_EXCLUDED_EDGE_LABELS = ['ref'] def __init__(self, tokenized_text, tagged_lemmas, penn_tree, edges, document_text): ''' `tokenized_text` and `tagged_lemmas` are the token and lemma strings from the parser. `edges` is a list of edge strings from the parser. `document_text` is an instance of `util.streams.CharacterTrackingStreamWrapper`. (Built-in stream types will *not* work.) ''' self.next_sentence = None self.previous_sentence = None # TODO: move much of the initialization functionality, particularly # aligning tokens to text, into the reader class. self.tokens = [] self.edge_labels = {} # maps (n1_index, n2_index) tuples to labels try: self.source_file_path = document_text.name except AttributeError: self.source_file_path = None # Declare a few variables that will be overwritten later, just so that # it's easy to tell what's in an instance of this class. self.edge_graph = csr_matrix((0, 0), dtype='float') self.document_char_offset = 0 self.original_text = '' self.__depths = np.array([]) self.path_predecessors = np.array([[]]) self.path_costs = np.array([[]]) token_strings, tag_strings = self.__get_token_strings(tokenized_text, tagged_lemmas) copy_node_indices = self.__create_tokens(token_strings, tag_strings) self.__align_tokens_to_text(document_text) self.__create_edges(edges, copy_node_indices) if FLAGS.use_constituency_parse: self.constituency_tree = ImmutableParentedTree.fromstring(penn_tree) self.constituency_graph = nltk_tree_to_graph(self.constituency_tree) self.constituent_heads = collins_find_heads(self.constituency_tree) else: self.constituency_tree = None self.constituency_graph = None self.constituent_heads = None def __deepcopy__(self, memo): # Avoid massive memory and stack demands next_sent, prev_sent = self.next_sentence, self.previous_sentence self.next_sentence, self.previous_sentence = None, None copied = self.__class__.__new__(self.__class__) memo[id(self)] = copied for k, v in self.__dict__.items(): setattr(copied, k, deepcopy(v, memo)) self.next_sentence, self.previous_sentence = next_sent, prev_sent return copied @staticmethod def unescape_token_text(token_text): token_text = token_text.replace(u'\xa0', ' ') return StanfordParsedSentence.PTB_UNESCAPE_MAP.get(token_text, token_text) @staticmethod def escape_token_text(token_text): token_text = token_text.replace(' ', u'\xa0') return StanfordParsedSentence.PTB_ESCAPE_MAP.get(token_text, token_text) @staticmethod def get_text_for_tokens(annotation_tokens): try: return ' '.join([token.original_text for token in annotation_tokens]) except TypeError: # Happens if None is passed return '' def get_depth(self, token): return self.__depths[token.index] def _token_is_preferred_for_head_to(self, new_token, old_token): # If the depths are equal, prefer verbs/copulas over nouns, and # nouns over others. This helps to get the correct heads for # fragmented spans, such as spans that consist of an xcomp and its # subject, as well as a few other edge cases. if self.is_clause_head(old_token): return False elif self.is_clause_head(new_token): return True elif old_token.pos in Token.NOUN_TAGS: return False elif new_token.pos in Token.NOUN_TAGS: return True else: return False def get_head(self, tokens): # TODO: Update to match SEMAFOR's heuristic algorithm? min_depth = np.inf head = None for token in tokens: # Ignore annotation tokens from outside this sentence. # Really this check should be an is, but we use != to make it work # on Frozen sentences. if token.parent_sentence != self: continue depth = self.get_depth(token) parent_of_current_head = head in self.get_children(token, '*') child_of_current_head = (head is not None and token in self.get_children(head, '*')) if ((depth < min_depth or parent_of_current_head) and not child_of_current_head): head = token min_depth = depth elif (depth == min_depth and head is not None and self._token_is_preferred_for_head_to(token, head)): logging.debug( u"Preferring %s over %s as head of '%s' in '%s'" % (token, head, u' '.join([t.original_text for t in tokens]), tokens[0].parent_sentence.original_text)) head = token min_depth = depth if head is None: logging.warn('Returning null head for tokens %s' % tokens); return head def count_words_between(self, token1, token2): ''' Counts words between tokens based purely on the token IDs, discounting punctuation tokens. ''' assert (self.tokens[token1.index] == token1 and self.tokens[token2.index] == token2), "Tokens not in sentence" switch = token1.index > token2.index if switch: token1, token2 = token2, token1 words_between = -1 for token in self.tokens[token1.index : token2.index + 1]: if token.pos[0].isalnum(): words_between += 1 # return -words_between if switch else words_between return words_between def get_most_direct_parent(self, token): ''' Returns a tuple (e, p), p is the parent of the given token along the shortest path to root, and e is the label of the edge from p to token. ''' if token.parent_sentence is not self: return (None, None) # not in the parse tree # We can't use self.path_predecessors because it was computed in an # essentially undirected fashion. Instead, we find all parents, and # select the one whose directed depth is lowest (i.e., with the shortest # directed path to root). incoming = self.edge_graph[:, token.index] nonzero = incoming.nonzero()[0] if not nonzero.any(): return (None, None) min_depth = np.inf for edge_start_index in nonzero: next_depth = self.__depths[edge_start_index] if next_depth < min_depth: min_depth = next_depth parent_index = edge_start_index edge_label = self.edge_labels[(parent_index, token.index)] return (edge_label, self.tokens[parent_index]) def get_children(self, token, edge_type=None): ''' If `edge_type` is given, returns a list of children of token related by an edge with label edge_type. Otherwise, returns a list of (edge_label, child_token) tuples. `edge_type` may be a single type or a list of types. The special value '*' indicates that all children should be returned, without edge labels. ''' if token.parent_sentence is not self: return [] # not in the parse tree # Grab the sparse column of the edge matrix with the edges of this # token. Iterate over the edge end indices therein. if edge_type: if edge_type == '*': return [self.tokens[edge_end_index] for edge_end_index in self.edge_graph[token.index].indices] else: edge_type = listify(edge_type) return [self.tokens[edge_end_index] for edge_end_index in self.edge_graph[token.index].indices if (self.edge_labels[(token.index, edge_end_index)] in edge_type)] else: return [(self.edge_labels[(token.index, edge_end_index)], self.tokens[edge_end_index]) for edge_end_index in self.edge_graph[token.index].indices] def is_copula_head(self, token): if token.parent_sentence is not self: return False # Grab the sparse column of the edge matrix with the edges of this # token, and check the labels on each non-zero edge. for edge_end_index in self.edge_graph[token.index].indices: # A copula edge to a child also indicates a clause. if self.edge_labels[(token.index, edge_end_index)] == 'cop': return True return False def is_clause_head(self, token): if token.parent_sentence is not self: return False if token.pos == 'ROOT': return False try: Token.VERB_TAGS.index(token.pos) if token.pos != 'MD': # Modals, though verbs, aren't clause heads return True except ValueError: # this POS wasn't in the list if self.is_copula_head(token): return True incoming = self.edge_graph[:, token.index] for edge_start_index in incoming.nonzero()[0]: # An incoming clause edge also indicates a clause. if (self.edge_labels[(edge_start_index, token.index)] in self.INCOMING_CLAUSE_EDGES): return True return False def extract_dependency_path(self, source, target, include_conj=True): assert source.parent_sentence is self and target.parent_sentence is self edges = [] while target is not source: predecessor_index = self.path_predecessors[source.index, target.index] if predecessor_index == -9999: raise DependencyPathError(source, target) predecessor = self.tokens[predecessor_index] try: # Normal case: the predecessor is the source of the edge. label = self.edge_labels[(predecessor_index, target.index)] start, end = predecessor, target except KeyError: # Back edge case: the predecessor is the target of the edge. label = self.edge_labels[(target.index, predecessor_index)] start, end = target, predecessor if label != 'conj' or include_conj: edges.append((start, end, label)) target = predecessor return DependencyPath(source, reversed(edges)) def get_closest_of_tokens(self, source, possible_targets, use_tree=True): ''' Finds the token among possible_targets closest to source. If use_tree is True, distance is determined by distance in the parse tree; otherwise, distance is simple lexical distance (which may be negative). Returns the token, along with its distance. If none of the possible targets is reachable, returns (None, np.inf). ''' if source.parent_sentence is not self: return (None, np.inf) if not possible_targets: raise ValueError("Can't find closest of 0 tokens") min_distance = np.inf for target in possible_targets: if target.parent_sentence is not self: continue if use_tree: next_distance = self.path_costs[source.index, target.index] else: next_distance = source.index - target.index if next_distance < min_distance: closest = target min_distance = next_distance if min_distance == np.inf: # source or all targets aren't in tree closest = None return closest, min_distance def get_constituency_node_for_tokens(self, tokens): # Token indices include ROOT, so we subtract 1 to get indices that will # match NLTK's leaf indices. indices = [token.index - 1 for token in tokens if token.parent_sentence is self] try: treeposition = self.constituency_tree.treeposition_spanning_leaves( min(indices), max(indices) + 1) # +1 b/c of Python-style ranges except AttributeError: # self.constituency_tree is None if not FLAGS.use_constituency_parse: raise ValueError('Constituency parses not in use') else: raise node = self.constituency_tree[treeposition] if not isinstance(node, Tree): # We got a treeposition of a leaf string node = self.constituency_tree[treeposition[:-1]] return node def get_token_for_constituency_node(self, node): if not is_parent_of_leaf(node): raise ValueError("Node is not a parent of a leaf: %s" % node) node_leaf = node[0] for i, leaf in enumerate(node.root().leaves()): if leaf is node_leaf: # identity, not equality return self.tokens[i] if not FLAGS.use_constituency_parse: raise ValueError('Constituency parses not in use') else: raise ValueError("Somehow you passed a node whose leaf isn't under" " its root. Wow.") DOMINATION_DIRECTION = Enum(['Dominates', 'DominatedBy', 'Independent']) def get_domination_relation(self, token1, token2): # TODO: do we need to worry about conj's here? path = self.extract_dependency_path(token1, token2, True) last_node = path.start all_forward = True all_backward = True for source, target, _dep_name in path: if source is last_node: # forward edge all_backward = False if not all_forward: break last_node = target else: # back edge all_forward = False if not all_backward: break last_node = source if all_forward: return self.DOMINATION_DIRECTION.Dominates elif all_backward: return self.DOMINATION_DIRECTION.DominatedBy else: return self.DOMINATION_DIRECTION.Independent @staticmethod def is_contiguous(tokens): last_index = tokens[0].index sentence = tokens[0].parent_sentence for token in tokens[1:]: if sentence is token.parent_sentence and ( token.pos in Token.PUNCT_TAGS or token.index == last_index + 1): last_index = token.index sentence = token.parent_sentence else: return False return True ########################################### # Private initialization support functions ########################################### @staticmethod def __get_token_strings(tokenized_text, tagged_lemmas): ''' This is basically a wrapper for the string split function, which also combines adjacent tokens if there are spaces within tokens. This is detected by looking for a lack of a '/' in the tagged lemma. ''' token_strings = tokenized_text.split(' ') lemma_strings = tagged_lemmas.split(' ') assert len(token_strings) == len(lemma_strings), ( "Tokens do not match tags") if all('/' in lemma for lemma in lemma_strings): return token_strings, lemma_strings final_token_strings = [] final_lemma_strings = [] tokens_to_accumulate = [] lemmas_to_accumulate = [] for token, lemma in zip(token_strings, lemma_strings): tokens_to_accumulate.append(token) lemmas_to_accumulate.append(lemma) if '/' in lemma: final_token_strings.append(' '.join(tokens_to_accumulate)) final_lemma_strings.append(' '.join(lemmas_to_accumulate)) tokens_to_accumulate = [] lemmas_to_accumulate = [] return final_token_strings, final_lemma_strings def __create_tokens(self, token_strings, tag_strings): # We need one more node than we have token strings (for root). copy_node_indices = [None for _ in range(len(token_strings) + 1)] root = self.__add_new_token('', 'ROOT', 'ROOT') copy_node_indices[0] = [root.index] for i, (token_str, tag_str) in ( enumerate(zip(token_strings, tag_strings))): # Can't use str.partition because there may be a '/' in the token. slash_index = tag_str.rindex('/') lemma = tag_str[:slash_index] pos = tag_str[slash_index + 1:] new_token = self.__add_new_token( self.unescape_token_text(token_str), pos, lemma) # Detect duplicated tokens. if (lemma == '.' and pos == '.' # Previous token is in self.tokens[i], not i-1: root is 0. and self.tokens[i].original_text.endswith('.')): new_token.is_absent = True copy_node_indices[i + 1] = [new_token.index] return copy_node_indices def __add_new_token(self, *args, **kwargs): new_token = Token(len(self.tokens), self, *args, **kwargs) self.tokens.append(new_token) return new_token def __align_tokens_to_text(self, document_text): eat_whitespace(document_text) self.document_char_offset = document_text.character_position # Root has no alignment to source. self.tokens[0].start_offset = None self.tokens[0].end_offset = None non_root_tokens = self.tokens[1:] for i, token in enumerate(non_root_tokens): # i is one less than the index of the current token in self.tokens, # because root. original = token.original_text if token.is_absent: # Handle case of duplicated character, which is the only type of # absent token that will have been detected so far. prev_token = self.tokens[i] if prev_token.original_text.endswith(original): # print "Found duplicated token:", ( # token.original_text.encode('utf-8')) token.start_offset = prev_token.end_offset - len(original) token.end_offset = prev_token.end_offset elif original == '.' and i == len(non_root_tokens) - 1: # End-of-sentence period gets special treatment: the "real" # original text may have been a period substitute or missing. # (Other things can get converted to fake end-of-sentence # periods to make life easier for the parser.) start_pos = document_text.tell() eaten_ws = eat_whitespace(document_text, True) not_at_eof = not is_at_eof(document_text) next_char, next_is_period_sub = peek_and_revert_unless( document_text, lambda char: self.PERIOD_SUBSTITUTES.find(char) != -1) if (not_at_eof and next_is_period_sub): # We've moved the stream over the period, so adjust offset. token.start_offset = (document_text.character_position - self.document_char_offset - 1) token.end_offset = token.start_offset + 1 token.original_text = next_char self.original_text += eaten_ws + next_char else: # The period is actually not there. token.is_absent = True token.original_text = '' document_text.seek(start_pos) else: # Normal case: just read the next token. search_start = document_text.character_position # Our preprocessing may hallucinate periods onto the ends of # abbreviations, particularly "U.S." Deal with them. if original[-1] == '.': token_text_to_find = original[:-1] else: token_text_to_find = original text_until_token, found_token = ( read_stream_until(document_text, token_text_to_find, True)) self.original_text += text_until_token assert found_token, ( (u'Could not find token "%s" starting at position %d ' '(accumulated: %s)') % ( original, search_start, self.original_text)).encode('utf-8') if original[-1] == '.': # If it ends in a period, and the next character in the # stream is a period, it's a duplicated period. Advance # over the period and append it to the accumulated text. _, is_period = peek_and_revert_unless( document_text, lambda char: char == '.') if is_period: self.original_text += '.' token.end_offset = (document_text.character_position - self.document_char_offset) token.start_offset = token.end_offset - len(original) ''' if not token.is_absent: print "Annotated token span: ", token.start_offset, ",", \ token.end_offset, 'for', \ token.original_text.encode('utf-8') + '. Annotated text:',\ (self.original_text[token.start_offset:token.end_offset] ).encode('utf-8') ''' def __make_token_copy(self, token_index, copy_num, copy_node_indices): copies = copy_node_indices[token_index] token = self.tokens[token_index] while copy_num >= len(copies): self.__add_new_token(token.original_text, token.pos, token.lemma, token.start_offset, token.end_offset, token.is_absent, token) copies.append(len(self.tokens) - 1) def __create_edges(self, edges, copy_node_indices): edge_lines = [line for line in edges if line] # skip blanks matches = [StanfordParsedSentence.EDGE_REGEX.match(edge_line) for edge_line in edge_lines] # First, we need to create tokens for all the copy nodes so that we have # the right size matrix for the graph. for match_result, edge_line in zip(matches, edge_lines): assert match_result, \ 'Improperly constructed edge line: %s' % edge_line arg1_index, arg1_copy, arg2_index, arg2_copy = \ match_result.group(3, 4, 6, 7) self.__make_token_copy(int(arg1_index), len(arg1_copy), copy_node_indices) self.__make_token_copy(int(arg2_index), len(arg2_copy), copy_node_indices) # Now, we can actually create the matrix and insert all the edges. num_nodes = len(self.tokens) self.edge_graph = lil_matrix((num_nodes, num_nodes), dtype='float') graph_excluded_edges = [] # edges that shouldn't be used for graph algs for match_result in matches: (relation, _arg1_lemma, arg1_index, arg1_copy, _arg2_lemma, arg2_index, arg2_copy) = match_result.group(*range(1,8)) arg1_index = int(arg1_index) arg2_index = int(arg2_index) token_1_idx = copy_node_indices[arg1_index][len(arg1_copy)] token_2_idx = copy_node_indices[arg2_index][len(arg2_copy)] # TODO: What should we do about the cases where there are # multiple labels for the same edge? (e.g., conj and ccomp) self.edge_labels[(token_1_idx, token_2_idx)] = relation if relation in self.DEPTH_EXCLUDED_EDGE_LABELS: graph_excluded_edges.append((token_1_idx, token_2_idx)) else: self.edge_graph[token_1_idx, token_2_idx] = 1.0 self._initialize_graph(graph_excluded_edges) def _initialize_graph(self, graph_excluded_edges): # Convert to CSR for shortest path (which would do it anyway) and to # make self.get_children() below work. self.edge_graph = self.edge_graph.tocsr() self.__depths = bfs_shortest_path_costs(self.edge_graph, 0) ''' For the undirected shortest paths we save, we'll want to: a) prefer xcomp-> __ ->nsubj paths over nsubj-> __ <-nsubj and nsubj<- __ ->xcomp paths. b) disprefer paths that rely on expletives and acls. c) treat the graph as undirected, EXCEPT for edges where we already have a reverse edge, in which case that edge's weight should be left alone. We adjust the graph accordingly. ''' # Adjust edge weights to make better paths preferred. for edge, label in self.edge_labels.iteritems(): if label == 'xcomp': self.edge_graph[edge] = 0.98 edge_end_token = self.tokens[edge[1]] subj_children = self.get_children( edge_end_token, self.SUBJECT_EDGE_LABELS) for child in subj_children: self.edge_graph[edge[1], child.index] = 0.985 elif label == 'expl' or label.startswith('acl'): self.edge_graph[edge] = 1.01 # Create duplicate edges to simulate undirectedness, EXCEPT where we # already have an edge in the opposite direction. For this we use a # copy of the graph, since we don't actually want to pollute edge_graph # with the reverse arcs. pseudo_unweighted_graph = self.edge_graph.tolil() nonzero = set([(i, j) for (i, j) in zip( *pseudo_unweighted_graph.nonzero())]) for (i, j) in nonzero: if (j, i) not in nonzero: pseudo_unweighted_graph[j, i] = pseudo_unweighted_graph[i, j] self.path_costs, self.path_predecessors = csgraph.shortest_path( pseudo_unweighted_graph, return_predecessors=True, directed=True) # Add in edges that we didn't want to use for distances/shortest path, # ignoring all the changes made to undirected_graph. # (Originally we were converting to LIL for efficiency, but it turned # out to hurt performance more than it helped.) # TODO: Should we convert if there were excluded edges? # self.edge_graph = self.edge_graph.tolil() for start, end in graph_excluded_edges: self.edge_graph[start, end] = 1.0 self.edge_graph = self.edge_graph.tocsr() def __unicode__(self): parse_lines = [u'%s(%s-%d, %s-%d)' % (label, self.tokens[edge[0]].lemma, edge[0], self.tokens[edge[1]].lemma, edge[1]) for edge, label in sorted(self.edge_labels.iteritems())] return u'%s\n\n%s' % (self.original_text, u'\n'.join(parse_lines)) def __str__(self): return self.__unicode__().encode('utf-8')
0] # ndarray-like nonzero produces only 1 dim else: incoming = graph.getrow(node) return incoming.nonzero()[1] class CycleError(Exception): def __init__(self): super(Exception, self).__init__("Cycle detected; graph is not a DAG") def topological_sort(tree, algorithm='tarjan'): return tarjan_topological_sort(tree) _SORT_MARKS = Enum(['Unvisited', 'Visiting', 'Visited']) def tarjan_topological_sort(tree): sorted_nodes = [] # assumes a square matrix, which a graph should be marks = [_SORT_MARKS.Unvisited] * tree.shape[0] def visit(node): if isinstance(node, np.ndarray): raise Exception() if marks[node] == _SORT_MARKS.Visiting: raise CycleError() if marks[node] != _SORT_MARKS.Visited: marks[node] = _SORT_MARKS.Visiting for child in get_outgoing_indices(tree, node):
causations_to_keep = [] # Process connectives biggest to smallest, discarding any that reuse tokens. # If we have two connectives of the same length competing for a token, this # will arbitrarily choose the first one we find. for connective_length in sorted(causations_by_size.keys(), reverse=True): for causation in causations_by_size[connective_length]: for conn_token in causation.connective: if tokens_used[conn_token.index]: break else: # Executes only if loop over tokens didn't break causations_to_keep.append(causation) for conn_token in causation.connective: tokens_used[conn_token.index] = True sentence.causation_instances = causations_to_keep RELATIVE_POSITIONS = Enum(['Before', 'Overlapping', 'After']) def get_causation_tuple(connective_tokens, cause_head, effect_head): return (tuple(t.index for t in connective_tokens), cause_head.index if cause_head else None, effect_head.index if effect_head else None) # Add some Colorama functionality. for style, code in [('UNDERLINE', 4), ('BLINK', 5)]: setattr(colorama.Style, style, '\033[%dm' % code)