def get_candidates(self, text_order, candidate_order, mention, index_sent): """ Gets the candidates ordered in a for the sieve check. :param text_order: The list of sentences that contain the list of mentions that form the text. :param candidate_order: The list of sentences that contain the list of mentions that form the text in bts order. :param mention: The mention whose candidates whe need. :param index_sent: The index of the current sentence. @rtype : list :return: A list of ordered candidates. """ mention_index = [c[ID] for c in candidate_order[index_sent] ].index(mention["id"]) if len(candidate_order[index_sent][mention_index]["entity"] [1]) == 1 and self.is_pronoun(mention): self.logger.debug("ORDERING: pronoun order") sentence_candidates = self.pronoun_order( candidate_order[index_sent][:mention_index], mention) other_candidates = [ m for s in reversed(text_order[:index_sent]) for m in s ] if pronouns.relative(mention[FORM].lower()): self.logger.debug("ORDERING: Relative pronoun order") sentence_candidates.reverse() return sentence_candidates + other_candidates else: return super(PronounSieve, self).get_candidates(text_order, candidate_order, mention, index_sent)
def is_relative_pronoun(graph_builder, first_constituent, second_constituent): """ Check if tho constituents are in relative pronoun construction. Also mark they. :param graph_builder: The graph manager. :param first_constituent: :param second_constituent: :return: Boolean """ # NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2))) if not graph_builder.same_sentence(first_constituent, second_constituent): return False if not pronouns.relative(second_constituent[FORM].lower()): return False if first_constituent[SPAN] > second_constituent[SPAN]: return False enclosing_np = graph_builder.get_syntactic_parent(first_constituent) upper = graph_builder.get_syntactic_parent(second_constituent) while upper and (upper[graph_builder.node_type] != graph_builder.root_type): if graph_builder.is_inside(upper[SPAN], enclosing_np[SPAN]): upper = graph_builder.get_syntactic_parent(upper) elif upper[ID] == enclosing_np[ID]: # TODO check path element return True else: return False return False
def are_coreferent(self, entity, mention, candidate_entity, candidate): """A pronoun is coreferent with a candidate? :param mention: The selected mention to represent the entity. :param entity: The entity that mention is part. :param candidate: The candidate that may corefer the entity. :param candidate_entity: The entity that candidate is part of it. :return: True or false """ if pronouns.relative(candidate[FORM]): return False if candidate.get(PLEONASTIC, False): return False if self.FORBID_POSSESSIVES \ and self.is_possessive(mention): return False if self.RESTRICT_POSSESSIVES_WITH_POSSESSIONS \ and self.is_pronoun(mention) and self.is_possessive(mention)\ and self.starts_with_possessive(candidate)\ and not self.is_pronoun(candidate): return False if self.RESTRICT_ADJACENT and mention[SPAN][ 0] == candidate[SPAN][1] + 1: return False return super(SpanishPronounMatch, self).are_coreferent(entity, mention, candidate_entity, candidate)
def extract_and_mark(self, mention): """ Determine the type of the mention. Also check some mention related features. :param mention: The mention to be classified. """ words = self.graph_builder.get_words(mention) head = self.graph_builder.get_head_word(mention) head_pos = head[POS] head_form = head[FORM].lower() head_word_ner = head.get(HEAD_OF_NER) first_form = words[0][FORM].lower() if pronouns.relative(first_form) and len(words) == 1: mention[RELATIVE_PRONOUN] = True else: mention[RELATIVE_PRONOUN] = False if determiners.indefinite_articles(first_form): mention[STARTED_BY_INDEFINITE_ARTICLE] = True else: mention[STARTED_BY_INDEFINITE_ARTICLE] = False if pronouns.indefinite(first_form): mention[STARTED_BY_INDEFINITE_PRONOUN] = True else: mention[STARTED_BY_INDEFINITE_PRONOUN] = False # Enumeration mention if rules.is_enumeration(self.graph_builder, mention): self._set_mention_type(mention, ENUMERATION_MENTION) # Pronoun mention elif (len(words) == 1 and pos_tags.pronoun(head_pos)) or\ (len(words) == 1 and (pronouns.all(head_form) or pronouns.relative(head_form)) and # not ner_tags.mention_ner(head_word_ner)): True): self._set_mention_type(mention, PRONOUN_MENTION) # Proper Mention elif pos_tags.proper_noun(head_pos): # or ner_tags.all(head_word_ner): self._set_mention_type(mention, PROPER_MENTION) # In other case is nominal else: self._set_mention_type(mention, NOMINAL_MENTION)
def validate(self, mention, entity): """ Only pronouns can be used for this sieve :param mention: The mention to check. :param entity: The entity of the mention. """ if not super(SpanishPronounMatch, self).validate(mention, entity): return False if pronouns.relative(mention[FORM]): return False if mention[PLEONASTIC]: return False return True
def is_appositive_construction_child(graph_builder, constituent): """ Check if the mention is in a appositive construction. "NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))"; "NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))"; "/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)"; "/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))"; :param graph_builder: The graphBuilder :param constituent: The mention to check """ constituent = constituent.get("constituent", constituent) # mention is inside a NP # TODO Improve the precision parent = graph_builder.get_syntactic_parent(constituent) if not constituent_tags.noun_phrase(parent[TAG]): return False siblings = graph_builder.get_syntactic_sibling(constituent) # Check if while siblings: actual = siblings.pop(0) if actual == constituent: break else: return False while siblings: actual = siblings.pop(0) if actual[FORM] == ",": break else: return False while siblings: actual = siblings.pop(0) if constituent_tags.noun_phrase(actual.get(TAG)): return parent if pronouns.relative(graph_builder.get_words(actual)[0].get("form")): return parent return False
def process_graph(self): from corefgraph.multisieve.features.constants import MENTION """ Prepare the graph for output. """ self.meta[self.graph_builder.doc_type] = self.graph_builder.get_doc_type() from corefgraph.resources.tagset import pos_tags from corefgraph.resources.dictionaries import pronouns self.meta["sentences"] = { 'words_histogram': [len(self.graph_builder.get_words(sentence)) for sentence in self.graph_builder.get_all_sentences()], 'pronouns_histogram': [len([word for word in self.graph_builder.get_words(sentence) if(pos_tags.pronoun(word[POS]) or pronouns.all(word[FORM]) or pronouns.relative(word[FORM]))]) for sentence in self.graph_builder.get_all_sentences()], 'named_entities_histogram': [len(self.graph_builder.get_sentence_named_entities(sentence)) for sentence in self.graph_builder.get_all_sentences()], 'mentions_histogram': [len(self.graph_builder.get_sentence_gold_mentions(sentence)) for sentence in self.graph_builder.get_all_sentences()] } self.meta["features"] = { 'counters': defaultdict(Counter), 'mentions': defaultdict(dict)} for index, sentence in enumerate(self.coreference_processor.mentions_textual_order): self.logger.debug("Featuring Sentence %d", index) sentence_mentions = [] # self.meta["sentences"].append(sentence_mentions) for mention in sentence: # Store mentions id in the meta sentence_mentions.append(mention[ID]) self.feature_extractor.characterize_mention(mention) # Resolve the coreference self.logger.debug("Resolve Coreference...") self.coreference_processor.resolve_text() self.meta["overall"] = { 'words': Counter([word[POS] for word in self.graph_builder.get_all_words()]), 'namedEntities': Counter([ne[NER] for ne in self.graph_builder.get_all_named_entities()]), 'constituents': Counter([constituent[TAG] for constituent in self.graph_builder.get_all_constituents()]), 'mentions': Counter([mention.get(MENTION) for mention in self.graph_builder.get_all_gold_mentions()]), 'mentions_size': [len(self.graph_builder.get_words(mention)) for mention in self.graph_builder.get_all_gold_mentions()], 'mentions_deep': [mention.get(CONSTITUENT, {DEEP: -1})[DEEP] for mention in self.graph_builder.get_all_gold_mentions()], 'mentions_per_entity': Counter([mention[GOLD_ENTITY] for mention in self.graph_builder.get_all_gold_mentions()]).values() }