def __add_tmp(self): # print "NERTrainingCallback:__add_tmp" if len(self.tmp) == 0: return # print "TMP", self.tmp, self._sent.ner_type """Adds the temporary chunk to the output sentence.""" if self._mode == NERTrainingCallback.NER_LINK: # The link may end with punctuation marks -- let's remove them! puncts = 0 for token in reversed(self.tmp): if token[NERTrainingCallback.LEMMA] in self._link_punct: puncts += 1 else: break # Add the link stripped of punctuation marks for i, attributes in enumerate(self.tmp[0 : len(self.tmp) - puncts]): if i == 0: self._sent.bi = 'B' else: self._sent.bi = 'I' self._sent.append(attributes) self._trie.add_anchor(self.tmp[0 : len(self.tmp) - puncts], self._sent.ner_type) # And the rest as regular text self._sent.ner_type = zero_ner_type() for attributes in self.tmp[len(self.tmp) - puncts:]: self._sent.append(attributes) elif self._mode == NERTrainingCallback.NO_LINK: for attributes in self.tmp: self._sent.append(attributes) elif self._mode == NERTrainingCallback.NNP_LINK: # print "NNP", self.tmp, self._sent.ner_type, len(self.tmp) # sys.stdout.flush() # print "TRIE", self._trie.paths if len(self.tmp) <= 8: sentence_start_non_nnp = False for partition in all_partitions(self.tmp): categories = [self._trie.get_category(part)[1] for part in partition] # print "PC", partition, categories for i, category in enumerate(categories): if category is None or category is 'UNK': # Invalid NNP, UNLESS the first word is sentence starter if (len(self._sent.sentence) == 0 and i == 0 and len(partition[i]) == 1 and not self.__has_noun(partition[i])): sentence_start_non_nnp = True else: break else: for i, part in enumerate(partition): if i == 0 and sentence_start_non_nnp: self._sent.ner_type = zero_ner_type() for word in part: self._sent.append(word) else: self._sent.ner_type = categories[i] was_B = False for word in part: if not was_B: self._sent.bi = 'B' was_B = True else: self._sent.bi = 'I' self._sent.append(word) break else: # for self.__unknown_nnp_link() else: # if len <= 8 self.__unknown_nnp_link() self.tmp = []
def __add_tmp(self): # print "NERTrainingCallback:__add_tmp" if len(self.tmp) == 0: return # print "TMP", self.tmp, self._sent.ner_type """Adds the temporary chunk to the output sentence.""" if self._mode == NERTrainingCallback.NER_LINK: begin, last = self.__partition_candidate(self.tmp, self._sent.ner_type) if last != 0: # Add the title as regular text if begin != 0: tmp_type = self._sent.ner_type self._sent.ner_type = zero_ner_type() for attributes in self.tmp[0 : begin]: self._sent.append(attributes) self._sent.ner_type = tmp_type # Add the real entity part if begin != last: # If the anchor link is an adjective, and the last word does not # occur in the link target, then it is a derivative form of the # entity and must be a MISC according to ConLL guidelines if (self.tmp[last - 1][NERTrainingCallback.POS].startswith(u'J') and not self.tmp[last - 1][NERTrainingCallback.RAW].lower() in self.tmp[last - 1][NERTrainingCallback.LINK].lower() and self._sent.ner_type != 'UNK' and self._sent.ner_type != zero_ner_type()): sys.stderr.write("Adj entity: {0}\n".format(self.tmp[0 : last])) self._sent.ner_type = 'MISC' # Add the link stripped of punctuation marks for i, attributes in enumerate(self.tmp[begin : last]): if i == 0: self._sent.bi = 'B' else: self._sent.bi = 'I' self._sent.append(attributes) self._trie.add_anchor(self.tmp[begin : last], self._sent.ner_type) # Unknown link: we must throw the sentence away if self._sent.ner_type == 'UNK': self._sent.links_lost += 1 # And the rest as regular text self._sent.ner_type = zero_ner_type() for attributes in self.tmp[last:]: self._sent.append(attributes) elif self._mode == NERTrainingCallback.NO_LINK: for attributes in self.tmp: self._sent.append(attributes) elif self._mode == NERTrainingCallback.NNP_LINK: # print "NNP", self.tmp, self._sent.ner_type, len(self.tmp) # sys.stdout.flush() # print "TRIE", self._trie.paths if len(self.tmp) <= 8: sentence_start_non_nnp = False for partition in all_partitions(self.tmp): categories = [self._trie.get_category(part)[1] for part in partition] # print "PC", partition, categories for i, category in enumerate(categories): if category == 'UNK': # Invalid NNP, UNLESS the first word is sentence starter if (len(self._sent.sentence) == 0 and i == 0 and len(partition[i]) == 1 and not self.__has_noun(partition[i])): sentence_start_non_nnp = True else: begin, last = self.__partition_candidate(partition[i], category) if begin != last: category = self._trie.get_category(partition[i][begin : last])[1] if category == 'UNK' or (begin > 0 and category != 'PER'): break else: for i, part in enumerate(partition): # At the beginning of a sentence, and the first word # is not an NN(P) if i == 0 and sentence_start_non_nnp: self._sent.ner_type = zero_ner_type() for word in part: self._sent.append(word) # The rest of the partitions else: self._sent.ner_type = categories[i] begin, last = self.__partition_candidate(part, category) # Add the title as regular text if begin != 0: self._sent.ner_type = zero_ner_type() for attributes in self.tmp[0 : begin]: self._sent.append(attributes) # Add the real entity part if begin != last: link, category = self._trie.get_category(partition[i][begin : last]) link = u" ".join(link) # If the anchor link is an adjective, and the last word does not # occur in the link target, then it is a derivative form of the # entity and must be a MISC according to ConLL guidelines if (part[last - 1][NERTrainingCallback.POS].startswith(u'J') and not part[last - 1][NERTrainingCallback.RAW].lower() in link.lower() and category != 'UNK'): sys.stderr.write("Adj entity: {0}\n".format(self.tmp[0 : last])) category = 'MISC' self._sent.ner_type = category # Add the link stripped of punctuation marks for i, attributes in enumerate(part[begin : last]): if i == 0: self._sent.bi = 'B' else: self._sent.bi = 'I' self._sent.append(attributes) if self._sent.ner_type == 'UNK': self._sent.links_lost += 1 # And the rest as regular text self._sent.ner_type = zero_ner_type() for attributes in self.tmp[last:]: self._sent.append(attributes) # We are done, let's break break else: # for self.__unknown_nnp_link() else: # if len <= 8 self.__unknown_nnp_link() self.tmp = []