def _find_bad_tag_and_raise_error(self, tags): ext_pos = parser.ExtPos() bad_tags = set() for tag in set(tags): good_tag = ext_pos.addTagConstraints(parser.StringVector([tag])) if not good_tag: bad_tags.add(tag) raise ValueError("Invalid POS tags (not present in the parser's " "terms.txt file): %s" % ', '.join(sorted(bad_tags)))
def _possible_tags_to_ext_pos(self, tokens, possible_tags): ext_pos = parser.ExtPos() if not possible_tags: return ext_pos for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] tags = map(str, tags) valid_tags = ext_pos.addTagConstraints(parser.StringVector(tags)) if not valid_tags: # at least one of the tags is bad -- find out which ones # and throw a ValueError self._find_bad_tag_and_raise_error(tags) return ext_pos
def parse_tagged(self, tokens, possible_tags, rerank=True): """Parse some pre-tagged, pre-tokenized text. tokens is a sequence of strings. possible_tags is map from token indices to possible POS tags. Tokens without an entry in possible_tags will be unconstrained by POS. If rerank is True, we will rerank the n-best list.""" self.check_loaded_models(rerank) ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] ext_pos.addTagConstraints(parser.VectorString(tags)) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list