def __init__(self, text_or_tokens, max_sentence_length=399): if isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>', max_sentence_length) else: self.sentrep = parser.SentRep(text_or_tokens)
def _find_bad_tag_and_raise_error(self, tags): ext_pos = parser.ExtPos() bad_tags = set() for tag in set(tags): good_tag = ext_pos.addTagConstraints(parser.StringVector([tag])) if not good_tag: bad_tags.add(tag) raise ValueError("Invalid POS tags (not present in the parser's " "terms.txt file): %s" % ', '.join(sorted(bad_tags)))
def __init__(self, text_or_tokens): if isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>') else: # text_or_tokens is a sequence -- need to make sure that each # element is a string to avoid crashing text_or_tokens = [parser.ptbEscape(str(token)) for token in text_or_tokens] self.sentrep = parser.SentRep(text_or_tokens)
def load_parser_model(self, model_dir, **parser_options): """Load the parsing model from model_dir and set parsing options. In general, the default options should suffice but see the set_parser_options() method for details. Note that the parser does not allow loading multiple models within the same process (calling this function twice will raise a RuntimeError).""" if self._parser_model_loaded: raise RuntimeError('Parser is already loaded and can only be loaded once.') if not exists(model_dir): raise ValueError('Parser model directory %r does not exist.' % model_dir) self._parser_model_loaded = True self.parser_model_dir = model_dir parser.loadModel(model_dir) self.set_parser_options(**parser_options)
def _possible_tags_to_ext_pos(self, tokens, possible_tags): ext_pos = parser.ExtPos() if not possible_tags: return ext_pos for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] tags = map(str, tags) valid_tags = ext_pos.addTagConstraints(parser.StringVector(tags)) if not valid_tags: # at least one of the tags is bad -- find out which ones # and throw a ValueError self._find_bad_tag_and_raise_error(tags) return ext_pos
def __init__(self, text_or_tokens): if isinstance(text_or_tokens, parser.SentRep): # ensure that Python owns the pointer text_or_tokens.this.acquire() self.sentrep = text_or_tokens elif isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>') else: # text_or_tokens is a sequence -- need to make sure that each # element is a string to avoid crashing text_or_tokens = [ parser.ptbEscape(str(token)) for token in text_or_tokens ] self.sentrep = parser.SentRep(text_or_tokens)
def fuse(self, threshold=0.5, exponent=1, num_parses=50, use_parser_scores=False): """Combine the parses in this n-best list into a single Tree using parse fusion. This results in a significant accuracy improvement. You may want to tune the parameters for your specific parsing model. See Choe, McClosky, and Charniak (EMNLP 2015) for more details. This will use the scores from the reranker unless the n-best list wasn't reranked or use_parser_scores=True. If fusion fails, the top parse from the list will be returned.""" parses = self.parses[:num_parses] if use_parser_scores or not self._reranked: scores = [scored_parse.parser_score for scored_parse in parses] else: scores = [scored_parse.reranker_score for scored_parse in parses] norm_scores = normalize_logprobs(scores, exponent=exponent) chart = parser.SimpleChart(len(self._sentrep)) for norm_score, scored_parse in zip(norm_scores, parses): chart.populate(scored_parse.ptb_parse._tree, norm_score) chart.prune(threshold) tree = chart.parse() if tree is None: # parse failed -- use original 1-best parse tree = parses[0].ptb_parse else: tree = Tree(tree) return tree
def parse(self, sentence, rerank='auto', sentence_id=None): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded. If there are no parses or an error occurs, this will return an empty NBestList.""" rerank = self.check_models_loaded_or_error(rerank) sentence = Sentence(sentence) # max_sentence_length is actually 1 longer than the maximum # allowed sentence length if len(sentence) >= parser.max_sentence_length - 1: raise ValueError("Sentence is too long (%s tokens, must be " "under %s)" % (len(sentence), parser.max_sentence_length - 1)) try: parses = parser.parse(sentence.sentrep) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses, sentence_id) if rerank: nbest_list.rerank(self) return nbest_list
def parse_tagged(self, tokens, possible_tags, rerank='auto'): """Parse some pre-tagged, pre-tokenized text. tokens must be a sequence of strings. possible_tags is map from token indices to possible POS tags (strings). Tokens without an entry in possible_tags will be unconstrained by POS. POS tags must be in the terms.txt file in the parsing model or else you will get a ValueError. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded.""" rerank = self._check_loaded_models(rerank) if isinstance(tokens, basestring): raise ValueError("tokens must be a sequence, not a string.") ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] tags = map(str, tags) valid_tags = ext_pos.addTagConstraints(parser.VectorString(tags)) if not valid_tags: # at least one of the tags is bad -- find out which ones # and throw a ValueError self._find_bad_tag_and_raise_error(tags) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def load_parsing_model(self, model_dir, language='En', case_insensitive=False, nbest=50, small_corpus=True, overparsing=21, debug=0, smoothPos=0): """Load the parsing model from model_dir and set parsing options. In general, the default options should suffice. Note that the parser does not allow loading multiple models within the same process.""" if self._parser_model_loaded: raise ValueError('Parser is already loaded and can only be loaded once.') if not os.path.exists(model_dir): raise ValueError('Parser model directory %r does not exist.' % model_dir) self._parser_model_loaded = True parser.loadModel(model_dir) self.parser_model_dir = model_dir parser.setOptions(language, case_insensitive, nbest, small_corpus, overparsing, debug, smoothPos)
def sentences_from_file(this_class, filename): """Given the path to a filename containing multiple SGML(-ish) lines (typical input to the command line parser), returns a list of Sentence objects (one for each tree in the text).""" # Note that the native method below leaks. We work around this # by acquiring its pointer in __init__ sentReps = parser.sentRepsFromFile(filename) return map(this_class, sentReps)
def trees_from_file(this_class, filename): """Given the path to a file containing multiple Penn Treebank trees, returns a list of Tree objects (one for each tree in the file).""" # see trees_from_string for an explanation trees = list(parser.inputTreesFromFile(filename)) for tree in trees: tree.this.acquire() return map(this_class, trees)
def set_parser_options(self, language='En', case_insensitive=False, nbest=50, small_corpus=True, overparsing=21, debug=0, smooth_pos=0): """Set options for the parser. Note that this is called automatically by load_parser_model() so you should only need to call this to update the parsing options. The method returns a dictionary of the new options. The options are as follows: language is a string describing the language. Currently, it can be one of En (English), Ch (Chinese), or Ar (Arabic). case_insensitive will make the parser ignore capitalization. nbest is the maximum size of the n-best list. small_corpus=True enables additional smoothing (originally intended for training from small corpora, but helpful in many situations). overparsing determines how much more time the parser will spend on a sentence relative to the time it took to find the first possible complete parse. This affects the speed/accuracy tradeoff. debug takes a non-negative integer. Setting it higher than 0 will cause the parser to print debug messages (surprising, no?). Setting smooth_pos to a number higher than 0 will cause the parser to assign that value as the probability of seeing a known word in a new part-of-speech (one never seen in training).""" if not RerankingParser._parser_model_loaded: raise RuntimeError('Parser must already be loaded (call ' 'load_parser_model() first)') parser.setOptions(language, case_insensitive, nbest, small_corpus, overparsing, debug, smooth_pos) self.parser_options = { 'language': language, 'case_insensitive': case_insensitive, 'nbest': nbest, 'small_corpus': small_corpus, 'overparsing': overparsing, 'debug': debug, 'smooth_pos': smooth_pos } return self.parser_options
def __init__(self, text_or_tokens, max_sentence_length=399): if isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>', max_sentence_length) else: # text_or_tokens is a sequence -- need to make sure that each # element is a string to avoid crashing text_or_tokens = map(str, text_or_tokens) self.sentrep = parser.SentRep(text_or_tokens)
def __init__(self): """Create an empty reranking parser. You'll need to call load_parsing_model() at minimum and load_reranker_model() if you're using the reranker. See also the load_unified_model_dir() classmethod which will take care of calling both of these for you.""" self._parser_model_loaded = False self.parser_model_dir = None self.reranker_model = None self._parser_thread_slot = parser.ThreadSlot() self.unified_model_dir = None
def log_prob(self): """Asks the current first-stage parsing model to score an existing tree. Returns parser model's log probability. Python equivalent of the evalTree command line tool. Note that you must have a parser model loaded in order to call this parses (otherwise you'll get a ValueError).""" if not RerankingParser._parser_model_loaded: raise ValueError("You need to have loaded a parser model in " "order to get the log probability.") return parser.treeLogProb(self._tree)
def sentences_from_string(this_class, text): """Given text containing SGML(-ish) lines (typical input to the command line parser), returns a list of Sentence objects (one for each tree in the text). Example usage: >>> Sentence.sentences_from_string('<s> Test </s>') [bllipparser.RerankingParser.Sentence(['Test'])] """ # Note that the native method below leaks. We work around this # by acquiring its pointer in __init__ sentReps = parser.sentRepsFromString(text) return map(this_class, sentReps)
def parse_tagged(self, tokens, possible_tags, rerank=True): """Parse some pre-tagged, pre-tokenized text. tokens is a sequence of strings. possible_tags is map from token indices to possible POS tags. Tokens without an entry in possible_tags will be unconstrained by POS. If rerank is True, we will rerank the n-best list.""" self.check_loaded_models(rerank) ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] ext_pos.addTagConstraints(parser.VectorString(tags)) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def __str__(self): """Represent the n-best list in a similar output format to the command-line parser and reranker.""" if self._reranked: from cStringIO import StringIO combined = StringIO() combined .write('%d dummy\n' % len(self.parses)) for parse in self.parses: combined.write('%s %s\n%s\n' % \ (parse.reranker_score, parse.parser_score, parse.ptb_parse)) combined.seek(0) return combined.read() else: return parser.asNBestList(self._parses)
def trees_from_string(this_class, text): """Given text containing multiple Penn Treebank trees, returns a list of Tree objects (one for each tree in the text).""" # Note: the native method below gives us memory ownership of # the InputTree objects in the vector. We acquire their pointers # and store them in a Python list (the vector won't stick # around). InputTree objects typically contain other InputTree # objects and the outer tree will free the inner trees when it is # deleted. So, we only need (and want) to acquire the pointer of # the outermost InputTree tree. trees = list(parser.inputTreesFromString(text)) for tree in trees: tree.this.acquire() return map(this_class, trees)
def __str__(self): """Represent the n-best list in a similar output format to the command-line parser and reranker.""" if self._reranked: from cStringIO import StringIO combined = StringIO() combined.write('%d dummy\n' % len(self.parses)) for parse in self.parses: combined.write('%s %s\n%s\n' % \ (parse.reranker_score, parse.parser_score, parse.ptb_parse)) combined.seek(0) return combined.read() else: return parser.asNBestList(self._parses)
def __init__(self, input_tree_or_string): """These can be constructed from the Penn Treebank string representations of trees, e.g.: >>> Tree('(S1 (NP (NN tree)))') bllipparser.RerankingParser.Tree('(S1 (NP (NN tree)))') Or from an existing InputTree (internal SWIG object). Users will generally want the former.""" if not isinstance(input_tree_or_string, parser.InputTree): if not isinstance(input_tree_or_string, basestring): raise TypeError("input_tree_or_string (%r) must be an InputTree or string." % input_tree_or_string) input_tree_or_string = \ parser.inputTreeFromString(input_tree_or_string) self._tree = input_tree_or_string
def load_parsing_model(self, model_dir, language='En', case_insensitive=False, nbest=50, small_corpus=True, overparsing=21, debug=0, smoothPos=0): """Load the parsing model from model_dir and set parsing options. In general, the default options should suffice. Note that the parser does not allow loading multiple models within the same process.""" if self._parser_model_loaded: raise ValueError( 'Parser is already loaded and can only be loaded once.') if not os.path.exists(model_dir): raise ValueError('Parser model directory %r does not exist.' % model_dir) self._parser_model_loaded = True parser.loadModel(model_dir) self.parser_model_dir = model_dir parser.setOptions(language, case_insensitive, nbest, small_corpus, overparsing, debug, smoothPos)
def parse(self, sentence, rerank=True, max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list.""" self.check_loaded_models(rerank) sentence = Sentence(sentence, max_sentence_length) try: parses = parser.parse(sentence.sentrep, self._parser_thread_slot) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse(self, sentence, rerank='auto', max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded.""" rerank = self._check_loaded_models(rerank) sentence = Sentence(sentence, max_sentence_length) try: parses = parser.parse(sentence.sentrep, self._parser_thread_slot) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def __init__(self, input_tree_or_string): """These can be constructed from the Penn Treebank string representations of trees, e.g.: >>> Tree('(S1 (NP (NN tree)))') Tree('(S1 (NP (NN tree)))') Or from an existing InputTree (internal SWIG object). Users will generally want the former.""" if not isinstance(input_tree_or_string, parser.InputTree): if not isinstance(input_tree_or_string, basestring): raise TypeError("input_tree_or_string (%r) must be " "an InputTree or string." % input_tree_or_string) input_tree_or_string = \ parser.inputTreeFromString(input_tree_or_string) self._tree = input_tree_or_string self._sd_tokens = None
def __str__(self): """Represent the n-best list in a similar output format to the command-line parser and reranker.""" sentence_id = self.sentence_id or 'x' if self._reranked: from cStringIO import StringIO combined = StringIO() combined.write('%d %s\n' % (len(self.parses), sentence_id)) for parse in self.parses: combined.write('%s %s\n%s\n' % (parse.reranker_score, parse.parser_score, parse.ptb_parse)) combined.seek(0) return combined.read() else: if self._parses: return parser.asNBestList(self._parses, str(sentence_id)) else: return '0 %s' % sentence_id
def evaluate(self, gold_tree): """Score this tree against a gold tree and return a dictionary with PARSEVAL information. Keys: gold, test, matched - integers for numbers of brackets precision, recall, fscore - floats between 0 and 1 Note that you must have a parser model loaded in order to evaluate parses (otherwise you'll get a ValueError). This is because the parser models include information about which tags are punctuation.""" if not RerankingParser._parser_terms_loaded: raise ValueError("You need to have loaded a parser model in " "order to evaluate.") scorer = parser.ScoreTree() stats = scorer.score(self._tree, gold_tree._tree) gold = stats.numInGold test = stats.numInGuessed matched = stats.numCorrect return dict(gold=gold, test=test, matched=matched, fscore=stats.fMeasure(), precision=stats.precision(), recall=stats.recall())
def load_parser_model(self, model_dir, terms_only=False, heads_only=False, **parser_options): """Load the parsing model from model_dir and set parsing options. In general, the default options should suffice but see the set_parser_options() method for details. Note that the parser does not allow loading multiple models within the same process (calling this function twice will raise a RuntimeError). If terms_only is True, we will not load the full parsing model, just part of speech tag information (intended for tools which only call things like Tree.evaluate()). If heads_only is True, we will only load head finding information (for things like Tree.dependencies(). If both are set to True, both of these will be loaded but the full parsing model will not.""" if RerankingParser._parser_model_loaded: raise RuntimeError('Parser is already loaded and can only ' 'be loaded once.') try: model_dir = str(model_dir) except UnicodeEncodeError: raise ValueError('Parser model directory %r must be an ASCII ' 'string.' % model_dir) if not exists(model_dir): raise ValueError('Parser model directory %r does not exist.' % model_dir) if not (terms_only or heads_only): RerankingParser._parser_model_loaded = True RerankingParser._parser_heads_loaded = True RerankingParser._parser_terms_loaded = True self.parser_model_dir = model_dir parser.loadModel(model_dir) self.set_parser_options(**parser_options) else: if terms_only: RerankingParser._parser_terms_loaded = True parser.loadTermsOnly(model_dir) if heads_only: RerankingParser._parser_heads_loaded = True parser.loadHeadInfoOnly(model_dir)
def parse_constrained(self, tokens, constraints, possible_tags=None, rerank='auto', sentence_id=None): """Parse pre-tokenized text with part of speech and/or phrasal constraints. Constraints is a dictionary of {(start, end): [terms]} which represents the constraint that all spans between [start,end) must be one of the terms in that list. start and end are integers and terms can be a single string or a list of strings. This also allows you to incorporate external POS tags as in parse_tagged(). While you can specify a constraint or an external POS tag for a word, the semantics are slightly different. Setting a tag with possible_tags will allow you to force a word to be a POS tag that the parser's tagger would not ordinarily use for a tag. Setting a constraint with constraints would only limit the set of allowable tags. Additionally, setting constraints doesn't change the probability of the final tree whereas setting possible_tags changes the probabilities of words given tags and may change the overall probability. The rerank flag is the same as in parse().""" rerank = self.check_models_loaded_or_error(rerank) if isinstance(tokens, basestring): raise ValueError("tokens must be a sequence, not a string.") if constraints: span_constraints = parser.LabeledSpans() for (start, end), terms in constraints.items(): if end <= start: raise ValueError("End must be at least start + 1:" "(%r, %r) -> %r" % (start, end, terms)) # since Tree.label currently returns a DeprecatedGetter, # we take some extra steps to get these back to strings # to avoid type errors if isinstance(terms, (basestring, DeprecatedGetter)): terms = [str(terms)] for term in terms: span_constraints.addConstraint(int(start), int(end), str(term)) else: span_constraints = None possible_tags = possible_tags or {} ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags) sentence = Sentence(tokens) try: parses = parser.parse(sentence.sentrep, ext_pos, span_constraints) if constraints and not parses: raise RuntimeError("Reparsing with relaxed constraints") except RuntimeError: if span_constraints: # we should relax them and retry span_constraints.minSizeForParsing = 2 try: parses = parser.parse(sentence.sentrep, ext_pos, span_constraints) except RuntimeError: parses = [] else: parses = [] nbest_list = NBestList(sentence, parses, sentence_id) if rerank: nbest_list.rerank(self) return nbest_list