Exemple #1
0
 def __init__(self, text_or_tokens, max_sentence_length=399):
     if isinstance(text_or_tokens, Sentence):
         self.sentrep = text_or_tokens.sentrep
     elif isinstance(text_or_tokens, basestring):
         self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>',
                                        max_sentence_length)
     else:
         self.sentrep = parser.SentRep(text_or_tokens)
    def _find_bad_tag_and_raise_error(self, tags):
        ext_pos = parser.ExtPos()
        bad_tags = set()
        for tag in set(tags):
            good_tag = ext_pos.addTagConstraints(parser.StringVector([tag]))
            if not good_tag:
                bad_tags.add(tag)

        raise ValueError("Invalid POS tags (not present in the parser's "
                         "terms.txt file): %s" % ', '.join(sorted(bad_tags)))
 def __init__(self, text_or_tokens):
     if isinstance(text_or_tokens, Sentence):
         self.sentrep = text_or_tokens.sentrep
     elif isinstance(text_or_tokens, basestring):
         self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>')
     else:
         # text_or_tokens is a sequence -- need to make sure that each
         # element is a string to avoid crashing
         text_or_tokens = [parser.ptbEscape(str(token))
             for token in text_or_tokens]
         self.sentrep = parser.SentRep(text_or_tokens)
 def load_parser_model(self, model_dir, **parser_options):
     """Load the parsing model from model_dir and set parsing
     options. In general, the default options should suffice but see
     the set_parser_options() method for details. Note that the parser
     does not allow loading multiple models within the same process
     (calling this function twice will raise a RuntimeError)."""
     if self._parser_model_loaded:
         raise RuntimeError('Parser is already loaded and can only be loaded once.')
     if not exists(model_dir):
         raise ValueError('Parser model directory %r does not exist.' % model_dir)
     self._parser_model_loaded = True
     self.parser_model_dir = model_dir
     parser.loadModel(model_dir)
     self.set_parser_options(**parser_options)
 def _possible_tags_to_ext_pos(self, tokens, possible_tags):
     ext_pos = parser.ExtPos()
     if not possible_tags:
         return ext_pos
     for index in range(len(tokens)):
         tags = possible_tags.get(index, [])
         if isinstance(tags, basestring):
             tags = [tags]
         tags = map(str, tags)
         valid_tags = ext_pos.addTagConstraints(parser.StringVector(tags))
         if not valid_tags:
             # at least one of the tags is bad -- find out which ones
             # and throw a ValueError
             self._find_bad_tag_and_raise_error(tags)
     return ext_pos
 def __init__(self, text_or_tokens):
     if isinstance(text_or_tokens, parser.SentRep):
         # ensure that Python owns the pointer
         text_or_tokens.this.acquire()
         self.sentrep = text_or_tokens
     elif isinstance(text_or_tokens, Sentence):
         self.sentrep = text_or_tokens.sentrep
     elif isinstance(text_or_tokens, basestring):
         self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>')
     else:
         # text_or_tokens is a sequence -- need to make sure that each
         # element is a string to avoid crashing
         text_or_tokens = [
             parser.ptbEscape(str(token)) for token in text_or_tokens
         ]
         self.sentrep = parser.SentRep(text_or_tokens)
    def fuse(self,
             threshold=0.5,
             exponent=1,
             num_parses=50,
             use_parser_scores=False):
        """Combine the parses in this n-best list into a single Tree
        using parse fusion. This results in a significant accuracy
        improvement. You may want to tune the parameters for your specific
        parsing model. See Choe, McClosky, and Charniak (EMNLP 2015) for
        more details. This will use the scores from the reranker unless
        the n-best list wasn't reranked or use_parser_scores=True. If
        fusion fails, the top parse from the list will be returned."""
        parses = self.parses[:num_parses]

        if use_parser_scores or not self._reranked:
            scores = [scored_parse.parser_score for scored_parse in parses]
        else:
            scores = [scored_parse.reranker_score for scored_parse in parses]
        norm_scores = normalize_logprobs(scores, exponent=exponent)

        chart = parser.SimpleChart(len(self._sentrep))
        for norm_score, scored_parse in zip(norm_scores, parses):
            chart.populate(scored_parse.ptb_parse._tree, norm_score)
        chart.prune(threshold)

        tree = chart.parse()
        if tree is None:
            # parse failed -- use original 1-best parse
            tree = parses[0].ptb_parse
        else:
            tree = Tree(tree)
        return tree
    def parse(self, sentence, rerank='auto', sentence_id=None):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded. If there are no parses or an error occurs,
        this will return an empty NBestList."""
        rerank = self.check_models_loaded_or_error(rerank)

        sentence = Sentence(sentence)
        # max_sentence_length is actually 1 longer than the maximum
        # allowed sentence length
        if len(sentence) >= parser.max_sentence_length - 1:
            raise ValueError("Sentence is too long (%s tokens, must be "
                             "under %s)" %
                             (len(sentence), parser.max_sentence_length - 1))

        try:
            parses = parser.parse(sentence.sentrep)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def parse(self, sentence, rerank='auto', sentence_id=None):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded. If there are no parses or an error occurs,
        this will return an empty NBestList."""
        rerank = self.check_models_loaded_or_error(rerank)

        sentence = Sentence(sentence)
        # max_sentence_length is actually 1 longer than the maximum
        # allowed sentence length
        if len(sentence) >= parser.max_sentence_length - 1:
            raise ValueError("Sentence is too long (%s tokens, must be "
                             "under %s)" %
                             (len(sentence), parser.max_sentence_length - 1))

        try:
            parses = parser.parse(sentence.sentrep)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def parse_tagged(self, tokens, possible_tags, rerank='auto'):
        """Parse some pre-tagged, pre-tokenized text. tokens must be a
        sequence of strings. possible_tags is map from token indices
        to possible POS tags (strings). Tokens without an entry in
        possible_tags will be unconstrained by POS. POS tags must be
        in the terms.txt file in the parsing model or else you will get
        a ValueError. If rerank is True, we will rerank the n-best list,
        if False the reranker will not be used. rerank can also be set to
        'auto' which will only rerank if a reranker model is loaded."""
        rerank = self._check_loaded_models(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            tags = map(str, tags)
            valid_tags = ext_pos.addTagConstraints(parser.VectorString(tags))
            if not valid_tags:
                # at least one of the tags is bad -- find out which ones
                # and throw a ValueError
                self._find_bad_tag_and_raise_error(tags)

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
            self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
 def load_parsing_model(self, model_dir, language='En',
                        case_insensitive=False, nbest=50, small_corpus=True,
                        overparsing=21, debug=0, smoothPos=0):
     """Load the parsing model from model_dir and set parsing
     options. In general, the default options should suffice. Note
     that the parser does not allow loading multiple models within
     the same process."""
     if self._parser_model_loaded:
         raise ValueError('Parser is already loaded and can only be loaded once.')
     if not os.path.exists(model_dir):
         raise ValueError('Parser model directory %r does not exist.' % model_dir)
     self._parser_model_loaded = True
     parser.loadModel(model_dir)
     self.parser_model_dir = model_dir
     parser.setOptions(language, case_insensitive, nbest, small_corpus,
                       overparsing, debug, smoothPos)
 def __init__(self, text_or_tokens, max_sentence_length=399):
     if isinstance(text_or_tokens, Sentence):
         self.sentrep = text_or_tokens.sentrep
     elif isinstance(text_or_tokens, basestring):
         self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>',
                                        max_sentence_length)
     else:
         self.sentrep = parser.SentRep(text_or_tokens)
 def sentences_from_file(this_class, filename):
     """Given the path to a filename containing multiple SGML(-ish)
     lines (typical input to the command line parser), returns a list
     of Sentence objects (one for each tree in the text)."""
     # Note that the native method below leaks. We work around this
     # by acquiring its pointer in __init__
     sentReps = parser.sentRepsFromFile(filename)
     return map(this_class, sentReps)
 def sentences_from_file(this_class, filename):
     """Given the path to a filename containing multiple SGML(-ish)
     lines (typical input to the command line parser), returns a list
     of Sentence objects (one for each tree in the text)."""
     # Note that the native method below leaks. We work around this
     # by acquiring its pointer in __init__
     sentReps = parser.sentRepsFromFile(filename)
     return map(this_class, sentReps)
 def trees_from_file(this_class, filename):
     """Given the path to a file containing multiple Penn Treebank
     trees, returns a list of Tree objects (one for each tree in the
     file)."""
     # see trees_from_string for an explanation
     trees = list(parser.inputTreesFromFile(filename))
     for tree in trees:
         tree.this.acquire()
     return map(this_class, trees)
 def trees_from_file(this_class, filename):
     """Given the path to a file containing multiple Penn Treebank
     trees, returns a list of Tree objects (one for each tree in the
     file)."""
     # see trees_from_string for an explanation
     trees = list(parser.inputTreesFromFile(filename))
     for tree in trees:
         tree.this.acquire()
     return map(this_class, trees)
    def set_parser_options(self,
                           language='En',
                           case_insensitive=False,
                           nbest=50,
                           small_corpus=True,
                           overparsing=21,
                           debug=0,
                           smooth_pos=0):
        """Set options for the parser. Note that this is called
        automatically by load_parser_model() so you should only need to
        call this to update the parsing options. The method returns a
        dictionary of the new options.

        The options are as follows: language is a string describing
        the language. Currently, it can be one of En (English), Ch
        (Chinese), or Ar (Arabic). case_insensitive will make the parser
        ignore capitalization. nbest is the maximum size of the n-best
        list. small_corpus=True enables additional smoothing (originally
        intended for training from small corpora, but helpful in many
        situations). overparsing determines how much more time the parser
        will spend on a sentence relative to the time it took to find the
        first possible complete parse. This affects the speed/accuracy
        tradeoff. debug takes a non-negative integer. Setting it higher
        than 0 will cause the parser to print debug messages (surprising,
        no?). Setting smooth_pos to a number higher than 0 will cause the
        parser to assign that value as the probability of seeing a known
        word in a new part-of-speech (one never seen in training)."""
        if not RerankingParser._parser_model_loaded:
            raise RuntimeError('Parser must already be loaded (call '
                               'load_parser_model() first)')

        parser.setOptions(language, case_insensitive, nbest, small_corpus,
                          overparsing, debug, smooth_pos)
        self.parser_options = {
            'language': language,
            'case_insensitive': case_insensitive,
            'nbest': nbest,
            'small_corpus': small_corpus,
            'overparsing': overparsing,
            'debug': debug,
            'smooth_pos': smooth_pos
        }
        return self.parser_options
 def __init__(self, text_or_tokens, max_sentence_length=399):
     if isinstance(text_or_tokens, Sentence):
         self.sentrep = text_or_tokens.sentrep
     elif isinstance(text_or_tokens, basestring):
         self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>',
                                        max_sentence_length)
     else:
         # text_or_tokens is a sequence -- need to make sure that each
         # element is a string to avoid crashing
         text_or_tokens = map(str, text_or_tokens)
         self.sentrep = parser.SentRep(text_or_tokens)
Exemple #19
0
 def __init__(self):
     """Create an empty reranking parser. You'll need to call
     load_parsing_model() at minimum and load_reranker_model() if
     you're using the reranker. See also the load_unified_model_dir()
     classmethod which will take care of calling both of these
     for you."""
     self._parser_model_loaded = False
     self.parser_model_dir = None
     self.reranker_model = None
     self._parser_thread_slot = parser.ThreadSlot()
     self.unified_model_dir = None
    def log_prob(self):
        """Asks the current first-stage parsing model to score an existing
        tree. Returns parser model's log probability. Python equivalent of the
        evalTree command line tool.

        Note that you must have a parser model loaded in order to call
        this parses (otherwise you'll get a ValueError)."""
        if not RerankingParser._parser_model_loaded:
            raise ValueError("You need to have loaded a parser model in "
                             "order to get the log probability.")
        return parser.treeLogProb(self._tree)
    def log_prob(self):
        """Asks the current first-stage parsing model to score an existing
        tree. Returns parser model's log probability. Python equivalent of the
        evalTree command line tool.

        Note that you must have a parser model loaded in order to call
        this parses (otherwise you'll get a ValueError)."""
        if not RerankingParser._parser_model_loaded:
            raise ValueError("You need to have loaded a parser model in "
                             "order to get the log probability.")
        return parser.treeLogProb(self._tree)
    def sentences_from_string(this_class, text):
        """Given text containing SGML(-ish) lines (typical input to
        the command line parser), returns a list of Sentence objects
        (one for each tree in the text). Example usage:

        >>> Sentence.sentences_from_string('<s> Test </s>')
        [bllipparser.RerankingParser.Sentence(['Test'])]
        """
        # Note that the native method below leaks. We work around this
        # by acquiring its pointer in __init__
        sentReps = parser.sentRepsFromString(text)
        return map(this_class, sentReps)
    def sentences_from_string(this_class, text):
        """Given text containing SGML(-ish) lines (typical input to
        the command line parser), returns a list of Sentence objects
        (one for each tree in the text). Example usage:

        >>> Sentence.sentences_from_string('<s> Test </s>')
        [bllipparser.RerankingParser.Sentence(['Test'])]
        """
        # Note that the native method below leaks. We work around this
        # by acquiring its pointer in __init__
        sentReps = parser.sentRepsFromString(text)
        return map(this_class, sentReps)
Exemple #24
0
    def parse_tagged(self, tokens, possible_tags, rerank=True):
        """Parse some pre-tagged, pre-tokenized text.  tokens is a
        sequence of strings.  possible_tags is map from token indices
        to possible POS tags.  Tokens without an entry in possible_tags
        will be unconstrained by POS.  If rerank is True, we will
        rerank the n-best list."""
        self.check_loaded_models(rerank)

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            ext_pos.addTagConstraints(parser.VectorString(tags))

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
                              self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def set_parser_options(self, language='En', case_insensitive=False,
                           nbest=50, small_corpus=True, overparsing=21,
                           debug=0, smooth_pos=0):
        """Set options for the parser. Note that this is called
        automatically by load_parser_model() so you should only need to
        call this to update the parsing options. The method returns a
        dictionary of the new options.

        The options are as follows: language is a string describing
        the language. Currently, it can be one of En (English), Ch
        (Chinese), or Ar (Arabic). case_insensitive will make the parser
        ignore capitalization. nbest is the maximum size of the n-best
        list. small_corpus=True enables additional smoothing (originally
        intended for training from small corpora, but helpful in many
        situations). overparsing determines how much more time the parser
        will spend on a sentence relative to the time it took to find the
        first possible complete parse. This affects the speed/accuracy
        tradeoff. debug takes a non-negative integer. Setting it higher
        than 0 will cause the parser to print debug messages (surprising,
        no?). Setting smooth_pos to a number higher than 0 will cause the
        parser to assign that value as the probability of seeing a known
        word in a new part-of-speech (one never seen in training)."""
        if not RerankingParser._parser_model_loaded:
            raise RuntimeError('Parser must already be loaded (call '
                               'load_parser_model() first)')

        parser.setOptions(language, case_insensitive, nbest, small_corpus,
                          overparsing, debug, smooth_pos)
        self.parser_options = {
            'language': language,
            'case_insensitive': case_insensitive,
            'nbest': nbest,
            'small_corpus': small_corpus,
            'overparsing': overparsing,
            'debug': debug,
            'smooth_pos': smooth_pos
        }
        return self.parser_options
 def __str__(self):
     """Represent the n-best list in a similar output format to the
     command-line parser and reranker."""
     if self._reranked:
         from cStringIO import StringIO
         combined = StringIO()
         combined .write('%d dummy\n' % len(self.parses))
         for parse in self.parses:
             combined.write('%s %s\n%s\n' % \
                 (parse.reranker_score, parse.parser_score, parse.ptb_parse))
         combined.seek(0)
         return combined.read()
     else:
         return parser.asNBestList(self._parses)
 def trees_from_string(this_class, text):
     """Given text containing multiple Penn Treebank trees, returns
     a list of Tree objects (one for each tree in the text)."""
     # Note: the native method below gives us memory ownership of
     # the InputTree objects in the vector. We acquire their pointers
     # and store them in a Python list (the vector won't stick
     # around). InputTree objects typically contain other InputTree
     # objects and the outer tree will free the inner trees when it is
     # deleted. So, we only need (and want) to acquire the pointer of
     # the outermost InputTree tree.
     trees = list(parser.inputTreesFromString(text))
     for tree in trees:
         tree.this.acquire()
     return map(this_class, trees)
 def trees_from_string(this_class, text):
     """Given text containing multiple Penn Treebank trees, returns
     a list of Tree objects (one for each tree in the text)."""
     # Note: the native method below gives us memory ownership of
     # the InputTree objects in the vector. We acquire their pointers
     # and store them in a Python list (the vector won't stick
     # around). InputTree objects typically contain other InputTree
     # objects and the outer tree will free the inner trees when it is
     # deleted. So, we only need (and want) to acquire the pointer of
     # the outermost InputTree tree.
     trees = list(parser.inputTreesFromString(text))
     for tree in trees:
         tree.this.acquire()
     return map(this_class, trees)
Exemple #29
0
 def __str__(self):
     """Represent the n-best list in a similar output format to the
     command-line parser and reranker."""
     if self._reranked:
         from cStringIO import StringIO
         combined = StringIO()
         combined.write('%d dummy\n' % len(self.parses))
         for parse in self.parses:
             combined.write('%s %s\n%s\n' % \
                 (parse.reranker_score, parse.parser_score, parse.ptb_parse))
         combined.seek(0)
         return combined.read()
     else:
         return parser.asNBestList(self._parses)
    def __init__(self, input_tree_or_string):
        """These can be constructed from the Penn Treebank string
        representations of trees, e.g.:

            >>> Tree('(S1 (NP (NN tree)))')
            bllipparser.RerankingParser.Tree('(S1 (NP (NN tree)))')

        Or from an existing InputTree (internal SWIG object). Users will
        generally want the former."""
        if not isinstance(input_tree_or_string, parser.InputTree):
            if not isinstance(input_tree_or_string, basestring):
                raise TypeError("input_tree_or_string (%r) must be an InputTree or string." % input_tree_or_string)
            input_tree_or_string = \
                parser.inputTreeFromString(input_tree_or_string)
        self._tree = input_tree_or_string
Exemple #31
0
 def load_parsing_model(self,
                        model_dir,
                        language='En',
                        case_insensitive=False,
                        nbest=50,
                        small_corpus=True,
                        overparsing=21,
                        debug=0,
                        smoothPos=0):
     """Load the parsing model from model_dir and set parsing
     options. In general, the default options should suffice. Note
     that the parser does not allow loading multiple models within
     the same process."""
     if self._parser_model_loaded:
         raise ValueError(
             'Parser is already loaded and can only be loaded once.')
     if not os.path.exists(model_dir):
         raise ValueError('Parser model directory %r does not exist.' %
                          model_dir)
     self._parser_model_loaded = True
     parser.loadModel(model_dir)
     self.parser_model_dir = model_dir
     parser.setOptions(language, case_insensitive, nbest, small_corpus,
                       overparsing, debug, smoothPos)
Exemple #32
0
    def parse(self, sentence, rerank=True, max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results.  sentence can be a string or a sequence.  If it is a
        string, it will be tokenized.  If rerank is True, we will rerank
        the n-best list."""
        self.check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def parse(self, sentence, rerank=True, max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results.  sentence can be a string or a sequence.  If it is a
        string, it will be tokenized.  If rerank is True, we will rerank
        the n-best list."""
        self.check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def parse(self, sentence, rerank='auto', max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded."""
        rerank = self._check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def __init__(self, input_tree_or_string):
        """These can be constructed from the Penn Treebank string
        representations of trees, e.g.:

            >>> Tree('(S1 (NP (NN tree)))')
            Tree('(S1 (NP (NN tree)))')

        Or from an existing InputTree (internal SWIG object). Users will
        generally want the former."""
        if not isinstance(input_tree_or_string, parser.InputTree):
            if not isinstance(input_tree_or_string, basestring):
                raise TypeError("input_tree_or_string (%r) must be "
                                "an InputTree or string." %
                                input_tree_or_string)
            input_tree_or_string = \
                parser.inputTreeFromString(input_tree_or_string)
        self._tree = input_tree_or_string
        self._sd_tokens = None
 def __str__(self):
     """Represent the n-best list in a similar output format to the
     command-line parser and reranker."""
     sentence_id = self.sentence_id or 'x'
     if self._reranked:
         from cStringIO import StringIO
         combined = StringIO()
         combined.write('%d %s\n' % (len(self.parses), sentence_id))
         for parse in self.parses:
             combined.write('%s %s\n%s\n' % (parse.reranker_score,
                                             parse.parser_score,
                                             parse.ptb_parse))
         combined.seek(0)
         return combined.read()
     else:
         if self._parses:
             return parser.asNBestList(self._parses, str(sentence_id))
         else:
             return '0 %s' % sentence_id
 def __str__(self):
     """Represent the n-best list in a similar output format to the
     command-line parser and reranker."""
     sentence_id = self.sentence_id or 'x'
     if self._reranked:
         from cStringIO import StringIO
         combined = StringIO()
         combined.write('%d %s\n' % (len(self.parses), sentence_id))
         for parse in self.parses:
             combined.write('%s %s\n%s\n' %
                            (parse.reranker_score, parse.parser_score,
                             parse.ptb_parse))
         combined.seek(0)
         return combined.read()
     else:
         if self._parses:
             return parser.asNBestList(self._parses, str(sentence_id))
         else:
             return '0 %s' % sentence_id
    def parse_tagged(self, tokens, possible_tags, rerank=True):
        """Parse some pre-tagged, pre-tokenized text.  tokens is a
        sequence of strings.  possible_tags is map from token indices
        to possible POS tags.  Tokens without an entry in possible_tags
        will be unconstrained by POS.  If rerank is True, we will
        rerank the n-best list."""
        self.check_loaded_models(rerank)

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            ext_pos.addTagConstraints(parser.VectorString(tags))

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
            self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def evaluate(self, gold_tree):
        """Score this tree against a gold tree and return a dictionary with
        PARSEVAL information. Keys:
            gold, test, matched - integers for numbers of brackets
            precision, recall, fscore - floats between 0 and 1

        Note that you must have a parser model loaded in order to
        evaluate parses (otherwise you'll get a ValueError). This is
        because the parser models include information about which tags
        are punctuation."""
        if not RerankingParser._parser_terms_loaded:
            raise ValueError("You need to have loaded a parser model in "
                             "order to evaluate.")
        scorer = parser.ScoreTree()
        stats = scorer.score(self._tree, gold_tree._tree)
        gold = stats.numInGold
        test = stats.numInGuessed
        matched = stats.numCorrect
        return dict(gold=gold,
                    test=test,
                    matched=matched,
                    fscore=stats.fMeasure(),
                    precision=stats.precision(),
                    recall=stats.recall())
    def load_parser_model(self,
                          model_dir,
                          terms_only=False,
                          heads_only=False,
                          **parser_options):
        """Load the parsing model from model_dir and set parsing
        options. In general, the default options should suffice but see
        the set_parser_options() method for details. Note that the parser
        does not allow loading multiple models within the same process
        (calling this function twice will raise a RuntimeError).

        If terms_only is True, we will not load the full parsing model,
        just part of speech tag information (intended for tools which
        only call things like Tree.evaluate()). If heads_only is True,
        we will only load head finding information (for things like
        Tree.dependencies(). If both are set to True, both of these will
        be loaded but the full parsing model will not."""
        if RerankingParser._parser_model_loaded:
            raise RuntimeError('Parser is already loaded and can only '
                               'be loaded once.')
        try:
            model_dir = str(model_dir)
        except UnicodeEncodeError:
            raise ValueError('Parser model directory %r must be an ASCII '
                             'string.' % model_dir)
        if not exists(model_dir):
            raise ValueError('Parser model directory %r does not exist.' %
                             model_dir)
        if not (terms_only or heads_only):
            RerankingParser._parser_model_loaded = True
            RerankingParser._parser_heads_loaded = True
            RerankingParser._parser_terms_loaded = True
            self.parser_model_dir = model_dir
            parser.loadModel(model_dir)
            self.set_parser_options(**parser_options)
        else:
            if terms_only:
                RerankingParser._parser_terms_loaded = True
                parser.loadTermsOnly(model_dir)
            if heads_only:
                RerankingParser._parser_heads_loaded = True
                parser.loadHeadInfoOnly(model_dir)
    def load_parser_model(self, model_dir, terms_only=False,
                          heads_only=False, **parser_options):
        """Load the parsing model from model_dir and set parsing
        options. In general, the default options should suffice but see
        the set_parser_options() method for details. Note that the parser
        does not allow loading multiple models within the same process
        (calling this function twice will raise a RuntimeError).

        If terms_only is True, we will not load the full parsing model,
        just part of speech tag information (intended for tools which
        only call things like Tree.evaluate()). If heads_only is True,
        we will only load head finding information (for things like
        Tree.dependencies(). If both are set to True, both of these will
        be loaded but the full parsing model will not."""
        if RerankingParser._parser_model_loaded:
            raise RuntimeError('Parser is already loaded and can only '
                               'be loaded once.')
        try:
            model_dir = str(model_dir)
        except UnicodeEncodeError:
            raise ValueError('Parser model directory %r must be an ASCII '
                             'string.' % model_dir)
        if not exists(model_dir):
            raise ValueError('Parser model directory %r does not exist.' %
                             model_dir)
        if not (terms_only or heads_only):
            RerankingParser._parser_model_loaded = True
            RerankingParser._parser_heads_loaded = True
            RerankingParser._parser_terms_loaded = True
            self.parser_model_dir = model_dir
            parser.loadModel(model_dir)
            self.set_parser_options(**parser_options)
        else:
            if terms_only:
                RerankingParser._parser_terms_loaded = True
                parser.loadTermsOnly(model_dir)
            if heads_only:
                RerankingParser._parser_heads_loaded = True
                parser.loadHeadInfoOnly(model_dir)
    def parse_constrained(self,
                          tokens,
                          constraints,
                          possible_tags=None,
                          rerank='auto',
                          sentence_id=None):
        """Parse pre-tokenized text with part of speech and/or phrasal
        constraints. Constraints is a dictionary of

            {(start, end): [terms]}

        which represents the constraint that all spans between [start,end)
        must be one of the terms in that list. start and end are integers
        and terms can be a single string or a list of strings.

        This also allows you to incorporate external POS tags as in
        parse_tagged(). While you can specify a constraint or an external
        POS tag for a word, the semantics are slightly different. Setting
        a tag with possible_tags will allow you to force a word to be a
        POS tag that the parser's tagger would not ordinarily use for
        a tag. Setting a constraint with constraints would only limit
        the set of allowable tags.  Additionally, setting constraints
        doesn't change the probability of the final tree whereas setting
        possible_tags changes the probabilities of words given tags and
        may change the overall probability.

        The rerank flag is the same as in parse()."""
        rerank = self.check_models_loaded_or_error(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        if constraints:
            span_constraints = parser.LabeledSpans()
            for (start, end), terms in constraints.items():
                if end <= start:
                    raise ValueError("End must be at least start + 1:"
                                     "(%r, %r) -> %r" % (start, end, terms))
                # since Tree.label currently returns a DeprecatedGetter,
                # we take some extra steps to get these back to strings
                # to avoid type errors
                if isinstance(terms, (basestring, DeprecatedGetter)):
                    terms = [str(terms)]
                for term in terms:
                    span_constraints.addConstraint(int(start), int(end),
                                                   str(term))
        else:
            span_constraints = None

        possible_tags = possible_tags or {}
        ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags)
        sentence = Sentence(tokens)
        try:
            parses = parser.parse(sentence.sentrep, ext_pos, span_constraints)
            if constraints and not parses:
                raise RuntimeError("Reparsing with relaxed constraints")
        except RuntimeError:
            if span_constraints:
                # we should relax them and retry
                span_constraints.minSizeForParsing = 2
                try:
                    parses = parser.parse(sentence.sentrep, ext_pos,
                                          span_constraints)
                except RuntimeError:
                    parses = []
            else:
                parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
    def parse_constrained(self, tokens, constraints, possible_tags=None,
                          rerank='auto', sentence_id=None):
        """Parse pre-tokenized text with part of speech and/or phrasal
        constraints. Constraints is a dictionary of

            {(start, end): [terms]}

        which represents the constraint that all spans between [start,end)
        must be one of the terms in that list. start and end are integers
        and terms can be a single string or a list of strings.

        This also allows you to incorporate external POS tags as in
        parse_tagged(). While you can specify a constraint or an external
        POS tag for a word, the semantics are slightly different. Setting
        a tag with possible_tags will allow you to force a word to be a
        POS tag that the parser's tagger would not ordinarily use for
        a tag. Setting a constraint with constraints would only limit
        the set of allowable tags.  Additionally, setting constraints
        doesn't change the probability of the final tree whereas setting
        possible_tags changes the probabilities of words given tags and
        may change the overall probability.

        The rerank flag is the same as in parse()."""
        rerank = self.check_models_loaded_or_error(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        if constraints:
            span_constraints = parser.LabeledSpans()
            for (start, end), terms in constraints.items():
                if end <= start:
                    raise ValueError("End must be at least start + 1:"
                                     "(%r, %r) -> %r" % (start, end, terms))
                # since Tree.label currently returns a DeprecatedGetter,
                # we take some extra steps to get these back to strings
                # to avoid type errors
                if isinstance(terms, (basestring, DeprecatedGetter)):
                    terms = [str(terms)]
                for term in terms:
                    span_constraints.addConstraint(int(start), int(end),
                                                   str(term))
        else:
            span_constraints = None

        possible_tags = possible_tags or {}
        ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags)
        sentence = Sentence(tokens)
        try:
            parses = parser.parse(sentence.sentrep, ext_pos, span_constraints)
            if constraints and not parses:
                raise RuntimeError("Reparsing with relaxed constraints")
        except RuntimeError:
            if span_constraints:
                # we should relax them and retry
                span_constraints.minSizeForParsing = 2
                try:
                    parses = parser.parse(sentence.sentrep, ext_pos,
                                          span_constraints)
                except RuntimeError:
                    parses = []
            else:
                parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list