def __init__(self, text_or_tokens, max_sentence_length=399): if isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>', max_sentence_length) else: self.sentrep = parser.SentRep(text_or_tokens)
def load_parsing_model(self, model_dir, language='En', case_insensitive=False, nbest=50, small_corpus=True, overparsing=21, debug=0, smoothPos=0): assert not self._parser_model_loaded self._parser_model_loaded = True parser.loadModel(model_dir) parser.setOptions(language, case_insensitive, nbest, small_corpus, overparsing, debug, smoothPos)
def test_parse(): sr1 = parser.SentRep(['These', 'are', 'tokens', '.']) sr2 = test_tokenizer() for sr in (sr1, sr2): parses = parser.parse(sr, thread_slot) display_parses(parses) print '---'
def test_tokenizer(): sr = parser.tokenize("junk <s> It's some text to tokenize, if you feel like it -- or not. </s>", 399) print 'sr %r' % str(sr) print 'sr length', len(sr) for i in range(len(sr)): print 'sr word', i, sr.getWord(i).lexeme() return sr
def test_extpos(): sr1 = parser.SentRep(['record']) print 'Unconstrained' display_parses(parser.parse(sr1)) print 'NN' ext_pos1 = parser.ExtPos() ext_pos1.addTagConstraints(parser.StringVector(['NN'])) display_parses(parser.parse(sr1, ext_pos1, None)) print 'VB' ext_pos2 = parser.ExtPos() ext_pos2.addTagConstraints(parser.StringVector(['VB'])) display_parses(parser.parse(sr1, ext_pos2, None))
def test_extpos(): sr1 = parser.SentRep(["record"]) print "Unconstrained" display_parses(parser.parse(sr1)) print "NN" ext_pos1 = parser.ExtPos() ext_pos1.addTagConstraints(parser.StringVector(["NN"])) display_parses(parser.parse(sr1, ext_pos1)) print "VB" ext_pos2 = parser.ExtPos() ext_pos2.addTagConstraints(parser.StringVector(["VB"])) display_parses(parser.parse(sr1, ext_pos2))
def test_parse(): sr1 = parser.SentRep(["These", "are", "tokens", "."]) sr2 = test_tokenizer() for sr in (sr1, sr2): parses = parser.parse(sr) display_parses(parses) print "---"
def test_parse(): sr1 = parser.SentRep(['These', 'are', 'tokens', '.']) sr2 = test_tokenizer() for sr in (sr1, sr2): parses = parser.parse(sr) display_parses(parses) print '---'
def test_tokenizer(): sr = parser.tokenize( "junk <s> It's some text to tokenize, if you feel like it -- or not. </s>", 399) print 'sr %r' % str(sr) print 'sr length', len(sr) for i in range(len(sr)): print 'sr word', i, sr.getWord(i).lexeme() return sr
def test_threadslot(): print 'parser.ThreadSlot contents:', dir(parser.ThreadSlot) print z = parser.ThreadSlot() print z print z.acquiredThreadSlot() print z.recycle() print z.acquiredThreadSlot() print z.acquire() print z.acquiredThreadSlot()
def __str__(self): if self._reranked: from cStringIO import StringIO combined = StringIO() combined.write('%d dummy\n' % len(self.parses)) for parse in self.parses: combined.write('%s %s\n%s\n' % \ (parse.reranker_score, parse.parser_score, parse.ptb_parse)) combined.seek(0) return combined.read() else: return parser.asNBestList(self._parses)
def __str__(self): if self._reranked: from cStringIO import StringIO combined = StringIO() combined .write('%d dummy\n' % len(self.parses)) for parse in self.parses: combined.write('%s %s\n%s\n' % \ (parse.reranker_score, parse.parser_score, parse.ptb_parse)) combined.seek(0) return combined.read() else: return parser.asNBestList(self._parses)
def parse(self, sentence, rerank=True, max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list.""" assert self._parser_model_loaded sentence = Sentence(sentence, max_sentence_length) parses = parser.parse(sentence.sentrep, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: self.rerank(nbest_list) return nbest_list
def parse_tagged(self, tokens, possible_tags, rerank=True): """Parse some pre-tagged, pre-tokenized text. tokens is a sequence of strings. possible_tags is map from token indices to possible POS tags. Tokens without an entry in possible_tags will be unconstrained by POS. If rerank is True, we will rerank the n-best list.""" assert self._parser_model_loaded ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] ext_pos.addTagConstraints(parser.VectorString(tags)) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: self.rerank(nbest_list) return nbest_list
def test_extpos(): sr1 = parser.SentRep(['record']) print 'Unconstrained' display_parses(parser.parse(sr1, thread_slot)) print 'NN' ext_pos1 = parser.ExtPos() ext_pos1.addTagConstraints(parser.VectorString(['NN'])) display_parses(parser.parse(sr1, ext_pos1, thread_slot)) print 'VB' ext_pos2 = parser.ExtPos() ext_pos2.addTagConstraints(parser.VectorString(['VB'])) display_parses(parser.parse(sr1, ext_pos2, thread_slot))
def test_multiword_extpos(): sr1 = parser.SentRep("British left waffles on Falklands .".split()) print "waffles = [anything]:" display_parses(parser.parse(sr1)) if 1: print "waffles = VBZ/VBD/VB:" ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(["VBZ", "VBD", "VB"])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos)) print "waffles = NNS:" ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(["NNS"])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos)) print "waffles = NN/NNS:" ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(["NN", "NNS"])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos))
def test_multiword_extpos(): sr1 = parser.SentRep('British left waffles on Falklands .'.split()) print 'waffles = [anything]:' display_parses(parser.parse(sr1)) if 1: print 'waffles = VBZ/VBD/VB:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(['VBZ', 'VBD', 'VB'])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos, None)) print 'waffles = NNS:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(['NNS'])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos, None)) print 'waffles = NN/NNS:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector(['NN', 'NNS'])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) ext_pos.addTagConstraints(parser.StringVector([])) display_parses(parser.parse(sr1, ext_pos, None))
def parse(self, sentence, rerank=True, max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list.""" assert self._parser_model_loaded sentence = Sentence(sentence, max_sentence_length) try: parses = parser.parse(sentence.sentrep, self._parser_thread_slot) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse_tagged(self, tokens, possible_tags, rerank=True): """Parse some pre-tagged, pre-tokenized text. tokens is a sequence of strings. possible_tags is map from token indices to possible POS tags. Tokens without an entry in possible_tags will be unconstrained by POS. If rerank is True, we will rerank the n-best list.""" assert self._parser_model_loaded ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] ext_pos.addTagConstraints(parser.VectorString(tags)) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def test_multiword_extpos(): sr1 = parser.SentRep('British left waffles on Falklands .'.split()) print 'waffles = [anything]:' display_parses(parser.parse(sr1, thread_slot)) if 1: print 'waffles = VBZ/VBD/VB:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString(['VBZ', 'VBD', 'VB'])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) display_parses(parser.parse(sr1, ext_pos, thread_slot)) print 'waffles = NNS:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString(['NNS'])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) display_parses(parser.parse(sr1, ext_pos, thread_slot)) print 'waffles = NN/NNS:' ext_pos = parser.ExtPos() ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString(['NN', 'NNS'])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) display_parses(parser.parse(sr1, ext_pos, thread_slot))
def test_as_nbest_list(): sr1 = parser.SentRep(['These', 'are', 'tokens', '.']) parses = parser.parse(sr1) print parser.asNBestList(parses, 'test_as_nbest_list_sentence')
def initialize(n=10): # this assumes we're in PARSE/ parser.loadModel("../DATA/EN") parser.setOptions('En', False, n, True, 21, 0, 0)
# a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import SWIGParser import fileinput if __name__ == "__main__": from test import initialize, display_parses thread_slot = SWIGParser.ThreadSlot() initialize(n=50) for line in fileinput.input(): line = line.strip() print line tree = SWIGParser.inputTreeFromString('(S1 ' + line + ')') print tree sentence = tree.toSentRep() print sentence parses = SWIGParser.parse(sentence, thread_slot) print len(parses), 'parses' if not parses: raise 'failed' display_parses(parses) print 'example failure tree', sentence.makeFailureTree(
def test_as_nbest_list(): sr1 = parser.SentRep(['These', 'are', 'tokens', '.']) parses = parser.parse(sr1, thread_slot) print parser.asNBestList(parses)
ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) ext_pos.addTagConstraints(parser.VectorString([])) display_parses(parser.parse(sr1, ext_pos, thread_slot)) def test_threadslot(): print 'parser.ThreadSlot contents:', dir(parser.ThreadSlot) print z = parser.ThreadSlot() print z print z.acquiredThreadSlot() print z.recycle() print z.acquiredThreadSlot() print z.acquire() print z.acquiredThreadSlot() if __name__ == "__main__": thread_slot = parser.ThreadSlot() dir_contents() if 1: initialize(n=5) test_as_nbest_list() for x in range(1000): # memory leak detection print 'iteration', x test_tokenizer() test_parse() test_multiword_extpos() test_extpos()
def __init__(self): self._parser_model_loaded = False self._reranker_model = None self._parser_thread_slot = parser.ThreadSlot()
def test_as_nbest_list(): sr1 = parser.SentRep(["These", "are", "tokens", "."]) parses = parser.parse(sr1) print parser.asNBestList(parses, "test_as_nbest_list_sentence")
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import SWIGParser import fileinput if __name__ == "__main__": from test import initialize, display_parses initialize(n=50) for line in fileinput.input(): line = line.strip() print line tree = SWIGParser.inputTreeFromString('(S1 ' + line + ')') print tree sentence = tree.toSentRep() print sentence parses = SWIGParser.parse(sentence) print len(parses), 'parses' if not parses: raise 'failed' display_parses(parses) print 'example failure tree', sentence.makeFailureTree('Xyz') print
def __str__(self): return parser.asNBestList(self._parses)