def parse_tree(tree_string, node_factory=CCGNodeFactory): parser = CCGParser(node_factory) toks = preserving_split(tree_string, "()<>", suppressors='<>') deriv = parser.read_paren(toks) ensure_stream_exhausted(toks, 'ccg.parse_tree') return deriv
def parse_tree(tree_string, node_factory=CCGNodeFactory): parser = CCGParser(node_factory) toks = preserving_split(tree_string, "()<>", suppressors='<>') deriv = parser.read_paren(toks) ensure_stream_exhausted(toks, 'ccg.parse_tree') return deriv
def parse_category(cat_string): '''Parses a category string into a category object. Throws DocParseException if unconsumed tokens remain.''' # Return each mode symbol as a token too when encountered. # Important: avoid using mode symbols in atomic category labels. toks = preserving_split(cat_string, "(\\/|)[]")# + ComplexCategory.mode_symbols) result = parse_compound(toks) ensure_stream_exhausted(toks, 'cats.parse_category') return result
def parse_category(cat_string): '''Parses a category string into a category object. Throws DocParseException if unconsumed tokens remain.''' # Return each mode symbol as a token too when encountered. # Important: avoid using mode symbols in atomic category labels. toks = preserving_split(cat_string, "(\\/|)[]") # + ComplexCategory.mode_symbols) result = parse_compound(toks) ensure_stream_exhausted(toks, 'cats.parse_category') return result
def parse_category(cat_string): # Return each mode symbol as a token too when encountered. # Important: avoid using mode symbols in atomic category labels. toks = preserving_split(cat_string, "(\\/)[]{}~")# + ComplexCategory.mode_symbols) result = parse_compound(toks, {}) if toks.peek() == '~': result.alias = parse_alias(toks) ensure_stream_exhausted(toks, 'cats.parse_category') return result
def parse_category(cat_string): # Return each mode symbol as a token too when encountered. # Important: avoid using mode symbols in atomic category labels. toks = preserving_split(cat_string, "(\\/)[]{}~") # + ComplexCategory.mode_symbols) result = parse_compound(toks, {}) if toks.peek() == '~': result.alias = parse_alias(toks) ensure_stream_exhausted(toks, 'cats.parse_category') return result
def testPennSplit(self): s = ''' ( (S (NP-SBJ (NNP Mr.) (NNP Vinken) ) (VP (VBZ is) (NP-PRD (NP (NN chairman) ) (PP (IN of) (NP (NP (NNP Elsevier) (NNP N.V.) ) (, ,) (NP (DT the) (NNP Dutch) (VBG publishing) (NN group) ))))) (. .) ))''' result = [tok for tok in preserving_split(s, r'()')] self.assertEqual(result, '''( ( S ( NP-SBJ ( NNP Mr. ) ( NNP Vinken ) ) ( VP ( VBZ is ) ( NP-PRD ( NP ( NN chairman ) ) ( PP ( IN of ) ( NP ( NP ( NNP Elsevier ) ( NNP N.V. ) ) ( , , ) ( NP ( DT the ) ( NNP Dutch ) ( VBG publishing ) ( NN group ) ) ) ) ) ) ( . . ) ) )'''.split(" "))
def testPennSplit(self): s = ''' ( (S (NP-SBJ (NNP Mr.) (NNP Vinken) ) (VP (VBZ is) (NP-PRD (NP (NN chairman) ) (PP (IN of) (NP (NP (NNP Elsevier) (NNP N.V.) ) (, ,) (NP (DT the) (NNP Dutch) (VBG publishing) (NN group) ))))) (. .) ))''' result = [tok for tok in preserving_split(s, r'()')] self.assertEqual( result, '''( ( S ( NP-SBJ ( NNP Mr. ) ( NNP Vinken ) ) ( VP ( VBZ is ) ( NP-PRD ( NP ( NN chairman ) ) ( PP ( IN of ) ( NP ( NP ( NNP Elsevier ) ( NNP N.V. ) ) ( , , ) ( NP ( DT the ) ( NNP Dutch ) ( VBG publishing ) ( NN group ) ) ) ) ) ) ( . . ) ) )''' .split(" "))
def testOnlySplitOnWhitespace(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, r'@#$%')] self.assertEqual(result, r'<a href="index.html">Text</a>'.split(" "))
def testPreserves(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, r'<>="/')] self.assertEqual( result, r'< a href = " index.html " > Text < / a >'.split(" "))
def testEmptyPeek(self): stream = preserving_split('', '@#$') self.assertRaises(StopIteration, stream.next) self.failIf(stream.peek()) # peek must yield None
def testPeek(self): stream = preserving_split('abc/def.ghi', './') for expected_tok in ('abc', '/', 'def', '.', 'ghi'): self.assertEqual(stream.peek(), expected_tok) stream.next()
def testEmptyPeek(self): stream = preserving_split('', '@#$') self.assertRaises(StopIteration, stream.next) self.failIf(stream.peek()) # peek must yield None
def testPeek(self): stream = preserving_split('abc/def.ghi', './') for expected_tok in ('abc', '/', 'def', '.', 'ghi'): self.assertEqual(stream.peek(), expected_tok) stream.next()
def testEmptyInput(self): result = [tok for tok in preserving_split('', '')] self.failIf(result)
def testSplitOnNothing(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, '', skip_chars='')] self.assertEqual(len(result), 1) self.assertEqual(result[0], s)
def testOnlySplitOnWhitespace(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, r'@#$%')] self.assertEqual(result, r'<a href="index.html">Text</a>'.split(" "))
def testSplitOnNothing(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, '', skip_chars='')] self.assertEqual(len(result), 1) self.assertEqual(result[0], s)
def tokenise(self, tree_string, split_chars, suppressors): return preserving_split(tree_string, split_chars='', skip_chars=' \n', suppressors='')
def testEmptyInput(self): result = [tok for tok in preserving_split('', '')] self.failIf(result)
def testAdjacentSplitters(self): result = [tok for tok in preserving_split(r'a.b.cd.ef..g', '.')] self.assertEqual(result, r'a . b . cd . ef . . g'.split(" "))
def testAdjacentSplitters(self): result = [tok for tok in preserving_split(r'a.b.cd.ef..g', '.')] self.assertEqual(result, r'a . b . cd . ef . . g'.split(" "))
def testPreserves(self): s = r'<a href="index.html">Text</a>' result = [tok for tok in preserving_split(s, r'<>="/')] self.assertEqual(result, r'< a href = " index.html " > Text < / a >'.split(" "))