def testWriteDerivation(self): trees = parse_tree(self.from_penn) self.assertEqual(len(trees), 1) tree = trees[0] write_graph(tree, 'penn_deriv.dot') self.assert_(os.path.exists('penn_deriv.dot'))
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: if i: text = ''.join(nth_occurrence(file.xreadlines(), N=i, when=lambda line: re.match(r'^<S', line), until=lambda line: re.match(r'^</S', line))) else: text = file.read() self.contents.feed(text) # HACK HACK HACK: # Sometimes <S>...</S> encloses more than one root (3:7 has some); # in which case, counting <S> will undercount the number of sentences if self.contents['s'] is None: return parse_tree('', AugmentedPennParser) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
def from_header_and_derivation(header, deriv_string): matches = re.match(r"ID=wsj_(\d\d)(\d\d).(\d+)", header) if matches and len(matches.groups()) == 3: sec_no, doc_no, der_no = [int(i) for i in matches.groups()] derivation = parse_tree(deriv_string, AugmentedPennParser)[0] ret = Derivation(sec_no, doc_no, der_no, derivation) return ret raise CCGbankParseException, "Malformed CCGbank header: %s" % header
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: headline_lines = nth_occurrence(file, N=1, when=lambda line: re.match(r'^<HEADLINE', line), until=lambda line: re.match(r'^</HEADLINE', line)) if not headline_lines: return None if not headline_lines[0].startswith('<HEADLINE'): raise CPTBParseException('Expected to find a <HEADLINE> line.') headline_lines = headline_lines[1:] # strip off <HEADLINE> if i: text = ''.join(headline_lines[i]) else: text = '\n'.join(headline_lines) self.contents.feed(text) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
def parse_file(text): return parse_tree(text, AugmentedPennParser)
def parse_file(text): return parse_tree(text, PennParser)
def parse_file(text): return parse_tree(text, CategoryPennParser, "()", "")
dot_path = None def write_dot_format(deriv, fn, format, label=""): cin = cout = None try: global dot_path if not dot_path: dot_path = os.popen('which dot').read().strip() if not dot_path: err('dot not found on this system. Ensure that dot is in the PATH.') return cmd = '%s -T%s -o %s 2>/dev/null' % (dot_path, format, fn) pipes = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True) cin, cout = pipes.stdin, pipes.stdout cin.write(make_graph(deriv, label=label)); cin.close() pipes.wait() if pipes.returncode is not None and pipes.returncode != 0: raise RuntimeError('dot terminated with non-zero return code: %d' % pipes.returncode) finally: if cin: cin.close() if cout: cout.close() if __name__ == '__main__': from munge.penn.parse import parse_tree import sys print make_graph(parse_tree(sys.stdin.read())[0])
def parse_file(text): return parse_tree(text, YZPTBParser)