def parse(self, sentences, translate): """Parse a list of sentences. :param sentences: a sequence of strings. :returns: A list of dependency trees (a forest) """ def tmpfile(prefix): return NamedTemporaryFile( dir='.', delete=False, prefix=prefix) if self.debug else NamedTemporaryFile( prefix=prefix) with uwriter(tmpfile('in-')) as infile, uwriter( tmpfile('err-')) as errfile: infile.writelines(u'\n'.join(sanitized(s) for s in sentences)) infile.seek(0) dependencies = check_output(self.command, stdin=infile, stderr=errfile).split('\n') process, keep = self.config # dpprint(dependencies) if translate: return translate(dependencies, sentences, base=0, parser=self.name, process=process, keep=keep) else: return dependencies
def main(args): import codecs from util import ureader, uwriter, uopen def handler(x): v = x.object[x.start:x.end] print >> stderr, repr(v), v return (u'', x.end) codecs.register_error('clear', handler) if '-t' not in args: usage(args) tag = map(string.lower, args[1 + args.index('-t')].split(',')) enc = args[1 + args.index('-e')] if '-e' in args else 'utf8' stdin = ureader(sys.stdin) if '-i' not in args else uopen( args[1 + args.index('-i')]) # stdout = codecs.getwriter(enc)(sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb'), errors='clear') stdout = codecs.getwriter(enc)( sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb')) stderr = uwriter(sys.stderr) for l in strip(stdin.read(), keep=tag): try: print >> stdout, l except UnicodeDecodeError: print 'problem with', l
def main(args): from util import ureader, uwriter #, tag, untag if '-l' not in args: usage() lang = args[1 + args.index('-l')] # do_tag = '-t' in args # maybe_tag = tag if do_tag else lambda x: x # valid_rels = ('Adj', 'Dobj', 'Subj', 'Iobj', 'AdvAdj', 'AdvVerb') stdin, stdout = ureader(sys.stdin), uwriter(sys.stdout) uprint = partial(print, file=stdout) for s, (sid, relations) in parse(stdin, lang): uprint(u'[{}] {}'.format(sid, s)) for sid, (_, _, r, d, h) in relations: if r != '**UNK**': # uprint(u'[{0}] {1} {2[0]}.{2[1]} {3[0]}.{3[1]}'.format(sid, r, d, h)) if r == 'Adj': uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, h, d)) else: uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, d, h)) uprint()
def main(args): if args.debug_meta: entries = args.__dict__ print('Metaphors for language {lang}, seed file {seed_fn}:'.format( **entries)) for n, v in all_metaphors(**entries): print(u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v), file=uwriter(sys.stdout)) else: with open_file(args.json_fn) as jsonf: json_out = m4detect(json_in=json.load(fp=jsonf, encoding='utf-8'), **args.__dict__) if args.out_fn == '-': json_dump(obj=json_out, fp=uwriter(sys.stdout)) else: with uopen(args.out_fn, mode='w+b') as out_f: json_dump(obj=json_out, fp=out_f)
def findm(index, relations, outs): for record in index: try: sent_id, _, _, k, n, v = record except ValueError: print(u'problem with', tabjoin(record), file=uwriter(sys.stderr)) if (k, n, v) in relations: print(sent_id, k, n, v, sep=u'\t', file=outs) print(sentence(int(sent_id)), file=outs)
def split(args): try: assert '-p' in args parser_kind = args[args.index('-p') + 1] split_re = splitter(parser_kind) #sin = ureader(sys.stdin) sin = sys.stdin sout = uwriter(sys.stdout) tokenjoin = spacejoin if '-l' not in args else nljoin eos = [args[args.index('-eos') + 1]] if '-eos' in args else [] lines = (l.rstrip().decode('utf8') for l in sin) for line in lines: print(tokenjoin(split_re.findall(line) + eos), file=sout) # Perhaps add blank line #if '-b' in args: # print(file=sout) print(file=sout) except AssertionError: usage(args)
def streams_for(categories, mode): return map(lambda c: uwriter(gzip.open('{}.{}.gz'.format(out_fname, c.lower()), mode)), categories)
def dump(self, stream=uwriter(sys.stderr)): for n, v in self.metaphors: print(u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v), file=stream)
from a parsed Russian web blog corpus. Created on October 24, 2012 .. moduleauthor:: ChrisXie, Luca Gilardi <*****@*****.**> """ from __future__ import print_function import sys, argparse from codecs import open from util import uwriter, Environment, derivations, update, uopen, dprint from pprint import pprint, pformat ustderr = uwriter(sys.stderr) env = Environment({ 'BASE': '/u/metanet/corpolexica/RU/RU-WAC/Russian Metaphor Extraction', 'GW': '/u/metanet/Parsing/Results/GW', 'BNC': '/u/metanet/corpolexica/EN', 'DEMO': '/u/metanet/demo', 'SEED_DIR': '/u/metanet/extraction/seeds' }) def main(): parser = create_parser() name_space = parser.parse_args() args = vars(name_space)
def main(args): """Main program. """ lang = args.lang uout = uwriter(sys.stdout) def openf(fn): return uopen(fn) if fn != '-' else ureader(sys.stdin) def output(json_doc): """Write our JSON file out in a 'standard' way. """ dump(json_doc, uout, encoding='utf-8', sort_keys=True, indent=2, ensure_ascii=False, separators=(',', ': ')) def process_conllx(args): sin = openf(args.conllx_fn) json_out = dict(JDATA) json_out.update(sentences=to_json( combine_conllx, blocks(lambda r: len(r) > 1, lines(sin, u'\t')))) output(json_out) def process_json(args): sin = openf(args.json_fn) json_in = load(fp=sin, encoding='utf-8') sentences_in = json_in['sentences'] try: json_out = parse(lang, [sanitized(s['ctext']) for s in sentences_in]) except KeyError: # from nltk.tokenize import TreebankWordTokenizer # _tokenize = TreebankWordTokenizer().tokenize _j = u' '.join def make_ctext(s): # this has side effects ctext = sanitized(s['text']) s['ctext'] = ctext return ctext json_out = parse(lang, [make_ctext(s) for s in sentences_in]) json_out.update((k, v) for k, v in json_in.items() if k != 'sentences') # Sanity check: verify we haven't modified ctext if False: for idx, (sent_in, sent_out) in enumerate(zip(json_in['sentences'], json_out['sentences']), start=1): ctext_in, ctext_out = sent_in['ctext'], sent_out['ctext'] try: assert ctext_in == ctext_out except AssertionError: dprint(u'error at line {}:\n {} \n!=\n {}'.format( idx, ctext_in, ctext_out)) output(json_out) if args.test: import doctest doctest.testmod() # elif args.freeling_fn: # process_freeling(args) elif args.conllx_fn: process_conllx(args) else: process_json(args)
def clean(args): sin = ureader(sys.stdin) sout = uwriter(sys.stdout) for line in sin: print >> sout, cleaned(line),
return (l.decode('utf8').rstrip().split(sep) for l in f) if decode else (l.rstrip().split(sep) for l in f) from os import path from itertools import islice def sentence(sent_id, files=512, chunk=4096): fn, pos = divmod(sent_id - 1, chunk) dn = fn // files with uopen(path.join('%.2x' % dn, '%.4x.ss' % fn)) as f: return islice(f, pos, pos + 1).next() if __name__ == '__main__': args = sys.argv[1:] multiple = '-m' in args if not multiple: main(*args) else: index, vs, vo = args[1:5] r = dict(vo=u'1-компл', vs=u'предик') outs = uwriter(sys.stdout) with uopen(vs) as vss, uopen(vo) as vos, open(index) as indexs: relations = read(split(vss), r['vs']) | read(split(vos), r['vo']) # for k, n, v in relations: # print(k, n, v, sep=u'\t', file=uwriter(sys.stderr)) findm(split(indexs, decode=True, sep=u'\t'), relations, outs)