def contextual_noprob(): """Contextual, without probabilities. """ fname = argv[1 + argv.index('-c')] if not os.access(fname, os.F_OK): print 'Cannot access', fname usage() print "Contextual, no probabilities." print 'Reading relations ...', lines = ureader(gzip.open(fname)).readlines() print 'done.' print 'Extracting records ...', recs = [l[:-1].lower().split() for l in lines] print 'done.' print 'Indexing relations ...', # Record schema: sentence_id, noun_id, verb_id, noun, verb ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs) print 'done.' UNK = u'__unk__' print 'Gathering categories ...', categories = list(set(r[3] for r in recs)) + [UNK] print '({})'.format(u', '.join(categories)), 'done.' index = categories.index src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4) def fields(rec): s_id, w_id = int(rec[0]), int(rec[src_id]) try: dep = ctx[(s_id, w_id)] except KeyError: dep = [UNK] * 6 return index(dep[3]), (rec[src], rec[tgt]), dep[4] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: print 'Processing', in_fname, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect(in_recs, categories) print 'done.' assert len(collected) == len(out_streams), '{} != {}'.format(collected, out_streams) print 'Outputting files', ', '.join(categories), '...' process_counts(collected, out_streams, src_lang, tgt_lang) print 'done.'
def contextual(): """Contextual, with probabilities. """ from itertools import product from util import grouped fname = argv[1 + argv.index('-c')] if not os.access(fname, os.F_OK): print 'Cannot access', fname usage() print 'Reading relations ...', lines = ureader(gzip.open(fname)).readlines() print 'done.' print 'Extracting records ...', recs = [l[:-1].lower().split() for l in lines] print 'done.' print 'Indexing relations ...', ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs) print 'done.' UNK = u'__unk__' print 'Gathering categories ...', categories = list(set(r[3] for r in recs)) + [UNK] print '({})'.format(u', '.join(categories)), 'done.' idx = categories.index src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4) def fields(rec): s_id, w_id = int(rec[0]), int(rec[src_id]) try: dep = ctx[(s_id, w_id)] except KeyError: dep = [UNK] * 6 return idx(dep[3]), (rec[src], dep[4]), rec[tgt] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category dotjoin = '.'.join fnames = [dotjoin(p) for p in product(categories, ('px', 'pmi'))] with closing(*streams_for(fnames, 'wb')) as out_streams: print 'Processing', in_fname, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect3(in_recs, categories, reverse=reverse) print 'done.\nOutputting files', ', '.join(fnames), '...' process_p2(collected, grouped(2, out_streams), src_lang, tgt_lang) print 'done.'
def probabilities(): if '-c' in argv or '-n' in argv: # Just to be sure: -c and -n are mutually exclusive usage() src, tgt = (2, 5) if not reverse else (5, 2) def fields(rec): return rec[3][0], rec[src], rec[tgt] # categories = ['v.px', 'v.pmi'] categories = ['n.px', 'n.pmi'] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('n'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: in_recs = imap(fields, in_stream) print 'Processing probabilities in', in_fname, 'for', categories, '...', collected = collect2(in_recs, reverse=reverse) print 'done.' print 'Outputting files ...', process_p(collected, out_streams, src_lang, tgt_lang) print 'done.'
def main(args): import codecs from util import ureader, uwriter, uopen def handler(x): v = x.object[x.start:x.end] print >> stderr, repr(v), v return (u'', x.end) codecs.register_error('clear', handler) if '-t' not in args: usage(args) tag = map(string.lower, args[1 + args.index('-t')].split(',')) enc = args[1 + args.index('-e')] if '-e' in args else 'utf8' stdin = ureader(sys.stdin) if '-i' not in args else uopen( args[1 + args.index('-i')]) # stdout = codecs.getwriter(enc)(sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb'), errors='clear') stdout = codecs.getwriter(enc)( sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb')) stderr = uwriter(sys.stderr) for l in strip(stdin.read(), keep=tag): try: print >> stdout, l except UnicodeDecodeError: print 'problem with', l
def main(args): from util import ureader, uwriter #, tag, untag if '-l' not in args: usage() lang = args[1 + args.index('-l')] # do_tag = '-t' in args # maybe_tag = tag if do_tag else lambda x: x # valid_rels = ('Adj', 'Dobj', 'Subj', 'Iobj', 'AdvAdj', 'AdvVerb') stdin, stdout = ureader(sys.stdin), uwriter(sys.stdout) uprint = partial(print, file=stdout) for s, (sid, relations) in parse(stdin, lang): uprint(u'[{}] {}'.format(sid, s)) for sid, (_, _, r, d, h) in relations: if r != '**UNK**': # uprint(u'[{0}] {1} {2[0]}.{2[1]} {3[0]}.{3[1]}'.format(sid, r, d, h)) if r == 'Adj': uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, h, d)) else: uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, d, h)) uprint()
def noncontextual(): if '-c' in argv or '-p' in argv: # Just to be sure: -c and -n are mutually exclusive usage() categories = 'nv' idx = categories.index def fields(rec): return idx(rec[3][0]), rec[2], rec[5] print 'Reading', in_fname, '...', in_stream = ifilter(check_record(categories), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: print 'Processing noncontextually', in_stream, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect(in_recs, categories, reverse=reverse) process(collected, out_streams, src_lang, tgt_lang, categories, reverse) print 'done.'
def openf(fn): return uopen(fn) if fn != '-' else ureader(sys.stdin)
def clean(args): sin = ureader(sys.stdin) sout = uwriter(sys.stdout) for line in sin: print >> sout, cleaned(line),
def open_file(fn): _, ext = splitext(fn) return ureader(gzip.open(fn)) if ext == '.gz' else uopen(fn)