Ejemplo n.º 1
0
    def contextual_noprob():
        """Contextual, without probabilities.
        """
        fname = argv[1 + argv.index('-c')]
        if not os.access(fname, os.F_OK):
            print 'Cannot access', fname
            usage()
            
        print "Contextual, no probabilities."
        
        print 'Reading relations ...',
        lines = ureader(gzip.open(fname)).readlines()
        print 'done.'

        print 'Extracting records ...',
        recs = [l[:-1].lower().split() for l in lines]
        print 'done.'

        print 'Indexing relations ...',
        # Record schema: sentence_id, noun_id, verb_id, noun, verb
        ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs)
        print 'done.'
        
        UNK = u'__unk__'
        
        print 'Gathering categories ...',
        categories = list(set(r[3] for r in recs)) + [UNK]
        print '({})'.format(u', '.join(categories)), 'done.'
        
        index = categories.index 
        src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4)
        def fields(rec):
            s_id, w_id = int(rec[0]), int(rec[src_id])
            try:
                dep = ctx[(s_id, w_id)]
            except KeyError:
                dep = [UNK] * 6
            return index(dep[3]), (rec[src], rec[tgt]), dep[4] 
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            print 'Processing', in_fname, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect(in_recs, categories)
            print 'done.'
            
            assert len(collected) == len(out_streams), '{} != {}'.format(collected, out_streams)
            
            print 'Outputting files', ', '.join(categories), '...' 
            process_counts(collected, out_streams, src_lang, tgt_lang)
            print 'done.'
Ejemplo n.º 2
0
    def contextual():
        """Contextual, with probabilities.
        """
        from itertools import product
        from util import grouped
        fname = argv[1 + argv.index('-c')]
        if not os.access(fname, os.F_OK):
            print 'Cannot access', fname
            usage()
        
        print 'Reading relations ...',
        lines = ureader(gzip.open(fname)).readlines()
        print 'done.'

        print 'Extracting records ...',
        recs = [l[:-1].lower().split() for l in lines]
        print 'done.'

        print 'Indexing relations ...',
        ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs)
        print 'done.'
        
        UNK = u'__unk__'
        
        print 'Gathering categories ...',
        categories = list(set(r[3] for r in recs)) + [UNK]
        print '({})'.format(u', '.join(categories)), 'done.'
        
        idx = categories.index 
        src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4)
        def fields(rec):
            s_id, w_id = int(rec[0]), int(rec[src_id])
            try:
                dep = ctx[(s_id, w_id)]
            except KeyError:
                dep = [UNK] * 6
            return idx(dep[3]), (rec[src], dep[4]), rec[tgt] 
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        dotjoin = '.'.join
        fnames = [dotjoin(p) for p in product(categories, ('px', 'pmi'))] 
        with closing(*streams_for(fnames, 'wb')) as out_streams:
            print 'Processing', in_fname, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect3(in_recs, categories, reverse=reverse)
            print 'done.\nOutputting files', ', '.join(fnames), '...' 
            process_p2(collected, grouped(2, out_streams), src_lang, tgt_lang)
            print 'done.'
Ejemplo n.º 3
0
    def probabilities():
        if '-c' in argv or '-n' in argv:
            # Just to be sure: -c and -n are mutually exclusive
            usage()
            
        src, tgt = (2, 5) if not reverse else (5, 2)    
        
        def fields(rec): 
            return rec[3][0], rec[src], rec[tgt]
        
#        categories = ['v.px', 'v.pmi'] 
        categories = ['n.px', 'n.pmi'] 

        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('n'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            in_recs = imap(fields, in_stream)
            print 'Processing probabilities in', in_fname, 'for', categories, '...',
            collected = collect2(in_recs, reverse=reverse)
            print 'done.'
            
            print 'Outputting files ...',
            process_p(collected, out_streams, src_lang, tgt_lang)
            
            print 'done.'
Ejemplo n.º 4
0
def main(args):
    import codecs
    from util import ureader, uwriter, uopen

    def handler(x):
        v = x.object[x.start:x.end]
        print >> stderr, repr(v), v
        return (u'', x.end)

    codecs.register_error('clear', handler)

    if '-t' not in args:
        usage(args)

    tag = map(string.lower, args[1 + args.index('-t')].split(','))
    enc = args[1 + args.index('-e')] if '-e' in args else 'utf8'
    stdin = ureader(sys.stdin) if '-i' not in args else uopen(
        args[1 + args.index('-i')])
    #     stdout = codecs.getwriter(enc)(sys.stdout if '-o' not in args else open(args[1 + args.index('-o')], 'wb'), errors='clear')
    stdout = codecs.getwriter(enc)(
        sys.stdout if '-o' not in args else open(args[1 +
                                                      args.index('-o')], 'wb'))
    stderr = uwriter(sys.stderr)
    for l in strip(stdin.read(), keep=tag):
        try:
            print >> stdout, l
        except UnicodeDecodeError:
            print 'problem with', l
Ejemplo n.º 5
0
def main(args):
    from util import ureader, uwriter  #, tag, untag

    if '-l' not in args:
        usage()

    lang = args[1 + args.index('-l')]

    #     do_tag = '-t' in args
    #     maybe_tag = tag if do_tag else lambda x: x

    #     valid_rels = ('Adj', 'Dobj', 'Subj', 'Iobj', 'AdvAdj', 'AdvVerb')

    stdin, stdout = ureader(sys.stdin), uwriter(sys.stdout)
    uprint = partial(print, file=stdout)
    for s, (sid, relations) in parse(stdin, lang):
        uprint(u'[{}] {}'.format(sid, s))
        for sid, (_, _, r, d, h) in relations:
            if r != '**UNK**':
                #                 uprint(u'[{0}] {1} {2[0]}.{2[1]} {3[0]}.{3[1]}'.format(sid, r, d, h))
                if r == 'Adj':
                    uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, h, d))
                else:
                    uprint(u'[{0}] {1} {2[0]} {3[0]}'.format(sid, r, d, h))
        uprint()
Ejemplo n.º 6
0
    def noncontextual():
        if '-c' in argv or '-p' in argv:
            # Just to be sure: -c and -n are mutually exclusive
            usage()
            
        categories = 'nv'
        idx = categories.index
        def fields(rec): 
            return idx(rec[3][0]), rec[2], rec[5]
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record(categories), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            print 'Processing noncontextually', in_stream, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect(in_recs, categories, reverse=reverse)
            process(collected, out_streams, src_lang, tgt_lang, categories, reverse)
            print 'done.'
Ejemplo n.º 7
0
 def openf(fn):
     return uopen(fn) if fn != '-' else ureader(sys.stdin)
Ejemplo n.º 8
0
def clean(args):
    sin = ureader(sys.stdin)
    sout = uwriter(sys.stdout)
    for line in sin:
        print >> sout, cleaned(line),
Ejemplo n.º 9
0
def open_file(fn):
    _, ext = splitext(fn)
    return ureader(gzip.open(fn)) if ext == '.gz' else uopen(fn)