def load_trigram_data(filename, mode):
    token_idx = TokenIdx()
    idxs = [] 
    target = []
    for n, line in enumerate(open(filename, 'r')):
        if n%100000==0: sys.stderr.write("loading data %s\r" % n)
        record = line.strip().split()
        w1_idx, w2_idx, w3_idx = [token_idx.id_for(w) for w in record[:3]]
        if len(record) == 3 and mode=='sm':
            # just trigram
            idxs.append([w1_idx, w2_idx])
            target.append(w3_idx)
        elif len(record) == 4 and mode=='lr':
            # trigram with label
            idxs.append([w1_idx, w2_idx, w3_idx])
            target.append(float(record[3]))
        else:
            raise Exception("expected 3 token for mode=sm and 4 tokens for mode=lr, not %s tokens for mode %s" % (len(record), mode))
    return (np.asarray(idxs, dtype='int32'), np.asarray(target, dtype='int32'), token_idx)
#!/usr/bin/env python
import numpy as np
from token_idx import TokenIdx
from sklearn.neighbors import LSHForest, BallTree
import optparse, sys

optparser = optparse.OptionParser(prog='embedding_near_neighbours.py', version='0.0.1', description='')
optparser.add_option('--vocab', None, dest='vocab', type='string', default='vocab.tsv', help='vocab for token idx')
optparser.add_option('--matrix-file', None, dest='matrix_file', type='string', help='np matrix file to load; eg ckpt.X.E')
optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for')
optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit')
opts, arguments = optparser.parse_args()
print >>sys.stderr, "options", opts

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# checking that tokens are in vocab
for token in opts.tokens.split(" "):
    if not token_idx.id_exists_for(token):
        print >>sys.stderr, "token [%s] not in vocab?" % token
        exit(1)

E = np.load(opts.matrix_file)

#lshf = LSHForest()
#lshf.fit(E)
#distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10)
#for d, i in zip(distances[0], indices[0]):
#    print d, token_idx.token_for(i)
Esempio n. 3
0
                     help='np matrix file to load; eg ckpt.X.E')
optparser.add_option('--tokens',
                     None,
                     dest='tokens',
                     type='string',
                     help='space separated list of tokens to emit NNs for')
optparser.add_option('--k',
                     None,
                     dest='k',
                     type='int',
                     default=5,
                     help='number of near neighbours to emit')
opts, arguments = optparser.parse_args()
print >> sys.stderr, "options", opts

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# checking that tokens are in vocab
for token in opts.tokens.split(" "):
    if not token_idx.id_exists_for(token):
        print >> sys.stderr, "token [%s] not in vocab?" % token
        exit(1)

E = np.load(opts.matrix_file)

#lshf = LSHForest()
#lshf.fit(E)
#distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10)
#for d, i in zip(distances[0], indices[0]):
#    print d, token_idx.token_for(i)
Esempio n. 4
0
optparser = optparse.OptionParser()
optparser.add_option('--files', None, dest='files', type='string', help='file glob')
optparser.add_option('--tokens', None, dest='tokens', type='string', default=None, help='comma seperated list of tokens, if blank process all')
optparser.add_option('--vocab', None, dest='vocab', type='string', default=None, help='vocab file. token TAB id')
opts, arguments = optparser.parse_args()
print >>sys.stderr, "options", opts

if not opts.files:
    raise Exception("no --files specified?")
if "," in opts.files:
    files = opts.files.split(",")
else:
    files = sorted(glob.glob(opts.files))

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# map tokens to ids
ids = None
if opts.tokens:
    ids = [token_idx.id_for(t) for t in opts.tokens.split(",")]

# sanity check files are consistent with prefix and vars
prefix, _time, var = prefix_time_var_of_ckpt(files[0])
for f in files:
    next_prefix, _time, next_var = prefix_time_var_of_ckpt(f)
    if prefix != next_prefix or next_var != var:
        raise Exception("glob includes files that dont match in prefix or var")

print "\t".join("ckpt_time idx token x_l_dist x_f_dist".split())