def load_trigram_data(filename, mode): token_idx = TokenIdx() idxs = [] target = [] for n, line in enumerate(open(filename, 'r')): if n%100000==0: sys.stderr.write("loading data %s\r" % n) record = line.strip().split() w1_idx, w2_idx, w3_idx = [token_idx.id_for(w) for w in record[:3]] if len(record) == 3 and mode=='sm': # just trigram idxs.append([w1_idx, w2_idx]) target.append(w3_idx) elif len(record) == 4 and mode=='lr': # trigram with label idxs.append([w1_idx, w2_idx, w3_idx]) target.append(float(record[3])) else: raise Exception("expected 3 token for mode=sm and 4 tokens for mode=lr, not %s tokens for mode %s" % (len(record), mode)) return (np.asarray(idxs, dtype='int32'), np.asarray(target, dtype='int32'), token_idx)
#!/usr/bin/env python import numpy as np from token_idx import TokenIdx from sklearn.neighbors import LSHForest, BallTree import optparse, sys optparser = optparse.OptionParser(prog='embedding_near_neighbours.py', version='0.0.1', description='') optparser.add_option('--vocab', None, dest='vocab', type='string', default='vocab.tsv', help='vocab for token idx') optparser.add_option('--matrix-file', None, dest='matrix_file', type='string', help='np matrix file to load; eg ckpt.X.E') optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for') optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit') opts, arguments = optparser.parse_args() print >>sys.stderr, "options", opts token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # checking that tokens are in vocab for token in opts.tokens.split(" "): if not token_idx.id_exists_for(token): print >>sys.stderr, "token [%s] not in vocab?" % token exit(1) E = np.load(opts.matrix_file) #lshf = LSHForest() #lshf.fit(E) #distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10) #for d, i in zip(distances[0], indices[0]): # print d, token_idx.token_for(i)
help='np matrix file to load; eg ckpt.X.E') optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for') optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit') opts, arguments = optparser.parse_args() print >> sys.stderr, "options", opts token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # checking that tokens are in vocab for token in opts.tokens.split(" "): if not token_idx.id_exists_for(token): print >> sys.stderr, "token [%s] not in vocab?" % token exit(1) E = np.load(opts.matrix_file) #lshf = LSHForest() #lshf.fit(E) #distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10) #for d, i in zip(distances[0], indices[0]): # print d, token_idx.token_for(i)
optparser = optparse.OptionParser() optparser.add_option('--files', None, dest='files', type='string', help='file glob') optparser.add_option('--tokens', None, dest='tokens', type='string', default=None, help='comma seperated list of tokens, if blank process all') optparser.add_option('--vocab', None, dest='vocab', type='string', default=None, help='vocab file. token TAB id') opts, arguments = optparser.parse_args() print >>sys.stderr, "options", opts if not opts.files: raise Exception("no --files specified?") if "," in opts.files: files = opts.files.split(",") else: files = sorted(glob.glob(opts.files)) token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # map tokens to ids ids = None if opts.tokens: ids = [token_idx.id_for(t) for t in opts.tokens.split(",")] # sanity check files are consistent with prefix and vars prefix, _time, var = prefix_time_var_of_ckpt(files[0]) for f in files: next_prefix, _time, next_var = prefix_time_var_of_ckpt(f) if prefix != next_prefix or next_var != var: raise Exception("glob includes files that dont match in prefix or var") print "\t".join("ckpt_time idx token x_l_dist x_f_dist".split())