def load_trigram_data(filename, mode): token_idx = TokenIdx() idxs = [] target = [] for n, line in enumerate(open(filename, 'r')): if n%100000==0: sys.stderr.write("loading data %s\r" % n) record = line.strip().split() w1_idx, w2_idx, w3_idx = [token_idx.id_for(w) for w in record[:3]] if len(record) == 3 and mode=='sm': # just trigram idxs.append([w1_idx, w2_idx]) target.append(w3_idx) elif len(record) == 4 and mode=='lr': # trigram with label idxs.append([w1_idx, w2_idx, w3_idx]) target.append(float(record[3])) else: raise Exception("expected 3 token for mode=sm and 4 tokens for mode=lr, not %s tokens for mode %s" % (len(record), mode)) return (np.asarray(idxs, dtype='int32'), np.asarray(target, dtype='int32'), token_idx)
help='np matrix file to load; eg ckpt.X.E') optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for') optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit') opts, arguments = optparser.parse_args() print >> sys.stderr, "options", opts token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # checking that tokens are in vocab for token in opts.tokens.split(" "): if not token_idx.id_exists_for(token): print >> sys.stderr, "token [%s] not in vocab?" % token exit(1) E = np.load(opts.matrix_file) #lshf = LSHForest() #lshf.fit(E) #distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10) #for d, i in zip(distances[0], indices[0]): # print d, token_idx.token_for(i)