Beispiel #1
0
                     type='int',
                     default=5,
                     help='number of near neighbours to emit')
opts, arguments = optparser.parse_args()
print >> sys.stderr, "options", opts

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# checking that tokens are in vocab
for token in opts.tokens.split(" "):
    if not token_idx.id_exists_for(token):
        print >> sys.stderr, "token [%s] not in vocab?" % token
        exit(1)

E = np.load(opts.matrix_file)

#lshf = LSHForest()
#lshf.fit(E)
#distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10)
#for d, i in zip(distances[0], indices[0]):
#    print d, token_idx.token_for(i)

ball_tree = BallTree(E, leaf_size=30)
for token in opts.tokens.split(" "):
    print
    distances, indices = ball_tree.query(E[[token_idx.id_for(token)]],
                                         k=min(opts.k, E.shape[0]))
    for d, nn in zip(distances[0], indices[0]):
        print d, token_idx.token_for(nn)
optparser.add_option('--vocab', None, dest='vocab', type='string', default='vocab.tsv', help='vocab for token idx')
optparser.add_option('--matrix-file', None, dest='matrix_file', type='string', help='np matrix file to load; eg ckpt.X.E')
optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for')
optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit')
opts, arguments = optparser.parse_args()
print >>sys.stderr, "options", opts

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# checking that tokens are in vocab
for token in opts.tokens.split(" "):
    if not token_idx.id_exists_for(token):
        print >>sys.stderr, "token [%s] not in vocab?" % token
        exit(1)

E = np.load(opts.matrix_file)

#lshf = LSHForest()
#lshf.fit(E)
#distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10)
#for d, i in zip(distances[0], indices[0]):
#    print d, token_idx.token_for(i)

ball_tree = BallTree(E, leaf_size=30)
for token in opts.tokens.split(" "):
    print
    distances, indices = ball_tree.query(E[[token_idx.id_for(token)]], k=min(opts.k, E.shape[0]))
    for d, nn in zip(distances[0], indices[0]):
        print d, token_idx.token_for(nn)
Beispiel #3
0
# sanity check files are consistent with prefix and vars
prefix, _time, var = prefix_time_var_of_ckpt(files[0])
for f in files:
    next_prefix, _time, next_var = prefix_time_var_of_ckpt(f)
    if prefix != next_prefix or next_var != var:
        raise Exception("glob includes files that dont match in prefix or var")

print "\t".join("ckpt_time idx token x_l_dist x_f_dist".split())

first = None  # always compare to first
last = None   # also compare to last
for f in files:
    X = np.load(f)

    if ids:
        X = X[ids]

    if last is not None:
        _prefix, ckpt_time, _var = prefix_time_var_of_ckpt(f)
        row_wise_distance_X_L = np.sqrt(np.sum((X - last) ** 2, axis=1))
        row_wise_distance_X_F = np.sqrt(np.sum((X - first) ** 2, axis=1))
        for i, (x_l_dist, x_f_dist) in enumerate(zip(row_wise_distance_X_L, row_wise_distance_X_F)):
            t_id = ids[i] if ids else i
            print "%s\t%d\t%s\t%f\t%f" % (ckpt_time, t_id, token_idx.token_for(t_id), x_l_dist, x_f_dist)

    if first is None:
        first = X

    last = X