def load_trigram_data(filename, mode):
    token_idx = TokenIdx()
    idxs = [] 
    target = []
    for n, line in enumerate(open(filename, 'r')):
        if n%100000==0: sys.stderr.write("loading data %s\r" % n)
        record = line.strip().split()
        w1_idx, w2_idx, w3_idx = [token_idx.id_for(w) for w in record[:3]]
        if len(record) == 3 and mode=='sm':
            # just trigram
            idxs.append([w1_idx, w2_idx])
            target.append(w3_idx)
        elif len(record) == 4 and mode=='lr':
            # trigram with label
            idxs.append([w1_idx, w2_idx, w3_idx])
            target.append(float(record[3]))
        else:
            raise Exception("expected 3 token for mode=sm and 4 tokens for mode=lr, not %s tokens for mode %s" % (len(record), mode))
    return (np.asarray(idxs, dtype='int32'), np.asarray(target, dtype='int32'), token_idx)
Exemple #2
0
                     help='np matrix file to load; eg ckpt.X.E')
optparser.add_option('--tokens',
                     None,
                     dest='tokens',
                     type='string',
                     help='space separated list of tokens to emit NNs for')
optparser.add_option('--k',
                     None,
                     dest='k',
                     type='int',
                     default=5,
                     help='number of near neighbours to emit')
opts, arguments = optparser.parse_args()
print >> sys.stderr, "options", opts

token_idx = TokenIdx()
token_idx.read_from_file(opts.vocab)

# checking that tokens are in vocab
for token in opts.tokens.split(" "):
    if not token_idx.id_exists_for(token):
        print >> sys.stderr, "token [%s] not in vocab?" % token
        exit(1)

E = np.load(opts.matrix_file)

#lshf = LSHForest()
#lshf.fit(E)
#distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10)
#for d, i in zip(distances[0], indices[0]):
#    print d, token_idx.token_for(i)