/
explore_feature.py
121 lines (99 loc) · 3.52 KB
/
explore_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Decode the convolution neural network to get the most significant features
"""
__author__ = "Wei Wang"
__email__ = "tskatom@vt.edu"
import sys
import os
import theano
import theano.tensor as T
import cPickle
from SIG_Cnn_encoder import make_data_cv, ReLU
import numpy as np
from theano import function, shared
import nn_layers as nn
import json
from theano.tensor.nnet import conv
def load_model(model_file, non_static=True):
params = {}
with open(model_file) as f:
# load classifier params
clf_W = cPickle.load(f)
clf_b = cPickle.load(f)
params["clf"] = [clf_W, clf_b]
# conv layer params
conv_params = []
for i in range(3):
conv_W = cPickle.load(f)
conv_b = cPickle.load(f)
conv_params.append([conv_W, conv_b])
params["convs"] = conv_params
if non_static:
params["embedding"] = cPickle.load(f)
return params
def find_ngrams(docs, ns=[3,4,5]):
n_grams = []
for n in ns:
n_gram = set()
for doc in docs:
doc = doc[:-2]
for i in xrange(len(doc) - n + 1):
gram = tuple(doc[i:i+n])
n_gram.add(gram)
n_grams.append(n_gram)
return n_grams
def get_top_features(n_gram, params, word2id, n=0, k=50):
words = shared(params["embedding"], borrow=True, name='ebmedding')
x = T.matrix()
x_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
(x.shape[0], 1, x.shape[1], words.shape[1])
)
n_gram_matrix = np.asarray([g for g in n_gram], dtype=theano.config.floatX)
# get filter
conv_W, conv_b = params["convs"][n]
W = shared(value=conv_W.astype(theano.config.floatX), borrow=True, name="conv_W")
b = shared(value=conv_b.astype(theano.config.floatX), borrow=True, name="conv_b")
conv_out = conv.conv2d(input=x_input,
filters=W,
filter_shape=conv_W.shape)
out = conv_out.flatten(2)
ind = T.argsort(out, axis=0)[-k:,:][::-1].T # REVERSE THE ORDER
get_out = function(inputs=[x], outputs=ind)
top_conv_ind = get_out(n_gram_matrix)
print 'top_conv_ind.shape: ', top_conv_ind.shape
top_word_ids = n_gram_matrix[top_conv_ind]
#convert id to words
id2words = {v:k for k,v in word2id.items()}
n_gram_texts = {}
for h_id, tops in enumerate(top_word_ids):
strs = []
for word_ids in tops:
word_str = ' '.join([id2words.get(wid, '') for wid in word_ids])
word_str = word_str.strip()
strs.append(word_str)
n_gram_texts[h_id] = strs
return n_gram_texts
def main():
print '...load the expriment data set'
data = cPickle.load(open('./data/experiment_dataset2'))
docs, type2id, pop2id, word2id, embedding, rand_embedding = data
print '... construct the train/valid/test set'
test_docs = docs[:10000]
datasets = make_data_cv(test_docs, 0, word2id, max_l=1000, filter_h=5)
print '... construct ngrams'
ns = [3, 4, 5]
n_grams = find_ngrams(datasets[0])
print '....Load model parameters'
# load the trained parameters
params= load_model('./data/pop_model.pkl')
print '....start test the model'
# construct the model
# dump the result
with open("./data/top_ngrams.pkl", 'wb') as tn:
for n, n_gram in enumerate(n_grams):
n_gram_texts = get_top_features(n_gram, params, word2id, n)
cPickle.dump(n_gram_texts, tn)
if __name__ == "__main__":
main()