Esempio n. 1
0
 def get_candidate_label_from_x_emb(self, trained_model_path, k):
     train_pid = self.train_x_emb.keys()
     train_emb = self.train_x_emb.values()
     test_pid = self.test_x_emb.keys()
     test_emb = self.test_x_emb.values()
     print 'begin KNN '
     nbrs = NearestNeighbors(n_neighbors=k).fit(train_emb)
     print 'end KNN'
     _, indices = nbrs.kneighbors(test_emb)
     # get candidate label
     test_unique_candidate_label = {}
     test_all_candidate_label = {}
     for i in range(len(indices)):
         k_nbs = np.array(train_pid)[indices[i]]
         can_l = []
         for pid in k_nbs:
             can_l.append(self.train_data.label_data[pid])
         all_can_l = np.concatenate(can_l)
         unique_can_l = np.unique(all_can_l)
         test_all_candidate_label[test_pid[i]] = all_can_l
         test_unique_candidate_label[test_pid[i]] = unique_can_l
     self.test_unique_candidate_label = test_unique_candidate_label
     self.test_all_candidate_label = test_all_candidate_label
     dump_pickle(self.test_unique_candidate_label, trained_model_path+'test_candidate_label.pkl')
Esempio n. 2
0
 def generate_x_embedding(self, trained_model_path):
     # generate candidate label subset via KNN using X-embeddings.
     # build model
     x_emb, y_, loss = self.model.build_model()
     gpu_options = tf.GPUOptions(allow_growth=True)
     with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
         tf.global_variables_initializer().run()
         saver = tf.train.Saver(tf.global_variables())
         print 'load trained model...'
         model_name = trained_model_path + 'model_final'
         saver.restore(sess, model_name)
         # -------------- get train_x_emb ------------
         print 'get train_x_emb'
         i = 0
         k = 0
         zero_y = np.zeros((self.batch_size, 2))
         zero_label_emb = np.zeros((self.batch_size, self.model.label_embedding_dim))
         while i < len(self.train_data.pids):
             k += 1
             if k % self.show_batches == 0:
                 print 'batch ' + str(k)
             batch_pid, batch_x, batch_len = self.train_data.get_pid_x(i, i + self.batch_size)
             if self.if_use_seq_len:
                 feed_dict = {self.model.x: np.array(batch_x), self.model.y: np.array(zero_y),
                              self.model.seqlen: np.array(batch_len),
                              self.model.label_embedding_id: zero_label_emb
                              }
             else:
                 feed_dict = {self.model.x: np.array(batch_x), self.model.y: np.array(zero_y),
                              self.model.label_embedding_id: zero_label_emb
                              }
             x_emb_ = sess.run(x_emb, feed_dict)
             for x_i in range(len(batch_pid)):
                 self.train_x_emb[batch_pid[x_i]] = x_emb_[x_i]
             i += self.batch_size
         print 'dump train_x_emb'
         dump_pickle(self.train_x_emb, trained_model_path+'train_x_emb.pkl')
         # -------------- get test_x_emb -------------
         print 'get test_x_emb'
         i = 0
         k = 0
         while i < len(self.test_data.pids):
             k += 1
             if k % self.show_batches == 0:
                 print 'batch ' + str(k)
             batch_pid, batch_x, batch_len = self.test_data.get_pid_x(i, i + self.batch_size)
             if self.if_use_seq_len:
                 feed_dict = {self.model.x: np.array(batch_x), self.model.y: np.array(zero_y),
                              self.model.seqlen: np.array(batch_len),
                              self.model.label_embedding_id: zero_label_emb
                              }
             else:
                 feed_dict = {self.model.x: np.array(batch_x), self.model.y: np.array(zero_y),
                              self.model.label_embedding_id: zero_label_emb
                              }
             x_emb_ = sess.run(x_emb, feed_dict)
             for x_i in range(len(batch_pid)):
                 self.test_x_emb[batch_pid[x_i]] = x_emb_[x_i]
             i += self.batch_size
         print 'dump test_x_emb'
         dump_pickle(self.test_x_emb, trained_model_path+'test_x_emb.pkl')