Example #1
0
 def internal_validate(self,
                       sess,
                       train_data,
                       test_idx_array,
                       tidx,
                       test_batch_size=10000):
     print_and_log("...Start testing...", self.log_file)
     batches_test = batch_gen(train_data[0], train_data[1],
                              test_idx_array, test_batch_size,
                              np.array([tidx]), False, True)
     tars = np.array([], dtype=np.int32)
     scores = np.array([], dtype=np.float32)
     while True:
         batch_fp, batch_tar, is_last = next(batches_test)
         batch_cls = sess.run(self.cls_tensor,
                              feed_dict={
                                  self.fp_tensor: batch_fp,
                                  self.tar_tensor: batch_tar,
                                  self.idropout: 0,
                                  self.idropout: 0
                              })
         clear_point = np.where(batch_tar != 0)
         tars = np.concatenate((tars, batch_tar[clear_point]))
         scores = np.concatenate((scores, batch_cls[clear_point]))
         if is_last:
             break
     return tars, scores
Example #2
0
 def train_and_test(self,
                    train_data,
                    idx_array=None,
                    test_idx_array=None,
                    tidx_array=None,
                    save_path=None):
     if idx_array is None:
         idx_array = np.array(range(len(train_data[0])))
     if tidx_array is None:
         tidx_array = np.array(range(train_data[1].shape[1]))
     if self.sign_bal:
         if self.unknown_val is None:
             num_pos = np.sum(np.maximum(train_data[1][idx_array], 0),
                              axis=0)
             num_neg = np.sum(np.abs(train_data[1][idx_array]),
                              axis=0) - num_pos
         elif self.unknown_val == 0:
             num_pos = np.sum(np.maximum(train_data[1][idx_array], 0),
                              axis=0)
             num_neg = len(idx_array) - num_pos
         else:
             print_and_log(
                 "Error: Unknown data should be ignored or negative(0) for sign balancing",
                 self.log_file)
             return None
         try:
             self.sign_ratio = num_neg / num_pos
         except:
             print_and_log(
                 "Error: Each target should have at least 1 positives and 1 negatives",
                 self.log_file)
             return None
     else:
         self.sign_ratio = np.ones(train_data[1].shape[1], dtype=np.float32)
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     saver = tf.train.Saver()
     with tf.Session(config=config) as sess:
         ## Initializing
         print_and_log("...Start initializing...", self.log_file)
         init = tf.global_variables_initializer()
         init_var = tf.variables_initializer([self.w, self.b])
         sess.run(init)
         test_idx, train_idx = set_aside_test(idx_array,
                                              self.validation_frac)
         tidx2batches = {}
         tidx2early = {}
         tidx2epoch = {}
         rev = False
         for tidx in tidx_array:
             if self.unknown_val is None:
                 batches = batch_gen(train_data[0],
                                     train_data[1],
                                     train_idx,
                                     self.batch_size,
                                     np.array([tidx]),
                                     known_only=True,
                                     use_all=self.use_all)
             else:
                 batches = batch_gen(train_data[0],
                                     train_data[1],
                                     train_idx,
                                     self.batch_size,
                                     np.array([tidx]),
                                     known_only=False,
                                     use_all=self.use_all)
             tidx2batches[tidx] = batches
             tidx2early[tidx] = EarlyStopping(patience=self.patience,
                                              reverse=rev)
             tidx2epoch[tidx] = 0
         test_fp = train_data[0][test_idx]
         test_tv = train_data[1][test_idx]
         if save_path is None:
             tar_array = np.array([], dtype=np.int32)
             score_array = np.array([], dtype=np.float32)
         else:
             roc_array = []
             pr_array = []
         ## Training
         print_and_log("...Start training...", self.log_file)
         save_w = np.zeros((len(tidx_array), self.last_size, 2),
                           dtype=np.float32)
         save_b = np.zeros((len(tidx_array), 2), dtype=np.float32)
         is_first_array = np.ones(len(tidx_array), dtype=bool)
         is_last_array = np.zeros(len(tidx_array), dtype=bool)
         while True:
             for i, tidx in enumerate(tidx_array):
                 if not self.batch_bal and is_last_array[
                         i]:  # if clause for not balancing
                     continue
                 if is_first_array[i]:
                     sess.run(init_var)
                     is_first_array[i] = False
                 else:
                     sess.run(init_var,
                              feed_dict={
                                  self.init_w: save_w[i],
                                  self.init_b: save_b[i]
                              })
                 batches = tidx2batches[tidx]
                 batch_fp, batch_tv, is_last = next(batches)
                 sess.run(self.trainer,
                          feed_dict={
                              self.fp_tensor: batch_fp,
                              self.tar_tensor: batch_tv,
                              self.sign_weight: self.sign_ratio[tidx]
                          })
                 save_w[i], save_b[i] = sess.run([self.w, self.b])
                 if is_last and not is_last_array[
                         i]:  # tidx reaches the last of an epoch, start validation and determine to stop or continue training for the tidx
                     tidx2epoch[tidx] += 1
                     epoch = tidx2epoch[tidx]
                     if epoch >= self.min_epoch:
                         loss = sess.run(self.cls_loss,
                                         feed_dict={
                                             self.fp_tensor:
                                             test_fp,
                                             self.tar_tensor:
                                             test_tv[:, np.array([tidx])],
                                             self.idropout:
                                             0,
                                             self.dropout:
                                             0,
                                             self.sign_weight:
                                             self.sign_ratio[tidx]
                                         })
                         early_stop_code = tidx2early[tidx].validate(loss)
                         if early_stop_code == 2:
                             print_and_log("+++ for tidx=%d +++" % tidx,
                                           self.log_file)
                             print_and_log(
                                 "...Terminating training by early stopper...",
                                 self.log_file)
                             print_and_log(
                                 "epoch=%d, loss=%.5f" % (epoch, loss),
                                 self.log_file)
                             print_and_log("", self.log_file)
                             is_last_array[i] = True
                     if epoch == self.max_epoch:
                         print_and_log("+++ for tidx=%d +++" % tidx,
                                       self.log_file)
                         print_and_log(
                             "...Terminating training because it reaches to max epoch...",
                             self.log_file)
                         print_and_log(
                             "epoch=%d, loss=%.5f" % (epoch, loss),
                             self.log_file)
                         print_and_log("", self.log_file)
                         is_last_array[i] = True
                     # Validating
                     if is_last_array[i]:
                         if self.do_save:
                             saver.save(
                                 sess, self.save_dir + self.model_name +
                                 '/model_%d' % tidx)
                         tars, scores = self.internal_validate(
                             sess, train_data, test_idx_array, tidx, 10000)
                         if save_path is None:
                             tar_array = np.concatenate((tar_array, tars))
                             score_array = np.concatenate(
                                 (score_array, scores))
                         else:
                             roc, pr = get_auc_from_array(tars, scores)
                             roc_array.append(roc)
                             pr_array.append(pr)
             if np.all(is_last_array):
                 break
         if save_path is None:
             roc_auc, pr_auc = get_auc_from_array(tar_array, score_array)
         else:
             roc_array = np.array(roc_array)
             pr_array = np.array(pr_array)
             np.save(save_path + 'roc_auc.npy', roc_array)
             np.save(save_path + 'pr_auc.npy', pr_array)
             roc_auc = np.mean(roc_array)
             pr_auc = np.mean(pr_array)
         print_and_log("+++ test_set +++", self.log_file)
         print_and_log(
             "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" %
             (0, self.top_k, 0, roc_auc, pr_auc), self.log_file)
         print_and_log("", self.log_file)
         print_and_log("...Validation finished...", self.log_file)
         with os.popen("date") as pop_file:
             print_and_log(pop_file.readline(), self.log_file)
         return 0, 0, roc_auc, pr_auc
Example #3
0
 def test_target(self,
                 test_data,
                 idx_array=None,
                 tidx_array=None,
                 save_path=None):
     if idx_array is None:
         idx_array = np.array(range(len(test_data[0])))
     if tidx_array is None:
         tidx_array = np.array(range(test_data[1].shape[1]))
     if not self.check_train:
         print_and_log("This model is not trained yet", self.log_file)
         return 0
     if self.weight is not None:
         compensate = 1 / self.weight
     else:
         compensate = np.ones(test_data[1].shape[1])
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     saver = tf.train.Saver()
     with tf.Session(config=config) as sess:
         print_and_log("...Start testing...", self.log_file)
         if self.param_num is None:
             saver.restore(sess, self.save_dir + self.model_name + '/model')
         else:
             saver.restore(
                 sess, self.save_dir + self.model_name +
                 '/model_for_param_%d' % self.param_num)
         batches = batch_gen(test_data[0],
                             test_data[1],
                             idx_array,
                             self.batch_size,
                             tidx_array,
                             use_all=True)
         tar_con = np.array([[]] * self.output_space).T
         cls_con = np.array([[]] * self.output_space).T
         loss = 0.0
         count = 0
         while True:
             batch_fp, batch_tv, is_last = next(batches)
             batch_cls, batch_loss = sess.run(
                 [self.cls_tensor, self.cls_loss],
                 feed_dict={
                     self.fp_tensor: batch_fp,
                     self.tar_tensor: batch_tv,
                     self.idropout: 0,
                     self.dropout: 0,
                     self.sign_weight: self.sign_ratio,
                     self.compensate: compensate[tidx_array]
                 })
             tar_con = np.concatenate((tar_con, batch_tv), axis=0)
             cls_con = np.concatenate((cls_con, batch_cls), axis=0)
             loss += batch_loss
             count += 1
             if is_last:
                 break
         print_and_log("...Calculation finished...", self.log_file)
         with os.popen("date") as pop_file:
             print_and_log(pop_file.readline(), self.log_file)
         loss /= count
         recall, _, _ = get_recall(tar_con, cls_con, self.top_k)
         roc_array, pr_array = get_auc_per_col(tar_con,
                                               cls_con,
                                               zero_is_ambiguous=True)
         if save_path is not None:
             np.save(save_path + 'roc_auc.npy', roc_array)
             np.save(save_path + 'pr_auc.npy', pr_array)
         roc_auc = np.mean(roc_array)
         pr_auc = np.mean(pr_array)
         print_and_log("+++ test_set +++", self.log_file)
         print_and_log(
             "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" %
             (loss, self.top_k, recall, roc_auc, pr_auc), self.log_file)
         print_and_log("", self.log_file)
         print_and_log("...Validation finished...", self.log_file)
     with os.popen("date") as pop_file:
         print_and_log(pop_file.readline(), self.log_file)
     return loss, recall, roc_auc, pr_auc
Example #4
0
 def train_and_test(self,
                    train_data,
                    idx_array=None,
                    test_idx_array=None,
                    tidx_array=None,
                    save_path=None):  # simultaneous learning and testing
     if idx_array is None:
         idx_array = np.array(range(len(train_data[0])))
     if test_idx_array is None:
         test_idx_array = np.array(range(len(train_data[0])))
     if tidx_array is None:
         tidx_array = np.array(range(train_data[1].shape[1]))
     if self.sign_bal:
         if self.unknown_val is None:
             num_pos = np.sum(np.maximum(train_data[1][idx_array], 0),
                              axis=0)
             num_neg = np.sum(np.abs(train_data[1][idx_array]),
                              axis=0) - num_pos
         elif self.unknown_val == 0:
             num_pos = np.sum(np.maximum(train_data[1][idx_array], 0),
                              axis=0)
             num_neg = len(idx_array) - num_pos
         else:
             print_and_log(
                 "Error: Unknown data should be ignored or negative(0) for sign balancing",
                 self.log_file)
             return None
         try:
             self.sign_ratio = num_neg / num_pos
         except:
             print_and_log(
                 "Error: Each target should have at least 1 positives and 1 negatives",
                 self.log_file)
             return None
     else:
         self.sign_ratio = np.ones(train_data[1].shape[1], dtype=np.float32)
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     with tf.Session(config=config) as sess:
         ## Initializing
         print_and_log("...Start initializing...", self.log_file)
         init = tf.global_variables_initializer()
         test_idx, train_idx = set_aside_test(idx_array,
                                              self.validation_frac)
         tidx2batches = {}
         for tidx in tidx_array:
             if self.unknown_val is None:
                 batches = batch_gen(train_data[0],
                                     train_data[1],
                                     train_idx,
                                     self.batch_size,
                                     np.array([tidx]),
                                     known_only=True,
                                     use_all=self.use_all)
             else:
                 batches = batch_gen(train_data[0],
                                     train_data[1],
                                     train_idx,
                                     self.batch_size,
                                     np.array([tidx]),
                                     known_only=False,
                                     use_all=self.use_all)
             tidx2batches[tidx] = batches
         test_fp = train_data[0][test_idx]
         test_tv = train_data[1][test_idx]
         ## Training and Testing
         print_and_log("...Start training...", self.log_file)
         rev = False
         if save_path is None:
             tar_array = np.array([], dtype=np.int32)
             score_array = np.array([], dtype=np.float32)
         else:
             roc_array = []
             pr_array = []
         for i, tidx in enumerate(tidx_array):
             batches = tidx2batches[tidx]
             early_stopper = EarlyStopping(patience=self.patience,
                                           reverse=rev)
             print_and_log("+++ for tidx=%d +++" % tidx, self.log_file)
             for e in range(1, self.max_epoch + 1):
                 if e == 1:
                     sess.run(init)
                 while True:
                     batch_fp, batch_tv, is_last = next(batches)
                     sess.run(self.trainer,
                              feed_dict={
                                  self.fp_tensor: batch_fp,
                                  self.tar_tensor: batch_tv,
                                  self.sign_weight: self.sign_ratio[tidx]
                              })
                     if is_last:
                         break
                 # Testing
                 if e >= self.min_epoch:
                     loss = sess.run(self.cls_loss,
                                     feed_dict={
                                         self.fp_tensor:
                                         test_fp,
                                         self.tar_tensor:
                                         test_tv[:, np.array([tidx])],
                                         self.idropout:
                                         0,
                                         self.dropout:
                                         0,
                                         self.sign_weight:
                                         self.sign_ratio[tidx]
                                     })
                     value = loss
                     early_stop_code = early_stopper.validate(value)
                     if early_stop_code == 0:
                         pass
                     if early_stop_code == 1:
                         pass
                     if early_stop_code == 2:
                         print_and_log(
                             "...Terminating training by early stopper...",
                             self.log_file)
                         print_and_log("epoch=%d, loss=%.5f" % (e, loss),
                                       self.log_file)
                         print_and_log("", self.log_file)
                         break
                 if e == self.max_epoch:
                     print_and_log(
                         "...Terminating training because it reaches to max epoch...",
                         self.log_file)
                     print_and_log("epoch=%d, loss=%.5f" % (e, loss),
                                   self.log_file)
                     print_and_log("", self.log_file)
             if self.do_save:
                 saver.save(
                     sess,
                     self.save_dir + self.model_name + '/model_%d' % tidx)
             # Validating
             print_and_log("...Start testing...", self.log_file)
             test_batch_size = 10000
             batches_test = batch_gen(train_data[0], train_data[1],
                                      test_idx_array, test_batch_size,
                                      np.array([tidx]), False, True)
             tars = np.array([], dtype=np.int32)
             scores = np.array([], dtype=np.float32)
             while True:
                 batch_fp, batch_tar, is_last = next(batches_test)
                 batch_cls = sess.run(self.cls_tensor,
                                      feed_dict={
                                          self.fp_tensor: batch_fp,
                                          self.tar_tensor: batch_tar,
                                          self.idropout: 0,
                                          self.idropout: 0
                                      })
                 clear_point = np.where(batch_tar != 0)
                 tars = np.concatenate((tars, batch_tar[clear_point]))
                 scores = np.concatenate((scores, batch_cls[clear_point]))
                 if is_last:
                     break
             if save_path is None:
                 tar_array = np.concatenate((tar_array, tars))
                 score_array = np.concatenate((score_array, scores))
             else:
                 roc, pr = get_auc_from_array(tars, scores)
                 roc_array.append(roc)
                 pr_array.append(pr)
         print_and_log("...Calculation finished...", self.log_file)
         if save_path is None:
             roc_auc, pr_auc = get_auc_from_array(tar_array, score_array)
         else:
             roc_array = np.array(roc_array)
             pr_array = np.array(pr_array)
             np.save(save_path + 'roc_auc.npy', roc_array)
             np.save(save_path + 'pr_auc.npy', pr_array)
             roc_auc = np.mean(roc_array)
             pr_auc = np.mean(pr_array)
         # return
         print_and_log("+++ test_set +++", self.log_file)
         print_and_log(
             "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" %
             (0, self.top_k, 0, roc_auc, pr_auc), self.log_file)
         print_and_log("", self.log_file)
         print_and_log("...Validation finished...", self.log_file)
         with os.popen("date") as pop_file:
             print_and_log(pop_file.readline(), self.log_file)
         return 0, 0, roc_auc, pr_auc
Example #5
0
 def train(self, train_data, idx_array=None, tidx_array=None):
     if idx_array is None:
         idx_array = np.array(range(len(train_data[0])))
     if tidx_array is None:
         tidx_array = np.array(range(train_data[1].shape[1]))
     if self.weight is not None:
         compensate = 1 / self.weight
     else:
         compensate = np.ones(train_data[1].shape[1])
     if self.sign_bal:
         if self.unknown_val is None:
             num_pos = np.sum(np.maximum(
                 train_data[1][idx_array][:, tidx_array], 0),
                              axis=0)
             num_neg = np.sum(np.abs(train_data[1][idx_array][:,
                                                              tidx_array]),
                              axis=0) - num_pos
         elif self.unknown_val == 0:
             num_pos = np.sum(np.maximum(
                 train_data[1][idx_array][:, tidx_array], 0),
                              axis=0)
             num_neg = len(idx_array) - num_pos
         else:
             print_and_log(
                 "Error: Unknown data should be ignored or negative(0) for sign balancing",
                 self.log_file)
             return None
         try:
             self.sign_ratio = num_neg / num_pos
         except:
             print_and_log(
                 "Error: Each target should have at least 1 positives and 1 negatives",
                 self.log_file)
             return None
     else:
         self.sign_ratio = np.ones(len(tidx_array), dtype=np.float32)
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     saver = tf.train.Saver()
     with tf.Session(config=config) as sess:
         ## Initializing
         print_and_log("...Start initializing...", self.log_file)
         init = tf.global_variables_initializer()
         sess.run(init)
         test_idx, train_idx = set_aside_test(idx_array,
                                              self.validation_frac)
         batches = batch_gen(train_data[0],
                             train_data[1],
                             train_idx,
                             self.batch_size,
                             tidx_array,
                             use_all=self.use_all)
         test_fp = train_data[0][test_idx]
         test_tv = train_data[1][test_idx][:, tidx_array]
         ## Training
         print_and_log("...Start training...", self.log_file)
         if self.standard in ['recall', 'roc', 'pr']:
             rev = True
         else:
             rev = False
         early_stopper = EarlyStopping(patience=self.patience, reverse=rev)
         for e in range(1, self.max_epoch + 1):
             while True:
                 batch_fp, batch_tv, is_last = next(batches)
                 sess.run(self.trainer,
                          feed_dict={
                              self.fp_tensor: batch_fp,
                              self.tar_tensor: batch_tv,
                              self.sign_weight: self.sign_ratio,
                              self.compensate: compensate[tidx_array]
                          })
                 if is_last:
                     break
             # Testing
             test_cls, loss = sess.run(
                 [self.cls_tensor, self.cls_loss],
                 feed_dict={
                     self.fp_tensor: test_fp,
                     self.tar_tensor: test_tv,
                     self.idropout: 0,
                     self.dropout: 0,
                     self.sign_weight: self.sign_ratio,
                     self.compensate: compensate[tidx_array]
                 })
             recall, _, _ = get_recall(test_tv, test_cls, self.top_k)
             roc_auc, pr_auc = get_auc_from_2d(test_tv,
                                               test_cls,
                                               zero_is_ambiguous=True)
             print_and_log("+++ epoch=%i +++" % e, self.log_file)
             print_and_log(
                 "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f"
                 % (loss, self.top_k, recall, roc_auc, pr_auc),
                 self.log_file)
             # Validation & Saving
             if e >= self.min_epoch:
                 if self.standard == 'recall':
                     value = recall
                 elif self.standard == 'roc':
                     value = roc_auc
                 elif self.standard == 'pr':
                     value = pr_auc
                 else:
                     value = loss
                 early_stop_code = early_stopper.validate(value)
                 if early_stop_code == 0:
                     print_and_log("...Saving current state...",
                                   self.log_file)
                     if self.param_num is None:
                         saver.save(
                             sess,
                             self.save_dir + self.model_name + '/model')
                     else:
                         saver.save(
                             sess, self.save_dir + self.model_name +
                             '/model_for_param_%d' % self.param_num)
                     self.check_train = True
                 if early_stop_code == 1:
                     pass
                 if early_stop_code == 2:
                     print_and_log(
                         "...Terminating training by early stopper...",
                         self.log_file)
                     print_and_log("", self.log_file)
                     break
             if e == self.max_epoch:
                 print_and_log(
                     "...Terminating training because it reaches to max epoch...",
                     self.log_file)
             print_and_log("", self.log_file)
     with os.popen("date") as pop_file:
         print_and_log(pop_file.readline(), self.log_file)