def internal_validate(self, sess, train_data, test_idx_array, tidx, test_batch_size=10000): print_and_log("...Start testing...", self.log_file) batches_test = batch_gen(train_data[0], train_data[1], test_idx_array, test_batch_size, np.array([tidx]), False, True) tars = np.array([], dtype=np.int32) scores = np.array([], dtype=np.float32) while True: batch_fp, batch_tar, is_last = next(batches_test) batch_cls = sess.run(self.cls_tensor, feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tar, self.idropout: 0, self.idropout: 0 }) clear_point = np.where(batch_tar != 0) tars = np.concatenate((tars, batch_tar[clear_point])) scores = np.concatenate((scores, batch_cls[clear_point])) if is_last: break return tars, scores
def train_and_test(self, train_data, idx_array=None, test_idx_array=None, tidx_array=None, save_path=None): if idx_array is None: idx_array = np.array(range(len(train_data[0]))) if tidx_array is None: tidx_array = np.array(range(train_data[1].shape[1])) if self.sign_bal: if self.unknown_val is None: num_pos = np.sum(np.maximum(train_data[1][idx_array], 0), axis=0) num_neg = np.sum(np.abs(train_data[1][idx_array]), axis=0) - num_pos elif self.unknown_val == 0: num_pos = np.sum(np.maximum(train_data[1][idx_array], 0), axis=0) num_neg = len(idx_array) - num_pos else: print_and_log( "Error: Unknown data should be ignored or negative(0) for sign balancing", self.log_file) return None try: self.sign_ratio = num_neg / num_pos except: print_and_log( "Error: Each target should have at least 1 positives and 1 negatives", self.log_file) return None else: self.sign_ratio = np.ones(train_data[1].shape[1], dtype=np.float32) config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver() with tf.Session(config=config) as sess: ## Initializing print_and_log("...Start initializing...", self.log_file) init = tf.global_variables_initializer() init_var = tf.variables_initializer([self.w, self.b]) sess.run(init) test_idx, train_idx = set_aside_test(idx_array, self.validation_frac) tidx2batches = {} tidx2early = {} tidx2epoch = {} rev = False for tidx in tidx_array: if self.unknown_val is None: batches = batch_gen(train_data[0], train_data[1], train_idx, self.batch_size, np.array([tidx]), known_only=True, use_all=self.use_all) else: batches = batch_gen(train_data[0], train_data[1], train_idx, self.batch_size, np.array([tidx]), known_only=False, use_all=self.use_all) tidx2batches[tidx] = batches tidx2early[tidx] = EarlyStopping(patience=self.patience, reverse=rev) tidx2epoch[tidx] = 0 test_fp = train_data[0][test_idx] test_tv = train_data[1][test_idx] if save_path is None: tar_array = np.array([], dtype=np.int32) score_array = np.array([], dtype=np.float32) else: roc_array = [] pr_array = [] ## Training print_and_log("...Start training...", self.log_file) save_w = np.zeros((len(tidx_array), self.last_size, 2), dtype=np.float32) save_b = np.zeros((len(tidx_array), 2), dtype=np.float32) is_first_array = np.ones(len(tidx_array), dtype=bool) is_last_array = np.zeros(len(tidx_array), dtype=bool) while True: for i, tidx in enumerate(tidx_array): if not self.batch_bal and is_last_array[ i]: # if clause for not balancing continue if is_first_array[i]: sess.run(init_var) is_first_array[i] = False else: sess.run(init_var, feed_dict={ self.init_w: save_w[i], self.init_b: save_b[i] }) batches = tidx2batches[tidx] batch_fp, batch_tv, is_last = next(batches) sess.run(self.trainer, feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tv, self.sign_weight: self.sign_ratio[tidx] }) save_w[i], save_b[i] = sess.run([self.w, self.b]) if is_last and not is_last_array[ i]: # tidx reaches the last of an epoch, start validation and determine to stop or continue training for the tidx tidx2epoch[tidx] += 1 epoch = tidx2epoch[tidx] if epoch >= self.min_epoch: loss = sess.run(self.cls_loss, feed_dict={ self.fp_tensor: test_fp, self.tar_tensor: test_tv[:, np.array([tidx])], self.idropout: 0, self.dropout: 0, self.sign_weight: self.sign_ratio[tidx] }) early_stop_code = tidx2early[tidx].validate(loss) if early_stop_code == 2: print_and_log("+++ for tidx=%d +++" % tidx, self.log_file) print_and_log( "...Terminating training by early stopper...", self.log_file) print_and_log( "epoch=%d, loss=%.5f" % (epoch, loss), self.log_file) print_and_log("", self.log_file) is_last_array[i] = True if epoch == self.max_epoch: print_and_log("+++ for tidx=%d +++" % tidx, self.log_file) print_and_log( "...Terminating training because it reaches to max epoch...", self.log_file) print_and_log( "epoch=%d, loss=%.5f" % (epoch, loss), self.log_file) print_and_log("", self.log_file) is_last_array[i] = True # Validating if is_last_array[i]: if self.do_save: saver.save( sess, self.save_dir + self.model_name + '/model_%d' % tidx) tars, scores = self.internal_validate( sess, train_data, test_idx_array, tidx, 10000) if save_path is None: tar_array = np.concatenate((tar_array, tars)) score_array = np.concatenate( (score_array, scores)) else: roc, pr = get_auc_from_array(tars, scores) roc_array.append(roc) pr_array.append(pr) if np.all(is_last_array): break if save_path is None: roc_auc, pr_auc = get_auc_from_array(tar_array, score_array) else: roc_array = np.array(roc_array) pr_array = np.array(pr_array) np.save(save_path + 'roc_auc.npy', roc_array) np.save(save_path + 'pr_auc.npy', pr_array) roc_auc = np.mean(roc_array) pr_auc = np.mean(pr_array) print_and_log("+++ test_set +++", self.log_file) print_and_log( "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" % (0, self.top_k, 0, roc_auc, pr_auc), self.log_file) print_and_log("", self.log_file) print_and_log("...Validation finished...", self.log_file) with os.popen("date") as pop_file: print_and_log(pop_file.readline(), self.log_file) return 0, 0, roc_auc, pr_auc
def test_target(self, test_data, idx_array=None, tidx_array=None, save_path=None): if idx_array is None: idx_array = np.array(range(len(test_data[0]))) if tidx_array is None: tidx_array = np.array(range(test_data[1].shape[1])) if not self.check_train: print_and_log("This model is not trained yet", self.log_file) return 0 if self.weight is not None: compensate = 1 / self.weight else: compensate = np.ones(test_data[1].shape[1]) config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver() with tf.Session(config=config) as sess: print_and_log("...Start testing...", self.log_file) if self.param_num is None: saver.restore(sess, self.save_dir + self.model_name + '/model') else: saver.restore( sess, self.save_dir + self.model_name + '/model_for_param_%d' % self.param_num) batches = batch_gen(test_data[0], test_data[1], idx_array, self.batch_size, tidx_array, use_all=True) tar_con = np.array([[]] * self.output_space).T cls_con = np.array([[]] * self.output_space).T loss = 0.0 count = 0 while True: batch_fp, batch_tv, is_last = next(batches) batch_cls, batch_loss = sess.run( [self.cls_tensor, self.cls_loss], feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tv, self.idropout: 0, self.dropout: 0, self.sign_weight: self.sign_ratio, self.compensate: compensate[tidx_array] }) tar_con = np.concatenate((tar_con, batch_tv), axis=0) cls_con = np.concatenate((cls_con, batch_cls), axis=0) loss += batch_loss count += 1 if is_last: break print_and_log("...Calculation finished...", self.log_file) with os.popen("date") as pop_file: print_and_log(pop_file.readline(), self.log_file) loss /= count recall, _, _ = get_recall(tar_con, cls_con, self.top_k) roc_array, pr_array = get_auc_per_col(tar_con, cls_con, zero_is_ambiguous=True) if save_path is not None: np.save(save_path + 'roc_auc.npy', roc_array) np.save(save_path + 'pr_auc.npy', pr_array) roc_auc = np.mean(roc_array) pr_auc = np.mean(pr_array) print_and_log("+++ test_set +++", self.log_file) print_and_log( "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" % (loss, self.top_k, recall, roc_auc, pr_auc), self.log_file) print_and_log("", self.log_file) print_and_log("...Validation finished...", self.log_file) with os.popen("date") as pop_file: print_and_log(pop_file.readline(), self.log_file) return loss, recall, roc_auc, pr_auc
def train_and_test(self, train_data, idx_array=None, test_idx_array=None, tidx_array=None, save_path=None): # simultaneous learning and testing if idx_array is None: idx_array = np.array(range(len(train_data[0]))) if test_idx_array is None: test_idx_array = np.array(range(len(train_data[0]))) if tidx_array is None: tidx_array = np.array(range(train_data[1].shape[1])) if self.sign_bal: if self.unknown_val is None: num_pos = np.sum(np.maximum(train_data[1][idx_array], 0), axis=0) num_neg = np.sum(np.abs(train_data[1][idx_array]), axis=0) - num_pos elif self.unknown_val == 0: num_pos = np.sum(np.maximum(train_data[1][idx_array], 0), axis=0) num_neg = len(idx_array) - num_pos else: print_and_log( "Error: Unknown data should be ignored or negative(0) for sign balancing", self.log_file) return None try: self.sign_ratio = num_neg / num_pos except: print_and_log( "Error: Each target should have at least 1 positives and 1 negatives", self.log_file) return None else: self.sign_ratio = np.ones(train_data[1].shape[1], dtype=np.float32) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: ## Initializing print_and_log("...Start initializing...", self.log_file) init = tf.global_variables_initializer() test_idx, train_idx = set_aside_test(idx_array, self.validation_frac) tidx2batches = {} for tidx in tidx_array: if self.unknown_val is None: batches = batch_gen(train_data[0], train_data[1], train_idx, self.batch_size, np.array([tidx]), known_only=True, use_all=self.use_all) else: batches = batch_gen(train_data[0], train_data[1], train_idx, self.batch_size, np.array([tidx]), known_only=False, use_all=self.use_all) tidx2batches[tidx] = batches test_fp = train_data[0][test_idx] test_tv = train_data[1][test_idx] ## Training and Testing print_and_log("...Start training...", self.log_file) rev = False if save_path is None: tar_array = np.array([], dtype=np.int32) score_array = np.array([], dtype=np.float32) else: roc_array = [] pr_array = [] for i, tidx in enumerate(tidx_array): batches = tidx2batches[tidx] early_stopper = EarlyStopping(patience=self.patience, reverse=rev) print_and_log("+++ for tidx=%d +++" % tidx, self.log_file) for e in range(1, self.max_epoch + 1): if e == 1: sess.run(init) while True: batch_fp, batch_tv, is_last = next(batches) sess.run(self.trainer, feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tv, self.sign_weight: self.sign_ratio[tidx] }) if is_last: break # Testing if e >= self.min_epoch: loss = sess.run(self.cls_loss, feed_dict={ self.fp_tensor: test_fp, self.tar_tensor: test_tv[:, np.array([tidx])], self.idropout: 0, self.dropout: 0, self.sign_weight: self.sign_ratio[tidx] }) value = loss early_stop_code = early_stopper.validate(value) if early_stop_code == 0: pass if early_stop_code == 1: pass if early_stop_code == 2: print_and_log( "...Terminating training by early stopper...", self.log_file) print_and_log("epoch=%d, loss=%.5f" % (e, loss), self.log_file) print_and_log("", self.log_file) break if e == self.max_epoch: print_and_log( "...Terminating training because it reaches to max epoch...", self.log_file) print_and_log("epoch=%d, loss=%.5f" % (e, loss), self.log_file) print_and_log("", self.log_file) if self.do_save: saver.save( sess, self.save_dir + self.model_name + '/model_%d' % tidx) # Validating print_and_log("...Start testing...", self.log_file) test_batch_size = 10000 batches_test = batch_gen(train_data[0], train_data[1], test_idx_array, test_batch_size, np.array([tidx]), False, True) tars = np.array([], dtype=np.int32) scores = np.array([], dtype=np.float32) while True: batch_fp, batch_tar, is_last = next(batches_test) batch_cls = sess.run(self.cls_tensor, feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tar, self.idropout: 0, self.idropout: 0 }) clear_point = np.where(batch_tar != 0) tars = np.concatenate((tars, batch_tar[clear_point])) scores = np.concatenate((scores, batch_cls[clear_point])) if is_last: break if save_path is None: tar_array = np.concatenate((tar_array, tars)) score_array = np.concatenate((score_array, scores)) else: roc, pr = get_auc_from_array(tars, scores) roc_array.append(roc) pr_array.append(pr) print_and_log("...Calculation finished...", self.log_file) if save_path is None: roc_auc, pr_auc = get_auc_from_array(tar_array, score_array) else: roc_array = np.array(roc_array) pr_array = np.array(pr_array) np.save(save_path + 'roc_auc.npy', roc_array) np.save(save_path + 'pr_auc.npy', pr_array) roc_auc = np.mean(roc_array) pr_auc = np.mean(pr_array) # return print_and_log("+++ test_set +++", self.log_file) print_and_log( "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" % (0, self.top_k, 0, roc_auc, pr_auc), self.log_file) print_and_log("", self.log_file) print_and_log("...Validation finished...", self.log_file) with os.popen("date") as pop_file: print_and_log(pop_file.readline(), self.log_file) return 0, 0, roc_auc, pr_auc
def train(self, train_data, idx_array=None, tidx_array=None): if idx_array is None: idx_array = np.array(range(len(train_data[0]))) if tidx_array is None: tidx_array = np.array(range(train_data[1].shape[1])) if self.weight is not None: compensate = 1 / self.weight else: compensate = np.ones(train_data[1].shape[1]) if self.sign_bal: if self.unknown_val is None: num_pos = np.sum(np.maximum( train_data[1][idx_array][:, tidx_array], 0), axis=0) num_neg = np.sum(np.abs(train_data[1][idx_array][:, tidx_array]), axis=0) - num_pos elif self.unknown_val == 0: num_pos = np.sum(np.maximum( train_data[1][idx_array][:, tidx_array], 0), axis=0) num_neg = len(idx_array) - num_pos else: print_and_log( "Error: Unknown data should be ignored or negative(0) for sign balancing", self.log_file) return None try: self.sign_ratio = num_neg / num_pos except: print_and_log( "Error: Each target should have at least 1 positives and 1 negatives", self.log_file) return None else: self.sign_ratio = np.ones(len(tidx_array), dtype=np.float32) config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver() with tf.Session(config=config) as sess: ## Initializing print_and_log("...Start initializing...", self.log_file) init = tf.global_variables_initializer() sess.run(init) test_idx, train_idx = set_aside_test(idx_array, self.validation_frac) batches = batch_gen(train_data[0], train_data[1], train_idx, self.batch_size, tidx_array, use_all=self.use_all) test_fp = train_data[0][test_idx] test_tv = train_data[1][test_idx][:, tidx_array] ## Training print_and_log("...Start training...", self.log_file) if self.standard in ['recall', 'roc', 'pr']: rev = True else: rev = False early_stopper = EarlyStopping(patience=self.patience, reverse=rev) for e in range(1, self.max_epoch + 1): while True: batch_fp, batch_tv, is_last = next(batches) sess.run(self.trainer, feed_dict={ self.fp_tensor: batch_fp, self.tar_tensor: batch_tv, self.sign_weight: self.sign_ratio, self.compensate: compensate[tidx_array] }) if is_last: break # Testing test_cls, loss = sess.run( [self.cls_tensor, self.cls_loss], feed_dict={ self.fp_tensor: test_fp, self.tar_tensor: test_tv, self.idropout: 0, self.dropout: 0, self.sign_weight: self.sign_ratio, self.compensate: compensate[tidx_array] }) recall, _, _ = get_recall(test_tv, test_cls, self.top_k) roc_auc, pr_auc = get_auc_from_2d(test_tv, test_cls, zero_is_ambiguous=True) print_and_log("+++ epoch=%i +++" % e, self.log_file) print_and_log( "loss=%.5f, recall_top_%d=%.5f, ROC_AUC=%.5f, PR_AUC=%.5f" % (loss, self.top_k, recall, roc_auc, pr_auc), self.log_file) # Validation & Saving if e >= self.min_epoch: if self.standard == 'recall': value = recall elif self.standard == 'roc': value = roc_auc elif self.standard == 'pr': value = pr_auc else: value = loss early_stop_code = early_stopper.validate(value) if early_stop_code == 0: print_and_log("...Saving current state...", self.log_file) if self.param_num is None: saver.save( sess, self.save_dir + self.model_name + '/model') else: saver.save( sess, self.save_dir + self.model_name + '/model_for_param_%d' % self.param_num) self.check_train = True if early_stop_code == 1: pass if early_stop_code == 2: print_and_log( "...Terminating training by early stopper...", self.log_file) print_and_log("", self.log_file) break if e == self.max_epoch: print_and_log( "...Terminating training because it reaches to max epoch...", self.log_file) print_and_log("", self.log_file) with os.popen("date") as pop_file: print_and_log(pop_file.readline(), self.log_file)