def lr_solver(train_data, train_label, validation, test, unlabel, feature_extract, feature_handler): """ """ logging.info('begin to train the lr classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test , unlabel = feature_extract (train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test , unlabel = feature_handler (train_data, validation, test, unlabel) """ lr = LogisticRegression () params_test = {"penalty":['l1','l2'], "C":[0.1,0.2,0.3,0.5,0.7,1,3,5], "tol":[0.001,0.003,0.005,0.01,0.05,0.1,0.5], "random_state":[1000000007]} rand_search_result = GridSearchCV (lr, param_grid = params_test, n_jobs = 3, cv = 3, scoring='roc_auc') rand_search_result.fit (train_data , train_label) params = evaluate.report (rand_search_result.grid_scores_) print params """ print train_data.shape[1] params = {'penalty': 'l1', 'C':0.1 , 'random_state': 1000000007, 'tol': 0.001, 'warm_start' : True} lr = LogisticRegression(**params) lr.fit (train_data , train_label) joblib.dump (lr, ROOT + '/result/lr.pkl') evaluate.get_auc (lr.predict_proba (validation)[:,1]) return lr.predict_proba (train_data)[:,1]
def gbdt_solver(train_data, train_label, validation, test, unlabel, dimreduce=decomposition.undo): """ """ # train_data = train_data[:100,:] # train_label = train_label[:100] logging.info("begin to train the gbdt classifier") new_train_data, new_val, new_test, new_unlabel = dimreduce(train_data, train_label, validation, test, unlabel) logging.info("finished feature extracting") """ gb = GradientBoostingClassifier () params_gbdt = {"n_estimators":[100,200,500,1000], "learning_rate":[0.02,0.03,0.05,0.1], "max_depth":[3,5,7,9], "random_state":[1000000007]}""" # rand_search_result = GridSearchCV (gb, param_grid = params_gbdt , n_jobs = 3 , cv = 3, scoring = 'roc_auc') # rand_search_result = RandomizedSearchCV (gb, param_distributions = params_gbdt, n_jobs = 3, cv = 3, n_iter = 100, scoring = 'roc_auc') # rand_search_result.fit (new_train_data , train_label) # params = tools.report (rand_search_result.grid_scores_) params = { "n_estimators": 600, "learning_rate": 0.03, "random_state": 1000000007, "max_depth": 2, "warm_start": True, } gb = GradientBoostingClassifier(**params) gb.fit(new_train_data, train_label) joblib.dump(gb, ROOT + "/result/gbdt.pkl") evaluate.get_auc(gb.predict_proba(new_val)[:, 1]) return gb.predict_proba(new_test)[:, 1]
def ssl_solver (train, label, validation, test, unlabel, dimreduce, classifier = LabelSpreading) : """ """ train, validation, test, unlabel = dimreduce (train, label, validation, test, unlabel) data = np.vstack ([train, unlabel]) label = np.hstack ([label, [-1] * unlabel.shape[0]]) assert data.shape[0] == len (label) cf = classifier (kernel = 'knn', n_neighbors = 100, max_iter = 3) # cf = classifier (kernel = 'rbf', gamma = 0.3, max_iter = 3) cf.fit (data, label) evaluate.get_auc (cf.predict_proba (validation)[:,1]) return cf.predict_proba (test)[:,1]
def nb_solver(train_data, train_label, validation, test, classifier, dimreduce, convertbinary): """ """ logging.info('begin to train the naive bayes classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape train_data, validation, test = convertbinary(train_data, validation, test) nb = classifier () nb.fit(train_data , train_label) evaluate.get_auc (nb.predict_proba (validation)[:,1]) return nb.predict_proba (test)[:,1]
def rf_solver(train_data, train_label, validation, test, unlabel, feature_extract, feature_handler): """ """ logging.info("begin to train the random forest classifier") # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test, unlabel = feature_extract(train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test, unlabel = feature_handler(train_data, validation, test, unlabel) rf = RandomForestClassifier(warm_start=True, n_jobs=2, n_estimators=2000, max_depth=3, min_samples_split=50) rf.fit(train_data, train_label) # joblib.dump (rf, ROOT + '/result/rf.pkl') evaluate.get_auc(rf.predict_proba(validation)[:, 1]) return rf.predict_proba(train_data)[:, 1]
def sgd_solver(train_data, train_label, validation, test, unlabel, feature_extract, feature_handler): """ """ logging.info('begin to train the sgd classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test , unlabel = feature_extract (train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test , unlabel = feature_handler (train_data, validation, test, unlabel) sgd = SGDClassifier(loss = 'modified_huber', alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', n_iter=5, n_jobs=2, penalty='l2', power_t=0.5, random_state=1000000007, shuffle=True, verbose=0, warm_start=True) sgd.fit (train_data , train_label) joblib.dump (sgd, ROOT + '/result/sgd.pkl') evaluate.get_auc (sgd.predict_proba (validation)[:,1]) return sgd.predict_proba (train_data)[:,1]
def s3vm_solver(train_data, train_label, validation, test, unlabel, feature_extract, feature_handler): """ """ logging.info('begin to train the s3vm classifier') #unlabel = unlabel[:100,:] #train_data = train_data[:100,:] #validation = validation[:100,:] #test = test[:100,:] #train_label = train_label[:100] train_data, validation, test , unlabel = feature_extract (train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test , unlabel = feature_handler (train_data, validation, test, unlabel) data = np.vstack ([train_data, unlabel]) label = np.hstack ([train_label, [-1] * unlabel.shape[0]]) assert data.shape[0] == len (label) s3vm = methods.scikitTSVM.SKTSVM(kernel='linear') s3vm.fit (data , label) evaluate.get_auc (s3vm.predict_proba (validation)[:,1]) return s3vm.predict_proba (train_data)[:,1]
def test_with_patch_image(ugan): ugan.get_test_data() img_name, y_true, res_loss, dis_loss, y_score = ugan.test(FLAGS, True) print('[*] testing ...') roc_auc = get_auc(y_true, y_score, True) print("ROC curve area: %.4f" % roc_auc) for idx in range(np.shape(y_true)[0]): print("image name: [%s] anomaly score: %.2f, actual label: %.d, generator loss: %.2f, discriminator loss: %.2f" \ % (str(re.split('/|[.]|\\\\', img_name[idx])[-2]), y_score[idx], y_true[idx], res_loss[idx], dis_loss[idx])) test_res = list(zip(y_score,y_true)) np.savetxt("score"+str(FLAGS.patch_size)+".csv", test_res,header="score,label", delimiter=",")
def cotraining (model_one, model_two, n_iter = 100) : """ """ data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number:,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) # train_number = 100 # unlabel_number = 1000 # # train = train[:100,:] # unlabel = unlabel[:1000,:] # label = label[:100] train_one = copy.deepcopy (train) label_one = copy.deepcopy (label) train_two = copy.deepcopy (train) label_two = copy.deepcopy (label) model_one.fit (train_one, label_one) model_two.fit (train_two, label_two) for iter in xrange (1 , n_iter + 1 , 1) : logging.info ('#%d iter for co-training :' % iter) unlabel_label = [-1] * unlabel_number unlabel_index = range (0, unlabel_number) step = 0 while len (unlabel_index) > 0 : step += 1 logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index))) model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two) model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one) evaluate.get_auc (model_one.predict_proba (validation)[:,1]) evaluate.get_auc (model_two.predict_proba (validation)[:,1]) evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0) joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step)) joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step)) evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
def test_with_patch_image(gaid): img_name, y_true, y_score = gaid.test(FLAGS, True) print('[*] testing ...') roc_auc = get_auc(y_true, y_score, True) print("ROC curve area: %.4f" % roc_auc) for idx in range(np.shape(y_true)[0]): print("image name: [%s] anomaly score: %.2f, actual label: %.d" \ % (str(re.split('/|[.]|\\\\', img_name[idx])[-2]), y_score[idx], y_true[idx])) test_res = list(zip(y_score,y_true)) np.savetxt("score-"+str(FLAGS.test_dir)+"-"+str(FLAGS.patch_size)+".csv", test_res,header="score,label", delimiter=",")
def train(self, config): print(" [*] train model ...") # output log if os.path.exists("output_log.txt"): os.remove("output_log.txt") logging.basicConfig(filename='output_log.txt', level=logging.INFO) # Optimizer d_optim = tf.train.AdamOptimizer(config.g_learning_rate, beta1=config.beta) \ .minimize(self.d_loss, var_list=self.d_vars) g_optim = tf.train.AdamOptimizer(config.d_learning_rate, beta1=config.beta) \ .minimize(self.g_loss, var_list=self.g_vars) # Initialize global var. try: tf.global_variables_initializer().run() except: tf.initialize_all_variables().run() # Merge summary self.g_sum = merge_summary([self.d__sum, self.g_sum,self.g_loss_real_sum, self.g_loss_rec_sum, self.g_loss_sum, self.d_loss_fake_sum]) self.d_sum = merge_summary([self.d_sum, self.d_loss_sum, self.d_loss_real_sum]) self.writer = SummaryWriter("./logs", self.sess.graph) # Load data sample sample_files = self.data[0:self.sample_num] sample = [get_image(sample_file, input_height=self.input_height, input_width=self.input_width, resize_height=self.output_height, resize_width=self.output_width, crop=self.crop, grayscale=self.grayscale) for sample_file in sample_files] if (self.grayscale): sample_inputs = np.array(sample).astype(np.float32)[:, :, :, None] else: sample_inputs = np.array(sample).astype(np.float32) # Load checkpoint could_load, checkpoint_counter = self.load(self.checkpoint_dir) if could_load: counter = checkpoint_counter print(" [*] Load SUCCESS") else: print(" [!] Load failed...") counter = 1 start_run_time = time.time() total_batch_time = 0 best_auc = 0.0 sample_save = True for epoch in xrange(config.epoch): self.data = glob(os.path.join(config.data_dir, config.dataset, config.train_dir, self.input_fname_pattern)) np.random.shuffle(self.data) batch_idxs = min(len(self.data), config.train_size) // config.batch_size for idx in xrange(0, batch_idxs): start_batch_time = time.time() batch_files = self.data[idx * config.batch_size:(idx + 1) * config.batch_size] batch = [get_image(batch_file, input_height=self.input_height, input_width=self.input_width, resize_height=self.output_height, resize_width=self.output_width, crop=self.crop, grayscale=self.grayscale) for batch_file in batch_files] if self.grayscale: batch_images = np.array(batch).astype(np.float32)[:, :, :, None] else: batch_images = np.array(batch).astype(np.float32) # Update D network _, summary_str = self.sess.run([d_optim, self.d_sum ],feed_dict={self.inputs: batch_images}) self.writer.add_summary(summary_str, counter) # Update G network _, summary_str = self.sess.run([g_optim,self.g_sum], feed_dict={self.inputs: batch_images}) self.writer.add_summary(summary_str, counter) errD = self.d_loss.eval({self.inputs: batch_images}) errG = self.g_loss.eval({self.inputs: batch_images}) counter += 1 end_batch_time = time.time() time_batch = (end_batch_time - start_batch_time)*1000 total_batch_time += time_batch hours, rem = divmod(end_batch_time - start_run_time, 3600) minutes, seconds = divmod(rem, 60) print( "Epoch: [%2d/%2d] [%4d/%4d] time: %02d:%02d:%02d , G (Reconstructor) loss: %.8f, " "D (Representation matching) loss: %.8f , Avg Run Time (ms/batch): %.8f ,(it/s): %.8f" \ % (epoch+1, config.epoch, idx+1, batch_idxs, int(hours), int(minutes), seconds, errD, errG, total_batch_time/counter,counter/(total_batch_time/1000))) if np.mod(counter, batch_idxs*2)== 1: try: samples, d_loss, g_loss = self.sess.run([self.sampler, self.d_loss, self.g_loss], feed_dict={self.inputs: sample_inputs}) save_images(samples, image_manifold_size(samples.shape[0]), './{}/train_{:02d}_{:04d}.png'.format(config.sample_dir, epoch, idx)) if sample_save: sample_save = False save_images(sample_inputs, image_manifold_size(sample_inputs.shape[0]), './{}/train_sample_inputs_{:02d}_{:04d}.png'.format(config.sample_dir, epoch, idx)) print("[Sample] D (Representation matching) loss: %.8f, G (Reconstructor) loss: %.8f" % (d_loss, g_loss)) except: print("one pic error!...") # test _,y_true, y_score = self.test(config) roc_auc = get_auc(y_true, y_score) if best_auc < roc_auc: best_auc = roc_auc self.save(config.checkpoint_dir, epoch) logging.info("Epoch: [%2d/%2d] , AUC: %.8f, Best AUC: %.8f, Avg run time: %.8f" % (epoch+1, config.epoch, roc_auc, best_auc, total_batch_time/counter)) print("Epoch: [%2d/%2d], AUC: %.8f, Best AUC: %.8f, Avg run time: %.8f" % (epoch+1, config.epoch, roc_auc, best_auc, total_batch_time/counter))