def predict(model): # Load data to carry out prediction # x: input; y: label x_predict = load_test_data(in_height, in_width, num_rows) # Define the input function for prediction predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={'file': x_predict}, batch_size=batch_size, num_epochs=1, shuffle=False) # Use the model for prediction pred_results = model.predict(input_fn=predict_input_fn) # results[:, 0] is the probability of class 0 (i.e. not malware) # results[:, 1] is the probability of class 1 (i.e. being malware) results = np.asarray(list(pred_results)) df_out = pd.DataFrame(results[:, 1]) header = ["malware"] df_out.to_csv('./result.csv', header=header, index=True, index_label="sample_id") # i = 0 # with open('result.csv', 'w') as csvfile: # csv_writer = csv.writer(csvfile,) # csv_writer.writerow(["sample_id", "malware"]) # for result in pred_results: # csv_writer.writerow([i, result[1]]) # i = i+1 print('You can find the prediction results in ./result.csv.')
def eval(): test_data,test_labels=load_test_data() with tf.Graph().as_default() as g: test_inputs_placeholder=tf.placeholder(tf.float32,shape=[100,32,32,3],name='test_inputs') test_labels_placeholder=tf.placeholder(tf.int32,shape=[100],name='test_labels') logits=vgg.inference_vgg(test_inputs_placeholder,train=False) test_correct_op = tf.nn.in_top_k(logits,test_labels_placeholder,1) saver=tf.train.Saver() with tf.Session() as sess: #load most recent checkpoint ckpt=tf.train.get_checkpoint_state('./checkpoints') if ckpt: saver.restore(sess, ckpt.model_checkpoint_path) mean_acc=0.0 for step in range((len(test_labels)/100)): test_batch_data,test_batch_labels=create_batch(step,test_data,test_labels) feed_dict={ test_inputs_placeholder:test_batch_data, test_labels_placeholder:test_batch_labels } curr_correct=sess.run([test_correct_op],feed_dict=feed_dict) curr_acc=np.sum(curr_correct)/100.0 mean_acc=((step)*mean_acc+curr_acc)/(step+1) print(mean_acc) print('Total mean accuracy is %f' %mean_acc)
def predict(): word_weights, tag_weights = load_embedding() word_voc, tag_voc, label_voc = load_voc() # train data sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc) seed = 137 np.random.seed(seed) np.random.shuffle(sentences) np.random.seed(seed) np.random.shuffle(tags) np.random.seed(seed) np.random.shuffle(labels) # load data sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc) labels_test = None # clear reslut command = 'rm ./Data/result/*' os.popen(command) # 划分训练、开发、测试集 kf = KFold(n_splits=config.KFOLD) train_indices, dev_indices = [], [] for train_index, dev_index in kf.split(labels): train_indices.append(train_index) dev_indices.append(dev_index) for num in range(config.KFOLD): train_index, dev_index = train_indices[num], dev_indices[num] sentences_train, sentences_dev = sentences[train_index], sentences[dev_index] tags_train, tags_dev = tags[train_index], tags[dev_index] labels_train, labels_dev = labels[train_index], labels[dev_index] # init model model = DCModel( config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt', label_voc=label_voc) # fit model model.fit( sentences_train, tags_train, labels_train, sentences_dev, tags_dev, labels_dev, sentences_test, tags_test, labels_test, config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB, word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB) print(model.get_best_score()) [p_test, r_test, f_test], nb_epoch = model.get_best_score() command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num) print(command) os.popen(command) print(p_test, r_test, f_test, '\n') # evaluate # result_path_k = result_path % k # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test, # labels_test, simple_compute=False, ignore_label=IGNORE_LABEL, # label_voc=relation_voc, result_path=result_path_k) # clear model model.clear_model() del model
def main(): train_file = "data_train.txt" test_file = "data_test.txt" epoches = 100 alpha = 0.000000001 data_array, label_array = load_train_data(train_file) test_array = load_test_data(test_file) data_matrix = np.mat(data_array) label_matrix = np.mat(label_array) test_matrix = np.mat(test_array) theta, cost_vector = train(data_matrix, label_matrix, epoches, alpha) test_result = test(theta, test_matrix) print(theta) print(cost_vector[np.size(cost_vector)-1]) print(test_matrix, test_result) # Plot Result m,n = np.shape(data_array) plot_x = [] plot_y = [] plot_z = [] for i in range(m): plot_x.append(data_matrix[i,1]) plot_y.append(data_matrix[i,n-1]) plot_z.append(label_matrix[i,0]) test_m, test_n = np.shape(test_matrix) plot_testx = [] plot_testy = [] plot_testz = [] for i in range(test_m): plot_testx.append(test_matrix[i,1]) plot_testy.append(test_matrix[i,test_n-1]) plot_testz.append(test_result[i,0]) figure = plt.figure("Result") fig_plot = figure.add_subplot(111, projection='3d') fig_plot.scatter(plot_x, plot_y, plot_z, s=5, c='red', marker='s') # plot 0 fig_plot.scatter(plot_testx, plot_testy, plot_testz, s=30, c='green', marker='s') # plot 0 x = np.random.randint(1000, 5000, size=[10000]) y = np.random.randint(2, 5, size=[10000]) z = theta[0,0] + theta[1,0] * x + theta[2,0] * y fig_plot.plot(x,y,z) fig_plot.set_title("The Result Linear Regression") fig_plot.set_xlabel('Area') fig_plot.set_ylabel('Rooms') fig_plot.set_zlabel('Price') # Plot Cost cost_fig = plt.figure("Cost") cost_plot = cost_fig.add_subplot(111) epoch = np.arange(0, epoches+1, 1) cost_plot.plot(epoch, cost_vector) plt.title("The Cost") plt.xlabel('Epoch') plt.ylabel('Cost') plt.show()
def generate_recons(self): """ generate all reconstructed CT samples from the FDK neural network which will be used for later training in U-Net """ # load all the data train_data_numpy, train_labels_numpy = load_data.load_training_data() validation_data_numpy, validation_labels_numpy = load_data.load_validation_data( ) test_data_numpy, test_labels_numpy = load_data.load_test_data() # normalize the input data train_data_numpy = self.normalize_sino(train_data_numpy) validation_data_numpy = self.normalize_sino(validation_data_numpy) test_data_numpy = self.normalize_sino(test_data_numpy) # normalize the labels train_labels_numpy = self.normalize_labels(train_labels_numpy) validation_labels_numpy = self.normalize_labels( validation_labels_numpy) test_labels_numpy = self.normalize_labels(test_labels_numpy) # session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.9 config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Build Graph self.build_inital_graph() self.build_model_proj_graph() self.build_model_recon_graph() self.build_train_op_graph() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # generation on set print('\n############################### generating') best_model_sess_file = tf.train.latest_checkpoint( 'fdk_nn_model/saved_session/') self.saver.restore(sess, best_model_sess_file) self.do_model_eval( sess, train_data_numpy, train_labels_numpy, NUM_TRAINING_SAMPLES, TRAIN_INDEX, [True, self.model_name + '/eval_recon/generation_recons/']) self.do_model_eval( sess, validation_data_numpy, validation_labels_numpy, NUM_VALIDATION_SAMPLES, VALID_INDEX, [True, self.model_name + '/eval_recon/generation_recons/']) self.do_model_eval( sess, test_data_numpy, test_labels_numpy, NUM_TEST_SAMPLES, TEST_INDEX, [True, self.model_name + '/eval_recon/generation_recons/'])
def eval(): transformer = Transformer(training=False) X, Sources, Targets = load_test_data() en2idx, idx2en = load_vocab('./preprocessed/en.vocab.tsv') with transformer.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print 'restored' mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name if not os.path.exists('results'): os.makedirs('results') with codecs.open('results/' + mname, 'w', 'utf-8') as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.max_len), np.int32) for j in range(hp.max_len): _preds = sess.run(transformer.preds, { transformer.x: x, transformer.y: preds }) preds[:, j] = _preds[:, j] for source, target, pred in zip(sources, targets, preds): got = " ".join( idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write('- source: {}\n'.format(source)) fout.write('- expected: {}\n'.format(target)) fout.write('- got: {}\n\n'.format(got)) ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append(ref) hypotheses.append(hypothesis) score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))
def test(net, criterion, device): testloader = load_test_data() correct, total = 0, 0 with torch.no_grad(): for data in tqdm(testloader): inputs, labels = data[0].to(device), data[1].to(device) outputs = net(inputs) _, predicted = torch.max(outputs, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total)) torch.cuda.empty_cache()
def main(): log_fmt = Formatter( '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ' ) handler = StreamHandler() handler.setLevel(INFO) handler.setFormatter(log_fmt) logger.addHandler(handler) handler = FileHandler(DIR + 'train_lgb_clf_hyperopt.py.log', 'a') handler.setLevel(DEBUG) handler.setFormatter(log_fmt) logger.setLevel(DEBUG) logger.addHandler(handler) logger.info('start') logger.info("start exploring best params") logger.info("start exploring best params without iteration") df_train = load_train_data() x_train = df_train.loc[:, 'ABC':'2047'] y_train = df_train['Active_Nonactive'].values best_params = lgb_opt_params(x_train, y_train) logger.info("end exploring best params without iteration") logger.info("start optimizing iteration") best_iter = opt_iter(x_train, y_train, best_params) logger.info("end optimizing iteration") logger.info("end exploring best params") logger.info("start best params train") best_model_No, cutoff = create_models(x_train, y_train, best_params, best_iter) logger.info("end best params train") logger.info("start predict unknown data(test data)") df_test = load_test_data().sort_values('Name') use_cols = x_train.columns.values # x_test = df_test[use_cols] df_all = pd.concat([df_train, df_test], axis=0, sort=False).sort_values('Name') x_all = df_all[use_cols] predict_test(x_all, best_model_No, cutoff) logger.info("end predict unknown data(test data)") logger.info("end")
def evaluate_on_metrics(model): """ do evaluation on mse, ssim, ms-ssim and psnr Parameters ---------- model : str The model for evaluation """ # get the labels _, labels = load_data.load_test_data() labels = normalize(labels) # load the recons on the model recon_phantoms = np.empty(labels.shape) for i in range(recon_phantoms.shape[0]): recon_file = model + '/eval_recon/recon_' + str(TEST_INDEX[i]) + '.npy' recon_phantoms[i, :, :, :] = np.load(recon_file) # MSE mse = np.mean(np.square(recon_phantoms - labels)) # max_val = 1.0 # SSIM ssim = calculate_ssim(recon_phantoms, labels, max_val) # MS-SSIM ms_ssim = calculate_ms_ssim(recon_phantoms, labels, max_val) # Peak Signal-to-Noise Ratio psnr = calculate_psnr(recon_phantoms, labels, max_val) # print the results print('mse value: ', str(mse)) print('ssim value: ', str(ssim)) print('ms-ssim value: ', str(ms_ssim)) print('psnr value: ', str(psnr)) # save the metrics results f = open(model + '/eval_result/metrics_result.txt', 'a+') f.write( "Model: {0}, Date: {1:%Y-%m-%d_%H:%M:%S} \nMSE: {2:3.8f} \nSSIM: {3:3.8f} \nMS-SSIM: {4:3.8f} \nPSNR: {5:3.8f}\n\n" .format(model, datetime.datetime.now(), mse, ssim, ms_ssim, psnr)) f.close()
def eval_pure_fdk(): """ do evaluation on mse, ssim, ms-ssim and psnr for the conventional FDK algorithm """ # get the labels _, labels = load_data.load_test_data() labels = normalize(labels) # load the recons recon_phantoms = np.empty(labels.shape) for i in range(recon_phantoms.shape[0]): recon_file = '../data_preprocessing/recon_145/recon_' + str( TEST_INDEX[i]) + '.npy' recon_phantoms[i, :, :, :] = np.load(recon_file) recon_phantoms = normalize(recon_phantoms) # MSE mse = np.mean(np.square(recon_phantoms - labels)) # max_val = 1.0 # SSIM ssim = calculate_ssim(recon_phantoms, labels, max_val) # MS-SSIM ms_ssim = calculate_ms_ssim(recon_phantoms, labels, max_val) # Peak Signal-to-Noise Ratio psnr = calculate_psnr(recon_phantoms, labels, max_val) # print the results print('mse value: ', str(mse)) print('ssim value: ', str(ssim)) print('ms-ssim value: ', str(ms_ssim)) print('psnr value: ', str(psnr)) # save the metrics results f = open('pure_fdk_model/eval_result/metrics_result.txt', 'a+') f.write( "Model: {0}, Date: {1:%Y-%m-%d_%H:%M:%S} \nMSE: {2:3.8f} \nSSIM: {3:3.8f} \nMS-SSIM: {4:3.8f} \nPSNR: {5:3.8f}\n\n" .format('pure_fdk_model', datetime.datetime.now(), mse, ssim, ms_ssim, psnr)) f.close()
def predict(): with open(DIR + 'model.pkl', 'rb') as f: clf = pickle.load(f) with open(DIR + 'usecols.pkl', 'rb') as f: usecols = pickle.load(f) imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances.csv') logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features)) df = load_test_data() logger.info('data size {}'.format(df.shape)) for col in usecols: if col not in df.columns.values: df[col] = np.zeros(df.shape[0]) logger.info('no col %s' % col) x_test = df[usecols] if x_test.shape[1] != n_features: raise Exception('Not match feature num: %s %s' % (x_test.shape[1], n_features)) logger.info('test load end') p_test = clf.predict(x_test) with open(DIR + 'test_tmp_pred.pkl', 'wb') as f: pickle.dump(p_test, f, -1) logger.info('test save end') sub = pd.DataFrame() sub['click_id'] = df['click_id'] sub['is_attributed'] = p_test sub.to_csv(DIR + 'submit.csv', index=False) logger.info('exit')
def model_selection_and_evaluation(): """ Test some candidate models with validation set, select highest scoring, train on full train + validation set, evluate on test set :return: tuple: best model, list of feature sets it uses """ # Load train and test sets df_tr, df_te = load_data.load_train_data(), load_data.load_test_data() # Split train into validation (for model selection) and train df_tr_tr, df_tr_val = utils.split_train_validation(df_tr) # Assess accuracies of all models on validation set # Get best scoring canditate best_model, best_model_feats = model_selection(df_tr_tr, df_tr_val) print('Best scoring model is: {}, Using feature sets: {}'.format( best_model.name, best_model_feats)) # Evaluate test set accuracy of chosen model test_set_evaluation(df_tr, df_te, {best_model: best_model_feats}) return best_model, best_model_feats
import numpy as np ############################### # Untar data def untar_data(name, outdir='./data'): my_tar = tarfile.open('./Indoor-scene-recognition/' + name) my_tar.extractall(outdir) my_tar.close() # Uncomment to untar data # untar_data("indoorCVPR_09annotations.tar") # untar_data("indoorCVPR_09.tar") ############################### ############################### # Load data test_data = load_data.load_test_data() train_data = load_data.load_train_data() # Show the data print(test_data.shape) print(train_data.shape) train_i = np.random.choice(train_data.shape[0]) test_i = np.random.choice(test_data.shape[0]) cv2.imshow("example in train", train_data[train_i]) cv2.imshow("example in test", test_data[test_i]) cv2.waitKey(0) ###############################
### with open(DIR + 'model.pkl', 'rb') as f: clf = pickle.load(f) with open(DIR + 'usecols.pkl', 'rb') as f: usecols = pickle.load(f) imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances.csv') logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features)) with open(DIR + 'fillna_mean.pkl', 'rb') as f: fillna_mean = pickle.load(f) x_test = load_test_data() id_cols = [ col for col in x_test.columns.values if re.search('_id$', col) is not None and col not in set( ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id']) ] logger.debug('id_cols {}'.format(id_cols)) x_test.drop(id_cols, axis=1, inplace=True) logger.info('usecols') x_test = x_test[usecols] gc.collect() logger.info('values {} {}'.format(len(usecols), x_test.shape)) x_test.fillna(fillna_mean, inplace=True)
xg_trn, num_boost_round=5000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) return model if __name__ == '__main__': logger.info('Start') train_df = load_train_data(nrows=100) logger.info('train load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=100) logger.info('test load end {}'.format(test_df.shape)) # Labels train_y = train_df["deal_probability"].values test_id = test_df["item_id"].values # Feature Weekday train_df["activation_weekday"] = train_df["activation_date"].dt.weekday test_df["activation_weekday"] = test_df["activation_date"].dt.weekday # Label encode the categorical variables cat_vars = [ "region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3" ]
from os import path import random #os.environ["CUDA_VISIBLE_DEVICES"]="3,4,5,6" training_progress = [] development_progress = [] test_progress = [] model = load_model() model.compile(optimizer='adagrad', loss='mse', metrics=['mae']) X_train, Y_train = load_training_data() X_dev, Y_dev = load_development_data() X_test, Y_test = load_test_data() min_mse_dev = 10000 min_mae_dev = 10000 min_mse_test = 10000 min_mae_test = 10000 current_epoch_number = 1 total_epoch_count = 100 m = X_train.shape[0] batch_size_list = list(range(1, m)) print("\n\n")
log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ') handler = StreamHandler() handler.setLevel('INFO') handler.setFormatter(log_fmt) logger.addHandler(handler) handler = FileHandler(DIR + 'train.py.log', 'a') handler.setLevel(DEBUG) handler.setFormatter(log_fmt) logger.setLevel(DEBUG) logger.addHandler(handler) logger.info('start') df_train0 = load_train_data() df_test0 = load_test_data() logger.info('concat train and test datasets: {} {}'.format(df_train0.shape, df_test0.shape)) df_train0['train'] = 1 df_test0['train'] = 0 df = pd.concat([df_train0, df_test0], axis=0, sort=False) logger.info('Data preprocessing') # Drop PoolQC, MiscFeature, Alley and Fence features # because they have more than 80% of missing values. df = df.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1) object_columns_df = df.select_dtypes(include=['object']) numerical_columns_df =df.select_dtypes(exclude=['object'])
epsilon=None, decay=0.0, amsgrad=True) batch_size = 20 #Parameters show_num = 1 map_index = 2 dataset_index = 3 #Load pre-trained weights model.load_weights(pretrained_model_weights) model.compile(optimizer=opt, loss='mse') #Load data test_input, expected_output, obs = load_test_data(dataset_index=dataset_index, map_index=map_index) print(test_input[2].shape) # print(expected_ouput[1]) #sys.exit() print('Predicting...') predicted_output = model.predict(test_input, batch_size=batch_size, verbose=1) print('Predicting Done!') print('Calculating Predicting Error...') mean_FDE = calculate_FDE(expected_output, predicted_output, len(expected_output), show_num) mean_ADE = calculate_ADE(expected_output, predicted_output, len(expected_output), 12, show_num) all_FDE = calculate_FDE(expected_output, predicted_output,
import torch from model import FNet from load_data import load_result_data, load_test_data import numpy as np import pandas as pd #device=torch.device('cuda:0' if torch.cuda.is_available else 'cpu') #[1000,10] [1000,10] [1000,10,2] np_test_x, np_test_y, np_test_xy = load_test_data( x_path="data/task9_evaluate_finetune_x.csv", y_path="data/task9_evaluate_finetune_y.csv") np_result_x = load_result_data(x_path="data/task9_evaluate_x.csv") fnet = FNet(1, 50, 1) map_location = lambda storage, loc: storage fnet.load_state_dict(torch.load("fnetmodel.pkl", map_location=map_location)) fnet.eval() # 结果 result = [] for i in range(np_test_xy.shape[0]): # [100, 5, 2] fnet.eval() test_xy = np_test_xy[i] # [5, 2] test_x = test_xy[:, 0] # [5, ] test1_x = np_result_x[i] # [100, ] tensor_test1_x = torch.from_numpy(test1_x[:, np.newaxis]).float() # [100, 1] tensor_test_xy = torch.from_numpy(test_xy).float() # [5, 2]
def run_training(self): """ do training """ # load all the data train_data_numpy, train_labels_numpy = load_data.load_training_data() validation_data_numpy, validation_labels_numpy = load_data.load_validation_data( ) test_data_numpy, test_labels_numpy = load_data.load_test_data() # normalize the input data train_data_numpy = self.normalize_sino(train_data_numpy) validation_data_numpy = self.normalize_sino(validation_data_numpy) test_data_numpy = self.normalize_sino(test_data_numpy) # normalize the labels train_labels_numpy = self.normalize_labels(train_labels_numpy) validation_labels_numpy = self.normalize_labels( validation_labels_numpy) test_labels_numpy = self.normalize_labels(test_labels_numpy) # Training session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.9 config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Build Graph self.build_inital_graph() self.build_model_proj_graph() self.build_model_recon_graph() self.build_train_op_graph() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) valid_losses = [] best_valid_loss = np.Inf for epoch in range(MAX_EPOCHS): # Initialise dataset iterator sess.run(self.iter.initializer, feed_dict={ self.data_placeholder: train_data_numpy, self.labels_placeholder: train_labels_numpy, self.index_placeholder: TRAIN_INDEX }) training_losses = 0 for step in range(NUM_TRAINING_SAMPLES): # run the training training_loss, _ = sess.run([self.loss, self.train_op]) training_losses += np.mean(training_loss) valid_loss = self.do_model_eval(sess, validation_data_numpy, validation_labels_numpy, NUM_VALIDATION_SAMPLES, VALID_INDEX, [False, '']) valid_losses.append(valid_loss) print( 'Epoch: {0:3d}/{1:3d}, training loss: {2:3.8f}, validation loss: {3:3.8f}' .format(epoch + 1, MAX_EPOCHS, training_losses / NUM_TRAINING_SAMPLES, valid_loss)) # early stopping if validation loss is increasing or staying the same after five epoches last_five_valid_losses = valid_losses[-5:] if valid_loss < best_valid_loss: best_valid_loss = valid_loss # Save a checkpoint of the least validation loss model so far # print("saving this least validation loss model so far!") self.saver.save(sess, self.model_name + '/saved_session/sess-' + '{date:%m_%d_%H:%M}'.format( date=datetime.datetime.now()) + '.ckpt', global_step=epoch) elif len(last_five_valid_losses) == 5 and all( [valid_loss >= x for x in last_five_valid_losses]): # print('early stopping !!!') break else: # print('no improvement on validation at this epoch, continue training...') continue # evaluate on test set print( '\n############################### testing evaluation on best trained model so far' ) best_model_sess_file = tf.train.latest_checkpoint( self.model_name + '/saved_session/') self.saver.restore(sess, best_model_sess_file) test_loss = self.do_model_eval( sess, test_data_numpy, test_labels_numpy, NUM_TEST_SAMPLES, TEST_INDEX, [True, self.model_name + '/eval_recon/']) print("average test loss: ", test_loss)
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
def adjust_param(): # 以第几类数据作为训练集 type_num = 0 dim = 4 C = 0.6 toler = 0.0001 maxIter = 40 best_acc = 0 best_a = 0 best_r = 0 best_label = [] # 数据预处理 if type_num == 0: train_data = load_data.load_train_data('../data/iris.data', 0, 30, dim=dim) test_data, correct_label = load_data.load_test_data('iris.data', type_num, 30, 150, dim=dim) elif type_num == 1: train_data = load_data.load_train_data('../data/iris.data', 50, 80) test_data1, correct_label1 = load_data.load_test_data('iris.data', type_num, 80, 150, dim=dim) test_data2, correct_label2 = load_data.load_test_data('iris.data', type_num, 0, 50, dim=dim) test_data = np.vstack((test_data1, test_data2)) correct_label = np.hstack((correct_label1, correct_label2)) elif type_num == 2: train_data = load_data.load_train_data('../data/iris.data', 100, 130, dim=dim) test_data1, correct_label1 = load_data.load_test_data('iris.data', type_num, 130, 150, dim=dim) test_data2, correct_label2 = load_data.load_test_data('iris.data', type_num, 0, 100, dim=dim) test_data = np.vstack((test_data1, test_data2)) correct_label = np.hstack((correct_label1, correct_label2)) min_acc = 2 avrg_acc = 0 max_acc = -1 for i in range(50): a, R = one_class_svm.smo(train_data, C, toler, maxIter) result_label = judge(test_data, a, R) acc = calculate_acc(result_label, correct_label) if acc > best_acc: best_acc = acc best_a = a best_r = R best_label = result_label avrg_acc += acc if acc < min_acc: min_acc = acc if acc > max_acc: max_acc = acc #print("accuracy: " + str(acc)) avrg_acc /= 100 print("train type:" + str(type_num) + ", dim=" + str(dim) + " => best acc = " + str(max_acc)) print("model: a=" + str(best_a) + ", R=" + str(best_r) + ",C=" + str(C)) print("label(0-20:positive sample):") print(best_label) draw_picture(train_data, test_data, correct_label, best_a, best_r, C, toler, best_acc)
from TFNN.layers.EmbeddingLayer import Embedding from sklearn.model_selection import KFold from triggerType_to_trigger import get_trigger ''' For Chinese word segmentation. ''' #############################1.load data ###################################### class_type = 3 training_count = 16796 test_count = 2570 word_weights, tag_weights = load_embedding() #矩阵形式 word_voc, tag_voc, label_voc = load_voc() #字典形式 sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc, class_type, training_count) Xend_sentence, Xend_tag_test, yend_test = load_test_data( word_voc, tag_voc, label_voc, class_type, test_count) #划分训练集,测试集(这里的y为词性tag kf = KFold(n_splits=10) train_indices, dev_indices = [], [] for train_index, dev_index in kf.split(labels): train_indices.append(train_index) dev_indices.append(dev_index) for num in range(10): train_index, dev_index = train_indices[num], dev_indices[num] sentences_train, sentences_dev = sentences[train_index], sentences[ dev_index] tags_train, tags_dev = tags[train_index], tags[dev_index] labels_train, labels_dev = labels[train_index], labels[dev_index] """kf = KFold(n_splits=10)
if len(result_label) != len(correct_label): print("Number of label isn't equal!") return 0 n = len(result_label) acc = 0 for i in range(n): if result_label[i] == correct_label[i]: acc += 1 return acc / n if __name__ == "__main__": training_data = load_data.load_training_data('data/iris.data') # 获取训练集, # training_data = [ [type1_data], [type2_data], …… [typeN_data] ] w_and_b = test_iris(training_data) # 得到支持向量 test_data, label = load_data.load_test_data( 'data/iris.data') # 获取测试集和正确的标签 result_label = judge(test_data, w_and_b) # 测试 print(result_label) # 打印结果标签 acc = calculate_acc(result_label, label) # 计算正确率 print("Accuracy: " + str(acc))
def train(argv=None): # load data print("Loading data ... ") x_train, y_train = load_data.load_train_data() x_test, y_test = load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . x_train = x_shuffled[1000:] y_train = y_shuffled[1000:] x_test = x_shuffled[:1000] y_test = y_shuffled[:1000] print(x_train.shape) print(x_test.shape) # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1) x_train = numpy.expand_dims(x_train, -1) x_test = numpy.expand_dims(x_test, -1) filter_sizes = [2, 3, 4, 5] filter_numbers = [300, 200, 100, 50] # input # input is sentence train_data_node = tf.placeholder(tf.float32, shape=(None, max_document_length, EMBEDDING_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES)) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # full connected - softmax layer, fc1_weights = tf.Variable( tf.truncated_normal([sum(filter_numbers), NUM_CLASSES], stddev=0.1, seed=SEED, dtype=tf.float32)) fc1_biases = tf.Variable( tf.constant(0.1, shape=[NUM_CLASSES], dtype=tf.float32)) # model def model(data): pooled_outputs = [] for idx, filter_size in enumerate(filter_sizes): conv = conv2d(train_data_node, filter_numbers[idx], filter_size, EMBEDDING_SIZE, name="kernel%d" % idx) # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters] pool = tf.nn.max_pool( conv, ksize=[1, max_document_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') pooled_outputs.append(tf.squeeze(pool)) if len(filter_sizes) > 1: cnn_output = tf.concat(1, pooled_outputs) else: cnn_output = pooled_outputs[0] # add dropout reshape = tf.nn.dropout(cnn_output, dropout_keep_prob) # fc1 layer fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases return fc1_output # Training computation logits = model(train_data_node) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases)) loss += 0.05 * regularizers tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model train_predict = tf.argmax(logits, 1) train_label = tf.argmax(train_labels_node, 1) # train accuracy train_correct_pred = tf.equal(train_predict, train_label) train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32)) tf.scalar_summary('acc', train_accuracy) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 1.0 } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). summary, step, losses, lr, acc, y_label, y_predict = sess.run( [ merged, global_step, loss, learning_rate, train_accuracy, train_label, train_predict ], feed_dict=feed_dict) test_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format( time_str, step, losses, lr, acc)) # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc)) # compute index compute_index(y_label, y_predict) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (step % steps_each_check < 100) and (step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - losses if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = losses return new_best_test_loss # run the training with tf.Session() as sess: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g},acc {:g}".format( time_str, step, losses, acc)) else: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) train_writer.close() test_writer.close()
#!/usr/bin/env python3 import json import numpy as np import pandas as pd from sklearn.metrics import classification_report from load_data import load_test_data from predict import predict X_df, y_df = load_test_data() y_pred = predict(X_df) y_true = (y_df['PINCP'] > 84770).astype(int) # drop nans nans = np.isnan(y_pred).ravel() failures = np.sum(nans) y_true_clean, y_pred_clean = y_true[~nans], y_pred[~nans] report = classification_report(y_true_clean, y_pred_clean, target_names=['High Income', 'Low Income'], output_dict=True) report['failures'] = failures / len(y_true) with open('./report.json', 'w') as f: json.dump(report, f) print('done')
batch_size = 1024 # mini batch size during training strides = 9 # CNN strides kernel_size = 9 # size of 1D convolutional kernels filters = 4 # number of convolutional kernels noise = 0.04 # noise factor for additive white Gaussian noise # save models here: model_path_name = r"specify_path_and_file_name" ### load data: #X,Y = load_data.load_pure_component_spectra_training_data() # load pure component spectra training dataset (NNi) X,Y = load_data.load_spectral_model_training_data() # OR load spectral model training dataset (NNii): X_val_meas, Y_val_meas = load_data.load_validation_data() # load validation data X_test_meas, Y_test_meas, Y_test_meas_nfNMR_IHM = load_data.load_test_data() # load test data label_factor = np.max(Y) # compute label scaling factor X_scaling_factor = np.max(X) # compute input scaling factor ### scale and reshape spectra, labels and ground truth / add channel dimension: X,X_val_meas,X_test_meas,Y,Y_val_meas,Y_test_meas,Y_test_meas_nfNMR_IHM = scale_reshape.scale_add_channels(X,X_val_meas,X_test_meas,Y,Y_val_meas,Y_test_meas,Y_test_meas_nfNMR_IHM,X_scaling_factor,label_factor) ### add noise to training data: X = X + np.random.normal(0,noise,(np.shape(X))) ### build and compile model: model = model_def.CNN_model(filters,kernel_size,X.shape[1:],strides)
]) print(history.history.keys()) print(history) else: if len(saved_weights) == 0: print("network hasn't been trained!") sys.exit() else: test_sample_num = 0 test_sentences = pickle.load(open('sentences_test', 'rb')) test_roots = pickle.load(open('rootwords_test', 'rb')) test_features = pickle.load(open('features_test', 'rb')) X_test, X_unique, y_unique = load_test_data(test_sentences, test_roots, X_word_to_ix) X_test = pad_sequences(X_test, maxlen=X_max_len, dtype='int32', padding='post') model.load_weights(saved_weights) plot_model(model, to_file="model2_arch.png", show_shapes=True) predictions = np.argmax(model.predict(X_test), axis=2) print(predictions) sequences = []
sc_logloss = np.mean(list_logloss_score) sc_gini = np.mean(list_gini_score) if min_score > sc_gini: min_score = sc_gini min_params = params logger.info('logloss: {}, gini: {}'.format(sc_logloss, sc_gini)) logger.info('current min score: {}, params: {}'.format( min_score, min_params)) logger.info('minimum params: {}'.format(min_params)) logger.info('minimum gini: {}'.format(min_score)) clf = LogisticRegression(**min_params) clf.fit(x_train, y_train) logger.info('train end') df = load_test_data() x_test = df[use_cols].sort_values('id') logger.info('test data load end {}'.format(x_test.shape)) pred_test = clf.predict_proba(df)[:, 1] df_submit = pd.read_csv(SAMPLE_SUBMIT_FILE).sort_values('id') df_submit['target'] = pred_test df_submit.to_csv(DIR + 'submit.csv', index=False) logger.info('end')
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from load_data import load_test_data, load_train_data from data_cleaning import clean_data # laod data train_data = clean_data(load_train_data()) train_data.drop(['PassengerId'], axis=1, inplace=True) test_data = clean_data(load_test_data()) # split training data into training/testing sets train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived']) train_X=train[train.columns[1:]] train_Y=train[train.columns[:1]] test_X=test[test.columns[1:]] test_Y=test[test.columns[:1]] X=train_data[train_data.columns[1:]] Y=train_data['Survived'] # Hyper-Parameter Tuning for AdaBoost n_estimators=list(range(100,1100,100)) learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1] hyper={'n_estimators':n_estimators,'learning_rate':learn_rate} gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
for i in range(15): logger.debug('\t{0:20s} : {1:>10.6f}'.format( df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model if __name__ == '__main__': logger.info('Start') # temp1_df = load_train_data(nrows=ROW) # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv') # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left') # del temp1_df, temp2_df train_df = load_train_data(nrows=ROW) logger.info('Train Data load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=ROW) logger.info('test load end {}'.format(test_df.shape)) # test_df = load_period_train_data(nrows=ROW) # logger.info('period train load end {}'.format(test_df.shape)) # pr_test_df = load_period_test_data(nrows=ROW) # logger.info('period test load end {}'.format(pr_test_df.shape)) # test_df = load_train_act_data(nrows=ROW) # tmp_df = pd.read_csv(TRN_PRED_FILE, index_col=['item_id']) # trn_act_df = load_train_act_data(nrows=ROW) # trn_act_df = trn_act_df.join(tmp_df, how='left') # train_df = pd.concat([train_df, trn_act_df], axis=0) # del trn_act_df, tmp_df
f1 = f1_score(true, predict, average="binary") precision_binary, recall_binary, fbeta_score_binary, _ = precision_recall_fscore_support( true, predict, average="binary" ) accuracy = accuracy_score(true, predict) print("正确率(Accuracy):%.3f\nF值(Macro-F score):%.3f" % (accuracy, f1)) print("精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f" % (precision_binary, recall_binary, fbeta_score_binary)) log_performance(accuracy, f1, precision_binary, recall_binary, len(true)) if figure == False: return # 画图 n_groups = 5 values = (accuracy, f1, precision_binary, recall_binary, fbeta_score_binary) fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.35 rects1 = plt.bar(index + bar_width / 2, values, bar_width, alpha=0.6, color="b") plt.xlabel("Result") plt.ylabel("Scores") plt.title("Experiment analysis") plt.xticks(index + bar_width, ("Accuracy", "F", "Precision", "Recall", "F")) plt.ylim(0, 1) plt.tight_layout() plt.show() if __name__ == "__main__": predict = load_pickle("./data/predict_labels/predict_labels.p") _, true_labels = load_test_data() analysis_result(predict, true_labels)