def train(args, params): print("Loading training set...") train = load.load_dataset(params['train']) print("Loading dev set...") dev = load.load_dataset(params['dev']) print("Building preprocessor...") preproc = load.Preproc(*train) print("train_set_classes:", preproc.classes) print("Training size: " + str(len(train[0])) + " examples.") print("Dev size: " + str(len(dev[0])) + " examples.") save_dir = make_save_dir(params['save_dir'], args.experiment) util.save(preproc, save_dir) params.update({ "input_shape": [None, 1], "num_categories": len(preproc.classes) }) model = network.build_network(**params) stopping = keras.callbacks.EarlyStopping(patience=30) reduce_lr = keras.callbacks.ReduceLROnPlateau( factor=0.1, patience=2, min_lr=params["learning_rate"] * 0.001) checkpointer = keras.callbacks.ModelCheckpoint( filepath=get_filename_for_saving(save_dir), save_best_only=False) batch_size = params.get("batch_size", 32) # summary = str(model.summary(print_fn=lambda x: fh.write(x + '\n'))) # out = open("/content/ecg/report.txt",'w') # out.write(summary) # out.close if params.get("generator", False): train_gen = load.data_generator(batch_size, preproc, *train) dev_gen = load.data_generator(batch_size, preproc, *dev) model.fit_generator(train_gen, steps_per_epoch=int(len(train[0]) / batch_size), epochs=MAX_EPOCHS, validation_data=dev_gen, validation_steps=int(len(dev[0]) / batch_size), callbacks=[checkpointer, reduce_lr, stopping]) # util.learning_curve(history) else: train_x, train_y = preproc.process(*train) dev_x, dev_y = preproc.process(*dev) model.fit(train_x, train_y, batch_size=batch_size, epochs=MAX_EPOCHS, validation_data=(dev_x, dev_y), callbacks=[checkpointer, reduce_lr, stopping])
def train(args, params): print("Loading training set...") train = load.load_dataset(params['train']) print("Loading dev set...") dev = load.load_dataset(params['dev']) print("Building preprocessor...") preproc = load.Preproc(*train) print("Training size: " + str(len(train[0])) + " examples.") print("Dev size: " + str(len(dev[0])) + " examples.") save_dir = make_save_dir(params['save_dir'], args.experiment) util.save(preproc, save_dir) params.update({ "input_shape": [None, 1], "num_categories": len(preproc.classes) }) print(params) model = network.build_network(**params) stopping = keras.callbacks.EarlyStopping(patience=8) reduce_lr = keras.callbacks.ReduceLROnPlateau( factor=0.1, patience=2, min_lr=params["learning_rate"] * 0.001) checkpointer = keras.callbacks.ModelCheckpoint( filepath=get_filename_for_saving(save_dir), save_best_only=False) ckpt_best = keras.callbacks.ModelCheckpoint(os.path.join( save_dir, 'best.hdf5'), save_best_only=True) batch_size = params.get("batch_size", 32) if params.get("generator", False): train_gen = load.data_generator(batch_size, preproc, *train) dev_gen = load.data_generator(batch_size, preproc, *dev) model.fit_generator( train_gen, steps_per_epoch=int(len(train[0]) / batch_size), epochs=MAX_EPOCHS, validation_data=dev_gen, validation_steps=int(len(dev[0]) / batch_size), callbacks=[checkpointer, ckpt_best, reduce_lr, stopping]) else: train_x, train_y = preproc.process(*train) dev_x, dev_y = preproc.process(*dev) model.fit(train_x, train_y, batch_size=batch_size, epochs=MAX_EPOCHS, validation_data=(dev_x, dev_y), callbacks=[checkpointer, ckpt_best, reduce_lr, stopping])
def train(args, params): print("Loading training set...") train = load.load_dataset(params['train']) print("Loading dev set...") dev = load.load_dataset(params['dev']) print("Building preprocessor...") preproc = load.Preproc(*train) print("Training size: " + str(len(train[0])) + " examples.") print("Dev size: " + str(len(dev[0])) + " examples.") save_dir = make_save_dir(params['save_dir'], args.experiment) util.save(preproc, save_dir) params.update({ "input_shape": [None, 1], "num_categories": len(preproc.classes) }) model = network.build_network(**params) stopping = keras.callbacks.EarlyStopping(patience=8) reduce_lr = keras.callbacks.ReduceLROnPlateau( factor=0.1, patience=2, min_lr=params["learning_rate"] * 0.001) checkpointer = keras.callbacks.ModelCheckpoint( filepath=get_filename_for_saving(save_dir), save_best_only=False) batch_size = params.get("batch_size", 32) if params.get("generator", False): train_gen = load.data_generator(batch_size, preproc, *train) dev_gen = load.data_generator(batch_size, preproc, *dev) model.fit_generator( train_gen, steps_per_epoch=int(len(train[0]) / batch_size), epochs=MAX_EPOCHS, validation_data=dev_gen, validation_steps=int(len(dev[0]) / batch_size), callbacks=[checkpointer, reduce_lr, stopping]) else: train_x, train_y = preproc.process(*train) dev_x, dev_y = preproc.process(*dev) model.fit( train_x, train_y, batch_size=batch_size, epochs=MAX_EPOCHS, validation_data=(dev_x, dev_y), callbacks=[checkpointer, reduce_lr, stopping])
def predict(data_json, model_path): preproc = util.load(os.path.dirname(model_path)) dataset = load.load_dataset(data_json) x, y = preproc.process(*dataset) y_test = [] for e, i in enumerate(dataset[1]): for j in range(len(i)): y_test.append(y[e, j, :]) y_result = np.array(y_test) model = keras.models.load_model(model_path) probs = model.predict(x, verbose=1) #update start y_test = [] y_predict = [] for e, i in enumerate(dataset[1]): for j in range(len(i)): y_test.append(y[e, j, :]) y_predict.append(probs[e, j, :]) y_test = np.array(y_test) y_predict = np.array(y_predict) #update stop return y_test, y_predict
def eval_model(model_path): """ 测试model准确率 """ eval_data, eval_labels = load_dataset(PATH) # 创建模型 cifar10_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir=model_path) # 评估模型和输出结果 eval_input_fn = tf.estimator.inputs.numpy_input_fn(x=eval_data, y=eval_labels, num_epochs=1, shuffle=True) eval_results = cifar10_classifier.evaluate(input_fn=eval_input_fn) print("----------------------------------------\n\ 总共训练步数:{g_step}\n\ 测试图片数量: {num}\n\ loss 值: {loss:0.4f}\n\ 识别准确率: {accuracy:0.2f}%\ \n----------------------------------------\n".format( g_step=eval_results["global_step"], loss=eval_results["loss"], num=eval_data.shape[0], accuracy=eval_results["accuracy"] * 100))
def do_confusion(): ''' generate and print a cross-validated confusion matrix''' from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import KFold from sklearn.metrics import confusion_matrix # load data features, labels = load_dataset('seeds') # create a sklearn knn classifier classifier = KNeighborsClassifier(n_neighbors = 4) # create a pipeline with prescaler + classifier classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) kf = KFold(len(features), n_folds = 3, shuffle=True) names = list(set(labels)) labels = np.array([names.index(ell) for ell in labels]) preds = labels.copy() preds[:] = -1 for train, test in kf: classifier.fit(features[train], labels[train]) preds[test] = classifier.predict(features[test]) cmat = confusion_matrix(labels, preds) print('Confusion matrix [rows represent true outcome, columns = predicted outcome]') print(cmat) # the explicit float() conversion is necessary in Python 2 (otherwise, result is rounded to 0) acc = cmat.trace()/float(cmat.sum()) print('Accuracy: {0:.1%}'.format(acc))
def predict(data_json, model_path): preproc = util.load(os.path.dirname(model_path)) dataset = load.load_dataset(data_json) x, y = preproc.process(*dataset) model = keras.models.load_model(model_path) probs = model.predict(x, verbose=1) return probs
def predict(data_json, model_path): preproc = util.load(os.path.dirname(model_path)) dataset = load.load_dataset(data_json) x, y = preproc.process(*dataset) model = keras.models.load_model(model_path) probs = model.predict(x, verbose=1) #cj add use for debug predict = np.argmax(probs, axis=2) #axis = 0是取行的最大值的索引,1是列的最大值的索引 return probs
def load_data(self): data_dir = self.config.data_dir batch_size = self.config.batch_size x_train, y_train, x_val, y_val, x_test, y_test = l.load_dataset( data_dir) print("data loaded successfully...") # number of iterations to go through entire training set self.train_data = {'x': x_train, 'y': y_train} self.train_iterations_per_epoch = (x_train.shape[0] + batch_size - 1) // batch_size print("x_training shape : ", x_train.shape[0]) print("y_training shape : ", y_train.shape[0]) print("num of iterations on training data in one epoch : ", self.train_iterations_per_epoch) ##################################################### self.val_data = {'x': x_val, 'y': y_val} self.val_iterations_per_epoch = (x_val.shape[0] + batch_size - 1) // batch_size print("x_validation shape : ", x_val.shape[0]) print("y_validation shape : ", y_val.shape[0]) print("num of iterations on validation data in one epoch : ", self.val_iterations_per_epoch) ######################################################## self.test_data = {'x': x_test, 'y': y_test} # iterations to go through test data, +1 if data size not divisible by batch size self.test_iterations_per_epoch = (x_test.shape[0] + batch_size - 1) // batch_size print("x_test shape : ", x_test.shape[0]) print("y_test shape : ", y_test.shape[0]) print("num of iterations on test data in one epoch : ", self.test_iterations_per_epoch) print("data loading complete ...\n")
def run_evaluation(dataset_path, predictors, additional_roots=None, max_number_of_queries=None, folds_num=5, evaluation_functions=(('precision', 1), ('precision', 3), ('precision', 5), ('ndcg', 1), ('ndcg', 3), ('ndcg', 5), ('dcg', 1), ('dcg', 3), ('dcg', 5))): evaluation_results = [np.zeros(len(evaluation_functions)) for i in range(len(predictors))] for fold in load.load_dataset(dataset_path, additional_roots, max_number_of_queries, folds_num): (x_train, y_train, id_train), (x_test, y_test, id_test) = fold for index_predictor, predictor in enumerate(predictors): # sys.stderr.write(predictor.get_name() + '\n') # sys.stderr.flush() y_pred = predictor.learn_predict(x_train, y_train, x_test) for index_function, (func_type, rank) in enumerate(evaluation_functions): evaluation_results[index_predictor][index_function] += Evaluate.mean(func_type, rank, y_test, y_pred, id_test) evaluation_results = [result / folds_num for result in evaluation_results] return evaluation_results
def predict(data_json, model_path): preproc = util.load(os.path.dirname(model_path)) dataset = load.load_dataset(data_json) x, y = preproc.process(*dataset) model = keras.models.load_model(model_path) probs = model.predict(x, verbose=1) # evaluate the model score = model.evaluate(x, y) length_frames = [] length_predicts = len(dataset[1]) for length_predict in range(length_predicts): length_frames.append(len(dataset[1][length_predict])) predict_class = evaluate(probs, length_predicts, length_frames) print("The model {} is : {:.2%}".format(model.metrics_names[1], score[1])) return probs, predict_class
def do_confusion(): ''' generate and print a cross-validated confusion matrix''' from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import KFold from sklearn.metrics import confusion_matrix # load data features, labels = load_dataset('seeds') # create a sklearn knn classifier classifier = KNeighborsClassifier(n_neighbors=4) # create a pipeline with prescaler + classifier classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) kf = KFold(len(features), n_folds=3, shuffle=True) names = list(set(labels)) labels = np.array([names.index(ell) for ell in labels]) preds = labels.copy() preds[:] = -1 for train, test in kf: classifier.fit(features[train], labels[train]) preds[test] = classifier.predict(features[test]) cmat = confusion_matrix(labels, preds) print( 'Confusion matrix [rows represent true outcome, columns = predicted outcome]' ) print(cmat) # the explicit float() conversion is necessary in Python 2 (otherwise, result is rounded to 0) acc = cmat.trace() / float(cmat.sum()) print('Accuracy: {0:.1%}'.format(acc))
from __future__ import print_function import numpy as np from load import load_dataset # sklearn implementation of knn from sklearn.neighbors import KNeighborsClassifier # load data features, labels = load_dataset('seeds') def leave_one_out(): # create a sklearn knn classifier classifier = KNeighborsClassifier(n_neighbors=4) n = len(features) correct = 0.0 # leave-one-out training for ignorefeat in range(n): training = np.ones(n, bool) # leave out training[ignorefeat] = 0 testing = ~training # fit classifier.fit(features[training], labels[training]) # predict prediction = classifier.predict(features[ignorefeat])
args = parser.parse_args() ## CUDA device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') ## Setup eval_dir = os.path.join( args.model_dir, f'trainsamples{args.trainsamples}' f'_testsamples{args.testsamples}' f'_translatetrain{args.translatetrain}' f'_translatetest{args.translatetest}') params = utils.load_params(args.model_dir) ## Data trainset, testset, num_classes = L.load_dataset(params['data'], data_dir=params['data_dir']) X_train, y_train = F.get_samples(trainset, args.trainsamples) X_test, y_test = F.get_samples(testset, args.testsamples) if args.translatetrain: X_train, y_train = F.translate(X_train, y_train, stride=7) if args.translatetest: X_test, y_test = F.translate(X_test, y_test, stride=7) X_train, y_train = X_train.to(device), y_train.to(device) X_test, y_test = X_test.to(device), y_test.to(device) ## Architecture net = L.load_architecture(params['data'], params['arch']) net = utils.load_ckpt(args.model_dir, 'model', net) net = net.to(device) ## Forward
eval.cleanfile("confusion") eval.cleanfile("measurement") LOGISTIC_REGRESSION = False if len(sys.argv) > 1: if ('L' in sys.argv[1:]): LOGISTIC_REGRESSION = True if LOGISTIC_REGRESSION: print 'LogisticRegression is Active' else: print 'NB is Active' print 'Begin Loading samples...' train_samples, train_target = load.load_dataset(fname=load.filename['TRAIN'], numdocs=None) #dev_samples,dev_target = load.load_dataset(fname=load.filename['DEV'],numdocs=None); print 'number of training sample %d' % len(train_target) print 'Tags for the last train example', train_target[-1] #Classifier Model classifyers = [] classes = [] for each in train_target: classes.extend(x for x in each) classes = set(classes) print 'Total number of classes for this model ', len(classes) class_example_count = [] for each in classes: Y = [1 if each in x else 0 for x in train_target]
# gpu-1 adam0.001 reg0.001 a1.0 model_folder_path = "./saved_res_bn/cinc17" arr = os.listdir(model_folder_path) arr = sorted(arr) last_folder = arr[-1] model_folder_path = "{}/{}/*.hdf5".format(model_folder_path, last_folder) arr_file = sorted(glob.glob(model_folder_path)) print('arr_file', arr_file) file_name = arr_file[0] model_path = file_name print('Model Path : ', model_path) # exit() # model_path = "../../../saved_res_nobn/cinc17/1609222106-676/14.899-0.302-001-16.664-0.284.hdf5" data = load.load_dataset(data_path) preproc = util.load(os.path.dirname(model_path)) print('preproc window size : ', preproc.window_size) class ScaleLayer(Layer): def __init__(self, alpha=0): super(ScaleLayer, self).__init__() self.alpha = alpha self.scale = K.variable(self.alpha, dtype='float32', name='alpha') def get_config(self): return {"alpha": self.alpha} def call(self, inputs): return inputs * self.scale
def run_trained_flow(save_folder, data_folder, data_fname): print('--------------------') print('Train GLOW model ...') # Training settings parser = argparse.ArgumentParser(description='PyTorch GLOW') parser.add_argument( '--batch-size', type=int, default=100, help='input batch size for training (default: 100)') parser.add_argument( '--test-batch-size', type=int, default=1, help='input batch size for testing (default: 1000)') parser.add_argument( '--epochs', type=int, default=2000, help='number of epochs to train (default: 1000)') parser.add_argument( '--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)') parser.add_argument( '--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument( '--num-blocks', type=int, default=9, help='number of invertible blocks (default: 5)') parser.add_argument( '--num-hidden', type=int, default=256, help='number of hidden layer neurons') parser.add_argument( '--num-inputs', type=int, default=24, help='look-ahead horizon of forecasting') parser.add_argument( '--num-cond-inputs', type=int, default=24, help='length of historical data') parser.add_argument( '--seed', type=int, default=1, help='random seed (default: 1)') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} try: os.makedirs(save_folder) except OSError: pass # Load training_subset, valid_set and test_set # Just one split: 1-fold training_subset, valid_set, test_set = load.load_dataset(data_folder, data_fname) print('Training subset size:', training_subset.N) print('Validation set size:', valid_set.N) print('Test set size:', test_set.N) # Load point estimate #pred_on_train, pred_on_valid, pred_on_test = load.load_point_estimates(data_folder) # Transform to torch.Tensor # train_tensor = torch.from_numpy(training_subset.X) new_training_subset = np.concatenate((training_subset.y, training_subset.X),-1) #new_training_subset = np.concatenate((training_subset.y, pred_on_train),-1) mu = new_training_subset.mean() std = new_training_subset.std() print('Mean of new train set:',mu) print('Std of new train set:',std) train_tensor = torch.from_numpy((training_subset.X-mu)/std) #train_tensor= torch.from_numpy((pred_on_train-mu)/std) train_labels = torch.from_numpy((training_subset.y-mu)/std) train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels) valid_tensor = torch.from_numpy((valid_set.X-mu)/std) #valid_tensor = torch.from_numpy((pred_on_valid-mu)/std) valid_labels = torch.from_numpy((valid_set.y-mu)/std) valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels) test_tensor = torch.from_numpy((test_set.X-mu)/std) #test_tensor = torch.from_numpy((pred_on_test-mu)/std) test_labels = torch.from_numpy((test_set.y-mu)/std) test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, **kwargs) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=args.test_batch_size, shuffle=False, drop_last=False, **kwargs) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, shuffle=False, drop_last=False, **kwargs) num_inputs = args.num_inputs num_cond_inputs = args.num_cond_inputs num_hidden = args.num_hidden def build_model(): modules = [] mask = torch.arange(0, num_inputs) % 2 #mask = torch.ones(num_inputs) #mask[round(num_inputs/2):] = 0 mask = mask.to(device).float() # build each modules for _ in range(args.num_blocks): modules += [ fnn.ActNorm(num_inputs), fnn.LUInvertibleMM(num_inputs), fnn.CouplingLayer( num_inputs, num_hidden, mask, num_cond_inputs, s_act='tanh', t_act='relu') ] mask = 1 - mask # build model model = fnn.FlowSequential(*modules) # initialize for module in model.modules(): if isinstance(module, nn.Linear): nn.init.orthogonal_(module.weight) if hasattr(module, 'bias') and module.bias is not None: module.bias.data.fill_(0) model.to(device) return model model = build_model() # Save trained model #torch.save(best_model, save_folder+'best_model.pt') #model = torch.load(save_folder+'best_model.pt', #map_location=lambda storage, loc: storage) model = torch.load(save_folder+'best_model.pt') def calculate_dist(true, generated): distance = {} for t in range(generated.shape[1]): y = true[t] y_hat = generated[:,t] dist = [] for p in range(50, 101, 1): if p == 50: median = stats.scoreatpercentile(y_hat, p) dist.append(np.abs(y - median)) else: pl = 100 - p pu = p l = stats.scoreatpercentile(y_hat, pl) u = stats.scoreatpercentile(y_hat, pu) if y <= u and y >= l: dist.append(0.0) elif y < l: dist.append(np.abs(y - l)) else: dist.append(np.abs(y - u)) # distance for each hour t dist = np.array(dist) distance[t] = dist series = pd.DataFrame.from_dict(distance) series = series.mean(axis = 1) return series.values def test(model, test_loader): model.eval() median_pred = [] ground_truth = [] point_pred = [] pi_1 = [] pi_99 = [] pi_5 = [] pi_95 = [] pi_15 = [] pi_85 = [] pi_25 = [] pi_75 = [] distance = {} for index, data in enumerate(test_loader): #if index == 2: break inputs = data[0] cond_inputs = data[1] with torch.no_grad(): cond_inputs_ = cond_inputs.view(-1,num_cond_inputs) * torch.ones([5000,num_cond_inputs]) yt_hat = model.sample(5000, cond_inputs = cond_inputs_).detach().cpu().numpy() #test_data = test_set.X[index,:].flatten() input_data = inputs.detach().numpy().flatten() cond_data = cond_inputs.detach().numpy().flatten() input_data = input_data*std + mu cond_data = cond_data*std + mu synth = yt_hat*std + mu median = stats.scoreatpercentile(synth, 50, axis = 0) percentile1 = stats.scoreatpercentile(synth, 1, axis = 0) percentile99 = stats.scoreatpercentile(synth, 99, axis = 0) percentile5 = stats.scoreatpercentile(synth, 5, axis = 0) percentile95 = stats.scoreatpercentile(synth, 95, axis = 0) percentile15 = stats.scoreatpercentile(synth, 15, axis = 0) percentile85 = stats.scoreatpercentile(synth, 85, axis = 0) percentile25 = stats.scoreatpercentile(synth, 25, axis = 0) percentile75 = stats.scoreatpercentile(synth, 75, axis = 0) if index == 0: median_pred = median ground_truth = input_data pi_1 = percentile1 pi_99 = percentile99 pi_5 = percentile5 pi_95 = percentile95 pi_15 = percentile15 pi_85 = percentile85 pi_25 = percentile25 pi_75 = percentile75 else: median_pred = np.concatenate((median_pred, median)) ground_truth = np.concatenate((ground_truth, input_data)) pi_1 = np.concatenate((pi_1, percentile1)) pi_99 = np.concatenate((pi_99, percentile99)) pi_5 = np.concatenate((pi_5, percentile5)) pi_95 = np.concatenate((pi_95, percentile95)) pi_15 = np.concatenate((pi_15, percentile15)) pi_85 = np.concatenate((pi_85, percentile85)) pi_25 = np.concatenate((pi_25, percentile25)) pi_75 = np.concatenate((pi_75, percentile75)) # distance of test data {index} averaged over 24 hours distance[index] = calculate_dist(input_data, synth) GLOW_pred_dict = {} GLOW_pred_dict['median_pred'] = median_pred GLOW_pred_dict['ground_truth'] = ground_truth GLOW_pred_dict['pi1'] = pi_1 GLOW_pred_dict['pi99'] = pi_99 GLOW_pred_dict['pi5'] = pi_5 GLOW_pred_dict['pi95'] = pi_95 GLOW_pred_dict['pi15'] = pi_15 GLOW_pred_dict['pi85'] = pi_85 GLOW_pred_dict['pi25'] = pi_25 GLOW_pred_dict['pi75'] = pi_75 # Save GLOW_pred_dict as .csv file GLOW_pred = pd.DataFrame.from_dict(GLOW_pred_dict) GLOW_pred.to_csv(save_folder+'GLOW_pred.csv') GLOW_distance = pd.DataFrame.from_dict(distance) GLOW_distance.to_csv(save_folder+'GLOW_distance.csv') #series = series.mean(axis = 1) #GLOW_distance = series.values # Save GLOW_distance as an array #np.save(save_folder+'GLOW_distance.npy', GLOW_distance) return None test(model, test_loader)
print "File " + filename + " does not exist!" sys.exit(0) else: with f: loaded_obj = cPickle.load(f) return loaded_obj parser2 = argparse.ArgumentParser() parser2.add_argument("teX", help="Provide filename for test dataset you want to use (reads). It should have been in 'media/'" "directory and filename should end with '-teX.fasta.gz'", type=str) parser2.add_argument("best_model", help="Provide filename for the best model. Filename must include directory. Must be of" "format 'best_model_with_params-[timestamp].pkl'.", type=str) parser2.add_argument("-teY", help="Provide filename for test dataset you want to use (classes). It should have been in 'media/'" "directory and filename should end with '-teY.fasta.gz'", type=str) results = parser2.parse_args() teX = np.asarray(load_dataset(results.teX)) teX = teX.reshape(-1, 1, 1, teX.shape[1]) teY = load_dataset(results.teY) best_model = results.best_model # teX_filename = "media/2114bef791b6111f12575439a7bbed73_4_0.200_100_1_0_20-teX.fasta.gz" # teY_filename = "media/2114bef791b6111f12575439a7bbed73_4_0.200_100_1_0_20-teY.fasta.gz" # model_filename = "models/best_model_with_params-1468304923-improving-eval.pkl" # teX = np.asarray(load_dataset(teX_filename)) # teX = teX.reshape(-1, 1, 1, teX.shape[1]) # teY = np.asarray(load_dataset(teY_filename)) # best_model = model_filename # initialize matrices X = T.ftensor4() Y = T.fmatrix()
import load as ld from lasagne import layers from lasagne.nonlinearities import softmax from nolearn.lasagne import NeuralNet import cPickle as pickle from nolearn.lasagne import BatchIterator path = '/home/tg/Documents/bogo/converted_database/' X, Y = ld.load_dataset(folder_path = path, filter_input = True, max_number_states = 10) _, num_features, size, _ = X.shape print X.shape # exit(0) NUM_FILTERS = 32 net = NeuralNet( layers=[ ('input', layers.InputLayer), # ('pad1', layers.shape.PadLayer), ('conv1', layers.Conv2DLayer), ('conv2', layers.Conv2DLayer), ('hidden4', layers.DenseLayer), ('output', layers.DenseLayer), ], input_shape = (None, num_features, size, size), conv1_num_filters = NUM_FILTERS, conv1_filter_size = (5,5), conv2_num_filters = NUM_FILTERS,
def train_W_flow(save_folder, data_folder, data_fname): print('--------------------') print('Train GLOW model ...') # Training settings parser = argparse.ArgumentParser(description='PyTorch GLOW') parser.add_argument('--batch-size', type=int, default=100, help='input batch size for training (default: 100)') parser.add_argument('--test-batch-size', type=int, default=1, help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=2, help='number of epochs to train (default: 1000)') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--num-blocks', type=int, default=9, help='number of invertible blocks (default: 5)') parser.add_argument('--num-hidden', type=int, default=256, help='number of hidden layer neurons') parser.add_argument('--num-inputs', type=int, default=24, help='look-ahead horizon of forecasting') parser.add_argument('--num-cond-inputs', type=int, default=24, help='length of historical data') parser.add_argument( '--weight', type=int, default=1, help='trade off KL-divergence and Wasserstein distance') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} try: os.makedirs(save_folder) except OSError: pass # Load training_subset, valid_set and test_set # Just one split: 1-fold training_subset, valid_set, test_set = load.load_dataset( data_folder, data_fname) print('Training subset size:', training_subset.N) print('Validation set size:', valid_set.N) print('Test set size:', test_set.N) # Load point estimate #pred_on_train, pred_on_valid, pred_on_test = load.load_point_estimates(data_folder) # Transform to torch.Tensor # train_tensor = torch.from_numpy(training_subset.X) new_training_subset = np.concatenate( (training_subset.y, training_subset.X), -1) #new_training_subset = np.concatenate((training_subset.y, pred_on_train),-1) mu = new_training_subset.mean() std = new_training_subset.std() print('Mean of new train set:', mu) print('Std of new train set:', std) train_tensor = torch.from_numpy((training_subset.X - mu) / std) #train_tensor= torch.from_numpy((pred_on_train-mu)/std) train_labels = torch.from_numpy((training_subset.y - mu) / std) train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels) valid_tensor = torch.from_numpy((valid_set.X - mu) / std) #valid_tensor = torch.from_numpy((pred_on_valid-mu)/std) valid_labels = torch.from_numpy((valid_set.y - mu) / std) valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels) test_tensor = torch.from_numpy((test_set.X - mu) / std) #test_tensor = torch.from_numpy((pred_on_test-mu)/std) test_labels = torch.from_numpy((test_set.y - mu) / std) test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, **kwargs) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.test_batch_size, shuffle=False, drop_last=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=False, drop_last=False, **kwargs) num_inputs = args.num_inputs num_cond_inputs = args.num_cond_inputs num_hidden = args.num_hidden def build_model(): modules = [] mask = torch.arange(0, num_inputs) % 2 #mask = torch.ones(num_inputs) #mask[round(num_inputs/2):] = 0 mask = mask.to(device).float() # build each modules for _ in range(args.num_blocks): modules += [ fnn.ActNorm(num_inputs), fnn.LUInvertibleMM(num_inputs), fnn.CouplingLayer(num_inputs, num_hidden, mask, num_cond_inputs, s_act='tanh', t_act='relu') ] mask = 1 - mask # build model model = fnn.FlowSequential(*modules) # initialize for module in model.modules(): if isinstance(module, nn.Linear): nn.init.orthogonal_(module.weight) if hasattr(module, 'bias') and module.bias is not None: module.bias.data.fill_(0) model.to(device) return model model = build_model() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6) # Build Discriminator class Flatten(nn.Module): def forward(self, input): return input.view(input.size()[0], -1) class Reshape(nn.Module): def forward(self, input): return input.view(input.size()[0], 1, -1) class Discriminator(nn.Module): def __init__(self): super(Discriminator, self).__init__() self.layers = nn.Sequential( Reshape(), nn.Conv1d(in_channels=1, out_channels=8, kernel_size=5, stride=1, dilation=1), nn.BatchNorm1d(8), nn.LeakyReLU(), nn.Conv1d(in_channels=8, out_channels=8, kernel_size=5, stride=1, dilation=1), nn.BatchNorm1d(8), nn.LeakyReLU(), nn.MaxPool1d(3, stride=2), Flatten(), nn.Linear(19 * 8, 1)) def forward(self, input): return self.layers(input) def clamp_weight(model): for module in model.modules(): if isinstance(module, nn.Conv1d) or isinstance( module, nn.Linear) or isinstance(module, nn.BatchNorm1d): module.weight.data = torch.clamp(module.weight.data, -1e-2, 1e-2) if module.bias is not None: module.bias.data = torch.clamp(module.bias.data, -1e-2, 1e-2) discriminator = Discriminator() disc_optimizer = optim.Adam(discriminator.parameters(), lr=args.lr, weight_decay=1e-6) def train_discriminator(data, cond_data): for param in discriminator.parameters(): param.requires_grad = True u = torch.Tensor(data.shape[0], data.shape[1]).normal_() synth_data, _ = model.forward(u, cond_data, mode='inverse') for i in range(5): # using detach to freeze the flow-based generative model disc_optimizer.zero_grad() _data = torch.cat((data, cond_data), 1) _synth_data = torch.cat((synth_data, cond_data), 1) loss = -discriminator(_data).mean() + discriminator( _synth_data.detach()).mean() loss.backward() disc_optimizer.step() clamp_weight(discriminator) def calculate_W_distance(data, cond_data, discriminator): u = torch.Tensor(data.shape[0], data.shape[1]).normal_() synth_data, _ = model.forward(u, cond_data, mode='inverse') # Freeze the discriminator for evaluating Wasserstein distance best_discriminator = discriminator for param in best_discriminator.parameters(): param.requires_grad = False _data = torch.cat((data, cond_data), 1) _synth_data = torch.cat((synth_data, cond_data), 1) W_distance = best_discriminator(_data).mean() - best_discriminator( _synth_data).mean() return W_distance # Start training generative flow train_loss = [] def train(epoch): model.train() for batch_idx, data in enumerate(train_loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].float() cond_data = cond_data.to(device) else: cond_data = None data = data[0] data = data.to(device) optimizer.zero_grad() loss = -model.log_probs(data, cond_data).mean() train_loss.append(loss.item()) # Adding Wasserstein distance as regularizer train_discriminator(data, cond_data) W_distance = calculate_W_distance(data, cond_data, discriminator) # Total loss loss += args.weight * W_distance loss.backward() optimizer.step() def validate(epoch, model, loader, prefix='Validation'): model.eval() val_loss = 0 for batch_idx, data in enumerate(loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].float() cond_data = cond_data.to(device) else: cond_data = None data = data[0] data = data.to(device) with torch.no_grad(): val_loss += -model.log_probs(data, cond_data).sum().item() return val_loss / len(loader.dataset) best_validation_loss = float('inf') best_validation_epoch = 0 best_model = model valid_loss = [] for epoch in range(args.epochs): print('\nEpoch: {}'.format(epoch)) train(epoch) validation_loss = validate(epoch, model, valid_loader) valid_loss.append(validation_loss) if epoch - best_validation_epoch >= 30 and epoch > 100: #if epoch - best_validation_epoch >= 30: break if validation_loss < best_validation_loss: best_validation_epoch = epoch best_validation_loss = validation_loss best_model = copy.deepcopy(model) print( 'Best validation at epoch {}: Average Log Likelihood in nats: {:.4f}' .format(best_validation_epoch, -best_validation_loss)) plt.figure(figsize=(10, 10)) plt.plot(range(len(valid_loss)), valid_loss) plt.title('validation loss over epochs') plt.savefig(save_folder + 'valid_loss.png') # Save trained model torch.save(best_model, save_folder + 'best_model.pt') def calculate_dist(true, generated): distance_of_one_sample = [] for t in range(generated.shape[1]): y = true[t] y_hat = generated[:, t] dist = [] for p in range(50, 101, 1): if p == 50: median = stats.scoreatpercentile(y_hat, p) dist.append(np.abs(y - median)) else: pl = 100 - p pu = p l = stats.scoreatpercentile(y_hat, pl) u = stats.scoreatpercentile(y_hat, pu) if y <= u and y >= l: dist.append(0.0) elif y < l: dist.append(np.abs(y - l)) else: dist.append(np.abs(y - u)) dist = np.array(dist) if t == 0: distance_of_one_sample = dist else: distance_of_one_sample += dist return distance_of_one_sample / 24 def test(model, test_loader): model.eval() median_pred = [] ground_truth = [] point_pred = [] pi_1 = [] pi_99 = [] pi_5 = [] pi_95 = [] pi_15 = [] pi_85 = [] pi_25 = [] pi_75 = [] distance = {} for index, data in enumerate(test_loader): if index == 2: break inputs = data[0] cond_inputs = data[1] with torch.no_grad(): cond_inputs_ = cond_inputs.view( -1, num_cond_inputs) * torch.ones([5000, num_cond_inputs]) yt_hat = model.sample( 5000, cond_inputs=cond_inputs_).detach().cpu().numpy() #test_data = test_set.X[index,:].flatten() input_data = inputs.detach().numpy().flatten() cond_data = cond_inputs.detach().numpy().flatten() input_data = input_data * std + mu cond_data = cond_data * std + mu synth = yt_hat * std + mu median = stats.scoreatpercentile(synth, 50, axis=0) percentile1 = stats.scoreatpercentile(synth, 1, axis=0) percentile99 = stats.scoreatpercentile(synth, 99, axis=0) percentile5 = stats.scoreatpercentile(synth, 5, axis=0) percentile95 = stats.scoreatpercentile(synth, 95, axis=0) percentile15 = stats.scoreatpercentile(synth, 15, axis=0) percentile85 = stats.scoreatpercentile(synth, 85, axis=0) percentile25 = stats.scoreatpercentile(synth, 25, axis=0) percentile75 = stats.scoreatpercentile(synth, 75, axis=0) if index == 0: median_pred = median ground_truth = input_data pi_1 = percentile1 pi_99 = percentile99 pi_5 = percentile5 pi_95 = percentile95 pi_15 = percentile15 pi_85 = percentile85 pi_25 = percentile25 pi_75 = percentile75 else: median_pred = np.concatenate((median_pred, median)) ground_truth = np.concatenate((ground_truth, input_data)) pi_1 = np.concatenate((pi_1, percentile1)) pi_99 = np.concatenate((pi_99, percentile99)) pi_5 = np.concatenate((pi_5, percentile5)) pi_95 = np.concatenate((pi_95, percentile95)) pi_15 = np.concatenate((pi_15, percentile15)) pi_85 = np.concatenate((pi_85, percentile85)) pi_25 = np.concatenate((pi_25, percentile25)) pi_75 = np.concatenate((pi_75, percentile75)) # distance of test data {index} averaged over 24 hours distance[index] = calculate_dist(input_data, synth) GLOW_pred_dict = {} GLOW_pred_dict['median_pred'] = median_pred GLOW_pred_dict['ground_truth'] = ground_truth GLOW_pred_dict['pi1'] = pi_1 GLOW_pred_dict['pi99'] = pi_99 GLOW_pred_dict['pi5'] = pi_5 GLOW_pred_dict['pi95'] = pi_95 GLOW_pred_dict['pi15'] = pi_15 GLOW_pred_dict['pi85'] = pi_85 GLOW_pred_dict['pi25'] = pi_25 GLOW_pred_dict['pi75'] = pi_75 # Save GLOW_pred_dict as .csv file GLOW_pred = pd.DataFrame.from_dict(GLOW_pred_dict) GLOW_pred.to_csv(save_folder + 'GLOW_pred.csv') GLOW_distance = pd.DataFrame.from_dict(distance) GLOW_distance.to_csv(save_folder + 'GLOW_distance.csv') #series = series.mean(axis = 1) #GLOW_distance = series.values # Save GLOW_distance as an array #np.save(save_folder+'GLOW_distance.npy', GLOW_distance) return None test(model, test_loader)
def test_iris(): features, labels = load_dataset('iris') assert len(features[0]) == 4 assert len(features) assert len(features) == len(labels)
def test_seeds(): features, labels = load_dataset('seeds') assert len(features[0]) == 7 assert len(features) assert len(features) == len(labels)
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License from load import load_dataset import numpy as np from knn import learn_model, apply_model, accuracy features, labels = load_dataset("seeds") def cross_validate(features, labels): error = 0.0 for fold in range(10): training = np.ones(len(features), bool) training[fold::10] = 0 testing = ~training model = learn_model(1, features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error return error / 10.0 error = cross_validate(features, labels) print("Ten fold cross-validated error was {0:.1%}.".format(error)) features -= features.mean(0)
def test_experiment_projects_with_embeddings(): libraries, keywords, lib_key_graph, test_domains_libraries, test_domains_keywords, train_domains_libraries, \ train_domains_keywords = load_dataset(number_of_methods=100000, num_of_keywords_after_dot=0) libraries, keywords, idf_dict = load_model_data() have_embeddings = "True" random_predict = "False" similarity_methods = ['cosine'] proximities = ['both'] idf_uses = ['True'] embeddings_first = nx.read_gpickle( 'line_algo\data\embedding_first-order.gpickle') embeddings_second = nx.read_gpickle( 'line_algo\data\embedding_second-order.gpickle') results = open('results.csv', mode='w') results = csv.writer(results, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) results.writerow([ "1st Prox.", "2st Prox.", 'idf', "Similarity", "HitRate@10", "AUC", "NDCG", "Coverage" ]) for proximity in proximities: for idf_use in idf_uses: for similarity_method in similarity_methods: embeddings = {} for node in lib_key_graph.nodes(): if proximity == 'first': embeddings[node] = embeddings_first[node] elif proximity == 'second': embeddings[node] = embeddings_second[node] else: embeddings[node] = np.concatenate( (embeddings_first[node], 0.3 * embeddings_second[node]), axis=None) results.writerow([proximity, similarity_method, idf_use]) coverage = [] hit_rate = [] auc = [] ndcg = [] if similarity_method == 'function': # Create training set for the model of the similarity prediction training_features, training_values = relation_model.create_training_set( lib_key_graph, embeddings, libraries, keywords) # Train the relation model scaler, model = relation_model.train_relation_model( training_features, training_values) libraries_predicted_list = [] # Store the results in a file for domain in test_domains_libraries.keys(): # Check if libraries was identified in this domain if len(test_domains_libraries[domain]) >= 0 and len( test_domains_libraries[domain]) > 5: path_keywords = test_domains_keywords[domain] path_libraries = test_domains_libraries[domain] print('Predict path: ', domain) print("Number of libraries in this file: ", len(test_domains_libraries[domain])) # Calculate similarity and save it in a dictionary if similarity_method == "function": sim = caclulate_function_similarity(libraries, embeddings, lib_key_graph, path_keywords, scaler, model, idf_dict, idf=idf_use) else: sim = calculate_similarity( libraries, embeddings, lib_key_graph, path_keywords, idf_dict, method=similarity_method, idf=idf_use) # print(sim) # Get the largest 5 values predicted_libraries = nlargest(10, sim, key=sim.get) print("Libraries predicted: ", predicted_libraries) print("Path libraries:", path_libraries, "\n") libraries_predicted_list = libraries_predicted_list + predicted_libraries for library in predicted_libraries: if library in path_libraries: print(library) # Hit rate for Top-5 libraries hit_rate_temp = calculate_hit_rate( path_libraries, predicted_libraries) hit_rate.append(hit_rate_temp) print("Hit Rate @", len(predicted_libraries), ": ", hit_rate_temp) # Calculate AUC labels = [ 1 if library in path_libraries else 0 for library in sim.keys() ] conf = list(sim.values()) if 1 in labels and 0 in labels: auc_temp = roc_auc_score(np.array(labels), np.array(conf)) auc.append(auc_temp) print("ROC AUC: ", auc_temp, "\n") # Calculate Normalized Cumulative Score # Relevance score=1 if a library that was predicted is in path's libraries ndcg_temp = ndcg_score([np.array(labels)], [np.array(conf)]) ndcg.append(ndcg_temp) print("Discounted Cumulative Gain: ", ndcg_score([np.array(labels)], [np.array(conf)]), '\n') libraries_predicted_list = list(set(libraries_predicted_list)) results.writerow([ sum(hit_rate) / len(hit_rate), sum(auc) / len(auc), sum(ndcg) / len(ndcg), len(libraries_predicted_list) / len(libraries) * 100 ]) results.writerow([np.std(hit_rate), np.std(auc), np.std(ndcg)]) coverage.append(len(libraries_predicted_list) / len(libraries))
# encoding:utf-8 from load import load_dataset, get_samples import json raw_train_data = "../data/input/train-data-test" raw_dev_data = "../data/input/dev-data-test" raw_test_data = "../data/input/test-data-test" data_size = 100000 n_prev_sents = 5 # 上下文长度 max_n_words = 20 # 句子长度 if __name__ == "__main__": # load dataset train_data = load_dataset(raw_train_data, data_size) dev_data = load_dataset(raw_train_data, data_size) test_data = load_dataset(raw_train_data, data_size) # create_samples train_samples = get_samples(threads=train_data, n_prev_sents=n_prev_sents, max_n_words=max_n_words, pad=False) with open('train_cand_2.txt', 'a') as fw: for sample in train_samples: sample_dict = {} sample_dict['context'] = sample.context sample_dict['response'] = sample.response sample_dict['spk_agents'] = sample.spk_agents sample_dict['true_adr'] = sample.true_adr sample_dict['true_res'] = sample.true_res # sample_dict['agent_index_dict'] = sample.agent_index_dict
def train_AR(data_folder, data_fname, GLOW_data_folder, GLOW_data_fname, model_save_folder): # Training settings print('----------------------------------') print('Pre-train Autoregressive model ...') parser = argparse.ArgumentParser(description='PyTorch GLOW') parser.add_argument( '--epochs', type=int, default=2000, help='number of epochs to train (default: 500)') parser.add_argument( '--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)') parser.add_argument( '--num-inputs', type=int, default=24, help='look-ahead horizon of forecasting') parser.add_argument( '--num-cond-inputs', type=int, default=24, help='length of historical data') parser.add_argument( '--order', type=int, default=24, help='order of Autoregressive model') parser.add_argument( '--delta', type=int, default=1e-4, help='stopping criterion') args = parser.parse_args() #try: #os.makedirs(model_save_folder) #except OSError: #pass # Define ARModel class class ARModel(nn.Module): def __init__(self, input_dim, output_dim): super(ARModel, self).__init__() self.linear = nn.Linear(input_dim, output_dim) def forward(self, x): out = self.linear(x) return out input_dim = args.num_cond_inputs output_dim = 1 model = ARModel(input_dim,output_dim) # Load dataset training_subset, valid_set, test_set = load.load_dataset(data_folder, data_fname) print('Training subset size:', training_subset.N) print('Validation set size:', valid_set.N) print('Test set size:', test_set.N) # Transform to torch.Tensor train_tensor = torch.from_numpy(training_subset.X) train_labels = torch.from_numpy(training_subset.y) train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels) valid_tensor = torch.from_numpy(valid_set.X) valid_labels = torch.from_numpy(valid_set.y) valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels) test_tensor = torch.from_numpy(test_set.X) test_labels = torch.from_numpy(test_set.y) test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size = 1, shuffle = False) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = 1, shuffle = False, drop_last = False) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size = 1, shuffle = False, drop_last = False) # Define loss function and optimizer criterion = nn.MSELoss()# Mean Squared Loss optimizer = torch.optim.SGD(model.parameters(), lr = args.lr) #Stochastic Gradient Descent # Train AR train_loss = [] def train(epoch): model.train() for batch_idx, data in enumerate(train_loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].squeeze() else: cond_data = None input_data = data[0].squeeze() optimizer.zero_grad() num_inputs = input_data.shape[0] history = cond_data pred = [] for t in range(num_inputs): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) loss = criterion(pred, input_data) train_loss.append(loss.detach().item()) loss.backward() optimizer.step() def validate(epoch, model, valid_loader): model.eval() val_loss = 0 for batch_idx, data in enumerate(valid_loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].squeeze() else: cond_data = None input_data = data[0].squeeze() with torch.no_grad(): history = cond_data pred = [] for t in range(input_data.shape[0]): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) val_loss += criterion(pred, input_data).detach().item() return val_loss / valid_set.N best_validation_loss = float('inf') best_validation_epoch = 0 best_model = model valid_loss = [] for epoch in range(args.epochs): print('\nEpoch: {}'.format(epoch)) train(epoch) validation_loss = validate(epoch, model, valid_loader) valid_loss.append(validation_loss) if epoch - best_validation_epoch >= 10: break if validation_loss < best_validation_loss - args.delta: best_validation_epoch = epoch best_validation_loss = validation_loss best_model = copy.deepcopy(model) print( 'Best validation at epoch {}: Average mse: {:.4f}'. format(best_validation_epoch, best_validation_loss)) plt.figure(figsize=(10,10)) plt.plot(range(len(valid_loss)), valid_loss) plt.title('validation loss over epochs') plt.savefig(model_save_folder+'pretrain_AR_valid_loss.png') # Test pre-trained AR def test(model, test_loader): model.eval() predictions = [] test_data = [] for index, data in enumerate(test_loader): #if index == 2: break input_data = data[0].squeeze() cond_data = data[1].squeeze() with torch.no_grad(): history = cond_data pred = [] for t in range(input_data.shape[0]): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) if index == 0: predictions = pred test_data = input_data else: predictions = torch.cat((predictions, pred)) test_data = torch.cat((test_data, input_data)) return predictions, test_data predictions, test_data = test(best_model, test_loader) # Calculate MSE, plot predictions versus test_data print('Pretrain MSE on test data:', criterion(predictions, test_data).detach().item()) # Save trained model #torch.save(best_model, model_save_folder+'best_model.pt') torch.save(best_model.state_dict(), model_save_folder+'pretrained_ARmodel.pt') # generate point estimates run_ar.generate_point_est(best_model, GLOW_data_folder, GLOW_data_fname) return best_model
def evaluate(models,classes): models, fname = zip(*models); fname=fname[0] print 'Loading Test dataset...' dev_samples,gold = load.load_dataset(fname=load.filename['TEST'],numdocs=1000); [tp,fp,fn,tn] = [0.0,0.0,0.0,0.0] keyword_stats=[] confusion=[] for each in models: confusion.append({e:[[],[]] for e in classes}) keyword_stats.append({e:[0.0,0.0,len(dev_samples)*1.0,0.0] for e in classes}); print 'Evaluation Cache for %s is not present' %fname pred = each.classify(dev_samples); #a sorted vector of strings assert(len(pred) ==len(dev_samples)); for no,each in enumerate(pred): print '\rVerifying output for example %d' %no, assert(type(each) == list); p=set(each)&classes; q=set(gold[no])&classes; r = p&q; tp += len(p&q); tn += len(classes)-len(p|q); fp += len(p)-len(p&q) fn += len(q)-len(p&q) for every in r: keyword_stats[-1][every][0]+=1; #tp for every in p-r: keyword_stats[-1][every][1]+=1; #fp for every in p|q: keyword_stats[-1][every][2]-=1; #tn for every in q-r: keyword_stats[-1][every][3]+=1; #fn #for every in r: confusion[-1][every][0].append(exampleno); #tp for every in p-r: confusion[-1][every][0].append(no); #fp #for every in p|q: keyword_stats[-1][every][2].append(exampleno); #tn for every in q-r: confusion[-1][every][1].append(no); #fn #write into file #print keyword_stats[-1] #print [tp,tn,fp,fn] with open(fname, 'wb') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=list(classes)) writer.writeheader() for each in keyword_stats: writer.writerow(each) prec,rec = tp/(tp+fp+0.01),tp/(tp+fn+0.01) #print prec,rec print '\n' print '[tp,fp,tn,fn]',keyword_stats[-1] print '------tp------tn------fp------fn------pr-------re------f1------' print '----------------------------Model %s--------------------------' %fname print '------%d------%d------%d------%d------%.2f------%.2f------%.2f------' %(tp,tn,fp,fn,prec,rec,2*prec*rec/(prec+rec+0.01)) x= '%s \n' %confusion[-1] for each in confusion[-1]: x+= 'confusion in %s \n' %each for no in confusion[-1][each]: for examp in no[:3]: x+= '%s \n' %examp x+= '%s %s\n' %(dev_samples[examp],gold[examp]) x+= '---------------------------------\n' writeintofile(x,"confusion")
import sys import load from normalize import normalize_features from print_matrices import print_matrices from numeric_verification import verify import train if len(sys.argv) >= 4: network_filename = sys.argv[1] weights_filename = sys.argv[2] dataset_name = sys.argv[3] else: print("\nUsage:\t python backpropagation.py network weights dataset\n") sys.exit() dataset = load.load_dataset(dataset_name) network = load.load_network_structure(network_filename) initial_weights = load.load_weights(weights_filename) normalize_features(dataset) # Calcula gradientes usando todas as instâncias do dataset. gradients = train.calculate_gradients(dataset, 0, len(dataset), initial_weights, network['regularization']) print_matrices(gradients)
import scipy.stats as sst from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix from tensorflow import keras import sklearn.metrics as skm import numpy_encoder default_lr = 0.001 address_request_round = "http://127.0.0.1:8000/round" address_global_weight = "http://127.0.0.1:8000/weight" global_round = 0 current_round = 0 max_round = 100 delay_time = 15 train = load.load_dataset("data/train_2.json") val = load.load_dataset("data/validation_2.json") preproc = load.preproc(*train) train_x, train_y = preproc.process(*train) val_x, val_y = preproc.process(*val) print("train size : {}, {}".format(len(train_x), len(train_y))) print("val size : {}, {}".format(len(val_x), len(val_y))) with open("data/validation_2.json", "rb") as fid: val_labels = [json.loads(l)['labels'] for l in fid] counts = collections.Counter(preproc.class_to_int[l[0]] for l in val_labels) counts = sorted(counts.most_common(), key=lambda x: x[0]) counts = list(zip(*counts))[1]
import simfin as sf from load import load_dataset, load_shareprices import pathlib import os from dotenv import load_dotenv from predict import train, predict, predict_similiar load_dotenv() SIMFIN_API_KEY = os.getenv('SIMFIN_API_KEY', 'free') MODELS_DIR = pathlib.Path('./models') DATA_DIR = pathlib.Path('./data') # LOAD shareprices_df = load_shareprices(simfin_api_key=SIMFIN_API_KEY) general_df = load_dataset(dataset='general', simfin_api_key=SIMFIN_API_KEY, shareprices_df=shareprices_df) banks_df = load_dataset(dataset='banks', simfin_api_key=SIMFIN_API_KEY, shareprices_df=shareprices_df) insurance_df = load_dataset(dataset='insurance', simfin_api_key=SIMFIN_API_KEY, shareprices_df=shareprices_df) # TRAIN general_model = train(general_df, winsor_quantile=0.01, model_name='general_model', feature_name='general', param=dict(learning_rate=0.01, max_depth=3,
from matplotlib.colors import ListedColormap from main.ch02.utils import CHART_DIR import load feature_names = [ 'area', 'perimeter', 'compactness', 'length of kernel', 'width of kernel', 'asymmetry coefficien', 'length of kernel groove', ] data = load.load_dataset("seeds") # np.set_printoptions(threshold=np.nan) def drawFigure(features, labels, neighbors=1, parameters=[], figName="no_name"): names = sorted(set(labels)) labels = np.array([names.index(ell) for ell in labels]) idX, idY = parameters[0], parameters[1] print("Xaxis :{0} - Yaxis : {1}").format(idX, idY) # define lower and upper limit on both axis (x=area, y =compactness) x0, y0 = features[:, idX].min() * 0.9, features[:, idY].min() * 0.9 x1, y1 = features[:, idX].max() * 1.1, features[:, idY].max() * 1.1 # create a meshgrid resulting of 2 X/Y-Linespaces
def train_AR(data_folder, data_fname, save_folder): # Training settings parser = argparse.ArgumentParser(description='PyTorch GLOW') parser.add_argument('--epochs', type=int, default=2000, help='number of epochs to train (default: 1000)') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)') parser.add_argument('--num-inputs', type=int, default=24, help='look-ahead horizon of forecasting') parser.add_argument('--num-cond-inputs', type=int, default=24, help='length of historical data') parser.add_argument('--order', type=int, default=24, help='order of Autoregressive model') parser.add_argument('--delta', type=int, default=1e-4, help='stopping criterion') args = parser.parse_args() try: os.makedirs(save_folder) except OSError: pass # Define ARModel class class ARModel(nn.Module): def __init__(self, input_dim, output_dim): super(ARModel, self).__init__() self.linear = nn.Linear(input_dim, output_dim) def forward(self, x): out = self.linear(x) return out input_dim = args.num_cond_inputs output_dim = 1 model = ARModel(input_dim, output_dim) # Load dataset training_subset, valid_set, test_set = load.load_dataset( data_folder, data_fname) print('Training subset size:', training_subset.N) print('Validation set size:', valid_set.N) print('Test set size:', test_set.N) # Transform to torch.Tensor train_tensor = torch.from_numpy(training_subset.X) train_labels = torch.from_numpy(training_subset.y) train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels) valid_tensor = torch.from_numpy(valid_set.X) valid_labels = torch.from_numpy(valid_set.y) valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels) test_tensor = torch.from_numpy(test_set.X) test_labels = torch.from_numpy(test_set.y) test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False, drop_last=False) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False) # Define loss function and optimizer criterion = nn.MSELoss() # Mean Squared Loss optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) #Stochastic Gradient Descent # Train AR train_loss = [] def train(epoch): model.train() for batch_idx, data in enumerate(train_loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].squeeze() else: cond_data = None input_data = data[0].squeeze() optimizer.zero_grad() num_inputs = input_data.shape[0] history = cond_data pred = [] for t in range(num_inputs): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) loss = criterion(pred, input_data) train_loss.append(loss.detach().item()) loss.backward() optimizer.step() def validate(epoch, model, valid_loader): model.eval() val_loss = 0 for batch_idx, data in enumerate(valid_loader): if isinstance(data, list): if len(data) > 1: cond_data = data[1].squeeze() else: cond_data = None input_data = data[0].squeeze() with torch.no_grad(): history = cond_data pred = [] for t in range(input_data.shape[0]): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) val_loss += criterion(pred, input_data).detach().item() return val_loss / valid_set.N best_validation_loss = float('inf') best_validation_epoch = 0 best_model = model valid_loss = [] for epoch in range(args.epochs): print('\nEpoch: {}'.format(epoch)) train(epoch) validation_loss = validate(epoch, model, valid_loader) valid_loss.append(validation_loss) if epoch - best_validation_epoch >= 10: break if validation_loss < best_validation_loss - args.delta: best_validation_epoch = epoch best_validation_loss = validation_loss best_model = copy.deepcopy(model) print('Best validation at epoch {}: Average mse: {:.4f}'.format( best_validation_epoch, best_validation_loss)) plt.figure(figsize=(10, 10)) plt.plot(range(len(valid_loss)), valid_loss) plt.title('validation loss over epochs') plt.savefig(save_folder + 'valid_loss.png') # Save trained model torch.save(best_model.state_dict(), save_folder + 'best_model.pt') #torch.save(best_model, save_folder+'best_model.pt') # Adding Gaussian noise to AR's point estimates to have scenarios def test(model, test_loader): model.eval() predictions = [] test_data = [] for index, data in enumerate(test_loader): #if index == 2: break input_data = data[0].squeeze() cond_data = data[1].squeeze() with torch.no_grad(): history = cond_data pred = [] for t in range(input_data.shape[0]): yt_hat = model.forward(history) history = torch.cat((history, yt_hat)) history = history[-24:] if t == 0: pred = yt_hat else: pred = torch.cat((pred, yt_hat)) if index == 0: predictions = pred test_data = input_data else: predictions = torch.cat((predictions, pred)) test_data = torch.cat((test_data, input_data)) return predictions, test_data predictions, test_data = test(best_model, test_loader) res = predictions - test_data res_std = res.std() noise = np.random.normal(0, res_std.item(), (5000, predictions.shape[0])) predictions = predictions.numpy() test_data = test_data.numpy() scenarios = predictions + noise def calculate_dist(true, generated): distance = {} for t in range(generated.shape[1]): y = true[t] y_hat = generated[:, t] dist = [] for p in range(50, 101, 1): if p == 50: median = stats.scoreatpercentile(y_hat, p) dist.append(np.abs(y - median)) else: pl = 100 - p pu = p l = stats.scoreatpercentile(y_hat, pl) u = stats.scoreatpercentile(y_hat, pu) if y <= u and y >= l: dist.append(0.0) elif y < l: dist.append(np.abs(y - l)) else: dist.append(np.abs(y - u)) # distance for each hour t dist = np.array(dist) distance[t] = dist series = pd.DataFrame.from_dict(distance) series = series.mean(axis=1) return series.values distance = {} start = 0 stride = 24 for sample_index in range(test_set.N): if start + stride <= scenarios.shape[1]: generated = scenarios[:, start:start + stride] true = test_data[start:start + stride] start += stride # Accumulate over samples distance[sample_index] = calculate_dist(true, generated) #distance = distance/test_set.N #np.save(save_folder+'AR_distance.npy',distance) AR_distance = pd.DataFrame.from_dict(distance) AR_distance.to_csv(save_folder + 'AR_distance.csv') AR_pred_dict = {} AR_pred_dict['point_pred'] = predictions AR_pred_dict['ground_truth'] = test_data AR_pred_dict['median'] = stats.scoreatpercentile(scenarios, 50, axis=0) AR_pred_dict['pi1'] = stats.scoreatpercentile(scenarios, 1, axis=0) AR_pred_dict['pi99'] = stats.scoreatpercentile(scenarios, 99, axis=0) AR_pred_dict['pi5'] = stats.scoreatpercentile(scenarios, 5, axis=0) AR_pred_dict['pi95'] = stats.scoreatpercentile(scenarios, 95, axis=0) AR_pred_dict['pi15'] = stats.scoreatpercentile(scenarios, 15, axis=0) AR_pred_dict['pi85'] = stats.scoreatpercentile(scenarios, 85, axis=0) AR_pred_dict['pi25'] = stats.scoreatpercentile(scenarios, 25, axis=0) AR_pred_dict['pi75'] = stats.scoreatpercentile(scenarios, 75, axis=0) AR_pred = pd.DataFrame.from_dict(AR_pred_dict) AR_pred.to_csv(save_folder + 'AR_pred.csv')
# nearest neighbor classification # When classifying a new element, this looks at the training data. For the object that is closest to it, its nearest neighbor. Then, it returns its label as the answer. import load from load import load_dataset feature_names = [ 'area', 'perimeter', 'compactness', 'length of kernel', 'width of kernel', 'asymmetry coefficient', 'length of kernel groove', ] data = load_dataset('seeds') features = data['features'] target = data['target'] from sklearn.neighbors import KNeighborsClassifier from matplotlib.colors import ListedColormap knn = KNeighborsClassifier(n_neighbors=1) kf = model_selection.KFold(n_splits=5, shuffle=False) means = [] for training, testing in kf.split(features): knn.fit(features[training], target[training]) prediction = knn.predict(features[testing]) curmean = np.mean(prediction == target[testing])
plt.xlabel('iterations (per fives)') plt.title("Learning rate =" + str(learning_rate)) plt.show() parameters = sess.run(parameters) print("Parameters have been trained!") correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) print("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train})) print("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test})) return parameters X_train_orig, Y_train_orig, X_test_orig, Y_test_orig, classes = load.load_dataset( ) X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1).T X_test_flatten = X_test_orig.reshape(X_test_orig.shape[0], -1).T X_train = X_train_flatten / 255. X_test = X_test_flatten / 255. Y_train = tools.convert_to_one_hot(Y_train_orig, 6) Y_test = tools.convert_to_one_hot(Y_test_orig, 6) parameters = model(X_train, Y_train, X_test, Y_test)
np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape) if COLOUR_FIGURE: cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) else: cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)]) plt.xlim(x0, x1) plt.ylim(y0, y1) plt.xlabel(feature_names[0]) plt.ylabel(feature_names[2]) plt.pcolormesh(X, Y, C, cmap=cmap) if COLOUR_FIGURE: cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)]) plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) else: for lab, ma in zip(range(3), "Do^"): plt.plot(features[labels == lab, 0], features[ labels == lab, 2], ma, c=(1., 1., 1.)) features, labels = load_dataset('seeds') names = sorted(set(labels)) labels = np.array([names.index(ell) for ell in labels]) train_plot(features, labels) plt.savefig('figure4.png') features -= features.mean(0) features /= features.std(0) train_plot(features, labels) plt.savefig('figure5.png')
import util import keras import keras.backend as K from keras.callbacks import LearningRateScheduler from keras.models import Model import scipy.io as scio from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score MAX_EPOCHS = 160 batch_size = 32 if __name__ == '__main__': params = util.config() save_dir = params['save_dir'] print("Loading training set...") train = load.load_dataset(params['train']) print("Loading dev set...") dev = load.load_dataset(params['dev']) print("Building preprocessor...") preproc = load.Preproc(*train) print("Training size: " + str(len(train[0])) + " examples.") print("Dev size: " + str(len(dev[0])) + " examples.") params.update({ "input_shape": [8000, 1], "num_categories": len(preproc.classes) }) #create the cl-pcg-net model = network.build_network(**params)
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License # Basic imports from __future__ import print_function import numpy as np from load import load_dataset # Import sklearn implementation of KNN from sklearn.neighbors import KNeighborsClassifier features, labels = load_dataset('aaj_data') classifier = KNeighborsClassifier(n_neighbors=4) n = len(features) correct = 0.0 for ei in range(n): training = np.ones(n, bool) training[ei] = 0 testing = ~training classifier.fit(features[training], labels[training]) pred = classifier.predict(features[ei]) correct += (pred == labels[ei]) print('Result of leave-one-out: {}'.format(correct / n)) # Import KFold object from sklearn.cross_validation import KFold
def predict(parser): val = load.load_dataset("data/validation_2.json") preproc = load.preproc(*val) args = parser.parse_args() print("args model : ", args.model) model = architecture.build_model() model.load_weights(args.model) with open("data/validation_2.json", "rb") as fid: val_labels = [json.loads(l)['labels'] for l in fid] counts = collections.Counter(preproc.class_to_int[l[0]] for l in val_labels) counts = sorted(counts.most_common(), key=lambda x: x[0]) counts = list(zip(*counts))[1] print("counts : ", counts) smooth = 500 counts = np.array(counts)[None, None, :] total = np.sum(counts) + counts.shape[1] print("total : ", total) prior = (counts + smooth) / float(total) # ??? print("prior : ", prior) ecgs, committee_labels = preproc.process(*val) m_probs = model.predict(ecgs) committee_labels = np.argmax(committee_labels, axis=2) committee_labels = committee_labels[:, 0] print("===================") temp = [] preds = np.argmax(m_probs / prior, axis=2) for i, j in zip(preds, val_labels): t = sst.mode(i[:len(j) - 1])[0][0] temp.append(t) #print(i[:len(j)-1]) preds = temp #print("preds : \n", preds) report = skm.classification_report(committee_labels, preds, target_names=preproc.classes, digits=3) scores = skm.precision_recall_fscore_support(committee_labels, preds, average=None) print("report : \n", report) cm = confusion_matrix(committee_labels, preds) print("confusion matrix : \n", cm) f1 = f1_score(committee_labels, preds, average='micro') #print("f1_score : ", f1) # ***roc_auc_score - m_probs*** s_probs = np.sum(m_probs, axis=1) s_probs = s_probs / 71 # one data set max size (element count) -> normalization #ovo_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovo') ovr_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovr') print("ovr_auroc : ", ovr_auroc) #print("ovo_auroc : ", ovo_auroc) ''' bootstrapping ''' n_bootstraps = 100 np.random.seed(3033) total_precision = [] total_recall = [] total_f1 = [] total_auroc = [] precision = [] recall = [] f1 = [] total = [] for j in range(n_bootstraps): indices = np.random.random_integers(0, len(m_probs) - 1, 100) #print("indices : ", len(indices)) if len(np.unique(committee_labels[indices])) < 2: continue sub_labels = [] sub_result = [] sub_probs = [] #print(indices) for i in indices: sub_labels.append(committee_labels[i]) sub_result.append(preds[i]) sub_probs.append(m_probs[i]) s_scores = precision_recall_fscore_support(sub_labels, sub_result, labels=[0, 1, 2, 3], average=None) # ***roc_auc_score - m_probs*** s_p = np.sum(sub_probs, axis=1) s_p = s_p / 71 # one data set max size (element count) -> normalization # ovo_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovo') #print(sub_labels) #print(s_p) try: s_auroc = roc_auc_score(sub_labels, s_p, multi_class='ovr') except: s_auroc = -1 #print(s_scores) precision.append(np.array(s_scores[0])) recall.append(np.array(s_scores[1])) f1.append(np.array(s_scores[2])) #auroc.append(s_auroc) total_precision.append(np.average(s_scores[0])) total_recall.append(np.average(s_scores[1])) total_f1.append(np.average(s_scores[2])) total_auroc.append(s_auroc) total_precision.sort() total_recall.sort() total_f1.sort() total_auroc.sort() total_auroc.remove(-1) #print(total_auroc) ''' bootstrapping 시 클래스가 존재하지 않는 케이스가 있을수도 있음 ''' precision = np.array(precision) precision[precision == .0] = np.nan recall = np.array(recall) recall[recall == .0] = np.nan f1 = np.array(f1) f1[f1 == .0] = np.nan #print(total_auroc) for i in range(4): pre = precision[:, i] pre.sort() rec = recall[:, i] rec.sort() f = f1[:, i] f.sort() pre = np.round(pre[int(len(pre) * 0.025):int(len(pre) * 0.975)], 3) rec = np.round(rec[int(len(rec) * 0.025):int(len(rec) * 0.975)], 3) f = np.round(pre[int(len(f) * 0.025):int(len(f) * 0.975)], 3) ''' print(i, " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(pre), 3), round(pre[0], 3), round(pre[-1], 3)), " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(rec), 3), round(rec[0], 3), round(rec[-1], 3)), " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(f), 3), round(f[0], 3), round(f[-1], 3))) ''' item = [ i, "{0} ({1}, {2})".format(np.round(np.nanmean(pre), 3), round(np.nanmin(pre), 3), round(np.nanmax(pre), 3)), "{0} ({1}, {2})".format(np.round(np.nanmean(rec), 3), round(np.nanmin(rec), 3), round(np.nanmax(rec), 3)), "{0} ({1}, {2})".format(np.round(np.nanmean(f), 3), round(np.nanmin(f), 3), round(np.nanmax(f), 3)) ] total.append(item) total_auroc = np.round( total_auroc[int(len(total_auroc) * 0.025):int(len(total_auroc) * 0.975)], 3) total_precision = np.round( total_precision[int(len(total_precision) * 0.025):int(len(total_precision) * 0.975)], 3) total_recall = np.round( total_recall[int(len(total_recall) * .025):int(len(total_recall) * .975)], 3) total_f1 = np.round( total_f1[int(len(total_f1) * .025):int(len(total_f1) * .975)], 3) with open(args.file_name, "w", newline='') as file: writer = csv.writer(file) writer.writerow(["", "precision", "recall", "f1-score", "auroc"]) writer.writerow([ "", "{0} ({1}, {2})".format(np.round(np.average(scores[0]), 3), total_precision[0], total_precision[-1]), "{0} ({1}, {2})".format(np.round(np.average(scores[1]), 3), total_recall[0], total_recall[-1]), "{0} ({1}, {2})".format(np.round(np.average(scores[2]), 3), total_f1[0], total_f1[-1]), "{0} ({1}, {2})".format(np.round(ovr_auroc, 3), total_auroc[0], total_auroc[-1]), ]) for i in total: writer.writerow(i)
pred[-1].append(everyclassifyer); #print 'Tags for this one', pred #assert(False) print pred return pred; if __name__ == '__main__': #A refined form of what we are doing looks so much similar to LDA/PGM. #Deep learning to learn feature end-end is better print 'Begin Loading samples...' train_samples,train_target = load.load_dataset(fname=load.filename['TRAIN'],numdocs=None); print 'number of training sample %d' %len(train_target) print 'Tags for the last train example',train_target[-1] c=defaultdict(float) for each in train_target: for everytag in each: c[everytag]+=1; y = filter(lambda x: c[x]>=500.0 ,c.keys()); #y=['java'] print y M1 = search_classify(True,y,'bow_bigram'); M1.train(train_target,train_samples); eval.evaluate([(M1,u'tfidf_LR.csv')],set(M1.classifyers.keys()));