def val(net, dataset, criterion, max_iter=100): print('Start val') for p in model.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=opt.batchSize, num_workers=int(opt.workers)) val_iter = iter(data_loader) i = 0 n_correct = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_masks = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) utils.loadData(mask, cpu_masks) mask1d = mask[:, 2, :, :] '''t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l)''' # print('val_image:',image.shape) # print('val_mask:',mask1d.shape) preds = model(image) # print('val_preds:',preds.shape) preds = preds.view(batch_size, 32, 100) # preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) # cost = criterion(preds, text, preds_size, length) / batch_size cost = criterion(preds, mask1d) loss_avg.add(cost) '''_, preds = preds.max(2) preds = preds.squeeze() preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_masks): if pred == target.lower(): n_correct += 1''' # 觉得在这种情况下其实是没有所谓的正确的,就设定成loss<0.007吧,没有特殊的原因 if cost < 0.007: n_correct += 1 '''raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))''' # 下面这个计算方式应该有点问题,因为batch_size不是每一次都是满的,所以这样除肯定会除多 accuracy = n_correct / float(max_iter * opt.batchSize) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def trainBatch(net, criterion, optimizer): data = train_iter.next() cpu_images, cpu_masks = data # print('cpu_images.shape ',cpu_images.shape) # print('cpu_masks.shape ',cpu_masks.shape) batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) utils.loadData(mask, cpu_masks) # print('mask.shape:',mask.shape) # print('batch_size ',batch_size) mask1d = mask[:, 2, :, :] '''t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l)''' # print('image.shape:',image.shape) # print('mask1d.shape):',mask1d.shape) preds = model(image) # print('preds.shape',preds.shape) preds = preds.view(batch_size, 32, 100) #preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) #cost = criterion(preds, text, preds_size, length) / batch_size # MSELoss是平均到每个位置上的,所以这里就不用再去除以batch size了 cost = criterion(preds, mask1d) # print('single_cost:',cost) model.zero_grad() cost.backward() optimizer.step() return cost
def trainBatch(net, criterion, optimizer): data = train_iter.next() cpu_images, cpu_bbs = data #print('img: ',cpu_images.shape) #print('mask: ',cpu_masks.shape) batch_size = cpu_images.size(0) utils.loadData(image_rgb, cpu_images) utils.loadData(bb, cpu_bbs) '''t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l)''' # print('val_image:',image.shape) # print('val_mask:',mask.shape) mask = model_mask(image_rgb) # print('val_preds:',preds.shape) mask = mask.view(batch_size, 1, 32, 100) #preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) #cost = criterion(preds, text, preds_size, length) / batch_size image_2channel = torch.cat((mask, image_rgb), dim=1) preds_bb = model_bb(image_2channel) #print('preds_bb shape:',preds_bb.shape) #print('bb shape:',bb.shape) cost = criterion(preds_bb.permute(1, 0, 2), bb) model_bb.zero_grad() cost.backward() optimizer.step() return cost
def trainBatch(net, criterionAttention,criterionCTC, optimizer): data = train_iter.next() cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) tAttention, lAttention = converterAttention.encode(cpu_texts) tCTC, lCTC = converterCTC.encode(cpu_texts) #print (image) #print (t) #print (l) utils.loadData(textAttention, tAttention) utils.loadData(lengthAttention, lAttention) utils.loadData(textCTC, tCTC) utils.loadData(lengthCTC, lCTC) if opt.lang: predsCTC, predsAttention = crnn(image, lengthAttention, textAttention) else: predsCTC, predsAttention = crnn(imageAttention, lengthAttention) costAttention = criterionAttention(predsAttention, textAttention) preds_size = Variable(torch.IntTensor([predsCTC.size(0)] * batch_size)) #print (predsCTC,textCTC,preds_size,lengthCTC) costCTC = criterionCTC(predsCTC, textCTC, preds_size, lengthCTC) / batch_size crnn.zero_grad() #cost = torch.sum(costCTC + costAttention) # print(costCTC,costAttention) cost = costCTC.cuda() + costAttention #cost = costCTC # print(cost) cost.backward() optimizer.step() return costCTC,costAttention,cost
def test_by_xzy(net, test_dataset): print('Start test') for p in net.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=64, num_workers=int(2)) val_iter = iter(data_loader) img_name_List = [] img_pred_List = [] for i in range(len(data_loader)): data = val_iter.next( ) #batch must contain tensors, numbers, dicts or lists; found <class 'PIL.Image.Image'> i += 1 cpu_images, cpu_img_name = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) # t, l = converter.encode(cpu_img_name) # utils.loadData(text, t) # utils.loadData(length, l) preds = net(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) _, preds = preds.max(2) # preds = preds.squeeze(2) #xzy 新pytorch再max(2)之后,已经没有了第2个维度。 preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode( preds.data, preds_size.data, raw=False) #sim_preds是预测出的字符串,类似“ XG78233838 ” for pred, name in zip(sim_preds, cpu_img_name): img_name_List.append(str(name)) img_pred_List.append(str(pred)) img_pred_List = np.array(img_pred_List) img_name_List = np.array(img_name_List) print(img_pred_List[0]) print(img_name_List[0]) df = pd.DataFrame({ 'name': img_name_List, 'label': img_pred_List }) #ValueError: arrays must all be same length column_order = ['name', 'label'] df = df[column_order] # predictionFile = '../../../../dataset_formal/classify_data/crnnData/result/result_crnn_with_ctpn.csv' # predictionFile = '../../../../dataset_formal/classify_data/crnnData/result/result_crnn_with_1800ctpn.csv' # predictionFile = '../../../../dataset_formal/classify_data/crnnData/result/result_crnn_with_1800ctpn_continue.csv' predictionFile = '../../../../dataset_formal/classify_data/crnnData/result/result_crnn_tight_ctpn.csv' df.to_csv(predictionFile, index=False) print("\nover")
def train_models(resnet, bayesian_resnet, num_epochs=10): '''Train models until num_epochs reached.''' #load training and validation datasets train, train_labels = loadData('data/ros_data.npz') val, val_labels = loadData('data/val.npz') opt = SGD(learning_rate=1e-3) #setup optimizer resnet.fit(np.expand_dims(train, axis=-1), to_categorical(train_labels), validation_data=[ np.expand_dims(val, axis=-1), to_categorical(val_labels) ], epochs=num_epochs, batch_size=5, optimizer=opt, save=True) #train model bayesian_resnet.fit(np.expand_dims(train, axis=-1), to_categorical(train_labels), validation_data=[ np.expand_dims(val, axis=-1), to_categorical(val_labels) ], epochs=num_epochs, batch_size=5, optimizer=opt, save=True) #train model
def test(test_loader, max_iter=10): test_size = 0 total_cer_loss = 0 total_ctc_loss = 0 test_iter = iter(test_loader) max_iter = min(max_iter, len(test_loader)) crnn.eval() with torch.no_grad(): for i in range(max_iter): data = test_iter.next() cpu_images, cpu_texts = data utils.loadData(image, cpu_images) batch_size = cpu_images.size(0) test_size += batch_size preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) total_ctc_loss += criterion(preds, text, preds_size, length) _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) # sim_preds = converter.beam_decode(preds.data) total_cer_loss += utils.cer_loss(sim_preds, cpu_texts, ignore_case=False) return total_ctc_loss * 1.0 / test_size, total_cer_loss * 1.0 / test_size
def main(): resnet_crnn = ResNetCRNN(rc_params.imgH, 1, len(rc_params.alphabet) + 1, rc_params.nh, resnet_type=rc_params.resnet_type, feat_size=rc_params.feat_size) resnet_crnn = torch.nn.DataParallel(resnet_crnn) state_dict = torch.load( './work_dirs/resnet18_rcnn_sgd_imgh128_rgb_512x1x16_lr_0.00100_batchSize_8_time_0319110013_/crnn_Rec_done_epoch_7.pth' ) resnet_crnn.load_state_dict(state_dict) test_dataset = dataset.lmdbDataset(root='to_lmdb/test_index', rgb=True) converter = utils.strLabelConverter(rc_params.alphabet) resnet_crnn.eval() resnet_crnn.cuda() data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=1, num_workers=int(rc_params.workers), collate_fn=alignCollate(imgH=rc_params.imgH, imgW=rc_params.imgW, keep_ratio=rc_params.keep_ratio, rgb=True)) val_iter = iter(data_loader) max_iter = len(data_loader) record_dir = 'test_out/test_out.txt' r = 1 f = open(record_dir, "a") image = torch.FloatTensor(rc_params.batchSize, 3, rc_params.imgH, rc_params.imgH) prog_bar = mmcv.ProgressBar(max_iter) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) # image = cpu_images.cuda() with torch.no_grad(): preds = resnet_crnn(image) preds_size = torch.IntTensor([preds.size(0)] * batch_size) _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) if not isinstance(sim_preds, list): sim_preds = [sim_preds] for pred in sim_preds: f.write(str(r).zfill(6) + ".jpg " + pred + "\n") r += 1 prog_bar.update() print("") f.close()
def main(args): xr, log_igfr_r, labels_r = loadData('NEW_GFR_TRAIN') xe, log_igfr_e, labels_e = loadData('NEW_GFR_TEST') train_ds = tf.data.Dataset.from_tensor_slices((xr, log_igfr_r, labels_r)) test_ds = tf.data.Dataset.from_tensor_slices((xe, log_igfr_e, labels_e)) train_ds = train_ds.shuffle(xr.shape[0]).batch(batch_size) # test_ds = test_ds.batch(batch_size) test_ds = test_ds.batch(1) model = KidneyModel(n_cat) init_lr, momentum = args.learning_rate, 0.9 lr = tfe.Variable(init_lr, name="learning_rate") optimizer = tf.train.AdamOptimizer(lr) with tf.device('/cpu:0'): lr = tfe.Variable(init_lr, name="learning_rate") optimizer = tf.train.AdamOptimizer(lr) for epoch in range(args.epochs): print('epoch', epoch) train_acc = tfe.metrics.Accuracy('train_accuracy') total_loss, total_batch = 0.0, 0.0 for (batch, (x, log_igfr, labels)) in enumerate(tfe.Iterator(train_ds)): with tf.GradientTape() as tape: mean, var, logits, igfr = model(x) loss_value = loss(mean, var, logits, igfr, labels, log_igfr, args.enlarge, args.w_div, args.w_l2) total_loss += loss_value.cpu().numpy() total_batch += 1 train_acc(tf.argmax(logits, axis=1, output_type=tf.int32), tf.argmax(labels, axis=1, output_type=tf.int32)) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients( zip(grads, model.variables), global_step=tf.train.get_or_create_global_step()) print('Learning Rate', lr.numpy()) if (epoch + 1) % 50 == 0: lr.assign(lr.numpy() / 2) print('Training acc {}'.format(100 * train_acc.result())) print('train_acc', 100 * train_acc.result().cpu().numpy()) test_acc = test(model, test_ds) test2_acc, reses, test3_acc, reses3 = test23(model, test_ds) print('test_acc1', test_acc) print('avg_loss ', total_loss / total_batch) print('test_acc2', test2_acc) print('test_acc3', test3_acc) for i in range(reses.shape[0]): print('Cate %d ' % i, reses[i]) checkpoint_dir = './saved_models/' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") root = tfe.Checkpoint(optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) root.save(file_prefix=checkpoint_dir)
def test(net, dataset, criterion, n_aug=1): print('Start test set predictions') for p in crnn.parameters(): p.requires_grad = False net.eval() all_file_names = [] all_preds = [] image_count = 0 pred_dict = {} for epoch in range(n_aug): test_iter = iter(dataset) for i in range(len(dataset)): data = test_iter.next() #i += 1 cpu_images, __, file_names = data batch_size = cpu_images.size(0) image_count = image_count + batch_size utils.loadData(image, cpu_images) preds = crnn(image) #print(preds.size()) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) # RA: While I am not sure yet, it looks like a greedy decoder and not beam search is being used here # Case is ignored in the accuracy, which is not ideal for an actual working system _, preds = preds.max(2) if torch.__version__ < '0.2': preds = preds.squeeze(2) # https://github.com/meijieru/crnn.pytorch/issues/31 preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, f in zip(sim_preds, file_names): if f not in pred_dict: pred_dict[f] = [pred] else: pred_dict[f].append(pred) for f, final_preds in pred_dict.items(): all_preds.append(Counter(final_preds).most_common(1)[0][0]) all_file_names.append(f.partition(".jpg")[0]) print("Total number of images in test set: %8d" % image_count) return (all_file_names, all_preds)
def eval(model, test_dirs): count, error, correct, eval_batchSize = 0, 0, 0, 1 dst = labelTestDataLoader(test_dirs, imgSize) testloader = DataLoader(dst, batch_size=1, shuffle=True, num_workers=8) start = time() eval_text = Variable(torch.IntTensor(eval_batchSize * 5)) eval_length = Variable(torch.IntTensor(eval_batchSize)) correct = 0 for i, (XI, labels, ims) in enumerate(testloader): count += 1 #Changes for the encoder [1234df ] -> [0,1,2,3] YI = [ ] #List of all the licence plate (string of actual licence plates) for label in labels: indexs = [int(x) for x in label.split('_')[:7]] l = [provinces[indexs[0]], alphabets[indexs[1]]] for index in range(2, 7): l.append(ads[indexs[index]]) YI.append(''.join(l)) t, l = converter.encode(YI) utils.loadData(eval_text, t) utils.loadData(eval_length, l) #YI = [[int(ee) for ee in el.split('_')[:7]] for el in labels] if use_gpu: x = Variable(XI.cuda()) else: x = Variable(XI) # Forward pass: Compute predicted y by passing x to the model #print('X: {}'.format(x)) fps_pred, preds = model(x) _, preds = preds.max(2, keepdim=True) preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) preds_size = Variable(torch.IntTensor([preds.size(0)] * eval_batchSize)) rsim_preds = converter.decode(preds.data, preds_size.data, raw=True) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) #print('rsim_preds: {}, sim_preds {} and YI is {}'.format(str(rsim_preds.encode('utf-8')), sim_preds.encode('utf-8'), YI)) try: print('rsim_preds: {}, sim_preds {} and YI is {}'.format( rsim_preds, sim_preds, YI)) except Exception as e: print('Exception in printing the decoded value: {}'.format(error)) try: for pred, target in zip(sim_preds, YI): if pred == target.lower(): correct += 1 except Exception as e: print('Exception while calculating correct in Eval')
def main(): # Load data x, y, columns = loadData(DATASET, SAVED) # Train/Test split 80/20 trnidx = int(x.shape[0] * .8) xtrain, ytrain = x[:trnidx], y[:trnidx][:, None] xtest, ytest = x[trnidx:], y[trnidx:][:, None] # Train optimizer = RMSPropOptimizer() alpha, batch, epochs = 1e-3, 4, 100 dnn = Model([ Dense(inputdim=xtrain.shape[1], units=32, activation='relu'), Dense(inputdim=32, units=1, activation='linear') ], loss='mean_squared_error', optimizer=optimizer) # Test dnn.fit(x=xtrain, y=ytrain, batch=batch, alpha=alpha, epochs=epochs) ypred = dnn.predict(xtest) print('Test MSE: %.4f' % MeanSquaredError(ytest, ypred))
def computeModelMetrics(): """ Function to do a manual cross validation and check precision, recall and f1 """ from sklearn.model_selection import cross_validate # Load data features, labels, unScaledFeatures = ut.loadData(True) features = features.astype(np.float32) mapping = { key: value for key, value in zip(list(set(labels)), range(len(set(labels)))) } labels = np.array([mapping[x] for x in labels], dtype=np.int64) y_pred = cross_validate(net, features, labels, scoring=('recall_micro', 'precision_micro', 'f1_micro', 'accuracy'), cv=10) print(y_pred) print( f"Precision = {np.mean(y_pred['test_precision_micro'])} (+/- {np.std(y_pred['test_precision_micro'])})" ) print( f"Recall = {np.mean(y_pred['test_recall_micro'])} (+/- {np.std(y_pred['test_recall_micro'])})" ) print( f"F1 = {np.mean(y_pred['test_f1_micro'])} (+/- {np.std(y_pred['test_f1_micro'])})" )
def main(): # Load data xtrain, xtest, _ = loadData(DATASET, SAVED) xtrain = xtrain if os.path.isfile(EMBEDDINGS): # Load embeddings embeddings = np.loadtxt('w2v.csv', dtype=str, delimiter=',', comments=None) vocab = embeddings[:, 0] vdict = {vocab[idx]: idx for idx in range(vocab.shape[0])} embeddings = embeddings[:, 1:].astype(np.float) else: # Creat embeddings vocab, probs = getUnigramProbs(xtrain) w2v = Word2Vec(vocab, probs, 50) w2v.train(xtrain[:100], 1, 4, 4, 1e-2) # Save embeddings embeddings = (w2v.w + w2v.c).astype(str) vocab = np.asarray(vocab)[:, None] embeddings = np.concatenate((vocab, embeddings), axis=1) np.savetxt('w2v.csv', embeddings, fmt='%s', delimiter=',')
def test_for_one_region(filename='100307.tsv', path='../Data', region_idx=0, history_lengths=range(1, 6), delays=range(1, 2), calc_type='ksg', compute_p=False): calc = startCalc(calc_type) df = utils.loadData(filename, path) data = utils.preprocess(df, sampling_rate=1.3, mean_processing_type='removal', trim_start=50, trim_end=25) result, ais_values, parameters, p_values = getLocalsForRegion( data, calc, region_idx, history_lengths, delays, compute_p=compute_p) if p_values is not None: print("p value:", p_values) plotAISAcrossParams(ais_values, history_lengths, delays, show_plot=False) plt.figure() plt.plot(result) plt.xlabel('Time') plt.ylabel('AIS') plt.title("AIS Kraskov: {}[{}]\nHistory = {}, Delay = {}".format( filename, region_idx, parameters[0], parameters[1])) plt.show()
def main(): # Load data x, y, columns = loadData(DATASET, SAVED) # Randomly permute data rargs = torch.randperm(x.shape[0]) x, y = x[rargs], y[rargs] x = torch.cat([x, torch.ones((x.shape[0], 1))], dim=1) y = torch.Tensor(createOneHotColumn(y.numpy())[0]) # Train/Test split 80/20 trnidx = int(x.shape[0] * .8) xtrain, ytrain = x[:trnidx], y[:trnidx] xtest, ytest = x[trnidx:], y[trnidx:] # Train alpha, batch, epochs = 1e-1, 32, 100 optimizer = RMSPropOptimizer() dnn = Model([ Dense(inputdim=xtrain.shape[1], units=8, activation='relu'), Dense(inputdim=8, units=ytrain.shape[1], activation='softmax') ], loss='categorical_cross_entropy', optimizer=optimizer) # Test dnn.fit(x=xtrain, y=ytrain, batch=batch, alpha=alpha, epochs=epochs) ypred = dnn.predict(xtest) print('Test Acc: %.4f' % Accuracy(ytest, ypred))
def loadData(self): rawInputs, rawOutputs = loadData(self.__filename, list(self.numericCols.keys()) + list(self.stringCols.keys()), list(self.outputCol.keys())[0]) priceAverage = 0 for i in range(len(rawInputs)): # for i in range(1000): containsNA = False for j in range(len(rawInputs[0])): if rawInputs[i][j] == "NA": containsNA = True break if len(rawInputs[i][self.stringCols['description']]) < 5: containsNA = True if containsNA: continue rawInputs[i][0] = rawInputs[i][0].replace('$', "") if rawInputs[i][0] != "": priceAverage += float(rawInputs[i][0]) if rawInputs[i][1] == "Studio": rawInputs[i][1] = 1 self.__inputs.append(rawInputs[i]) self.__outputs.append(rawOutputs[i]) priceAverage = priceAverage / len(self.__inputs) for i in range(len(self.__inputs)): if self.__inputs[i][0] == "": self.__inputs[i][0] = priceAverage
def main(): # Load data x, y, columns = loadData(DATASET, SAVED) # Randomly permute data rargs = torch.randperm(x.shape[0]) x, y = x[rargs], y[rargs] # Train/Test split 80/20 trnidx = int(x.shape[0] * .8) xtrain, ytrain = x[:trnidx], y[:trnidx] xtest, ytest = x[trnidx:], y[trnidx:] classes = [c.item() for c in torch.unique(ytrain)] # Train forest = RandomForestClassifier(numTrees=10, maxDepth=None, leafSize=1, bootstrapRatio=0.3) forest.fit(xtrain, ytrain, classes) # Test ypred = forest.predict(xtest) acc = torch.sum((ytest == ypred).float()) / ytest.shape[0] print('Test Accuracy: %.4f' % acc)
def loaded(ev, server, plugin): global cfg if ev["name"] == name: cfg = utils.loadData("uuid", cfg) cfg["offline"] = server.offline_login if "usercache" not in cfg or cfg["usercache"] == "": cfg["usercache"] = server.cfg["asd"] + "/usercache.json"
def main(): # Load data x, y, columns = loadData(DATASET, SAVED) # Train/Test split 80/20 trnidx = int(x.shape[0] * .8) xtrain, ytrain = x[:trnidx], y[:trnidx] xtest, ytest = x[trnidx:], y[trnidx:] # Train alpha, batch, epochs = 1e-3, 128, 1000 optimizer = RMSPropOptimizer() dnn = Model([ Dense(inputdim=xtrain.shape[1], units=16, activation='relu'), Dense(inputdim=16, units=1, activation='sigmoid') ], loss='binary_cross_entropy', optimizer=optimizer) # Test dnn.fit(x=xtrain, y=ytrain, batch=batch, alpha=alpha, epochs=epochs) ypred = dnn.predict(xtest) print('Test Acc: %.4f' % Accuracy(ytest, ypred))
def doit(X, k): x, y = loadData("train", 225) x = x.toarray() train_x = x[0:10000] train_y = y[0:10000] test_x = x[9000:10000] test_y = y[9000:10000] model = lwp() model.fit(train_x, train_y) prediction = model.predict(test_x) cent = model.centroids_ clas = model.classes_ # print(cent.shape) # print(clas) neigh = knn(n_neighbors=k) neigh.fit(cent, clas) kn = neigh.kneighbors(X.toarray())[:][1] # correct = 0 # wrong = 0 # for i in range(1000): # print(test_y[i],clas[kn[i]]) # if test_y[i] in clas[kn[i]]: # correct = correct+1 # else: # wrong = wrong+1 # print(correct,wrong) return clas[kn]
def test_for_one_pair(filename='100307.tsv', path='../Data', source_region=1, target_region=0, param_file='Results/HCP/AIS/params/100307_params.csv', calc_type='ksg', compute_p=False): calc = startCalc(calc_type) df, param_df = utils.loadData(filename, path, get_params=True, param_file=param_file) data = utils.preprocess(df, sampling_rate=1.3, mean_processing_type='removal', trim_start=50, trim_end=25) result, p_values, dce = getLocalsForRegionPair(data, source_region, target_region, param_df, calc, compute_p=compute_p) if p_values is not None: print("p value:", p_values) print('Dynamic correlation exclusion value:', dce) utils.plotTimeseries(result)
def __init__(self): try: self.lastseenDict=utils.loadData('lastseen', dict) self.functions[".ls"]=("lastseen", 2, [0,1,2]) self.helpDict[".ls"]="Shows when a user was last seen (by the bot). Usage: .ls <Username>" except: self.writeLog("Error initializing plugin 'lastseen':", 2) self.noteError()
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): self.batch_size = cfg.batch_size self.data_feed = loadData(batch_size=self.batch_size, train_shuffle=True) # False # Construct Template Model (G_enc) to encoder input face with tf.variable_scope('face_model'): self.face_model = Resnet50() # Vgg16() self.face_model.build() print('VGG model built successfully.') # Construct G_dec and D self.is_train = tf.placeholder(tf.bool, name='is_train') self.profile, self.front = self.data_feed.get_train() # Construct Model self.build_arch() print('Model built successfully.') all_vars = tf.trainable_variables() self.vars_gen = [ var for var in all_vars if var.name.startswith('decoder') ] self.vars_dis = [ var for var in all_vars if var.name.startswith('discriminator') ] self.loss() #################DEBUG####################### with tf.name_scope('Debug'): grad1 = tf.gradients([self.feature_loss], [self.gen_p])[0] self.grad1 = tf.reduce_mean( tf.sqrt(tf.reduce_sum(tf.square(grad1), [1, 2, 3]))) grad2 = tf.gradients([self.g_loss], [self.gen_p])[0] self.grad2 = tf.reduce_mean( tf.sqrt(tf.reduce_sum(tf.square(grad2), [1, 2, 3]))) grad3 = tf.gradients([self.front_loss], [self.gen_f])[0] self.grad3 = tf.reduce_mean( tf.sqrt(tf.reduce_sum(tf.square(grad3), [1, 2, 3]))) # Summary self._summary() # Trainer self.global_step = tf.Variable(0, name='global_step', trainable=False) self.train_gen = tf.train.AdamOptimizer( cfg.lr, beta1=cfg.beta1, beta2=cfg.beta2).minimize(self.gen_loss, global_step=self.global_step, var_list=self.vars_gen) self.train_dis = tf.train.AdamOptimizer( cfg.lr, beta1=cfg.beta1, beta2=cfg.beta2).minimize(self.dis_loss, global_step=self.global_step, var_list=self.vars_dis)
def runAll(self): """ This method will be used by the bayesian parameter search It returns input parameters enriched with validation scores """ df, truth = utils.loadData(self.dataset) y = truth.clusters self.fitPredict(df) return self.evaluate(y)
def add_ID(train_df, opt): path = os.path.join(opt['data_dir'], opt['train_file']) train_data = loadData(path) news2id = {} for i, news in enumerate(train_data): news2id[news['newsId']] = i df_ = pd.DataFrame({'newsId': list(news2id.keys()), 'ID': list(news2id.values())}) train_df = train_df.merge(df_, on='newsId', how='left') return train_df
def run_individual_parameters(i, data_path, extension, save_folder, GRP=False, compute_p=True, **preprocessing_params): """ Arguments: GRP -- True if processing the GRP data """ files = utils.getAllFiles(data_path, extension) if GRP: file = files[0] filename = '{:02}'.format( i) # Save the results by the subjects number subject_id = i else: file = files[i] filename = utils.basename(file) subject_id = None os.makedirs("Results/{}/AM/idx".format(save_folder), exist_ok=True) os.makedirs("Results/{}/AM/p_values".format(save_folder), exist_ok=True) print("Processing", i, ":", filename) if os.path.exists('Results/{}/AM/p_values/{}.csv'.format( save_folder, filename)): exit() param_file = 'Results/{}/AIS/idx/{}.csv'.format(save_folder, filename) df, param_df = utils.loadData(file, get_params=True, param_file=param_file, subject_id=subject_id) data = utils.preprocess(df, **preprocessing_params) results, p_values = getLocalsForAllRegions(data, param_df, compute_p=compute_p) # Add back the trimmed sections padding = ((0, 0), (preprocessing_params.get('trim_start', 0), preprocessing_params.get('trim_end', 0))) results = np.pad(results, padding, mode='constant', constant_values=0) pd.DataFrame(results).to_csv('Results/{}/AM/{}_AM.csv'.format( save_folder, filename), index=None, header=None) pd.DataFrame(p_values).to_csv('Results/{}/AM/p_values/{}.csv'.format( save_folder, filename), index=None, header=None) try: utils.plotHeatmap(results, divergent=True) except: pass
def main(): train = utils.loadData('./data/letters.train.data') test = utils.loadData('./data/letters.test.data') trainset = filterData(train) testset = filterData(test) model = Model2(trainset) model.train(3) correct = incorrect = 0 for x, y in testset: y_tag = model.inference(x) correct += (y == y_tag) incorrect += (y != y_tag) acc = 100.0 * correct / (correct + incorrect) print("test accuracy: {}".format(acc))
def val(net, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() val_iter = iter(val_loader) i = 0 n_correct = 0 loss_avg = utils.averager() max_iter = len(val_loader) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) # t, l = converter.encode(cpu_texts) # utils.loadData(text, t) # utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchSize) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def set_data(tt): #load data of txt to numpy x_input, y_input = utils.loadData() # x = np.reshape(x_input, (-1, 1)) y = np.reshape(y_input, (-1, 1)) #Defined a tensor data x_data = tt.FloatTensor(x) y_data = tt.FloatTensor(y) return x_input, y_input, x_data, y_data
def main(): #load the JPEG images or the npz file #data, labels = load_images() #un-comment if want to load JPEG images into numpy #saveData('data/data.npz', data, labels) #un-comment if want to save loaded JPEG images as npz for faster loading later data, labels = loadData( 'data/data.npz' ) #load images from npz, much faster then loading from JPEGS each time #split dataset into train, val, and test ... split into 70/30 and then 70/15/15 print('Splitting data into 70/15/15 train, val, and test sets.') train, testval, train_labels, testval_labels = train_test_split( data, labels, test_size=0.30, random_state=42, shuffle=True, stratify=labels) test, val, test_labels, val_labels = train_test_split( testval, testval_labels, test_size=0.50, random_state=42, shuffle=True, stratify=testval_labels) del data, labels, testval, testval_labels #free up memory #check imbalance print('Train Shape:', train.shape, 'Train Labels Shape:', train_labels.shape) print('Validation Shape:', val.shape, 'Validation Labels Shape:', val_labels.shape) print('Test Shape:', test.shape, 'Test Labels Shape:', test_labels.shape) plotClassDist(train_labels, 'Train Class Distribution') plotClassDist(val_labels, 'Validation Class Distribution') plotClassDist(test_labels, 'Test Class Distribution') #random oversample train set and recheck balance ovs_data, ovs_labels = randomOversample(train, train_labels) print('OVS Data Shape:', ovs_data.shape, 'OVS Labels Shape:', ovs_labels.shape) plotClassDist(ovs_labels, 'Train Class Distribution (ROS)') #oversample using smote on train set and recheck balance smt_data, smt_labels = smoteOversample(train, train_labels) print('SMT Data Shape:', smt_data.shape, 'SMT Labels Shape:', smt_labels.shape) plotClassDist(smt_labels, 'Train Class Distribution (SMOTE)') #save untouched and oversampled data as npz along with val and test saveData('data/train.npz', train, train_labels) saveData('data/ros_data.npz', ovs_data, ovs_labels) saveData('data/smt_data.npz', smt_data, smt_labels) saveData('data/val.npz', val, val_labels) saveData('data/test.npz', test, test_labels)
def __init__(self, code, lines, path, tglobals, config): try: self.configref = config sys.path.append(path) self.code = __import__(code, tglobals) print "\tTesting code execution for personality." if callable(self.code.test): self.code.test() else: raise ImportError self.lines = utils.loadData(lines, dict, path) except Exception as error: traceback.print_exc() except ImportError: print ("Cannot call self.code.test. Either the .py does not have it or the import went wrong.")
def trainBatch(net, criterion, optimizer): data = train_iter.next() cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size crnn.zero_grad() cost.backward() optimizer.step() return cost
def val(net, dataset, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader( dataset, shuffle=True, batch_size=opt.batchSize, num_workers=int(opt.workers)) val_iter = iter(data_loader) i = 0 n_correct = 0 loss_avg = utils.averager() for i in range(min(max_iter, len(data_loader))): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True) for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchSize) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
__author__ = 'HyNguyen' from utils import loadData, save_data_4_nn_k_words, time import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-indir', required=True, type=str) parser.add_argument('-outdir', required=True, type=str) parser.add_argument('-kwords', required=True, type=int) parser.add_argument('-nsample', type=str, default=-1) args = parser.parse_args() path2InDir = args.indir path2OutDir = args.outdir kwords = args.kwords nsample = args.nsample dataset = loadData(path2InDir,nsample) start = time.time() save_data_4_nn_k_words(dataset,path2OutDir,k_words=kwords ,data_name="cnn") end = time.time() print("Time for ", len(dataset), ": ", end-start)
def computeRatios(self,true_dist=False, vars_g=None, data_file='test',use_log=False): ''' Use the computed score densities to compute the decomposed ratio test. set true_dist to True if workspace have the true distributions to make plots, in that case vars_g also must be provided Final result is histogram for ratios and signal - bkf rejection curves ''' f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) w = f.Get('w') f.Close() #TODO: This are Harcoded for now c1 = self.c1 c0 = self.c0 #c1 = np.multiply(c1, self.cross_section) c1 = c1/c1.sum() c0 = c0/c0.sum() print 'Calculating ratios' npoints = 50 if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) if use_log == True: evaluateRatio = self.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = self.evaluateDecomposedRatio post = '' score = ROOT.RooArgSet(w.var('score')) scoref = ROOT.RooArgSet(w.var('scoref')) if use_log == True: getRatio = self.singleLogRatio else: getRatio = self.singleRatio if self.preprocessing == True: if self.scaler == None: self.scaler = {} for k in range(self.nsamples): for j in range(self.nsamples): if k < j: self.scaler[(k,j)] = joblib.load('{0}/model/{1}/{2}/{3}_{4}_{5}.dat'.format(self.dir,'mlp',self.c1_g,'scaler',self.dataset_names[k],self.dataset_names[j])) # NN trained on complete model F0pdf = w.function('bkghistpdf_F0_F1') F1pdf = w.function('sighistpdf_F0_F1') # TODO Here assuming that signal is first dataset testdata, testtarget = loadData(data_file,self.F0_dist,0,dir=self.dir,c1_g=self.c1_g,preprocessing=False) if len(vars_g) == 1: xarray = np.linspace(0,5,npoints) fullRatios,_ = evaluateRatio(w,xarray,x=x,plotting=True,roc=False,true_dist=True) F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in xarray]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in xarray]) y2 = getRatio(F1dist, F0dist) # NN trained on complete model outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),xarray.reshape(xarray.shape[0],1),model_g=self.model_g,clf=self.clf) F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) pdfratios = getRatio(F1fulldist, F0fulldist) saveFig(xarray, [fullRatios, y2, pdfratios], makePlotName('all','train',type='ratio'+post),title='Likelihood Ratios',labels=['Composed trained', 'True', 'Full trained'],print_pdf=True,dir=self.dir) if true_dist == True: decomposedRatio,_ = evaluateRatio(w,testdata,x=x,plotting=False,roc=self.verbose_printing,true_dist=True) else: decomposedRatio,_ = evaluateRatio(w,testdata,c0arr=c0,c1arr=c1,plotting=True, roc=True,data_type=data_file) if len(testdata.shape) > 1: outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata,model_g=self.model_g,clf=self.clf) #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),testdata,model_g=self.model_g) else: outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata.reshape(testdata.shape[0],1),model_g=self.model_g,clf=self.clf) F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F1fulldist,F0fulldist) if true_dist == True: if len(testdata.shape) > 1: F1dist = np.array([self.evalDist(x,w.pdf('F1'),xs) for xs in testdata]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),xs) for xs in testdata]) else: F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in testdata]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in testdata]) realRatio = getRatio(F1dist,F0dist) decomposed_target = testtarget complete_target = testtarget real_target = testtarget #Histogram F0-f0 for composed, full and true # Removing outliers numtest = decomposedRatio.shape[0] #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.] #decomposed_outliers = np.zeros(numtest,dtype=bool) #complete_outliers = np.zeros(numtest,dtype=bool) #decomposed_outliers = self.findOutliers(decomposedRatio) #complete_outliers = self.findOutliers(completeRatio) #decomposed_target = testtarget[decomposed_outliers] #complete_target = testtarget[complete_outliers] #decomposedRatio = decomposedRatio[decomposed_outliers] #completeRatio = completeRatio[complete_outliers] if true_dist == True: real_outliers = np.zeros(numtest,dtype=bool) real_outliers = self.findOutliers(realRatio) #real_target = testtarget[real_outliers] #realRatio = realRatio[real_outliers] all_ratios_plots = [] all_names_plots = [] bins = 70 low = 0.6 high = 1.2 if use_log == True: low = -1.0 high = 1.0 low = [] high = [] low = [] high = [] ratios_vars = [] for l,name in enumerate(['sig','bkg']): if true_dist == True: ratios_names = ['truth','full','composed'] ratios_vec = [realRatio, completeRatio, decomposedRatio] target_vec = [real_target, complete_target, decomposed_target] minimum = min([realRatio[real_target == 1-l].min(), completeRatio[complete_target == 1-l].min(), decomposedRatio[decomposed_target == 1-l].min()]) maximum = max([realRatio[real_target == 1-l].max(), completeRatio[complete_target == 1-l].max(), decomposedRatio[decomposed_target == 1-l].max()]) else: ratios_names = ['full','composed'] ratios_vec = [completeRatio, decomposedRatio] target_vec = [complete_target, decomposed_target] minimum = min([completeRatio[complete_target == 1-l].min(), decomposedRatio[decomposed_target == 1-l].min()]) maximum = max([completeRatio[complete_target == 1-l].max(), decomposedRatio[decomposed_target == 1-l].max()]) low.append(minimum - ((maximum - minimum) / bins)*10) high.append(maximum + ((maximum - minimum) / bins)*10) w.factory('ratio{0}[{1},{2}]'.format(name, low[l], high[l])) ratios_vars.append(w.var('ratio{0}'.format(name))) for curr, curr_ratios, curr_targets in zip(ratios_names,ratios_vec,target_vec): numtest = curr_ratios.shape[0] for l,name in enumerate(['sig','bkg']): hist = ROOT.TH1F('{0}_{1}hist_F0_f0'.format(curr,name),'hist',bins,low[l],high[l]) for val in curr_ratios[curr_targets == 1-l]: hist.Fill(val) datahist = ROOT.RooDataHist('{0}_{1}datahist_F0_f0'.format(curr,name),'hist', ROOT.RooArgList(ratios_vars[l]),hist) ratios_vars[l].setBins(bins) histpdf = ROOT.RooHistFunc('{0}_{1}histpdf_F0_f0'.format(curr,name),'hist', ROOT.RooArgSet(ratios_vars[l]), datahist, 0) histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') getattr(w,'import')(hist) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) #print '{0} {1} {2}'.format(curr,name,hist.Integral()) if name == 'bkg': all_ratios_plots.append([w.function('{0}_sighistpdf_F0_f0'.format(curr)), w.function('{0}_bkghistpdf_F0_f0'.format(curr))]) all_names_plots.append(['sig_{0}'.format(curr),'bkg_{0}'.format(curr)]) all_ratios_plots = [[all_ratios_plots[j][i] for j,_ in enumerate(all_ratios_plots)] for i,_ in enumerate(all_ratios_plots[0])] all_names_plots = [[all_names_plots[j][i] for j,_ in enumerate(all_names_plots)] for i,_ in enumerate(all_names_plots[0])] printMultiFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),all_names_plots,setLog=True,dir=self.dir,model_g=self.model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True) # scatter plot true ratio - composed - full ratio #if self.verbose_printing == True and true_dist == True: # saveFig(completeRatio,[realRatio], makePlotName('full','train',type='scat'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),scatter=True,axis=['full trained ratio','true ratio'],dir=self.dir,model_g=self.model_g) # saveFig(decomposedRatio,[realRatio], makePlotName('comp','train',type='scat'+post,dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed trained ratio','true ratio'],dir=self.dir, model_g=self.model_g) # signal - bkg rejection plots if use_log == True: decomposedRatio = np.exp(decomposedRatio) completeRatio = np.exp(completeRatio) if true_dist == True: realRatio = np.exp(realRatio) if true_dist == True: ratios_list = [decomposedRatio/decomposedRatio.max(), completeRatio/completeRatio.max(), realRatio/realRatio.max()] targets_list = [decomposed_target, complete_target, real_target] legends_list = ['composed', 'full', 'true'] else: indices = (decomposedRatio > 0.) decomposedRatio = decomposedRatio[indices] decomposed_target = decomposed_target[indices] indices = (completeRatio > 0.) completeRatio = completeRatio[indices] complete_target = complete_target[indices] completeRatio = np.log(completeRatio) decomposedRatio = np.log(decomposedRatio) decomposedRatio = decomposedRatio + np.abs(decomposedRatio.min()) completeRatio = completeRatio + np.abs(completeRatio.min()) ratios_list = [decomposedRatio/decomposedRatio.max(), completeRatio/completeRatio.max()] targets_list = [decomposed_target, complete_target] legends_list = ['composed','full'] makeSigBkg(ratios_list,targets_list,makePlotName('comp','all',type='sigbkg'+post,dir=self.dir, model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves') # Scatter plot to compare regression function and classifier score if self.verbose_printing == True and true_dist == True: testdata, testtarget = loadData('test',self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g) if len(testdata.shape) > 1: reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),xs) for xs in testdata]) else: reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),[xs]) for xs in testdata]) if len(testdata.shape) > 1: outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],testdata.shape[1]),model_g=self.model_g, clf=self.clf) else: outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],1),model_g=self.model_g, clf=self.clf)
def evaluateDecomposedRatio(self,w,evalData,x=None,plotting=True, roc=False,gridsize=None,c0arr=None, c1arr=None,true_dist=False,pre_evaluation=None,pre_dist=None,data_type='test',debug=False,cross_section=None,indexes=None): ''' Compute composed ratio for dataset 'evalData'. Single ratios can be precomputed in pre_evaluation ''' # pair-wise ratios # and decomposition computation #f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) #w = f.Get('w') #f.Close() if indexes == None: indexes = self.basis_indexes score = ROOT.RooArgSet(w.var('score')) npoints = evalData.shape[0] fullRatios = np.zeros(npoints) fullRatiosReal = np.zeros(npoints) c0arr = self.c0 if c0arr == None else c0arr c1arr = self.c1 if c1arr == None else c1arr true_score = [] train_score = [] all_targets = [] all_positions = [] all_ratios = [] for k,c in enumerate(c0arr): innerRatios = np.zeros(npoints) innerTrueRatios = np.zeros(npoints) if c == 0: continue for j,c_ in enumerate(c1arr): index_k, index_j = (indexes[k],indexes[j]) f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) if index_k<>index_j: if pre_evaluation == None: traindata = evalData if self.preprocessing == True: traindata = preProcessing(evalData,self.dataset_names[min(index_k,index_j)], self.dataset_names[max(index_k,index_j)],self.scaler) outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata,model_g=self.model_g,clf=self.clf) #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,index_k, #index_j),traindata,model_g=self.model_g) f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) else: f0pdfdist = pre_evaluation[0][index_k][index_j] f1pdfdist = pre_evaluation[1][index_k][index_j] if f0pdfdist == None or f1pdfdist == None: pdb.set_trace() pdfratios = self.singleRatio(f0pdfdist,f1pdfdist) else: pdfratios = np.ones(npoints) all_ratios.append(pdfratios) innerRatios += (c_/c) * pdfratios if true_dist == True: if pre_dist == None: f0 = w.pdf('f{0}'.format(index_k)) f1 = w.pdf('f{0}'.format(index_j)) if len(evalData.shape) > 1: f0dist = np.array([self.evalDist(x,f0,xs) for xs in evalData]) f1dist = np.array([self.evalDist(x,f1,xs) for xs in evalData]) else: f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in evalData]) f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in evalData]) else: f0dist = pre_dist[0][index_k][index_j] f1dist = pre_dist[1][index_k][index_j] ratios = self.singleRatio(f0dist, f1dist) innerTrueRatios += (c_/c) * ratios # ROC curves for pair-wise ratios if (roc == True or plotting==True) and k < j: all_positions.append((k,j)) if roc == True: if self.dataset_names <> None: name_k, name_j = (self.dataset_names[index_k], self.dataset_names[index_j]) else: name_k, name_j = (index_k,index_j) testdata, testtarget = loadData(data_type,name_k,name_j,dir=self.dir,c1_g=self.c1_g, preprocessing=self.preprocessing, scaler=self.scaler) else: testdata = evalData size2 = testdata.shape[1] if len(testdata.shape) > 1 else 1 outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),testdata,model_g=self.model_g,clf=self.clf) #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,index_k, # index_j),testdata.reshape(testdata.shape[0],size2),model_g=self.model_g) f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) clfRatios = self.singleRatio(f0pdfdist,f1pdfdist) train_score.append(clfRatios) if roc == True: all_targets.append(testtarget) #individual ROC #makeROC(clfRatios, testtarget,makePlotName('dec','train',k,j,type='roc',dir=self.dir, #model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g) if true_dist == True: if len(evalData.shape) > 1: f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) else: f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) trRatios = self.singleRatio(f0dist,f1dist) true_score.append(trRatios) # makeROC(trRatios, testtarget, makePlotName('dec','truth',k,j,type='roc', # dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g) innerRatios = 1./innerRatios innerRatios[np.abs(innerRatios) == np.inf] = 0. fullRatios += innerRatios if true_dist == True: innerTrueRatios = 1./innerTrueRatios innerTrueRatios[np.abs(innerTrueRatios) == np.inf] = 0. fullRatiosReal += innerTrueRatios if roc == True: for ind in range(1,(len(train_score)/3+1)): print_scores = train_score[(ind-1)*3:(ind-1)*3+3] print_targets = all_targets[(ind-1)*3:(ind-1)*3+3] print_positions = all_positions[(ind-1)*3:(ind-1)*3+3] if true_dist == True: makeMultiROC(print_scores, print_targets,makePlotName('all{0}'.format(ind-1),'comparison',type='roc', dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g, true_score = true_score,print_pdf=True,title='ROC for pairwise trained classifier',pos=print_positions) else: makeMultiROC(print_scores, print_targets,makePlotName('all{0}'.format(ind-1),'comparison',type='roc', dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g, print_pdf=True,title='ROC for pairwise trained classifier',pos=print_positions) if plotting == True: saveMultiFig(evalData,[x for x in zip(train_score,true_score)], makePlotName('all_dec','train',type='ratio'),labels=[['f0-f1(trained)','f0-f1(truth)'],['f0-f2(trained)','f0-f2(truth)'],['f1-f2(trained)','f1-f2(truth)']],title='Pairwise Ratios',print_pdf=True,dir=self.dir) return fullRatios,fullRatiosReal
#!/usr/bin/env python # coding:utf-8 import numpy as np import conf import NN from activation_function import Sigmoid import utils if __name__ == '__main__': print "Part 1: Loading Data\n" X, y = utils.loadData(conf.FILE_X, conf.FILE_Y) print "Part 2: Loading Parameters\n" W1, W2 = utils.loadParams(conf.FILE_W1, conf.FILE_W2) # Unroll parameters W = np.hstack((W1.flatten(0), W2.flatten(0))) W = W.reshape((len(W), 1)) print "Part 3: Compute Cost(Feedforward)\n" LEARN_RATE = 0 J, _ = NN.nnCostFunction(W, conf.INPUT_LAYER_SIZE, conf.HIDDEN_LAYER_SIZE, conf.NUM_LABELS, X, y, LEARN_RATE) print ("Cost at parameters (loaded from w1.txt and w2.txt): %f" "\n(this value should be about 0.287629)\n") % J print "Part 4: Implement Regularization\n"
def fit(self, data_file='test',importance_sampling=False, true_dist=True,vars_g=None): ''' Create pdfs for the classifier score to be used later on the ratio test, input workspace only needed in case there exist true pdfs for the distributions the models being used are ./model/{model_g}/{c1_g}/{model_file}_i_j.pkl and the data files are ./data/{model_g}/{c1_g}/{data_file}_i_j.dat ''' bins = 40 low = 0. high = 1. if self.input_workspace <> None: #f = ROOT.TFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) w = f.Get('w') # TODO test this when workspace is present w = ROOT.RooWorkspace('w') if w == None else w f.Close() else: w = ROOT.RooWorkspace('w') w.Print() print 'Generating Score Histograms' w.factory('score[{0},{1}]'.format(low,high)) s = w.var('score') if importance_sampling == True: if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None #This is because most of the data of the full model concentrate around 0 bins_full = 40 low_full = 0. high_full = 1. w.factory('scoref[{0},{1}]'.format(low_full, high_full)) s_full = w.var('scoref') histos = [] histos_names = [] inv_histos = [] inv_histos_names = [] sums_histos = [] def saveHistos(w,outputs,s,bins,low,high,pos=None,importance_sampling=False,importance_data=None, importance_outputs=None): if pos <> None: k,j = pos else: k,j = ('F0','F1') print 'Estimating {0} {1}'.format(k,j) for l,name in enumerate(['sig','bkg']): data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data", ROOT.RooArgSet(s)) hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high) values = outputs[l] #values = values[self.findOutliers(values)] for val in values: hist.Fill(val) s.setVal(val) data.add(ROOT.RooArgSet(s)) norm = 1./hist.Integral() hist.Scale(norm) s.setBins(bins) datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgList(s),hist) #histpdf = ROOT.RooHistPdf('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', # ROOT.RooArgSet(s), datahist, 1) histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgSet(s), datahist, 1) #histpdf.setUnitNorm(True) #testvalues = np.array([self.evalDist(ROOT.RooArgSet(s), histpdf, [xs]) for xs in values]) #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') #print 'INTEGRAL' #print histpdf.createIntegral(ROOT.RooArgSet(s)).getVal() #print histpdf.Integral() #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooAdaptiveGaussKronrodIntegrator1D') getattr(w,'import')(hist) getattr(w,'import')(data) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) score_str = 'scoref' if pos == None else 'score' # Calculate the density of the classifier output using kernel density #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str)) # Print histograms pdfs and estimated densities if self.verbose_printing == True and name == 'bkg' and k <> j: full = 'full' if pos == None else 'dec' if k < j and k <> 'F0': histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) if j < k and k <> 'F0': inv_histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) inv_histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) if self.scaler == None: self.scaler = {} # change this for k in range(self.nsamples): for j in range(self.nsamples): if k == j: continue #if k <> 2 and j <> 2: # continue if self.dataset_names <> None: name_k, name_j = (self.dataset_names[k], self.dataset_names[j]) else: name_k, name_j = (k,j) print 'Loading {0}:{1} {2}:{3}'.format(k,name_k, j,name_j) traindata, targetdata = loadData(data_file,name_k,name_j,dir=self.dir,c1_g=self.c1_g, preprocessing=self.preprocessing,scaler=self.scaler,persist=True) numtrain = traindata.shape[0] size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 #output = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 1],model_g=self.model_g), # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 0],model_g=self.model_g)] output = [predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] saveHistos(w,output,s,bins,low,high,(k,j)) #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) if self.verbose_printing==True: for ind in range(1,(len(histos)/3+1)): print_histos = histos[(ind-1)*3:(ind-1)*3+3] print_histos_names = histos_names[(ind-1)*3:(ind-1)*3+3] printMultiFrame(w,['score']*len(print_histos),print_histos, makePlotName('dec{0}'.format(ind-1),'all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),print_histos_names, dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') # Full model traindata, targetdata = loadData(data_file,self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g, preprocessing=self.preprocessing, scaler=self.scaler) numtrain = traindata.shape[0] size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 outputs = [predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] #outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==1],model_g=self.model_g), # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==0],model_g=self.model_g)] saveHistos(w,outputs,s_full, bins_full, low_full, high_full,importance_sampling=False) if self.verbose_printing == True: printFrame(w,['scoref'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),['signal','bkg'], dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) w.Print()
for i in xrange(100): plt.subplot(10, 10, i) plt.axis("off") plt.imshow(self.W1.T[i, :].reshape(28, 28), cmap=cmap) plt.show() if i == 0: stop_time = time.clock() print "one single epoch runs %i minutes!" % ((stop_time - start_time) / 60.0) plt.plot(learning_curve_list) plt.show() if __name__ == "__main__": dataset = "mnist.pkl.gz" train_set, valid_set, test_set = loadData(dataset) train_x, train_y = train_set valid_x, valid_y = valid_set test_x, test_y = test_set print "the size of training set is:(%d,%d)" % train_x.shape n_sample, feature_size = train_x.shape n_hidden = 500 epochs = 100 '''lr = 0.1 batch_size = 20 corruption_level = 0.3 regularization = 0 print "initializing AutoEncoder......" dA = DenoisingAutoEncoder(feature_size,n_hidden)
import personality # from cogito import config? # how to access config.character? # eval all lines in cogito itself? # return s, sArgs instance # sArgs is a dummy class containing the needed things to complete # s has its insertions converted to sArgs.<item>, sArgs instance delivered along - .format via eval in cogito core? import FListAPI import random import utils lines = utils.loadData("EDI", dict, "./personalities/EDI/") def __init__(datapipe): print("\tEDI initialized. ") # datapipe.personality.lines = utils.loadData('bartender', '\personalities\bartender\\') class Functions: def JCH(FListProtocol, msgobj): if msgobj.source.character.name == "Jalon Renk": FListProtocol.reply("Space Husband Unit 'Jalon Renk' recognized. Welcome.", msgobj) def test(): print("\tEDI.py successfully called test()") def handle(FListProtocol, msg):
def trainClassifiers(clf,nsamples, model_g='mlp',c1_g='', dir='/afs/cern.ch/user/j/jpavezse/systematics', model_file='adaptive', dataset_names = None, full_names = None, data_file='train', preprocessing=False, seed=1234, index = None, vars_names = None ): ''' Train classifiers pair-wise on datasets ''' print 'Training classifier' scaler=None if preprocessing == True: scaler = {} for k in range(nsamples): for j in range(nsamples): if k==j or k > j: continue if dataset_names <> None: name_k, name_j = (dataset_names[k], dataset_names[j]) else: name_k, name_j = (k,j) print " Training Classifier on {0}/{1}".format(name_k,name_j) traindata,targetdata = loadData(data_file,name_k,name_j,dir=dir,c1_g=c1_g, preprocessing=preprocessing, scaler=scaler) if model_g == 'mlp': clf.fit(traindata, targetdata, save_file='{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g,model_file,k,j)) else: rng = np.random.RandomState(seed) indices = rng.permutation(traindata.shape[0]) traindata = traindata[indices] targetdata = targetdata[indices] #scores = cross_validation.cross_val_score(clf, traindata.reshape(traindata.shape[0], #traindata.shape[1]), targetdata) #print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) clf.fit(traindata.reshape(traindata.shape[0],traindata.shape[1]) ,targetdata) #joblib.dump(clf, '/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(model_file,k,j)) joblib.dump(clf, '{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g,model_file,k,j)) print " Training Classifier on F0/F1" traindata,targetdata = loadData(data_file,'F0' if full_names == None else full_names[0], 'F1' if full_names == None else full_names[1],dir=dir,c1_g=c1_g) if model_g == 'mlp': clf.fit(traindata, targetdata, save_file='{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir,model_g,c1_g,model_file)) else: rng = np.random.RandomState(seed) indices = rng.permutation(traindata.shape[0]) traindata = traindata[indices] targetdata = targetdata[indices] #clf = svm.NuSVC(probability=True) #Why use a SVR?? scores = cross_validation.cross_val_score(clf, traindata, targetdata) print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) clf.fit(traindata,targetdata) #clf.plot_importance_matrix(vars_names) #joblib.dump(clf, '/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(model_file)) joblib.dump(clf, '{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(dir,model_g,c1_g,model_file)) return scaler
import personality #from cogito import config? #how to access config.character? #eval all lines in cogito itself? #return s, sArgs instance #sArgs is a dummy class containing the needed things to complete #s has its insertions converted to sArgs.<item>, sArgs instance delivered along - .format via eval in cogito core? import FListAPI import random import utils lines = utils.loadData('bartender', dict, './personalities/bartender/') class Functions(): def JCH(FListProtocol, msg): line = eval(random.choice(lines['join'])) if random.random<0.6: FList.say("Bartender Personality JCH! Welcome, new user.") def telling(FListProtocol, msg): pass # messages = FListProtocol._telling(msg.source.character.name) def __init__(datapipe): print("\tBartender personality successfully loaded. HERE WE GO!") #datapipe.personality.lines = utils.loadData('bartender', '\personalities\bartender\\') def test(): print("\tBartender.py successfully called test()\n") def handle(FListProtocol, msg):