def get_models(opt, h5=None): from models import evaluate, plot_roc, fapply, Mahalanobis, Momentum, FreqThresh, FreqBands from sklearn.preprocessing import Scaler from sklearn.decomposition import PCA from sklearn.mixture import GMM, DPGMM from sklearn.manifold import LocallyLinearEmbedding, Isomap from labeling import Labeler import re if not h5: h5 = H5Node(opt) samples = h5['samples'] print( colorize(boldblue, green) * '#datasets found in database# %s:' % opt.database) datasets = [] i = 0 for k, sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl: continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue if opt.sample and not re.findall(opt.sample, k): continue print( colorize(boldyellow, green) * '[%d] %s : (srate=%f, wndsize=%d)' % (i, k, srate, wndsize)) datasets.append((i, (k, sampl, srate, wndsize))) i += 1 datasets = dict(datasets) if len(datasets) > 1: selected = [] while not selected: s = raw_input('datasets to use:') selected = [datasets[int(i.strip())] for i in s.split(',')] else: selected = datasets.values() steps = { #'Scaler': fapply( Scaler ), 'Bands': fapply(FreqBands, 2, 5, 10), #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ), 'Threshold': fapply(FreqThresh, 0), 'Momentum': fapply(Momentum, 'vks'), #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ), 'DPGMM': fapply(DPGMM, covariance_type='diag', n_iter=40), 'Mahal': fapply(Mahalanobis, False), 'PCA': fapply(PCA, 1, 3), 'PCA2': fapply(PCA), #'PCAw': fapply( PCA, 3, 10 , whiten=True ) } if not opt.computations: opt.computations = [ #('Bands', 'DPGMM'), ('Bands', 'Mahal'), #('BandsLg', 'DPGMM'), #('Threshold','DPGMM'), #('Threshold', 'Mahal'), ('Threshold', 'Momentum', 'Mahal'), #('Threshold','MomentumMVKS', 'DPGMM' ), ('Threshold', 'PCA', 'Mahal'), #('Threshold', 'PCA', 'DPGMM' ), #('Threshold', 'PCAw', 'DPGMM' ) ] for k, sampl, srate, wndsize in selected: print('## processing %s' % k) if not 'annot' in sampl: labeler = Labeler(opt) labeler.prepare() labeler(sampl) fit, binarize = None, None #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ] splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ] model = splitToInts(opt.model) if opt.model is not None else None legit = splitToInts(opt.legit) if opt.legit is not None else None malicious = splitToInts( opt.malicious) if opt.malicious is not None else None m, ((fit, binarize, classes), res) = evaluate(opt, None, sampl, steps=steps, model=model, legit=legit, malicious=malicious) plot_roc(res, 'ROC curves') if opt.tex: f = open(opt.tex, 'a') try: f.write('\n') f.write(r''' \begin{table}[h] \begin{center} \begin{tabular}{c|cc} Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline %s \end{tabular} \end{center} \caption{Mean and standard deviation of the area under ROC curve.} \end{table} ''' % '\\\\ \hline\n'.join( ('%s & %.3f & %.3f' % (name.replace('_', '\_'), np.mean(auc), np.std(auc))) for name, auc, _ in res)) f.write('\n') finally: f.close() return m, ((fit, binarize, classes), res)
def run_epoch(dataset, is_training, model, optimizer, batch_size, margin, save_path): data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) losses = [] all_ranked_labels = [] if is_training: model.train() else: model.eval() requires_grad = False for batch in tqdm(data_loader): q_body = Variable( batch["q_body"], requires_grad=requires_grad) # batch_size x truncate_length cand_bodies = Variable(batch["candidate_bodies"], requires_grad=requires_grad ) # batch_size x num_cands x truncate_length q_title = Variable(batch["q_title"], requires_grad=requires_grad) cand_titles = Variable(batch["candidate_titles"], requires_grad=requires_grad) q_body_mask = Variable( batch["q_body_mask"], requires_grad=requires_grad) # batch_size x truncate_length q_title_mask = Variable(batch["q_title_mask"], requires_grad=requires_grad) cand_body_masks = Variable( batch["candidate_body_masks"], requires_grad=requires_grad ) # batch_size x num_cands x truncate_length cand_title_masks = Variable(batch["candidate_title_masks"], requires_grad=requires_grad) num_cands = cand_titles.size()[1] if is_training: optimizer.zero_grad() q_body_enc, q_title_enc = model(q_body, q_body_mask), model( q_title, q_title_mask) # output is batch_size x enc_length cand_body_encs = model( cand_bodies.view( batch_size * num_cands, TRUNCATE_LENGTH ), # output is (batch_size x num_cands) x enc_length cand_body_masks.view(batch_size * num_cands, TRUNCATE_LENGTH)) cand_title_encs = model( cand_titles.view(batch_size * num_cands, TRUNCATE_LENGTH), cand_title_masks.view(batch_size * num_cands, TRUNCATE_LENGTH)) assert (not hasnan(q_body_enc)), q_body_enc assert (not hasnan(q_title_enc)), q_title_enc assert (not hasnan(cand_body_encs)) assert (not hasnan(cand_title_encs)) q_enc = q_title_enc + q_body_enc / 2.0 candidate_encs = cand_title_encs + cand_body_encs / 2.0 enc_length = q_enc.size()[-1] #domain_predictions = domain_classifier(q_enc, candidate_ends) #loss(domain_predictions, target_predictions) #domain_optimizer.step() candidate_encs = candidate_encs.view( batch_size, num_cands, -1) # batch_size x num_cands x enc_length query_encs = q_enc.view(batch_size, 1, -1).expand_as( candidate_encs) # batch_size x (num_cands) x enc_length cos = torch.nn.CosineSimilarity(dim=2, eps=1e-08)( candidate_encs, query_encs) # batch_size x (num_cands) assert (not hasnan(cos)) if is_training: target = Variable(torch.zeros(batch_size).long(), requires_grad=True) loss = torch.nn.MultiMarginLoss(margin=margin)(cos, target) #total_loss = loss - domain_loss #total_loss.backward() loss.backward() #loss.backward(retain_graph=False) optimizer.step() losses.append(loss.cpu().data[0]) else: # do evaluation stuff sorted_cos, ind = cos.sort(1, descending=True) labels = batch["labels"] for i in range(batch_size): all_ranked_labels.append(labels[i][ind.data[i]]) if is_training: # save the model torch.save(model.state_dict(), save_path) avg_loss = np.mean(losses) return avg_loss else: return evaluate(all_ranked_labels)
def run_val(model, criterion, dataloader, accumulators, logger, writer, epoch, device, cfg): global best_acc global best_info dts_planes = [] dts_lines = [] gts_planes = [] gts_lines = [] for accumulator in accumulators: accumulator.reset() model.eval() for iters, inputs in enumerate(dataloader): # set device for key, value in inputs.items(): inputs[key] = value.to(device) # forward x = model(inputs['img']) loss, loss_stats = criterion(x, **inputs) # post process # parse predict plane and line results dt_planes, dt_lines, dt_params3d, _ = post_process(x) # parse gt plane and line results to evaluate model roughly. gt_planes, gt_lines, gt_params3d = gt_check(inputs) # collect results dts_planes.extend(dt_planes) # each img topk dt planes gts_planes.extend(gt_planes) dts_lines.extend([dt[dt[:, 3] == 1] for dt in dt_lines ]) # each img has variable number of dt lines gts_lines.extend([gt[gt[:, 3] == 1] for gt in gt_lines]) for i, (key, value) in enumerate(loss_stats.items()): if not torch.is_tensor(value): value = torch.tensor(value) accumulators[i].update(key, value.data) for accumulator in accumulators: writer.add_scalar('epoch/' + accumulator.name, accumulator.avg, epoch) # evaluate mAR_p, mAP_p, mAR_l, mAP_l = evaluate(dts_planes, dts_lines, gts_planes, gts_lines) writer.add_scalar('epoch/mAR_p', mAR_p, epoch) writer.add_scalar('epoch/mAP_p', mAP_p, epoch) writer.add_scalar('epoch/mAR_l', mAR_l, epoch) writer.add_scalar('epoch/mAP_l', mAP_l, epoch) # save model if epoch % 10 == 0: if not os.path.isdir(f'./checkpoints/checkpoints_{cfg.model_name}'): os.makedirs(f'./checkpoints/checkpoints_{cfg.model_name}') if cfg.num_gpus > 1: torch.save( model.module.state_dict(), f'./checkpoints/checkpoints_{cfg.model_name}/{epoch}.pt') else: torch.save( model.state_dict(), f'./checkpoints/checkpoints_{cfg.model_name}/{epoch}.pt') # save best model if (mAP_p + mAP_l) > best_acc: best_acc = mAP_p + mAP_l best_info = f'mAR_p:{mAR_p},mAP_p:{mAP_p},mAR_l:{mAR_l},mAP_l:{mAP_l},epoch:{epoch},best_acc:{best_acc}' if not os.path.isdir(f'./checkpoints/checkpoints_{cfg.model_name}'): os.makedirs(f'./checkpoints/checkpoints_{cfg.model_name}') if cfg.num_gpus > 1: torch.save(model.module.state_dict(), f'./checkpoints/checkpoints_{cfg.model_name}/best.pt') else: torch.save(model.state_dict(), f'./checkpoints/checkpoints_{cfg.model_name}/best.pt') logger.info(f'best_acc:{best_acc}, info:{best_info}')
def get_models(opt, h5= None): from models import evaluate,plot_roc,fapply,Mahalanobis,Momentum,FreqThresh,FreqBands from sklearn.preprocessing import Scaler from sklearn.decomposition import PCA from sklearn.mixture import GMM,DPGMM from sklearn.manifold import LocallyLinearEmbedding,Isomap from labeling import Labeler import re if not h5: h5 = H5Node(opt) samples = h5['samples'] print(colorize(boldblue,green) * '#datasets found in database# %s:' %opt.database) datasets = [] i = 0 for k,sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl : continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue if opt.sample and not re.findall(opt.sample, k): continue print(colorize(boldyellow,green) * '[%d] %s : (srate=%f, wndsize=%d)'%(i,k,srate,wndsize)) datasets.append((i,(k,sampl,srate,wndsize))) i+=1 datasets = dict(datasets) if len(datasets)>1: selected = [] while not selected: s = raw_input('datasets to use:') selected = [datasets[int(i.strip())] for i in s.split(',')] else: selected = datasets.values() steps = { #'Scaler': fapply( Scaler ), 'Bands': fapply( FreqBands, 2,5,10 ), #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ), 'Threshold': fapply( FreqThresh, 0 ), 'Momentum': fapply( Momentum, 'vks'), #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ), 'DPGMM' : fapply( DPGMM, covariance_type='diag', n_iter=40 ), 'Mahal': fapply( Mahalanobis, False ), 'PCA': fapply( PCA, 1, 3 ), 'PCA2': fapply( PCA ), #'PCAw': fapply( PCA, 3, 10 , whiten=True ) } if not opt.computations : opt.computations = [ #('Bands', 'DPGMM'), ('Bands', 'Mahal'), #('BandsLg', 'DPGMM'), #('Threshold','DPGMM'), #('Threshold', 'Mahal'), ('Threshold','Momentum', 'Mahal' ), #('Threshold','MomentumMVKS', 'DPGMM' ), ('Threshold', 'PCA', 'Mahal' ), #('Threshold', 'PCA', 'DPGMM' ), #('Threshold', 'PCAw', 'DPGMM' ) ] for k,sampl,srate,wndsize in selected: print('## processing %s'%k) if not 'annot' in sampl: labeler = Labeler(opt) labeler.prepare() labeler(sampl) fit, binarize = None, None #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ] splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ] model = splitToInts(opt.model) if opt.model is not None else None legit = splitToInts(opt.legit) if opt.legit is not None else None malicious = splitToInts(opt.malicious) if opt.malicious is not None else None m,((fit, binarize, classes), res) = evaluate(opt, None, sampl,steps=steps,model=model,legit=legit,malicious=malicious) plot_roc(res,'ROC curves') if opt.tex: f = open(opt.tex,'a') try: f.write('\n') f.write(r''' \begin{table}[h] \begin{center} \begin{tabular}{c|cc} Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline %s \end{tabular} \end{center} \caption{Mean and standard deviation of the area under ROC curve.} \end{table} ''' % '\\\\ \hline\n'.join(('%s & %.3f & %.3f' % (name.replace('_','\_'),np.mean(auc),np.std(auc))) for name,auc,_ in res)) f.write('\n') finally: f.close() return m,((fit, binarize, classes), res)
def plot_contamination_figures(distribution_name): #### set up constants, parameters, and models delta = 0.05 N = 20000 dimensions = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] # dimensions = [1,] #[16, 32, 64,] # 128, 256, 512] data_generators = { 'LogNormal': LogNormal(3, 1, contamination_level=0.05, random_state=1001), 'Burr': Burr(1.2, 10, contamination_level=0.05, random_state=1001) } model_names = [ 'EmpiricalMean', 'CatoniGiulini two-phase', 'CatoniGiulini one-phase', 'Coordinate-wise truncated mean', 'HDMoM geometric median', 'HDMoM coordinative-wise median' ] models = [ EmpiricalMean(), CatoniGiulini(delta, True), CatoniGiulini(delta, False), CoordTruncMeans(delta), HDMoM(delta, True), HDMoM(delta, False) ] model2markers = { 'EmpiricalMean': 'o', 'CatoniGiulini two-phase': '+', 'CatoniGiulini one-phase': '+', 'Coordinate-wise truncated mean': '+', 'HDMoM geometric median': 'x', 'HDMoM coordinative-wise median': 'x' } experimental_results = {model: [] for model in model_names} #### conduct experiments if True: data_generator = data_generators[distribution_name] for model, model_name in tqdm(zip(models, model_names), total=len(models)): for D in dimensions: data_generator.reset() error = evaluate(data_generator, model, N, D) experimental_results[model_name].append(error) with open('./experiments/%sContaminationOut.json' % distribution_name, 'w') as f: json.dump(experimental_results, f) else: with open('./experiments/%sContaminationOut.json' % distribution_name, 'r') as f: experimental_results = json.load(f) #### plots ## plot all methods plt.rcParams["font.family"] = "Times New Roman" plt.xscale('log', base=2) colors = cm.rainbow(np.linspace(0, 1, len(models))) marker_size = 20 scatters = [] for i, model_name in enumerate(model_names): l_i = plt.scatter(dimensions, experimental_results[model_name], color=colors[i], s=marker_size, marker=model2markers[model_name]) scatters.append(l_i) plt.legend(scatters, model_names, loc='upper left') plt.xlabel('Dimension') plt.ylabel('Error') if distribution_name == 'LogNormal': plt.ylim(0, 50) plt.savefig('./figures/%sContaminationAll.pdf' % distribution_name, dpi=300) plt.cla() ## plot only trimmed mean based methods plt.rcParams["font.family"] = "Times New Roman" plt.xscale('log', base=2) scatters = [] TrimmedMeanMethods = [ 'EmpiricalMean', 'CatoniGiulini two-phase', 'CatoniGiulini one-phase', 'Coordinate-wise truncated mean' ] for i, model_name in enumerate(TrimmedMeanMethods): l_i = plt.scatter(dimensions, experimental_results[model_name], color=colors[i], s=marker_size, marker=model2markers[model_name]) scatters.append(l_i) plt.legend(scatters, TrimmedMeanMethods, loc='upper left') plt.xlabel('Dimension') plt.ylabel('Error') if distribution_name == 'LogNormal': plt.ylim(0, 50) plt.savefig('./figures/%sContaminationTrimmed.pdf' % distribution_name, dpi=300) plt.cla() ## plot only MoM based methods plt.rcParams["font.family"] = "Times New Roman" plt.xscale('log', base=2) scatters = [] HDMoMMethods = [ 'EmpiricalMean', 'HDMoM geometric median', 'HDMoM coordinative-wise median' ] for i, model_name in enumerate(HDMoMMethods): i += 3 l_i = plt.scatter(dimensions, experimental_results[model_name], color=colors[i], s=marker_size, marker=model2markers[model_name]) scatters.append(l_i) plt.legend(scatters, HDMoMMethods, loc='upper left') plt.xlabel('Dimension') plt.ylabel('Error') if distribution_name == 'LogNormal': plt.ylim(0, 50) plt.savefig('./figures/%sContaminationMoM.pdf' % distribution_name, dpi=300) plt.cla()
holdout_to_save.to_csv("blended_holdout_data.csv") test_data.to_csv("blended_test_data.csv") ## final steps # reinstantiate model = md._ESTIMATORS_META_[_MAIN_ESTIMATOR_]() err = md.fit_model(model, train_data, train_labels) print "###############################################" print "MODEL:", model print "Trianing error rate:", err print "###############################################" if _BLENDING_ or _HOLDOUT_: holdout_preds = model.predict(holdout) holdout_acc = 1 - md.evaluate(holdout_preds, holdout_labels.ravel()) print "###############################################" print "Holdout error rate:", holdout_acc print "###############################################" ## now re-instantiate and train on concatenated holdout + train train_data = pd.concat([train_data, holdout], axis=0) train_labels = np.concatenate([train_labels, holdout_labels], axis=0) model = md._ESTIMATORS_META_[_MAIN_ESTIMATOR_]() md.fit_model(model, train_data, train_labels) preds = model.predict(test_data) preds_df = pd.DataFrame(preds) preds_df.index = preds_df.index + 1 #preds.columns = ['Id', 'Prediction']