Ejemplo n.º 1
0
def get_models(opt, h5=None):
    from models import evaluate, plot_roc, fapply, Mahalanobis, Momentum, FreqThresh, FreqBands
    from sklearn.preprocessing import Scaler
    from sklearn.decomposition import PCA
    from sklearn.mixture import GMM, DPGMM
    from sklearn.manifold import LocallyLinearEmbedding, Isomap
    from labeling import Labeler
    import re

    if not h5:
        h5 = H5Node(opt)
    samples = h5['samples']

    print(
        colorize(boldblue, green) * '#datasets found in database# %s:' %
        opt.database)
    datasets = []
    i = 0
    for k, sampl in samples.iteritems():
        if '.srate' not in sampl or '.wndsize' not in sampl:
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        if opt.sample and not re.findall(opt.sample, k):
            continue

        print(
            colorize(boldyellow, green) * '[%d] %s : (srate=%f, wndsize=%d)' %
            (i, k, srate, wndsize))

        datasets.append((i, (k, sampl, srate, wndsize)))
        i += 1
    datasets = dict(datasets)

    if len(datasets) > 1:
        selected = []
        while not selected:
            s = raw_input('datasets to use:')
            selected = [datasets[int(i.strip())] for i in s.split(',')]
    else:
        selected = datasets.values()

    steps = {
        #'Scaler': fapply( Scaler ),
        'Bands': fapply(FreqBands, 2, 5, 10),
        #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ),
        'Threshold': fapply(FreqThresh, 0),
        'Momentum': fapply(Momentum, 'vks'),
        #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ),
        'DPGMM': fapply(DPGMM, covariance_type='diag', n_iter=40),
        'Mahal': fapply(Mahalanobis, False),
        'PCA': fapply(PCA, 1, 3),
        'PCA2': fapply(PCA),
        #'PCAw': fapply( PCA, 3, 10 , whiten=True )
    }
    if not opt.computations:
        opt.computations = [
            #('Bands', 'DPGMM'),
            ('Bands', 'Mahal'),
            #('BandsLg', 'DPGMM'),
            #('Threshold','DPGMM'),
            #('Threshold', 'Mahal'),
            ('Threshold', 'Momentum', 'Mahal'),
            #('Threshold','MomentumMVKS',  'DPGMM' ),
            ('Threshold', 'PCA', 'Mahal'),
            #('Threshold', 'PCA', 'DPGMM' ),
            #('Threshold', 'PCAw', 'DPGMM' )
        ]

    for k, sampl, srate, wndsize in selected:

        print('## processing %s' % k)

        if not 'annot' in sampl:
            labeler = Labeler(opt)
            labeler.prepare()
            labeler(sampl)

        fit, binarize = None, None
        #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ]

        splitToInts = lambda x: [
            int(i) for i in (m.strip() for m in x.split(',') if isString(m))
            if i.isdigit()
        ]

        model = splitToInts(opt.model) if opt.model is not None else None
        legit = splitToInts(opt.legit) if opt.legit is not None else None
        malicious = splitToInts(
            opt.malicious) if opt.malicious is not None else None

        m, ((fit, binarize, classes), res) = evaluate(opt,
                                                      None,
                                                      sampl,
                                                      steps=steps,
                                                      model=model,
                                                      legit=legit,
                                                      malicious=malicious)
        plot_roc(res, 'ROC curves')

        if opt.tex:
            f = open(opt.tex, 'a')
            try:
                f.write('\n')
                f.write(r'''
\begin{table}[h]
    \begin{center}
        \begin{tabular}{c|cc}
            Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline

%s

        \end{tabular}
    \end{center}
    \caption{Mean and standard deviation of the area under ROC curve.}
\end{table}
''' % '\\\\ \hline\n'.join(
                    ('%s & %.3f & %.3f' %
                     (name.replace('_', '\_'), np.mean(auc), np.std(auc)))
                    for name, auc, _ in res))
                f.write('\n')
            finally:
                f.close()

        return m, ((fit, binarize, classes), res)
Ejemplo n.º 2
0
def run_epoch(dataset, is_training, model, optimizer, batch_size, margin,
              save_path):
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              drop_last=True)
    losses = []
    all_ranked_labels = []
    if is_training:
        model.train()
    else:
        model.eval()
    requires_grad = False
    for batch in tqdm(data_loader):
        q_body = Variable(
            batch["q_body"],
            requires_grad=requires_grad)  # batch_size x truncate_length
        cand_bodies = Variable(batch["candidate_bodies"],
                               requires_grad=requires_grad
                               )  # batch_size x num_cands x truncate_length
        q_title = Variable(batch["q_title"], requires_grad=requires_grad)
        cand_titles = Variable(batch["candidate_titles"],
                               requires_grad=requires_grad)
        q_body_mask = Variable(
            batch["q_body_mask"],
            requires_grad=requires_grad)  # batch_size x truncate_length
        q_title_mask = Variable(batch["q_title_mask"],
                                requires_grad=requires_grad)
        cand_body_masks = Variable(
            batch["candidate_body_masks"], requires_grad=requires_grad
        )  # batch_size x num_cands x truncate_length
        cand_title_masks = Variable(batch["candidate_title_masks"],
                                    requires_grad=requires_grad)
        num_cands = cand_titles.size()[1]
        if is_training:
            optimizer.zero_grad()
        q_body_enc, q_title_enc = model(q_body, q_body_mask), model(
            q_title, q_title_mask)  # output is batch_size  x enc_length
        cand_body_encs = model(
            cand_bodies.view(
                batch_size * num_cands, TRUNCATE_LENGTH
            ),  # output is (batch_size x num_cands) x enc_length
            cand_body_masks.view(batch_size * num_cands, TRUNCATE_LENGTH))
        cand_title_encs = model(
            cand_titles.view(batch_size * num_cands, TRUNCATE_LENGTH),
            cand_title_masks.view(batch_size * num_cands, TRUNCATE_LENGTH))
        assert (not hasnan(q_body_enc)), q_body_enc
        assert (not hasnan(q_title_enc)), q_title_enc
        assert (not hasnan(cand_body_encs))
        assert (not hasnan(cand_title_encs))
        q_enc = q_title_enc + q_body_enc / 2.0
        candidate_encs = cand_title_encs + cand_body_encs / 2.0
        enc_length = q_enc.size()[-1]
        #domain_predictions = domain_classifier(q_enc, candidate_ends)
        #loss(domain_predictions, target_predictions)
        #domain_optimizer.step()
        candidate_encs = candidate_encs.view(
            batch_size, num_cands, -1)  # batch_size x num_cands x enc_length
        query_encs = q_enc.view(batch_size, 1, -1).expand_as(
            candidate_encs)  # batch_size x (num_cands) x enc_length
        cos = torch.nn.CosineSimilarity(dim=2, eps=1e-08)(
            candidate_encs, query_encs)  # batch_size x (num_cands)
        assert (not hasnan(cos))
        if is_training:
            target = Variable(torch.zeros(batch_size).long(),
                              requires_grad=True)
            loss = torch.nn.MultiMarginLoss(margin=margin)(cos, target)
            #total_loss = loss - domain_loss
            #total_loss.backward()
            loss.backward()  #loss.backward(retain_graph=False)
            optimizer.step()
            losses.append(loss.cpu().data[0])
        else:
            # do evaluation stuff
            sorted_cos, ind = cos.sort(1, descending=True)
            labels = batch["labels"]
            for i in range(batch_size):
                all_ranked_labels.append(labels[i][ind.data[i]])
    if is_training:
        # save the model
        torch.save(model.state_dict(), save_path)
        avg_loss = np.mean(losses)
        return avg_loss
    else:
        return evaluate(all_ranked_labels)
Ejemplo n.º 3
0
def run_val(model, criterion, dataloader, accumulators, logger, writer, epoch,
            device, cfg):
    global best_acc
    global best_info
    dts_planes = []
    dts_lines = []
    gts_planes = []
    gts_lines = []
    for accumulator in accumulators:
        accumulator.reset()
    model.eval()
    for iters, inputs in enumerate(dataloader):
        # set device
        for key, value in inputs.items():
            inputs[key] = value.to(device)

        # forward
        x = model(inputs['img'])
        loss, loss_stats = criterion(x, **inputs)

        # post process
        # parse predict plane and line results
        dt_planes, dt_lines, dt_params3d, _ = post_process(x)
        # parse gt plane and line results to evaluate model roughly.
        gt_planes, gt_lines, gt_params3d = gt_check(inputs)
        # collect results
        dts_planes.extend(dt_planes)  # each img topk dt planes
        gts_planes.extend(gt_planes)
        dts_lines.extend([dt[dt[:, 3] == 1] for dt in dt_lines
                          ])  # each img has variable number of dt lines
        gts_lines.extend([gt[gt[:, 3] == 1] for gt in gt_lines])

        for i, (key, value) in enumerate(loss_stats.items()):
            if not torch.is_tensor(value):
                value = torch.tensor(value)
            accumulators[i].update(key, value.data)

    for accumulator in accumulators:
        writer.add_scalar('epoch/' + accumulator.name, accumulator.avg, epoch)

    # evaluate
    mAR_p, mAP_p, mAR_l, mAP_l = evaluate(dts_planes, dts_lines, gts_planes,
                                          gts_lines)
    writer.add_scalar('epoch/mAR_p', mAR_p, epoch)
    writer.add_scalar('epoch/mAP_p', mAP_p, epoch)
    writer.add_scalar('epoch/mAR_l', mAR_l, epoch)
    writer.add_scalar('epoch/mAP_l', mAP_l, epoch)

    # save model
    if epoch % 10 == 0:
        if not os.path.isdir(f'./checkpoints/checkpoints_{cfg.model_name}'):
            os.makedirs(f'./checkpoints/checkpoints_{cfg.model_name}')
        if cfg.num_gpus > 1:
            torch.save(
                model.module.state_dict(),
                f'./checkpoints/checkpoints_{cfg.model_name}/{epoch}.pt')
        else:
            torch.save(
                model.state_dict(),
                f'./checkpoints/checkpoints_{cfg.model_name}/{epoch}.pt')

    # save best model
    if (mAP_p + mAP_l) > best_acc:
        best_acc = mAP_p + mAP_l
        best_info = f'mAR_p:{mAR_p},mAP_p:{mAP_p},mAR_l:{mAR_l},mAP_l:{mAP_l},epoch:{epoch},best_acc:{best_acc}'
        if not os.path.isdir(f'./checkpoints/checkpoints_{cfg.model_name}'):
            os.makedirs(f'./checkpoints/checkpoints_{cfg.model_name}')
        if cfg.num_gpus > 1:
            torch.save(model.module.state_dict(),
                       f'./checkpoints/checkpoints_{cfg.model_name}/best.pt')
        else:
            torch.save(model.state_dict(),
                       f'./checkpoints/checkpoints_{cfg.model_name}/best.pt')
        logger.info(f'best_acc:{best_acc}, info:{best_info}')
Ejemplo n.º 4
0
def get_models(opt, h5= None):
    from models import evaluate,plot_roc,fapply,Mahalanobis,Momentum,FreqThresh,FreqBands
    from sklearn.preprocessing import Scaler
    from sklearn.decomposition import PCA
    from sklearn.mixture import GMM,DPGMM
    from sklearn.manifold import LocallyLinearEmbedding,Isomap
    from labeling import Labeler
    import re

    if not h5:
        h5 = H5Node(opt)
    samples = h5['samples']

    print(colorize(boldblue,green) * '#datasets found in database# %s:' %opt.database)
    datasets = []
    i = 0
    for k,sampl in samples.iteritems():
        if  '.srate' not in sampl or  '.wndsize' not in sampl :
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and  srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        if opt.sample and not re.findall(opt.sample, k):
            continue

        print(colorize(boldyellow,green) * '[%d] %s : (srate=%f, wndsize=%d)'%(i,k,srate,wndsize))

        datasets.append((i,(k,sampl,srate,wndsize)))
        i+=1
    datasets = dict(datasets)

    if len(datasets)>1:
        selected = []
        while not selected:
            s = raw_input('datasets to use:')
            selected = [datasets[int(i.strip())] for i in s.split(',')]
    else:
        selected = datasets.values()

    steps = {
        #'Scaler': fapply( Scaler ),
        'Bands': fapply( FreqBands, 2,5,10 ),
        #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ),
        'Threshold': fapply( FreqThresh, 0 ),
        'Momentum': fapply( Momentum, 'vks'),
        #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ),
        'DPGMM' : fapply( DPGMM, covariance_type='diag', n_iter=40 ),
        'Mahal': fapply( Mahalanobis, False ),
        'PCA': fapply( PCA, 1, 3 ),
        'PCA2': fapply( PCA  ),
        #'PCAw': fapply( PCA, 3, 10 , whiten=True )
    }
    if not opt.computations : opt.computations = [
        #('Bands', 'DPGMM'),
        ('Bands', 'Mahal'),
        #('BandsLg', 'DPGMM'),
        #('Threshold','DPGMM'),
        #('Threshold', 'Mahal'),
        ('Threshold','Momentum', 'Mahal' ),
        #('Threshold','MomentumMVKS',  'DPGMM' ),
        ('Threshold', 'PCA', 'Mahal' ),
        #('Threshold', 'PCA', 'DPGMM' ),
        #('Threshold', 'PCAw', 'DPGMM' )
    ]

    for k,sampl,srate,wndsize in selected:

        print('## processing %s'%k)

        if not 'annot' in sampl:
            labeler = Labeler(opt)
            labeler.prepare()
            labeler(sampl)

        fit, binarize = None, None
        #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ]

        splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ]

        model = splitToInts(opt.model) if opt.model is not None else None
        legit = splitToInts(opt.legit) if opt.legit is not None else None
        malicious = splitToInts(opt.malicious) if opt.malicious is not None else None

        m,((fit, binarize, classes), res) = evaluate(opt, None, sampl,steps=steps,model=model,legit=legit,malicious=malicious)
        plot_roc(res,'ROC curves')

        if opt.tex:
            f = open(opt.tex,'a')
            try:
                f.write('\n')
                f.write(r'''
\begin{table}[h]
    \begin{center}
        \begin{tabular}{c|cc}
            Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline

%s

        \end{tabular}
    \end{center}
    \caption{Mean and standard deviation of the area under ROC curve.}
\end{table}
''' % '\\\\ \hline\n'.join(('%s & %.3f & %.3f' % (name.replace('_','\_'),np.mean(auc),np.std(auc))) for name,auc,_ in res))
                f.write('\n')
            finally:
                f.close()

        return m,((fit, binarize, classes), res)
Ejemplo n.º 5
0
def plot_contamination_figures(distribution_name):
    #### set up constants, parameters, and models
    delta = 0.05
    N = 20000
    dimensions = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
    # dimensions = [1,] #[16, 32, 64,] # 128, 256, 512]
    data_generators = {
        'LogNormal': LogNormal(3,
                               1,
                               contamination_level=0.05,
                               random_state=1001),
        'Burr': Burr(1.2, 10, contamination_level=0.05, random_state=1001)
    }

    model_names = [
        'EmpiricalMean', 'CatoniGiulini two-phase', 'CatoniGiulini one-phase',
        'Coordinate-wise truncated mean', 'HDMoM geometric median',
        'HDMoM coordinative-wise median'
    ]
    models = [
        EmpiricalMean(),
        CatoniGiulini(delta, True),
        CatoniGiulini(delta, False),
        CoordTruncMeans(delta),
        HDMoM(delta, True),
        HDMoM(delta, False)
    ]
    model2markers = {
        'EmpiricalMean': 'o',
        'CatoniGiulini two-phase': '+',
        'CatoniGiulini one-phase': '+',
        'Coordinate-wise truncated mean': '+',
        'HDMoM geometric median': 'x',
        'HDMoM coordinative-wise median': 'x'
    }
    experimental_results = {model: [] for model in model_names}

    #### conduct experiments
    if True:
        data_generator = data_generators[distribution_name]
        for model, model_name in tqdm(zip(models, model_names),
                                      total=len(models)):
            for D in dimensions:
                data_generator.reset()
                error = evaluate(data_generator, model, N, D)
                experimental_results[model_name].append(error)
        with open('./experiments/%sContaminationOut.json' % distribution_name,
                  'w') as f:
            json.dump(experimental_results, f)
    else:
        with open('./experiments/%sContaminationOut.json' % distribution_name,
                  'r') as f:
            experimental_results = json.load(f)

    #### plots
    ## plot all methods
    plt.rcParams["font.family"] = "Times New Roman"
    plt.xscale('log', base=2)
    colors = cm.rainbow(np.linspace(0, 1, len(models)))

    marker_size = 20

    scatters = []
    for i, model_name in enumerate(model_names):
        l_i = plt.scatter(dimensions,
                          experimental_results[model_name],
                          color=colors[i],
                          s=marker_size,
                          marker=model2markers[model_name])
        scatters.append(l_i)
    plt.legend(scatters, model_names, loc='upper left')
    plt.xlabel('Dimension')
    plt.ylabel('Error')
    if distribution_name == 'LogNormal':
        plt.ylim(0, 50)
    plt.savefig('./figures/%sContaminationAll.pdf' % distribution_name,
                dpi=300)
    plt.cla()

    ## plot only trimmed mean based methods
    plt.rcParams["font.family"] = "Times New Roman"
    plt.xscale('log', base=2)
    scatters = []
    TrimmedMeanMethods = [
        'EmpiricalMean', 'CatoniGiulini two-phase', 'CatoniGiulini one-phase',
        'Coordinate-wise truncated mean'
    ]
    for i, model_name in enumerate(TrimmedMeanMethods):
        l_i = plt.scatter(dimensions,
                          experimental_results[model_name],
                          color=colors[i],
                          s=marker_size,
                          marker=model2markers[model_name])
        scatters.append(l_i)

    plt.legend(scatters, TrimmedMeanMethods, loc='upper left')
    plt.xlabel('Dimension')
    plt.ylabel('Error')
    if distribution_name == 'LogNormal':
        plt.ylim(0, 50)
    plt.savefig('./figures/%sContaminationTrimmed.pdf' % distribution_name,
                dpi=300)
    plt.cla()

    ## plot only MoM based methods
    plt.rcParams["font.family"] = "Times New Roman"
    plt.xscale('log', base=2)
    scatters = []
    HDMoMMethods = [
        'EmpiricalMean', 'HDMoM geometric median',
        'HDMoM coordinative-wise median'
    ]
    for i, model_name in enumerate(HDMoMMethods):
        i += 3
        l_i = plt.scatter(dimensions,
                          experimental_results[model_name],
                          color=colors[i],
                          s=marker_size,
                          marker=model2markers[model_name])
        scatters.append(l_i)

    plt.legend(scatters, HDMoMMethods, loc='upper left')
    plt.xlabel('Dimension')
    plt.ylabel('Error')
    if distribution_name == 'LogNormal':
        plt.ylim(0, 50)
    plt.savefig('./figures/%sContaminationMoM.pdf' % distribution_name,
                dpi=300)
    plt.cla()
Ejemplo n.º 6
0
            holdout_to_save.to_csv("blended_holdout_data.csv")

            test_data.to_csv("blended_test_data.csv")

    ## final steps
    # reinstantiate
    model = md._ESTIMATORS_META_[_MAIN_ESTIMATOR_]()
    err = md.fit_model(model, train_data, train_labels)
    print "###############################################"
    print "MODEL:", model
    print "Trianing error rate:", err
    print "###############################################"

    if _BLENDING_ or _HOLDOUT_:
        holdout_preds = model.predict(holdout)
        holdout_acc = 1 - md.evaluate(holdout_preds, holdout_labels.ravel())
        print "###############################################"
        print "Holdout error rate:", holdout_acc
        print "###############################################"

        ## now re-instantiate and train on concatenated holdout + train
        train_data = pd.concat([train_data, holdout], axis=0)
        train_labels = np.concatenate([train_labels, holdout_labels], axis=0)

        model = md._ESTIMATORS_META_[_MAIN_ESTIMATOR_]()
        md.fit_model(model, train_data, train_labels)

    preds = model.predict(test_data)
    preds_df = pd.DataFrame(preds)
    preds_df.index = preds_df.index + 1
    #preds.columns = ['Id', 'Prediction']