Ejemplo n.º 1
0
def annotate(opt, h5=None):
    """ Crappy feature allowing to create annotation file
    """
    from labeling import Labeler
    labeler = Labeler(opt)
    labeler.prepare()

    if 'samples' not in h5:
        return
    samples = h5['samples']
    opt.srate
    opt.window
    for k, sampl in samples.iteritems():

        if '.srate' not in sampl or '.wndsize' not in sampl:
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        print(
            colorize(None, boldblue, green) *
            '\n\n## #labeling sampleset# %s' % k)
        labeler(sampl)
Ejemplo n.º 2
0
def annotate(opt, h5=None):
    """ Crappy feature allowing to create annotation file
    """
    from labeling import Labeler
    labeler = Labeler(opt)
    labeler.prepare()

    if 'samples' not in h5:
        return
    samples = h5['samples']
    opt.srate
    opt.window
    for k,sampl in samples.iteritems():

        if  '.srate' not in sampl or  '.wndsize' not in sampl :
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and  srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        print(colorize(None,boldblue,green ) * '\n\n## #labeling sampleset# %s' % k)
        labeler(sampl)
Ejemplo n.º 3
0
    def partition(data, device, train_size=0.8):
        """Static partition method.
        Performs partition by "plucking" (1 - train_size) * n messages out of the
        provided data set, and places them in a new test set.
        Args:
        data (list): The original formatted BGP data.
        device (str): The device on which the tensors should be created/stored.
        train_size (float): The proportion of the data to use as a training set.
        Returns:
        A tuple containing the partitioned training input and target sets as tensors.
        """
        # Start with a copy, will be training
        train = copy.deepcopy(data)
        test = []

        # Get distribution of message indices, keep ordering
        test_len = int((1 - train_size) * len(data))
        test_indices = sorted(random.sample(range(len(data)), test_len),
                              reverse=True)

        # For each index, remove from train and append to test
        for i in test_indices:
            test.append(train.pop(i))

        # Need to reverse test now
        test.reverse()

        # Now label each set individually (performed in place)
        Labeler(train)
        Labeler(test)

        # Rescale data as well
        train = DataRescaler(train).scaled_data
        test = DataRescaler(test).scaled_data

        # Convert to tensors
        # Inputs
        Xtrain = torch.tensor(
            [[s.get('time')] + list(s.get('composite').values())
             for s in train],
            dtype=torch.double).to(device)
        Xtest = torch.tensor(
            [[s.get('time')] + list(s.get('composite').values())
             for s in test],
            dtype=torch.double).to(device)

        # Targets
        Ttrain = torch.tensor([[s.get('distinct')] for s in train],
                              dtype=torch.long).to(device)
        Ttest = torch.tensor([[s.get('distinct')] for s in test],
                             dtype=torch.long).to(device)
        return (Xtrain, Ttrain, Xtest, Ttest)
Ejemplo n.º 4
0
    def test_labeling(self):
        l = Labeler(1)

        self.assertEqual(l.get_label(2.0, 5.0), 'a')
        self.assertEqual(l.get_label(5.0, 2.0), 'a')
        self.assertEqual(l.get_label(6.0, 5.0), 'b')
        self.assertEqual(l.get_label(6.0, 7.0), 'c')
        self.assertEqual(l.get_label(4.0, 7.0), 'd')

        l = Labeler(3)
        self.assertEqual(l.get_label(1.1, 2.0), 'aad')
        self.assertEqual(l.get_label(2.0, 2.0), 'aac')
        self.assertEqual(l.get_label(2.6, 2.6), 'aca')
Ejemplo n.º 5
0
def get_models(opt, h5=None):
    from models import evaluate, plot_roc, fapply, Mahalanobis, Momentum, FreqThresh, FreqBands
    from sklearn.preprocessing import Scaler
    from sklearn.decomposition import PCA
    from sklearn.mixture import GMM, DPGMM
    from sklearn.manifold import LocallyLinearEmbedding, Isomap
    from labeling import Labeler
    import re

    if not h5:
        h5 = H5Node(opt)
    samples = h5['samples']

    print(
        colorize(boldblue, green) * '#datasets found in database# %s:' %
        opt.database)
    datasets = []
    i = 0
    for k, sampl in samples.iteritems():
        if '.srate' not in sampl or '.wndsize' not in sampl:
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        if opt.sample and not re.findall(opt.sample, k):
            continue

        print(
            colorize(boldyellow, green) * '[%d] %s : (srate=%f, wndsize=%d)' %
            (i, k, srate, wndsize))

        datasets.append((i, (k, sampl, srate, wndsize)))
        i += 1
    datasets = dict(datasets)

    if len(datasets) > 1:
        selected = []
        while not selected:
            s = raw_input('datasets to use:')
            selected = [datasets[int(i.strip())] for i in s.split(',')]
    else:
        selected = datasets.values()

    steps = {
        #'Scaler': fapply( Scaler ),
        'Bands': fapply(FreqBands, 2, 5, 10),
        #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ),
        'Threshold': fapply(FreqThresh, 0),
        'Momentum': fapply(Momentum, 'vks'),
        #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ),
        'DPGMM': fapply(DPGMM, covariance_type='diag', n_iter=40),
        'Mahal': fapply(Mahalanobis, False),
        'PCA': fapply(PCA, 1, 3),
        'PCA2': fapply(PCA),
        #'PCAw': fapply( PCA, 3, 10 , whiten=True )
    }
    if not opt.computations:
        opt.computations = [
            #('Bands', 'DPGMM'),
            ('Bands', 'Mahal'),
            #('BandsLg', 'DPGMM'),
            #('Threshold','DPGMM'),
            #('Threshold', 'Mahal'),
            ('Threshold', 'Momentum', 'Mahal'),
            #('Threshold','MomentumMVKS',  'DPGMM' ),
            ('Threshold', 'PCA', 'Mahal'),
            #('Threshold', 'PCA', 'DPGMM' ),
            #('Threshold', 'PCAw', 'DPGMM' )
        ]

    for k, sampl, srate, wndsize in selected:

        print('## processing %s' % k)

        if not 'annot' in sampl:
            labeler = Labeler(opt)
            labeler.prepare()
            labeler(sampl)

        fit, binarize = None, None
        #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ]

        splitToInts = lambda x: [
            int(i) for i in (m.strip() for m in x.split(',') if isString(m))
            if i.isdigit()
        ]

        model = splitToInts(opt.model) if opt.model is not None else None
        legit = splitToInts(opt.legit) if opt.legit is not None else None
        malicious = splitToInts(
            opt.malicious) if opt.malicious is not None else None

        m, ((fit, binarize, classes), res) = evaluate(opt,
                                                      None,
                                                      sampl,
                                                      steps=steps,
                                                      model=model,
                                                      legit=legit,
                                                      malicious=malicious)
        plot_roc(res, 'ROC curves')

        if opt.tex:
            f = open(opt.tex, 'a')
            try:
                f.write('\n')
                f.write(r'''
\begin{table}[h]
    \begin{center}
        \begin{tabular}{c|cc}
            Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline

%s

        \end{tabular}
    \end{center}
    \caption{Mean and standard deviation of the area under ROC curve.}
\end{table}
''' % '\\\\ \hline\n'.join(
                    ('%s & %.3f & %.3f' %
                     (name.replace('_', '\_'), np.mean(auc), np.std(auc)))
                    for name, auc, _ in res))
                f.write('\n')
            finally:
                f.close()

        return m, ((fit, binarize, classes), res)
Ejemplo n.º 6
0
def get_models(opt, h5= None):
    from models import evaluate,plot_roc,fapply,Mahalanobis,Momentum,FreqThresh,FreqBands
    from sklearn.preprocessing import Scaler
    from sklearn.decomposition import PCA
    from sklearn.mixture import GMM,DPGMM
    from sklearn.manifold import LocallyLinearEmbedding,Isomap
    from labeling import Labeler
    import re

    if not h5:
        h5 = H5Node(opt)
    samples = h5['samples']

    print(colorize(boldblue,green) * '#datasets found in database# %s:' %opt.database)
    datasets = []
    i = 0
    for k,sampl in samples.iteritems():
        if  '.srate' not in sampl or  '.wndsize' not in sampl :
            continue

        srate = scalar(sampl['.srate'])
        wndsize = scalar(sampl['.wndsize'])

        if opt.srate and  srate not in opt.srate or opt.window and wndsize not in opt.window:
            continue

        if opt.sample and not re.findall(opt.sample, k):
            continue

        print(colorize(boldyellow,green) * '[%d] %s : (srate=%f, wndsize=%d)'%(i,k,srate,wndsize))

        datasets.append((i,(k,sampl,srate,wndsize)))
        i+=1
    datasets = dict(datasets)

    if len(datasets)>1:
        selected = []
        while not selected:
            s = raw_input('datasets to use:')
            selected = [datasets[int(i.strip())] for i in s.split(',')]
    else:
        selected = datasets.values()

    steps = {
        #'Scaler': fapply( Scaler ),
        'Bands': fapply( FreqBands, 2,5,10 ),
        #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ),
        'Threshold': fapply( FreqThresh, 0 ),
        'Momentum': fapply( Momentum, 'vks'),
        #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ),
        'DPGMM' : fapply( DPGMM, covariance_type='diag', n_iter=40 ),
        'Mahal': fapply( Mahalanobis, False ),
        'PCA': fapply( PCA, 1, 3 ),
        'PCA2': fapply( PCA  ),
        #'PCAw': fapply( PCA, 3, 10 , whiten=True )
    }
    if not opt.computations : opt.computations = [
        #('Bands', 'DPGMM'),
        ('Bands', 'Mahal'),
        #('BandsLg', 'DPGMM'),
        #('Threshold','DPGMM'),
        #('Threshold', 'Mahal'),
        ('Threshold','Momentum', 'Mahal' ),
        #('Threshold','MomentumMVKS',  'DPGMM' ),
        ('Threshold', 'PCA', 'Mahal' ),
        #('Threshold', 'PCA', 'DPGMM' ),
        #('Threshold', 'PCAw', 'DPGMM' )
    ]

    for k,sampl,srate,wndsize in selected:

        print('## processing %s'%k)

        if not 'annot' in sampl:
            labeler = Labeler(opt)
            labeler.prepare()
            labeler(sampl)

        fit, binarize = None, None
        #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ]

        splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ]

        model = splitToInts(opt.model) if opt.model is not None else None
        legit = splitToInts(opt.legit) if opt.legit is not None else None
        malicious = splitToInts(opt.malicious) if opt.malicious is not None else None

        m,((fit, binarize, classes), res) = evaluate(opt, None, sampl,steps=steps,model=model,legit=legit,malicious=malicious)
        plot_roc(res,'ROC curves')

        if opt.tex:
            f = open(opt.tex,'a')
            try:
                f.write('\n')
                f.write(r'''
\begin{table}[h]
    \begin{center}
        \begin{tabular}{c|cc}
            Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline

%s

        \end{tabular}
    \end{center}
    \caption{Mean and standard deviation of the area under ROC curve.}
\end{table}
''' % '\\\\ \hline\n'.join(('%s & %.3f & %.3f' % (name.replace('_','\_'),np.mean(auc),np.std(auc))) for name,auc,_ in res))
                f.write('\n')
            finally:
                f.close()

        return m,((fit, binarize, classes), res)