Ejemplo n.º 1
0
    def __init__(self, args):

        # create logger
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)

        # create console handler and set level to debug
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)

        # create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

        # add formatter to ch
        ch.setFormatter(formatter)

        # add ch to logger
        self.logger.addHandler(ch)
        self.logger.debug("Starting Collector process in %s" % os.getcwd())
        self.logger.debug("Gevent Version %s" % gevent.__version__)

        #TODO: move output file name to config
        #fname = "./NetFlow.%s.bin"%str(time.time()*100000)

        #WARN: might want to remove this after testing
        #self.out = open(fname,"wb")

        #create tool instances
        self.interface = Interface()
        self.parse = Parse()
        self.describe = Describe()
        self.standardize = Standardize()
        self.transform = Transform()
        self.partition = Partition()

        self.q = Queue()
        self.inWindow = False

        self.score = Score()
        #TODO: move csv name to config
        self.csv = CSV("output.csv")

        return super(Collector, self).__init__(args)
def main():
    sd = SynthData()
    ca = coordinateAscent()
    cal = CoordinateAscentLasso()
    st = Standardize()
    beta0Seperate = True
    lam = 0.1

    X, y, b = sd.generateData(noise=False,
                              w=np.array([1, 1, 1, 1, 1, 0, 0, 0, 0,
                                          0])[np.newaxis].T)
    #if beta0Seperate:
    #    beta = np.array([1, 1, 1, 1, 0, 0, 0, 0])[np.newaxis].T
    #else:
    #    beta = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0])[np.newaxis].T

    #if beta0Seperate:
    #    y = 1 + np.dot(X, beta)
    #else:
    #    X = np.append(np.ones((X.shape[0], 1)), X, 1)
    #    y = np.dot(X, beta)

    print('Fitting the model with Lasso:')
    print('Lambda = ' + str(lam))
    print('beta0, array of betas:')
    t0 = time()
    print(cal.coordinateAscentLasso(y, X, lam, [], False, beta0Seperate))
    dt = time() - t0
    print('done in %.4fs.' % dt)

    print()
    print('Fitting the model with plain \'ol Coordinate Ascent')
    print('beta0, array of betas:')
    t0 = time()
    print(ca.coordinateAscent(y, X, [], False))
    dt = time() - t0
    print('done in %.4fs.' % dt)
    print()

    #print('Dictionary Learning')
    #dl = decomposition.DictionaryLearning(fit_algorithm='cd')
    #print(dl.fit(X))
    #print(np.shape(dl.components_))
    #print(dl.components_)

    print('Fitting the model with LARS (from the scikit library)')
    clf = linear_model.LassoLars(alpha=0.01)
    t0 = time()
    print(clf.fit(X, y))
    dt = time() - t0
    print('done in %.4fs.' % dt)
    print('array of betas:')
    print(clf.coef_)
    return 1
Ejemplo n.º 3
0
    def __init__(self,args):

        # create logger
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        
        # create console handler and set level to debug
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        
        # create formatter
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        
        # add formatter to ch
        ch.setFormatter(formatter)
        
        # add ch to logger
        self.logger.addHandler(ch)        
        self.logger.debug( "Starting Collector process in %s"%os.getcwd())
        #self.logger.debug( "Gevent Version %s"%gevent.__version__)
        
        #TODO: move output file name to config
        fname = "./NetFlow.%d.bin"%int(time.time())
        
        #WARN: might want to remove this after testing
        self.out = open(fname,"wb")
        
        #create tool instances
        self.interface = Interface()
        self.parse = Parse()
        self.context = Context()
        self.describe = Describe()
        self.standardize = Standardize()
        self.transform = Transform()
        self.partition = Partition()
        
        self.q = Queue()
        
        self.inWindow = settings.SETTINGS.get("collector","inWindow")
        
        self.score = Score()
        #TODO: move csv name to config
        #self.csv = CSV("output.csv")
        
        self.output = Output()
        
        return super(Collector,self).__init__(args)
def main():
    np.set_printoptions(suppress=True)
    # loads the spambase data
    f = open("data/spambase.data")
    spam_data = np.array(np.loadtxt(f, delimiter=','))
    k_fold_nr = 10
    nr_samples = len(spam_data)

    # chooses which stochastic gradients to use on the run-through
    # takes defaults or you can provide command line arguments such as:
    # python3 cross_validation.py batch linear
    # python3 cross_validation.py batch linear stochastic linear batch logistic stochastic logistic
    gradient_descent_methods = get_command_line_args()

    k_fold_indices = createKFoldIndices(nr_samples, k_fold_nr)

    st = Standardize()

    # The last column, our complete y_true, which we will always take a subset from
    # based on our current k-fold group
    email_is_spam = spam_data[:, -1]

    # take all the features apart from the last one, since that is our y_true
    # containing all the information if an email actually is a spa,
    spam_data = spam_data[:, 0:-1]

    # normalising the spam_data
    spam_data_normalized = st.standardize(spam_data)

    # the spam data split into k_fold groups
    k_fold_spam_data = []
    for indices in k_fold_indices:
        k_fold_spam_data.append(spam_data_normalized[indices, :])

    lambdas = [1, 0.1, 0.01]
    num_iters = 100

    for gradient_descent_method in gradient_descent_methods:
        train_err_history, predictions, y_test_sets = run_cross_validation(
            gradient_descent_method, k_fold_nr, lambdas, num_iters, k_fold_spam_data, email_is_spam, k_fold_indices)

        train_err_history = np.array(train_err_history)

        # print(train_err_history)
        # print(train_err_history.shape)

        collapsed_train_err_history = [[], [], []]

        for lam_i in range(len(lambdas)):
            lam_err_history = train_err_history[lam_i]
            # print(lam_err_history)
            # print(np.mean(lam_err_history.tolist(), axis=0))
            collapsed_train_err_history[lam_i].extend(np.mean(lam_err_history, axis=0))

        # print(collapsed_train_err_history)
        #
        # print(len(collapsed_train_err_history))
        # print(len(collapsed_train_err_history[0]))
        # print(len(collapsed_train_err_history[1]))
        # print(len(collapsed_train_err_history[2]))
        iterations0 = [i for i in range(len(collapsed_train_err_history[0]))]
        iterations1 = [i for i in range(len(collapsed_train_err_history[1]))]
        iterations2 = [i for i in range(len(collapsed_train_err_history[2]))]
        # print(collapsed_train_err_history)

        # best lambda based on test errors (should have 3 test errors, where-as each is the
        # mean of the 10 k-fold runs)
        # take the best lambda from the 3-d predictions array for the ROC curve
        # y_test as well, that is dependent on the k-group I chose
        print('plotting now')
        plt.figure(1)
        plt.plot(iterations0, collapsed_train_err_history[0], 'g-', iterations1,
                 collapsed_train_err_history[1], 'b-', iterations2, collapsed_train_err_history[2], 'r-')
        plt.legend(['lambda=1.0', 'lambda=0.1', 'lambda=0.01'])

        # plt.plot(iterations, collapsed_train_err_history[1], 'b-', iterations, collapsed_train_err_history[2], 'r-')
        # plt.legend(['lambda=0.1', 'lambda=0.01'])

        plt.xlabel('iteration')
        plt.ylabel('cost')
        plt.show()

        step_size = 0.01
        nr_steps = (1 / step_size) + 1

        false_positive_rates = []
        true_positive_rates = []

        for spam_threshold in np.linspace(0, 1, nr_steps):
            spam_indices = [i for i, x in zip(range(len(y_test_sets)), y_test_sets) if x == 1]
            nr_spam = len(spam_indices)
            no_spam_indices = [i for i, x in zip(range(len(y_test_sets)), y_test_sets) if x == 0]
            nr_no_spam = len(no_spam_indices)

            nr_spam_predictions = len([element for element in predictions[spam_indices] if element > spam_threshold])

            nr_no_spam_predictions = len([element for element in predictions[no_spam_indices]
                                          if element > spam_threshold])

            # print('Nr of spam total', nr_spam)
            # print('Nr of spam classified', nr_spam_predictions)

            # x-coordinate = false_positive_rate
            false_positive_rates.append(nr_no_spam_predictions / nr_no_spam)

            # y-coordinate = true_positve_rate
            true_positive_rates.append(nr_spam_predictions / nr_spam)
            # find all the ones in the y_test and what percentage are they in the predictions
            # taken from y_test is true_positive
            # 30 true positives 70 false positive
            # true positive rate is e.g. 30 (classified above given threshold out of the 70) /70 (number of 1s)
            # false positive rate is e.g. 20 (classified above given threshold out of the 30) / 30 (number of 0s)
            # then plot as a point y = true positive rate, x = false positive rate
            # connected up = ROC curve
            # 1 ROC curve for all the 10 k-folds
            # false positives, what are 0s in y_test but are above the threshold in the predictions

        # sort the x and y coordinates so the AUC can be calculated properly
        false_positive_rates.sort()
        true_positive_rates.sort()

        auc_sum = 0

        for i in range(1, len(false_positive_rates)):
            x_subtracted = false_positive_rates[i] - false_positive_rates[i - 1]
            y_added = true_positive_rates[i] + true_positive_rates[i - 1]
            auc_sum += x_subtracted * y_added

        auc = (1/2) * auc_sum

        print('The AUC of my classifier is ' + str(auc))

        plt.figure(2)
        plt.plot(false_positive_rates, true_positive_rates)
        plt.legend(['ROC-Curve with AUC ' + str(auc)])
        plt.show()
Ejemplo n.º 5
0
class Collector(DatagramServer):
    x = 0
    def __init__(self,args):

        # create logger
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        
        # create console handler and set level to debug
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        
        # create formatter
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        
        # add formatter to ch
        ch.setFormatter(formatter)
        
        # add ch to logger
        self.logger.addHandler(ch)        
        self.logger.debug( "Starting Collector process in %s"%os.getcwd())
        #self.logger.debug( "Gevent Version %s"%gevent.__version__)
        
        #TODO: move output file name to config
        fname = "./NetFlow.%d.bin"%int(time.time())
        
        #WARN: might want to remove this after testing
        self.out = open(fname,"wb")
        
        #create tool instances
        self.interface = Interface()
        self.parse = Parse()
        self.context = Context()
        self.describe = Describe()
        self.standardize = Standardize()
        self.transform = Transform()
        self.partition = Partition()
        
        self.q = Queue()
        
        self.inWindow = settings.SETTINGS.get("collector","inWindow")
        
        self.score = Score()
        #TODO: move csv name to config
        #self.csv = CSV("output.csv")
        
        self.output = Output()
        
        return super(Collector,self).__init__(args)
    
    def done(self):
        #pass
        self.out.close()
        #really important to call del on the csv obj to ensure it closes correctly
        #del self.csv
    
    def handle(self, rawData, address):
        Collector.x += 1
        #print '%s %s: got %r' % (Collector.x, address[0], rawData)  
        self.out.write(rawData)
        
        interfacedData = self.interface.run(rawData)
        #self.logger.debug("Interface: %s"%(repr(interfacedData)))
        #once the rawData is "interfaced" we are passing it around by reference
        # interfaced data must be iterable
        try:
            for record in interfacedData:
                self.parse.run(record)
                #self.logger.debug("Parse: %s"%(repr(record)))
                self.context.run(record)
                #self.logger.debug("Context: %s"%(repr(record)))
                self.describe.run(record)
                #self.logger.debug("Describe: %s"%(repr(record)))
                #push the record onto the queue until window 
                if not (self.inWindow):
                    self.q.put(record)
                    #self.logger.debug("adding record to queue %s"%(repr(record)))
                    if (self.q.qsize() == int(settings.SETTINGS.get("collector","describeWindow"))):
                        #self.logger.debug("Describe Window of %s records met, Begin Processing queue"%settings.SETTINGS.get("collector","describeWindow"))
                        self.inWindow = True
                        
                        while not self.q.empty():
                            item = self.q.get()
                            #self.logger.debug("processing record from queue %s"%(repr(item)))
                            self.standardize.run(item)
                            self.transform.run(item)
                            self.partition.run(item)
                            #self.csv.writeRow(self.csv.format(item))
                            self.output.run(item)
                            self.q.task_done()
                else:
                    self.standardize.run(record)
                    #self.logger.debug("Standardize: %s"%(repr(record)))
                    self.transform.run(record)
                    #self.logger.debug("Transform: %s"%(repr(record)))
                    self.partition.run(record)
                    #self.logger.debug("Partition: %s"%(repr(record)))
                    #self.csv.writeRow(self.csv.format(record))
                    self.output.run(record)
                    
                    #self.score.run(record)
                    
        except Exception as e:
            self.logger.error("Interfaced data is not iterable %s"%(str(e)))
Ejemplo n.º 6
0
class Collector(DatagramServer):
    x = 0

    def __init__(self, args):

        # create logger
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)

        # create console handler and set level to debug
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)

        # create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

        # add formatter to ch
        ch.setFormatter(formatter)

        # add ch to logger
        self.logger.addHandler(ch)
        self.logger.debug("Starting Collector process in %s" % os.getcwd())
        self.logger.debug("Gevent Version %s" % gevent.__version__)

        #TODO: move output file name to config
        #fname = "./NetFlow.%s.bin"%str(time.time()*100000)

        #WARN: might want to remove this after testing
        #self.out = open(fname,"wb")

        #create tool instances
        self.interface = Interface()
        self.parse = Parse()
        self.describe = Describe()
        self.standardize = Standardize()
        self.transform = Transform()
        self.partition = Partition()

        self.q = Queue()
        self.inWindow = False

        self.score = Score()
        #TODO: move csv name to config
        self.csv = CSV("output.csv")

        return super(Collector, self).__init__(args)

    def done(self):
        #self.out.close()
        #really important to call del on the csv obj to ensure it closes correctly
        del self.csv

    def handle(self, rawData, address):
        Collector.x += 1
        #print '%s %s: got %r' % (Collector.x, address[0], data)
        #self.out.write(rawData)

        interfacedData = self.interface.run(rawData)
        #once the rawData is "interfaced" we are passing it around by reference
        # interfaced data must be iterable
        try:
            for record in interfacedData:
                self.parse.run(record)
                self.describe.run(record)
                #push the record onto the queue until window
                if not (self.inWindow):
                    self.q.put(record)
                    #self.logger.debug("adding record to queue %s"%(repr(record)))
                    if (self.q.qsize() == int(
                            settings.SETTINGS.get("collector",
                                                  "describeWindow"))):
                        self.logger.debug(
                            "Describe Window of %s records met, Begin Processing queue"
                            % settings.SETTINGS.get("collector",
                                                    "describeWindow"))
                        self.inWindow = True

                        while not self.q.empty():
                            item = self.q.get()
                            #self.logger.debug("processing record from queue %s"%(repr(item)))
                            self.standardize.run(item)
                            self.transform.run(item)
                            self.partition.run(item)
                            self.csv.writeRow(self.csv.format(item))
                            self.q.task_done()
                else:
                    self.standardize.run(record)
                    self.transform.run(record)
                    self.partition.run(record)
                    self.csv.writeRow(self.csv.format(record))

                    self.score.run(record)

        except Exception as e:
            self.logger.error("Interfaced data is not iterable %s" % (str(e)))