def __init__(self, args): # create logger self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger self.logger.addHandler(ch) self.logger.debug("Starting Collector process in %s" % os.getcwd()) self.logger.debug("Gevent Version %s" % gevent.__version__) #TODO: move output file name to config #fname = "./NetFlow.%s.bin"%str(time.time()*100000) #WARN: might want to remove this after testing #self.out = open(fname,"wb") #create tool instances self.interface = Interface() self.parse = Parse() self.describe = Describe() self.standardize = Standardize() self.transform = Transform() self.partition = Partition() self.q = Queue() self.inWindow = False self.score = Score() #TODO: move csv name to config self.csv = CSV("output.csv") return super(Collector, self).__init__(args)
def main(): sd = SynthData() ca = coordinateAscent() cal = CoordinateAscentLasso() st = Standardize() beta0Seperate = True lam = 0.1 X, y, b = sd.generateData(noise=False, w=np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])[np.newaxis].T) #if beta0Seperate: # beta = np.array([1, 1, 1, 1, 0, 0, 0, 0])[np.newaxis].T #else: # beta = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0])[np.newaxis].T #if beta0Seperate: # y = 1 + np.dot(X, beta) #else: # X = np.append(np.ones((X.shape[0], 1)), X, 1) # y = np.dot(X, beta) print('Fitting the model with Lasso:') print('Lambda = ' + str(lam)) print('beta0, array of betas:') t0 = time() print(cal.coordinateAscentLasso(y, X, lam, [], False, beta0Seperate)) dt = time() - t0 print('done in %.4fs.' % dt) print() print('Fitting the model with plain \'ol Coordinate Ascent') print('beta0, array of betas:') t0 = time() print(ca.coordinateAscent(y, X, [], False)) dt = time() - t0 print('done in %.4fs.' % dt) print() #print('Dictionary Learning') #dl = decomposition.DictionaryLearning(fit_algorithm='cd') #print(dl.fit(X)) #print(np.shape(dl.components_)) #print(dl.components_) print('Fitting the model with LARS (from the scikit library)') clf = linear_model.LassoLars(alpha=0.01) t0 = time() print(clf.fit(X, y)) dt = time() - t0 print('done in %.4fs.' % dt) print('array of betas:') print(clf.coef_) return 1
def __init__(self,args): # create logger self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger self.logger.addHandler(ch) self.logger.debug( "Starting Collector process in %s"%os.getcwd()) #self.logger.debug( "Gevent Version %s"%gevent.__version__) #TODO: move output file name to config fname = "./NetFlow.%d.bin"%int(time.time()) #WARN: might want to remove this after testing self.out = open(fname,"wb") #create tool instances self.interface = Interface() self.parse = Parse() self.context = Context() self.describe = Describe() self.standardize = Standardize() self.transform = Transform() self.partition = Partition() self.q = Queue() self.inWindow = settings.SETTINGS.get("collector","inWindow") self.score = Score() #TODO: move csv name to config #self.csv = CSV("output.csv") self.output = Output() return super(Collector,self).__init__(args)
def main(): np.set_printoptions(suppress=True) # loads the spambase data f = open("data/spambase.data") spam_data = np.array(np.loadtxt(f, delimiter=',')) k_fold_nr = 10 nr_samples = len(spam_data) # chooses which stochastic gradients to use on the run-through # takes defaults or you can provide command line arguments such as: # python3 cross_validation.py batch linear # python3 cross_validation.py batch linear stochastic linear batch logistic stochastic logistic gradient_descent_methods = get_command_line_args() k_fold_indices = createKFoldIndices(nr_samples, k_fold_nr) st = Standardize() # The last column, our complete y_true, which we will always take a subset from # based on our current k-fold group email_is_spam = spam_data[:, -1] # take all the features apart from the last one, since that is our y_true # containing all the information if an email actually is a spa, spam_data = spam_data[:, 0:-1] # normalising the spam_data spam_data_normalized = st.standardize(spam_data) # the spam data split into k_fold groups k_fold_spam_data = [] for indices in k_fold_indices: k_fold_spam_data.append(spam_data_normalized[indices, :]) lambdas = [1, 0.1, 0.01] num_iters = 100 for gradient_descent_method in gradient_descent_methods: train_err_history, predictions, y_test_sets = run_cross_validation( gradient_descent_method, k_fold_nr, lambdas, num_iters, k_fold_spam_data, email_is_spam, k_fold_indices) train_err_history = np.array(train_err_history) # print(train_err_history) # print(train_err_history.shape) collapsed_train_err_history = [[], [], []] for lam_i in range(len(lambdas)): lam_err_history = train_err_history[lam_i] # print(lam_err_history) # print(np.mean(lam_err_history.tolist(), axis=0)) collapsed_train_err_history[lam_i].extend(np.mean(lam_err_history, axis=0)) # print(collapsed_train_err_history) # # print(len(collapsed_train_err_history)) # print(len(collapsed_train_err_history[0])) # print(len(collapsed_train_err_history[1])) # print(len(collapsed_train_err_history[2])) iterations0 = [i for i in range(len(collapsed_train_err_history[0]))] iterations1 = [i for i in range(len(collapsed_train_err_history[1]))] iterations2 = [i for i in range(len(collapsed_train_err_history[2]))] # print(collapsed_train_err_history) # best lambda based on test errors (should have 3 test errors, where-as each is the # mean of the 10 k-fold runs) # take the best lambda from the 3-d predictions array for the ROC curve # y_test as well, that is dependent on the k-group I chose print('plotting now') plt.figure(1) plt.plot(iterations0, collapsed_train_err_history[0], 'g-', iterations1, collapsed_train_err_history[1], 'b-', iterations2, collapsed_train_err_history[2], 'r-') plt.legend(['lambda=1.0', 'lambda=0.1', 'lambda=0.01']) # plt.plot(iterations, collapsed_train_err_history[1], 'b-', iterations, collapsed_train_err_history[2], 'r-') # plt.legend(['lambda=0.1', 'lambda=0.01']) plt.xlabel('iteration') plt.ylabel('cost') plt.show() step_size = 0.01 nr_steps = (1 / step_size) + 1 false_positive_rates = [] true_positive_rates = [] for spam_threshold in np.linspace(0, 1, nr_steps): spam_indices = [i for i, x in zip(range(len(y_test_sets)), y_test_sets) if x == 1] nr_spam = len(spam_indices) no_spam_indices = [i for i, x in zip(range(len(y_test_sets)), y_test_sets) if x == 0] nr_no_spam = len(no_spam_indices) nr_spam_predictions = len([element for element in predictions[spam_indices] if element > spam_threshold]) nr_no_spam_predictions = len([element for element in predictions[no_spam_indices] if element > spam_threshold]) # print('Nr of spam total', nr_spam) # print('Nr of spam classified', nr_spam_predictions) # x-coordinate = false_positive_rate false_positive_rates.append(nr_no_spam_predictions / nr_no_spam) # y-coordinate = true_positve_rate true_positive_rates.append(nr_spam_predictions / nr_spam) # find all the ones in the y_test and what percentage are they in the predictions # taken from y_test is true_positive # 30 true positives 70 false positive # true positive rate is e.g. 30 (classified above given threshold out of the 70) /70 (number of 1s) # false positive rate is e.g. 20 (classified above given threshold out of the 30) / 30 (number of 0s) # then plot as a point y = true positive rate, x = false positive rate # connected up = ROC curve # 1 ROC curve for all the 10 k-folds # false positives, what are 0s in y_test but are above the threshold in the predictions # sort the x and y coordinates so the AUC can be calculated properly false_positive_rates.sort() true_positive_rates.sort() auc_sum = 0 for i in range(1, len(false_positive_rates)): x_subtracted = false_positive_rates[i] - false_positive_rates[i - 1] y_added = true_positive_rates[i] + true_positive_rates[i - 1] auc_sum += x_subtracted * y_added auc = (1/2) * auc_sum print('The AUC of my classifier is ' + str(auc)) plt.figure(2) plt.plot(false_positive_rates, true_positive_rates) plt.legend(['ROC-Curve with AUC ' + str(auc)]) plt.show()
class Collector(DatagramServer): x = 0 def __init__(self,args): # create logger self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger self.logger.addHandler(ch) self.logger.debug( "Starting Collector process in %s"%os.getcwd()) #self.logger.debug( "Gevent Version %s"%gevent.__version__) #TODO: move output file name to config fname = "./NetFlow.%d.bin"%int(time.time()) #WARN: might want to remove this after testing self.out = open(fname,"wb") #create tool instances self.interface = Interface() self.parse = Parse() self.context = Context() self.describe = Describe() self.standardize = Standardize() self.transform = Transform() self.partition = Partition() self.q = Queue() self.inWindow = settings.SETTINGS.get("collector","inWindow") self.score = Score() #TODO: move csv name to config #self.csv = CSV("output.csv") self.output = Output() return super(Collector,self).__init__(args) def done(self): #pass self.out.close() #really important to call del on the csv obj to ensure it closes correctly #del self.csv def handle(self, rawData, address): Collector.x += 1 #print '%s %s: got %r' % (Collector.x, address[0], rawData) self.out.write(rawData) interfacedData = self.interface.run(rawData) #self.logger.debug("Interface: %s"%(repr(interfacedData))) #once the rawData is "interfaced" we are passing it around by reference # interfaced data must be iterable try: for record in interfacedData: self.parse.run(record) #self.logger.debug("Parse: %s"%(repr(record))) self.context.run(record) #self.logger.debug("Context: %s"%(repr(record))) self.describe.run(record) #self.logger.debug("Describe: %s"%(repr(record))) #push the record onto the queue until window if not (self.inWindow): self.q.put(record) #self.logger.debug("adding record to queue %s"%(repr(record))) if (self.q.qsize() == int(settings.SETTINGS.get("collector","describeWindow"))): #self.logger.debug("Describe Window of %s records met, Begin Processing queue"%settings.SETTINGS.get("collector","describeWindow")) self.inWindow = True while not self.q.empty(): item = self.q.get() #self.logger.debug("processing record from queue %s"%(repr(item))) self.standardize.run(item) self.transform.run(item) self.partition.run(item) #self.csv.writeRow(self.csv.format(item)) self.output.run(item) self.q.task_done() else: self.standardize.run(record) #self.logger.debug("Standardize: %s"%(repr(record))) self.transform.run(record) #self.logger.debug("Transform: %s"%(repr(record))) self.partition.run(record) #self.logger.debug("Partition: %s"%(repr(record))) #self.csv.writeRow(self.csv.format(record)) self.output.run(record) #self.score.run(record) except Exception as e: self.logger.error("Interfaced data is not iterable %s"%(str(e)))
class Collector(DatagramServer): x = 0 def __init__(self, args): # create logger self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger self.logger.addHandler(ch) self.logger.debug("Starting Collector process in %s" % os.getcwd()) self.logger.debug("Gevent Version %s" % gevent.__version__) #TODO: move output file name to config #fname = "./NetFlow.%s.bin"%str(time.time()*100000) #WARN: might want to remove this after testing #self.out = open(fname,"wb") #create tool instances self.interface = Interface() self.parse = Parse() self.describe = Describe() self.standardize = Standardize() self.transform = Transform() self.partition = Partition() self.q = Queue() self.inWindow = False self.score = Score() #TODO: move csv name to config self.csv = CSV("output.csv") return super(Collector, self).__init__(args) def done(self): #self.out.close() #really important to call del on the csv obj to ensure it closes correctly del self.csv def handle(self, rawData, address): Collector.x += 1 #print '%s %s: got %r' % (Collector.x, address[0], data) #self.out.write(rawData) interfacedData = self.interface.run(rawData) #once the rawData is "interfaced" we are passing it around by reference # interfaced data must be iterable try: for record in interfacedData: self.parse.run(record) self.describe.run(record) #push the record onto the queue until window if not (self.inWindow): self.q.put(record) #self.logger.debug("adding record to queue %s"%(repr(record))) if (self.q.qsize() == int( settings.SETTINGS.get("collector", "describeWindow"))): self.logger.debug( "Describe Window of %s records met, Begin Processing queue" % settings.SETTINGS.get("collector", "describeWindow")) self.inWindow = True while not self.q.empty(): item = self.q.get() #self.logger.debug("processing record from queue %s"%(repr(item))) self.standardize.run(item) self.transform.run(item) self.partition.run(item) self.csv.writeRow(self.csv.format(item)) self.q.task_done() else: self.standardize.run(record) self.transform.run(record) self.partition.run(record) self.csv.writeRow(self.csv.format(record)) self.score.run(record) except Exception as e: self.logger.error("Interfaced data is not iterable %s" % (str(e)))