def onevsall2(anomalies, observed): """ This takes an observed data and tests them against each set of anomalies and sees which one has the best score. This is not a onevsall impelementation but more of a one class classifier used against each class. I don't know if it is any better or worse, but it is much faster than onevsall, so I'll leave it here for now. The Java code used this method. """ observed_h = None output = " " #somehow this is generating the same scores for training data.. weird for k in sorted(anomalies.keys()): if observed_h is None: observed_h = Histograms(0, 0) features = anomalies[k].get_features() for i in range(len(features)): observed_h.insert_one(features[i], 0, value=observed[0][i], use_internal_time=True) #print anomalies[k].get_histograms() #import IPython #IPython.embed() output += str(k) + " : " + np.array_str( SVMCalc.test(anomalies[k], observed_h)) + "\n" return output
def getFakeAnomalies(): ret = dict() ret[(0,0)] = Histograms(2, 1) for t in range(500): ret[(0,0)].insert_one("10.0.0.1", t, value = 210 + random.randint(0,200), use_internal_time = True) ret[(0,0)].insert_one("10.0.0.2", t, value = 810 + random.randint(0,200), use_internal_time = True) ret[(0,0)].insert_one("10.0.0.3", t, value = 210 + random.randint(0,200), use_internal_time = True) ret[(0,0)].next_row() ret[(1,1)] = Histograms(2, 1) for t in range(500): ret[(1,1)].insert_one("10.0.0.1", t, value = 0 + random.randint(0,200), use_internal_time = True) ret[(1,1)].insert_one("10.0.0.2", t, value = 900 + random.randint(0,200), use_internal_time = True) ret[(1,1)].insert_one("10.0.0.3", t, value = 0 + random.randint(0,200), use_internal_time = True) ret[(1,1)].next_row() ret[(2,2)] = Histograms(2, 1) for t in range(500): ret[(2,2)].insert_one("10.0.0.1", t, value = 0 + random.randint(0,100), use_internal_time = True) ret[(2,2)].insert_one("10.0.0.2", t, value = 0 + random.randint(0,100), use_internal_time = True) ret[(2,2)].insert_one("10.0.0.3", t, value = 0 + random.randint(0,100), use_internal_time = True) ret[(2,2)].next_row() ret[(3,3)] = Histograms(2, 1) for t in range(500): ret[(3,3)].insert_one("10.0.0.1", t, value = 1000 + random.randint(0,200), use_internal_time = True) ret[(3,3)].insert_one("10.0.0.2", t, value = 11200 + random.randint(0,200), use_internal_time = True) ret[(3,3)].insert_one("10.0.0.3", t, value = 1000 + random.randint(0,200), use_internal_time = True) ret[(3,3)].next_row() return ret
def get_histogram(self, sample_window_sec, slide_window_sec, filter_name, filter_value, features_keep): """ filter_name and filter_value is to define one thing that we're making histograms for. For example, source_addr = 10.0.0.1 would build histograms for all packets originating from 10.0.0.1 and with whatever desired features features_keep is a tuple of strings representing field names in the database. If the feature is inside the text_values map, just pass in the key for the map and this code will automatically try the various maps looking for it. We assume the same key name doesn't appear in multiple maps As a result the ret dict contains a flat keyspace """ ret = Histograms(sample_window_sec, slide_window_sec) self._result = self._session.execute("SELECT * FROM " + self._table) temp_data = dict() count = 0 for res in self._result: count += 1 features = () res_dict = res._asdict() if res_dict[filter_name] != filter_value: continue if res.source_addr in temp_data: temp_data[res_dict["dest_addr"]] += 1 else: temp_data[res_dict["dest_addr"]] = 0 for f in features_keep.split(","): f = f.strip() if f in res_dict: features = features + (res_dict[f], ) elif f in res_dict["text_values"]: features = features + (res_dict["text_values"][f], ) else: raise Exception("Could not find field " + f) sec = time.mktime(res.time_stamp.timetuple()) ret.insert_one(','.join(map(str, features)), sec) return ret
def main(): start = datetime.now() parser = ArgumentParser("Updates all relevant files.") parser.add_argument("--ucr", action="store_true", default=False, help="Add ER column to updated UCR file.") parser.add_argument("--hist", action="store_true", default=False, help="Plot new histograms.") args = parser.parse_args() if args.ucr: print("\n\tUpdating ER status in UCR records...") er = ERupdate() er.getERstatus() print("\n\tCalculating adversity scores in UPDB records...") a = Adversity() a.getAdversityScores() print("\n\tMerging UPDB and UCR records...") merger = DatabaseMerger() merger.merge() # Get summaries print("\n\tImputing missing data...") i = Impute() i.imputeRecords() summarize(args.ucr) print("\n\tCalculating totals from merged records...") c = Counter() c.writeXLSX() c.printComplete() if args.hist: Histograms(c) print(("\tTotal runtime: {}\n").format(datetime.now() - start))
def getfakedata(): global next_id global hist_dict train_h = Histograms.get_fake_histogram_train() test_h = Histograms.get_fake_histogram_test() hist_dict[next_id] = train_h output = "Dataset ID: " + str(next_id) + "\n" for f in train_h.get_features(): output += f + "\n" next_id += 1 train_h.print_histograms() hist_dict[next_id] = test_h output += "Dataset ID: " + str(next_id) + "\n" for f in test_h.get_features(): output += f + "\n" next_id += 1 test_h.print_histograms() return Response(output, mimetype='text/plain')
print "[setup_limits] WARNING : Didn't find tree {} in input file, but did find {}. Changing the tree name, but try to fix this.".format(tree_name, backup_tree_name) tree_name = backup_tree_name else: print "[setup_limits] ERROR : Didn't find tree {} in input file, nor {}. Quitting!".format(tree_name, backup_tree_name) sys.exit(1) # Check that the "NEvents" histogram is present h_NEvents = f.Get("NEvents") if not h_NEvents: if "data" in sample: print "[setup_limits] ERROR : NEvents histogram in not in this file! It is probably corrupt. This is data, so this problem is fatal." sys.exit(1) else: print "[setup_limits] WARNING : NEvents histogram in not in this file! It is probably corrupt. This is MC, so I am skipping the file. But, you probably want to remove from the input list." sample_files[sample].remove(filename) limit_histogrammer = Histograms(sample, tree_name=tree_name, jet_type=args.jet_type) if args.label: extra_tag = "_" + args.label else: extra_tag = "" output_file_basename ="histograms_{}_{}_{}{}.root".format(sample, args.jet_type, args.year, extra_tag) if args.output_folder: limit_histogrammer.set_output_path("{}/{}".format(args.output_folder, output_file_basename)) else: limit_histogrammer.set_output_path("/uscms/home/dryu/DAZSLE/data/histograms/tmp/{}".format(output_file_basename)) for filename in files_to_run: print "Input file {}".format(filename) limit_histogrammer.add_file(filename) #limit_histogrammer.set_jet_type(args.jet_type) if "JetHT" in sample or "SingleMu" in sample: limit_histogrammer.set_data_source("data")
def task_handler(doc_id, user_id, task_id, data, g, cmd): if g is not None: if g.canvas is not None: g.canvas.get_tk_widget().destroy() if g.toolbar is not None: g.toolbar.destroy() g.toolbar = None if g.listbox is not None: g.listbox.destroy() if task_id == "2a": if cmd and doc_id not in TaskManager.get_all_documents( data) or doc_id is None: print("Please Provide a Valid Document ID") else: histogram = Histograms( TaskManager.get_countries( doc_id, TaskManager.filter_data(data, "subject_doc_id", doc_id)), "Task 2A", cmd) if not cmd: TaskManager.plot_figure_gui(g, histogram) elif task_id == "2b": if cmd and doc_id not in TaskManager.get_all_documents( data) or doc_id is None: print("Please Provide a Valid Document ID") else: histogram = Histograms( TaskManager.get_continents( doc_id, TaskManager.filter_data(data, "subject_doc_id", doc_id)), "Task 2B", cmd) if not cmd: TaskManager.plot_figure_gui(g, histogram) elif task_id == "3a": histogram = Histograms(TaskManager.simple_get_all_browser(data), "Task 3A", cmd) if not cmd: TaskManager.plot_figure_gui(g, histogram) elif task_id == "3b": histogram = Histograms(TaskManager.get_all_browser(data), "Task 3B", cmd) if not cmd: TaskManager.plot_figure_gui(g, histogram) elif task_id == "4": top10 = TaskManager.get_top_10(data) if cmd: print(top10) else: TaskManager.load_list(g, top10) elif task_id == "5a": users = TaskManager.get_all_users_by_doc(doc_id, data) if cmd: print(users) else: TaskManager.load_list(g, users) elif task_id == "5b": docs = TaskManager.get_all_documents_by_user(user_id, data) if cmd: print(docs) else: TaskManager.load_list(g, docs) elif task_id == "5c": also_likes = TaskManager.task5(data, doc_id, user_id, None) if cmd: print(also_likes) else: TaskManager.load_list(g, also_likes) elif task_id == "5d": also_likes = TaskManager.task5(data, doc_id, user_id, TaskManager.sort_by_readership) if cmd: print(also_likes) else: TaskManager.load_list(g, also_likes) elif task_id == "5e": also_likes = TaskManager.task5(data, doc_id, user_id, TaskManager.sort_by_number) if cmd: print(also_likes) else: TaskManager.load_list(g, also_likes) else: if cmd: print("Invalid Task")
def getAnomalies(self, testStart = None, testEnd = None, trainStart = None, trainEnd = None, filterValue = None, targetType = None, algorithm = None, userState = None, userCause = None): """ Returns a dictionary where each key is a tuple of (cause, state) and each value is a Histogram() of observed anomalous data tagged by a user as having that cause/state """ arg = "" if testStart is not None: arg += "&detectionTimeWindowStart=" + str(testStart) if testEnd is not None: arg += "&detectionTimeWindowEnd=" + str(testEnd) if trainStart is not None: arg += "&trainingTimeWindowStart=" + str(trainStart) if trainEnd is not None: arg += "&trainingTimeWindowEnd=" + str(trainEnd) if filterValue is not None: arg += "&sourceValue=" + sourceValue if targetType is not None: arg += "&targetType=" + targetType if algorithm is not None: arg += "&algorithm=" + algorithm if userCause is not None: arg += "&userCause=" + userCause if userState is not None: arg += "&userState=" + userState print "arg is ",arg resp = requests.get('http://' + self.host + '/essence-services/anomaly/query/?' + arg) if resp.status_code != 200: # This means something went wrong. raise ApiError('GET /tasks/ {}'.format(resp.status_code)) cause = -1 state = -1 ret = dict() for i in resp.json(): if 'userCause' in i and 'id' in i['userCause']: cause = i['userCause']['id'] if 'userState' in i and 'id' in i['userCause']: state = i['userCause']['id'] if (cause, state) not in ret: # for why we init Histogram this way, see __init__ where it # takes in a matrix as an argument and converts it into # internal format. We're basically doing the same thing here # but skipping the step of making the matrix to pass into init ret[(cause, state)] = Histograms(-1, -1) hist = ret[(cause, state)] if 'anomalyEntries' not in i: continue for entries in i['anomalyEntries']: hist.insert_one(entries['sequenceNumber'], -1, use_internal_time = True) hist.next_row() return ret