def graph_cluster_parent(gr,dom_file, ip_file,level): #level is for 1 or 2 hop propagation g1 = open(dom_file, 'r') g2 = open(ip_file, 'r') dom_list = {} # put it in a dictionary ip_list = {} # put it an another dictionary for n in g1: n = n.rstrip() dom_list[n] = 1 g1.close() for i in g2: i = i.rstrip() ip_list[i] = 1 g2.close() POS_SC = 0.6 NEG_SC = -0.8 for step in range(1,MAX_ITERATIONS): # Steps are the number of iterations # num = MAX_ITERATIONS-1 (train_dom,test_dom) = rand_split(dom_list) (train_ip, test_ip) = rand_split(ip_list) ret = nr.bl_propagate2(gr,train_dom,train_ip,0,POS_SC,NEG_SC) # ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC) fp = open("benign10k",'r') good_list = [] for ln in fp: ln = ln.rstrip() good_list.append(ln) fp.close() mal_train_list = train_dom+train_ip mal_test_list = test_dom+test_ip fp1 = open("benign_clusters_op",'w') fp2 = open("mal_clusters_op",'w') identify_clusters(gr,level,fp1,fp2)
def graph_cluster_parent(gr, dom_file, ip_file, level): #level is for 1 or 2 hop propagation g1 = open(dom_file, 'r') g2 = open(ip_file, 'r') dom_list = {} # put it in a dictionary ip_list = {} # put it an another dictionary for n in g1: n = n.rstrip() dom_list[n] = 1 g1.close() for i in g2: i = i.rstrip() ip_list[i] = 1 g2.close() POS_SC = 0.6 NEG_SC = -0.8 for step in range(1, MAX_ITERATIONS): # Steps are the number of iterations # num = MAX_ITERATIONS-1 (train_dom, test_dom) = rand_split(dom_list) (train_ip, test_ip) = rand_split(ip_list) ret = nr.bl_propagate2(gr, train_dom, train_ip, 0, POS_SC, NEG_SC) # ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC) fp = open("benign10k", 'r') good_list = [] for ln in fp: ln = ln.rstrip() good_list.append(ln) fp.close() mal_train_list = train_dom + train_ip mal_test_list = test_dom + test_ip fp1 = open("benign_clusters_op", 'w') fp2 = open("mal_clusters_op", 'w') identify_clusters(gr, level, fp1, fp2)
def graph_scores2(gr,dom_file, ip_file,POS_SCORE,NEG_SCORE,POS_SC,NEG_SC,g3, level): #level is for 1 or 2 hop propagation g1 = open(dom_file, 'r') g2 = open(ip_file, 'r') # g3 = open("tpfp_reusults", 'w') dom_list = {} # put it in a dictionary ip_list = {} # put it an another dictionary for n in g1: n = n.rstrip() dom_list[n] = 1 g1.close() for i in g2: i = i.rstrip() ip_list[i] = 1 g2.close() for step in range(1,MAX_ITERATIONS): # Steps are the number of iterations # num = MAX_ITERATIONS-1 (train_dom,test_dom) = rand_split(dom_list) (train_ip, test_ip) = rand_split(ip_list) ret = nr.bl_propagate2(gr,train_dom,train_ip,0,POS_SC,NEG_SC) ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC) # ret = nr.bl_propagate2(gr,train_dom,train_ip,1) # ret = nr.bl_propagate2(gr,train_dom,train_ip,2) # ret = nr.bl_propagate2(gr,train_dom,train_ip,3) # This graph basically consists of neighbors from hop1 and hop2, which are the only entities we want to check, if identified # gr2 = nx.Graph() str1 = "tpfp-results/truep_r"+str(step) str2 = "tpfp-results/falsen_r"+str(step) str3 = "tpfp-results/falsep_r"+str(step) str4 = "tpfp-results/truen_r"+str(step) f1= open(str1, 'w') f2= open(str2, 'w') f3= open(str3, 'w') f4= open(str4, 'w') print "Now preparing new graph(dictionary)..." # There are three ways to create the two hop neighborhood, 1.Make a temporary graph, 2. Make a subgraph, 3. Make a dictionary # http://networkx.lanl.gov/reference/generated/networkx.Graph.subgraph.html ( They are only pointers to original graph) dom_ip_neigh = {} # lookups in dictionary are optimized # ip_neigh = {} for node in dom_list: # domain file node = node.rstrip() # might not be of any use now dom_ip_neigh[node]=1 if node in gr: n = node for e in gr.edge[n]: dom_ip_neigh[e] = 1 if level ==2: for f in gr.edge[e]: dom_ip_neigh[f] = 1 else: pass for node in ip_list: # IP file node = node.rstrip() # again, might not be of any use if node in gr: n = node dom_ip_neigh[n]=1 for e in gr.edge[n]: dom_ip_neigh[e]=1 if level == 2: for f in gr.edge[e]: dom_ip_neigh[f] =1 else: pass print "Done preparing new graph..." in_graph = 0 tp_num = 0 fn_num = 0 tn_num = 0 fp_num = 0 # This is not entirely false positive, but those nodes, which I mark, but they are not in my testing list no_score = 0 # present in graph, but no score t_d = {} # dictionary holding domains and their scores t_i ={} # same as above for IPs is_neighbor = 0 # marks the nodes which are present in the graph & testing neighborhood not_neighbor = 0 # nodes in training, but not present in the two hop neighborhood not_in_graph = 0 # not present in our original graph neg_more_than_thres = 0 # nodes marked negative, but more than negative threshold pos_less_than_thres = 0 # nodes marked positive, but less than positive threshold pos_more_than_thres = 0 neg_less_than_thres = 0 print "Now starting tp,fp analysis..." for n in test_dom: n = n.rstrip() if n in gr: # Check if it exists in original graph in_graph += 1 if n in dom_ip_neigh: # if the node in testing is present in the neighborhood graph of training is_neighbor += 1 if n in ret: # if the node has any trust state other than a zero if ret[n] < NEG_SCORE: tp_num += 1 t_d[n] = ret[n] f1.write("%s %f\n" %(n, t_d[n])) elif ret[n] > POS_SCORE: fn_num += 1 # false negative coz it shldnt be here t_d[n] = ret[n] f2.write("%s %f\n" %(n, t_d[n])) else: pass # ret doesn't score values == 0 else : # decision to mark nodes, which are present in the training neighborhood, but not marked as anything -> false negatives(strict) no_score += 1 # Here I can decide whether to use fn_num as false negative or use fn_num + no_score as the nodes to consider for false negatives. else: not_neighbor += 1 # They are not present in the testing neighborhood else: not_in_graph += 1 pass # not even present in original graph for k in test_ip: k = k.rstrip() if k in gr: in_graph += 1 if k in dom_ip_neigh: # the nodes in testing to be in the neighborhood of the training is_neighbor += 1 if k in ret: if ret[k] < NEG_SCORE: tp_num += 1 t_i[k] = ret[k] f1.write("%s %f\n" %(k, t_i[k])) elif ret[k] > POS_SCORE: fn_num += 1 t_i[k] = ret[k] f2.write("%s %f\n" %(k, t_i[k])) else: pass else :# Again decide to use fn_num or fn_num + no_score no_score += 1 else: not_neighbor += 1 else: pass # just not present in the graph for i in ret: # nodes for which we have non zero scores a if i in t_d: continue elif i in t_i: continue else: # which are not present in the neighborhoods if ret[i] > 0 and ret[i] < POS_SCORE: pos_less_than_thres += 1 continue elif ret[i] > POS_SCORE: # nodes marked as benign, contribute to TN pos_more_than_thres += 1 # tn_num += 1 # f4.write("%s %f\n" %(i,ret[i])) continue elif ret[i] < 0 and ret[i] > NEG_SCORE: neg_more_than_thres += 1 continue elif ret[i] < NEG_SCORE: # nodes marked as malicious, contribute to FP neg_less_than_thres += 1 # fp_num += 1 f3.write("%s %f\n" %(i,ret[i])) continue identify_clusters(gr,level, fp1, fp2) print "Finished with tp/fp analysis" print "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" %(POS_SCORE,NEG_SCORE, POS_SC, NEG_SC) print "Step(iteration): "+str(step) print "tp_num: "+str(tp_num) print "fp_num: "+str(neg_less_than_thres) print "tn_num: "+str(tn_num) print "fn_num: "+str(fn_num) print "Present in neighbohood, but no score(should be marked as false negatives?): "+str(no_score) print "is neighbor(of testing): "+str(is_neighbor) print "not_neighbor: "+str(not_neighbor) print "not_in_graph: "+str(not_in_graph) print "Negative more than threshold: "+str(neg_more_than_thres) print "Positive less than threshold: "+str(pos_less_than_thres) print "Positive more than threshold(tn): "+str(pos_more_than_thres) print "Negative less than threshold(fp): "+str(neg_less_than_thres) # gr2.clear() f1.close() f2.close() f3.close() f4.close() g3.write("###################### Iteration: %d ######################\n" %(step)) g3.write("Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" %(POS_SCORE,NEG_SCORE, POS_SC, NEG_SC)) g3.write("tp_num: %d\n" %(tp_num)) g3.write("fp_num: %d\n" %(fp_num)) g3.write("tn_num: %d\n" %(tn_num)) g3.write("fn_num: %d\n" %(fn_num)) g3.write("is neighbor(of testing): %d\n" %(is_neighbor)) g3.write("Present in neighborhood, but no score(should be marked as false -ves: %d ?\n" %(no_score)) g3.write("not_neighbor: %d\n" %(not_neighbor)) g3.write("not_in_graph: %d\n" %(not_in_graph)) g3.write("Negative more than threshold: %d\n" %(neg_more_than_thres)) g3.write("Positive less than threshold: %d\n" %(pos_less_than_thres)) g3.write("Negative less than threshold(fp): %d\n" %(neg_less_than_thres)) g3.write("Positive more than threshold(tn): %d\n" %(pos_more_than_thres)) g3.write("\n") g3.flush()
def graph_scores2(gr, dom_file, ip_file, POS_SCORE, NEG_SCORE, POS_SC, NEG_SC, g3, level): #level is for 1 or 2 hop propagation g1 = open(dom_file, 'r') g2 = open(ip_file, 'r') # g3 = open("tpfp_reusults", 'w') dom_list = {} # put it in a dictionary ip_list = {} # put it an another dictionary for n in g1: n = n.rstrip() dom_list[n] = 1 g1.close() for i in g2: i = i.rstrip() ip_list[i] = 1 g2.close() for step in range(1, MAX_ITERATIONS): # Steps are the number of iterations # num = MAX_ITERATIONS-1 (train_dom, test_dom) = rand_split(dom_list) (train_ip, test_ip) = rand_split(ip_list) ret = nr.bl_propagate2(gr, train_dom, train_ip, 0, POS_SC, NEG_SC) ret = nr.bl_propagate2(gr, train_dom, train_ip, 1, POS_SC, NEG_SC) # ret = nr.bl_propagate2(gr,train_dom,train_ip,1) # ret = nr.bl_propagate2(gr,train_dom,train_ip,2) # ret = nr.bl_propagate2(gr,train_dom,train_ip,3) # This graph basically consists of neighbors from hop1 and hop2, which are the only entities we want to check, if identified # gr2 = nx.Graph() str1 = "tpfp-results/truep_r" + str(step) str2 = "tpfp-results/falsen_r" + str(step) str3 = "tpfp-results/falsep_r" + str(step) str4 = "tpfp-results/truen_r" + str(step) f1 = open(str1, 'w') f2 = open(str2, 'w') f3 = open(str3, 'w') f4 = open(str4, 'w') print "Now preparing new graph(dictionary)..." # There are three ways to create the two hop neighborhood, 1.Make a temporary graph, 2. Make a subgraph, 3. Make a dictionary # http://networkx.lanl.gov/reference/generated/networkx.Graph.subgraph.html ( They are only pointers to original graph) dom_ip_neigh = {} # lookups in dictionary are optimized # ip_neigh = {} for node in dom_list: # domain file node = node.rstrip() # might not be of any use now dom_ip_neigh[node] = 1 if node in gr: n = node for e in gr.edge[n]: dom_ip_neigh[e] = 1 if level == 2: for f in gr.edge[e]: dom_ip_neigh[f] = 1 else: pass for node in ip_list: # IP file node = node.rstrip() # again, might not be of any use if node in gr: n = node dom_ip_neigh[n] = 1 for e in gr.edge[n]: dom_ip_neigh[e] = 1 if level == 2: for f in gr.edge[e]: dom_ip_neigh[f] = 1 else: pass print "Done preparing new graph..." in_graph = 0 tp_num = 0 fn_num = 0 tn_num = 0 fp_num = 0 # This is not entirely false positive, but those nodes, which I mark, but they are not in my testing list no_score = 0 # present in graph, but no score t_d = {} # dictionary holding domains and their scores t_i = {} # same as above for IPs is_neighbor = 0 # marks the nodes which are present in the graph & testing neighborhood not_neighbor = 0 # nodes in training, but not present in the two hop neighborhood not_in_graph = 0 # not present in our original graph neg_more_than_thres = 0 # nodes marked negative, but more than negative threshold pos_less_than_thres = 0 # nodes marked positive, but less than positive threshold pos_more_than_thres = 0 neg_less_than_thres = 0 print "Now starting tp,fp analysis..." for n in test_dom: n = n.rstrip() if n in gr: # Check if it exists in original graph in_graph += 1 if n in dom_ip_neigh: # if the node in testing is present in the neighborhood graph of training is_neighbor += 1 if n in ret: # if the node has any trust state other than a zero if ret[n] < NEG_SCORE: tp_num += 1 t_d[n] = ret[n] f1.write("%s %f\n" % (n, t_d[n])) elif ret[n] > POS_SCORE: fn_num += 1 # false negative coz it shldnt be here t_d[n] = ret[n] f2.write("%s %f\n" % (n, t_d[n])) else: pass # ret doesn't score values == 0 else: # decision to mark nodes, which are present in the training neighborhood, but not marked as anything -> false negatives(strict) no_score += 1 # Here I can decide whether to use fn_num as false negative or use fn_num + no_score as the nodes to consider for false negatives. else: not_neighbor += 1 # They are not present in the testing neighborhood else: not_in_graph += 1 pass # not even present in original graph for k in test_ip: k = k.rstrip() if k in gr: in_graph += 1 if k in dom_ip_neigh: # the nodes in testing to be in the neighborhood of the training is_neighbor += 1 if k in ret: if ret[k] < NEG_SCORE: tp_num += 1 t_i[k] = ret[k] f1.write("%s %f\n" % (k, t_i[k])) elif ret[k] > POS_SCORE: fn_num += 1 t_i[k] = ret[k] f2.write("%s %f\n" % (k, t_i[k])) else: pass else: # Again decide to use fn_num or fn_num + no_score no_score += 1 else: not_neighbor += 1 else: pass # just not present in the graph for i in ret: # nodes for which we have non zero scores a if i in t_d: continue elif i in t_i: continue else: # which are not present in the neighborhoods if ret[i] > 0 and ret[i] < POS_SCORE: pos_less_than_thres += 1 continue elif ret[i] > POS_SCORE: # nodes marked as benign, contribute to TN pos_more_than_thres += 1 # tn_num += 1 # f4.write("%s %f\n" %(i,ret[i])) continue elif ret[i] < 0 and ret[i] > NEG_SCORE: neg_more_than_thres += 1 continue elif ret[i] < NEG_SCORE: # nodes marked as malicious, contribute to FP neg_less_than_thres += 1 # fp_num += 1 f3.write("%s %f\n" % (i, ret[i])) continue identify_clusters(gr, level, fp1, fp2) print "Finished with tp/fp analysis" print "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" % ( POS_SCORE, NEG_SCORE, POS_SC, NEG_SC) print "Step(iteration): " + str(step) print "tp_num: " + str(tp_num) print "fp_num: " + str(neg_less_than_thres) print "tn_num: " + str(tn_num) print "fn_num: " + str(fn_num) print "Present in neighbohood, but no score(should be marked as false negatives?): " + str( no_score) print "is neighbor(of testing): " + str(is_neighbor) print "not_neighbor: " + str(not_neighbor) print "not_in_graph: " + str(not_in_graph) print "Negative more than threshold: " + str(neg_more_than_thres) print "Positive less than threshold: " + str(pos_less_than_thres) print "Positive more than threshold(tn): " + str(pos_more_than_thres) print "Negative less than threshold(fp): " + str(neg_less_than_thres) # gr2.clear() f1.close() f2.close() f3.close() f4.close() g3.write( "###################### Iteration: %d ######################\n" % (step)) g3.write( "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" % (POS_SCORE, NEG_SCORE, POS_SC, NEG_SC)) g3.write("tp_num: %d\n" % (tp_num)) g3.write("fp_num: %d\n" % (fp_num)) g3.write("tn_num: %d\n" % (tn_num)) g3.write("fn_num: %d\n" % (fn_num)) g3.write("is neighbor(of testing): %d\n" % (is_neighbor)) g3.write( "Present in neighborhood, but no score(should be marked as false -ves: %d ?\n" % (no_score)) g3.write("not_neighbor: %d\n" % (not_neighbor)) g3.write("not_in_graph: %d\n" % (not_in_graph)) g3.write("Negative more than threshold: %d\n" % (neg_more_than_thres)) g3.write("Positive less than threshold: %d\n" % (pos_less_than_thres)) g3.write("Negative less than threshold(fp): %d\n" % (neg_less_than_thres)) g3.write("Positive more than threshold(tn): %d\n" % (pos_more_than_thres)) g3.write("\n") g3.flush()