Exemple #1
0
def graph_cluster_parent(gr,dom_file, ip_file,level): #level is for 1 or 2 hop propagation
    g1 = open(dom_file, 'r')
    g2 = open(ip_file, 'r')
    dom_list = {} # put it in a dictionary
    ip_list = {} # put it an another dictionary
    
    for n in g1:
        n = n.rstrip()
        dom_list[n] = 1
    g1.close()
    for i in g2:
        i = i.rstrip()
        ip_list[i] = 1
    g2.close()
    POS_SC = 0.6
    NEG_SC = -0.8

    for step in range(1,MAX_ITERATIONS):         # Steps are the number of iterations
        #        num = MAX_ITERATIONS-1
        (train_dom,test_dom) = rand_split(dom_list)
        (train_ip, test_ip) = rand_split(ip_list)
        ret = nr.bl_propagate2(gr,train_dom,train_ip,0,POS_SC,NEG_SC) 
 #        ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC) 
    fp = open("benign10k",'r')
    good_list = []
    for ln in fp:
        ln = ln.rstrip()
        good_list.append(ln)
        
    fp.close()
    mal_train_list = train_dom+train_ip
    mal_test_list = test_dom+test_ip


    fp1 = open("benign_clusters_op",'w')
    fp2 = open("mal_clusters_op",'w')
    identify_clusters(gr,level,fp1,fp2)
Exemple #2
0
def graph_cluster_parent(gr, dom_file, ip_file,
                         level):  #level is for 1 or 2 hop propagation
    g1 = open(dom_file, 'r')
    g2 = open(ip_file, 'r')
    dom_list = {}  # put it in a dictionary
    ip_list = {}  # put it an another dictionary

    for n in g1:
        n = n.rstrip()
        dom_list[n] = 1
    g1.close()
    for i in g2:
        i = i.rstrip()
        ip_list[i] = 1
    g2.close()
    POS_SC = 0.6
    NEG_SC = -0.8

    for step in range(1, MAX_ITERATIONS):  # Steps are the number of iterations
        #        num = MAX_ITERATIONS-1
        (train_dom, test_dom) = rand_split(dom_list)
        (train_ip, test_ip) = rand_split(ip_list)
        ret = nr.bl_propagate2(gr, train_dom, train_ip, 0, POS_SC, NEG_SC)
#        ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC)
    fp = open("benign10k", 'r')
    good_list = []
    for ln in fp:
        ln = ln.rstrip()
        good_list.append(ln)

    fp.close()
    mal_train_list = train_dom + train_ip
    mal_test_list = test_dom + test_ip

    fp1 = open("benign_clusters_op", 'w')
    fp2 = open("mal_clusters_op", 'w')
    identify_clusters(gr, level, fp1, fp2)
Exemple #3
0
def graph_scores2(gr,dom_file, ip_file,POS_SCORE,NEG_SCORE,POS_SC,NEG_SC,g3, level): #level is for 1 or 2 hop propagation
    g1 = open(dom_file, 'r')
    g2 = open(ip_file, 'r')
#    g3 = open("tpfp_reusults", 'w')
    dom_list = {} # put it in a dictionary
    ip_list = {} # put it an another dictionary
    for n in g1:
        n = n.rstrip()
        dom_list[n] = 1
    g1.close()
    for i in g2:
        i = i.rstrip()
        ip_list[i] = 1
    g2.close()
    for step in range(1,MAX_ITERATIONS):         # Steps are the number of iterations
#        num = MAX_ITERATIONS-1
        (train_dom,test_dom) = rand_split(dom_list)
        (train_ip, test_ip) = rand_split(ip_list)
        ret = nr.bl_propagate2(gr,train_dom,train_ip,0,POS_SC,NEG_SC) 
        ret = nr.bl_propagate2(gr,train_dom,train_ip,1,POS_SC,NEG_SC) 

        # ret = nr.bl_propagate2(gr,train_dom,train_ip,1)
        # ret = nr.bl_propagate2(gr,train_dom,train_ip,2)
        # ret = nr.bl_propagate2(gr,train_dom,train_ip,3)
        # This graph basically consists of neighbors from hop1 and hop2, which are the only entities we want to check, if identified
#        gr2 = nx.Graph()
        
        str1 = "tpfp-results/truep_r"+str(step)
        str2 = "tpfp-results/falsen_r"+str(step)
        str3 = "tpfp-results/falsep_r"+str(step)
        str4 = "tpfp-results/truen_r"+str(step)
        f1= open(str1, 'w')
        f2= open(str2, 'w')
        f3= open(str3, 'w')
        f4= open(str4, 'w')

        print "Now preparing new graph(dictionary)..."

# There are three ways to create the two hop neighborhood, 1.Make a temporary graph, 2. Make a subgraph, 3. Make a dictionary
# http://networkx.lanl.gov/reference/generated/networkx.Graph.subgraph.html ( They are only pointers to original graph)
        
        dom_ip_neigh = {}  # lookups in dictionary are optimized
#        ip_neigh = {}
        for node in dom_list: # domain file
            node = node.rstrip()  # might not be of any use now
            dom_ip_neigh[node]=1
            if node in gr:
                n = node
                for e in gr.edge[n]:
                    dom_ip_neigh[e] = 1
                    if level ==2:
                        for f in gr.edge[e]:
                            dom_ip_neigh[f] = 1
                    else:
                        pass

        for node in ip_list: # IP file
            node = node.rstrip() # again, might not be of any use
            if node in gr:
                n = node
                dom_ip_neigh[n]=1
                for e in gr.edge[n]:
                    dom_ip_neigh[e]=1
                    if level == 2:
                        for f in gr.edge[e]:
                            dom_ip_neigh[f] =1
                    else:
                        pass
                        
        print "Done preparing new graph..."

        in_graph = 0
        tp_num = 0
        fn_num = 0
        tn_num = 0
        fp_num = 0 # This is not entirely false positive, but those nodes, which I mark, but they are not in my testing list
        no_score = 0 # present in graph, but no score
        t_d = {} # dictionary holding domains and their scores
        t_i ={} # same as above for IPs
        is_neighbor = 0 # marks the nodes which are present in the graph & testing neighborhood
        not_neighbor = 0 # nodes in training, but not present in the two hop neighborhood
        not_in_graph = 0 # not present in our original graph
        neg_more_than_thres = 0 # nodes marked negative, but more than negative threshold
        pos_less_than_thres = 0 # nodes marked positive, but less than positive threshold
        pos_more_than_thres = 0
        neg_less_than_thres = 0
        print "Now starting tp,fp analysis..."
        for n in test_dom:
            n = n.rstrip()
            if n in gr: # Check if it exists in original graph
                in_graph += 1
                if n in dom_ip_neigh: # if the node in testing is present in the neighborhood graph of training
                    is_neighbor += 1
                    if n in ret: # if the node has any trust state other than a zero
                        if ret[n] < NEG_SCORE:
                            tp_num += 1
                            t_d[n] = ret[n]
                            f1.write("%s %f\n" %(n, t_d[n]))
                        elif ret[n] > POS_SCORE:
                            fn_num += 1 # false negative coz it shldnt be here
                            t_d[n] = ret[n]
                            f2.write("%s %f\n" %(n, t_d[n]))
                        else:
                            pass # ret doesn't score values == 0
                    else :
                        # decision to mark nodes, which are present in the training neighborhood, but not marked as anything -> false negatives(strict)
                        no_score += 1 
                        # Here I can decide whether to use fn_num as false negative or use fn_num + no_score as the nodes to consider for false negatives.
                else:
                    not_neighbor += 1 # They are not present in the testing neighborhood
            else:
                not_in_graph += 1
                pass # not even present in original graph

        for k in test_ip:
            k = k.rstrip() 
            if k in gr:
                in_graph += 1
                if k in dom_ip_neigh: # the nodes in testing to be in the neighborhood of the training
                    is_neighbor += 1
                    if k in ret:
                        if ret[k] < NEG_SCORE:
                            tp_num += 1
                            t_i[k] = ret[k]
                            f1.write("%s %f\n" %(k, t_i[k]))
                        elif ret[k] > POS_SCORE:
                            fn_num += 1
                            t_i[k] = ret[k]
                            f2.write("%s %f\n" %(k, t_i[k]))
                        else:
                            pass
                    else :# Again decide to use fn_num or fn_num + no_score
                        no_score += 1 
                else:
                    not_neighbor += 1
            else:
                pass # just not present in the graph

        for i in ret: # nodes for which we have non zero scores a
            if i in t_d: 
                continue
            elif i in t_i:
                continue 
            else: # which are not present in the neighborhoods
                if ret[i] > 0 and ret[i] < POS_SCORE:
                    pos_less_than_thres += 1
                    continue
                elif ret[i] > POS_SCORE: # nodes marked as benign, contribute to TN
                    pos_more_than_thres += 1
#                    tn_num += 1
#                    f4.write("%s %f\n" %(i,ret[i]))
                    continue
                elif ret[i] < 0 and ret[i] > NEG_SCORE:
                    neg_more_than_thres += 1
                    continue
                elif ret[i] < NEG_SCORE: # nodes marked as malicious, contribute to FP
                    neg_less_than_thres += 1
#                    fp_num += 1
                    f3.write("%s %f\n" %(i,ret[i]))
                    continue
                
        identify_clusters(gr,level, fp1, fp2)
        print "Finished with tp/fp analysis"
        print "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" %(POS_SCORE,NEG_SCORE, POS_SC, NEG_SC)
        print "Step(iteration): "+str(step)
        print "tp_num: "+str(tp_num)
        print "fp_num: "+str(neg_less_than_thres)
        print "tn_num: "+str(tn_num)
        print "fn_num: "+str(fn_num)
        print "Present in neighbohood, but no score(should be marked as false negatives?): "+str(no_score)
        print "is neighbor(of testing): "+str(is_neighbor)
        print "not_neighbor: "+str(not_neighbor)
        print "not_in_graph: "+str(not_in_graph)
        print "Negative more than threshold: "+str(neg_more_than_thres)
        print "Positive less than threshold: "+str(pos_less_than_thres)
        print "Positive more than threshold(tn): "+str(pos_more_than_thres)
        print "Negative less than threshold(fp): "+str(neg_less_than_thres)
#        gr2.clear()
        f1.close()
        f2.close()
        f3.close()
        f4.close()
        g3.write("###################### Iteration: %d ######################\n" %(step))
        g3.write("Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" %(POS_SCORE,NEG_SCORE, POS_SC, NEG_SC))
        g3.write("tp_num: %d\n" %(tp_num))
        g3.write("fp_num: %d\n" %(fp_num))
        g3.write("tn_num: %d\n" %(tn_num))
        g3.write("fn_num: %d\n" %(fn_num))
        g3.write("is neighbor(of testing): %d\n" %(is_neighbor))
        g3.write("Present in neighborhood, but no score(should be marked as false -ves: %d ?\n" %(no_score))
        g3.write("not_neighbor: %d\n" %(not_neighbor))
        g3.write("not_in_graph: %d\n" %(not_in_graph))
        g3.write("Negative more than threshold: %d\n" %(neg_more_than_thres))
        g3.write("Positive less than threshold: %d\n" %(pos_less_than_thres))
        g3.write("Negative less than threshold(fp): %d\n" %(neg_less_than_thres))
        g3.write("Positive more than threshold(tn): %d\n" %(pos_more_than_thres))
        g3.write("\n")
        g3.flush()
Exemple #4
0
def graph_scores2(gr, dom_file, ip_file, POS_SCORE, NEG_SCORE, POS_SC, NEG_SC,
                  g3, level):  #level is for 1 or 2 hop propagation
    g1 = open(dom_file, 'r')
    g2 = open(ip_file, 'r')
    #    g3 = open("tpfp_reusults", 'w')
    dom_list = {}  # put it in a dictionary
    ip_list = {}  # put it an another dictionary
    for n in g1:
        n = n.rstrip()
        dom_list[n] = 1
    g1.close()
    for i in g2:
        i = i.rstrip()
        ip_list[i] = 1
    g2.close()
    for step in range(1, MAX_ITERATIONS):  # Steps are the number of iterations
        #        num = MAX_ITERATIONS-1
        (train_dom, test_dom) = rand_split(dom_list)
        (train_ip, test_ip) = rand_split(ip_list)
        ret = nr.bl_propagate2(gr, train_dom, train_ip, 0, POS_SC, NEG_SC)
        ret = nr.bl_propagate2(gr, train_dom, train_ip, 1, POS_SC, NEG_SC)

        # ret = nr.bl_propagate2(gr,train_dom,train_ip,1)
        # ret = nr.bl_propagate2(gr,train_dom,train_ip,2)
        # ret = nr.bl_propagate2(gr,train_dom,train_ip,3)
        # This graph basically consists of neighbors from hop1 and hop2, which are the only entities we want to check, if identified
        #        gr2 = nx.Graph()

        str1 = "tpfp-results/truep_r" + str(step)
        str2 = "tpfp-results/falsen_r" + str(step)
        str3 = "tpfp-results/falsep_r" + str(step)
        str4 = "tpfp-results/truen_r" + str(step)
        f1 = open(str1, 'w')
        f2 = open(str2, 'w')
        f3 = open(str3, 'w')
        f4 = open(str4, 'w')

        print "Now preparing new graph(dictionary)..."

        # There are three ways to create the two hop neighborhood, 1.Make a temporary graph, 2. Make a subgraph, 3. Make a dictionary
        # http://networkx.lanl.gov/reference/generated/networkx.Graph.subgraph.html ( They are only pointers to original graph)

        dom_ip_neigh = {}  # lookups in dictionary are optimized
        #        ip_neigh = {}
        for node in dom_list:  # domain file
            node = node.rstrip()  # might not be of any use now
            dom_ip_neigh[node] = 1
            if node in gr:
                n = node
                for e in gr.edge[n]:
                    dom_ip_neigh[e] = 1
                    if level == 2:
                        for f in gr.edge[e]:
                            dom_ip_neigh[f] = 1
                    else:
                        pass

        for node in ip_list:  # IP file
            node = node.rstrip()  # again, might not be of any use
            if node in gr:
                n = node
                dom_ip_neigh[n] = 1
                for e in gr.edge[n]:
                    dom_ip_neigh[e] = 1
                    if level == 2:
                        for f in gr.edge[e]:
                            dom_ip_neigh[f] = 1
                    else:
                        pass

        print "Done preparing new graph..."

        in_graph = 0
        tp_num = 0
        fn_num = 0
        tn_num = 0
        fp_num = 0  # This is not entirely false positive, but those nodes, which I mark, but they are not in my testing list
        no_score = 0  # present in graph, but no score
        t_d = {}  # dictionary holding domains and their scores
        t_i = {}  # same as above for IPs
        is_neighbor = 0  # marks the nodes which are present in the graph & testing neighborhood
        not_neighbor = 0  # nodes in training, but not present in the two hop neighborhood
        not_in_graph = 0  # not present in our original graph
        neg_more_than_thres = 0  # nodes marked negative, but more than negative threshold
        pos_less_than_thres = 0  # nodes marked positive, but less than positive threshold
        pos_more_than_thres = 0
        neg_less_than_thres = 0
        print "Now starting tp,fp analysis..."
        for n in test_dom:
            n = n.rstrip()
            if n in gr:  # Check if it exists in original graph
                in_graph += 1
                if n in dom_ip_neigh:  # if the node in testing is present in the neighborhood graph of training
                    is_neighbor += 1
                    if n in ret:  # if the node has any trust state other than a zero
                        if ret[n] < NEG_SCORE:
                            tp_num += 1
                            t_d[n] = ret[n]
                            f1.write("%s %f\n" % (n, t_d[n]))
                        elif ret[n] > POS_SCORE:
                            fn_num += 1  # false negative coz it shldnt be here
                            t_d[n] = ret[n]
                            f2.write("%s %f\n" % (n, t_d[n]))
                        else:
                            pass  # ret doesn't score values == 0
                    else:
                        # decision to mark nodes, which are present in the training neighborhood, but not marked as anything -> false negatives(strict)
                        no_score += 1
                        # Here I can decide whether to use fn_num as false negative or use fn_num + no_score as the nodes to consider for false negatives.
                else:
                    not_neighbor += 1  # They are not present in the testing neighborhood
            else:
                not_in_graph += 1
                pass  # not even present in original graph

        for k in test_ip:
            k = k.rstrip()
            if k in gr:
                in_graph += 1
                if k in dom_ip_neigh:  # the nodes in testing to be in the neighborhood of the training
                    is_neighbor += 1
                    if k in ret:
                        if ret[k] < NEG_SCORE:
                            tp_num += 1
                            t_i[k] = ret[k]
                            f1.write("%s %f\n" % (k, t_i[k]))
                        elif ret[k] > POS_SCORE:
                            fn_num += 1
                            t_i[k] = ret[k]
                            f2.write("%s %f\n" % (k, t_i[k]))
                        else:
                            pass
                    else:  # Again decide to use fn_num or fn_num + no_score
                        no_score += 1
                else:
                    not_neighbor += 1
            else:
                pass  # just not present in the graph

        for i in ret:  # nodes for which we have non zero scores a
            if i in t_d:
                continue
            elif i in t_i:
                continue
            else:  # which are not present in the neighborhoods
                if ret[i] > 0 and ret[i] < POS_SCORE:
                    pos_less_than_thres += 1
                    continue
                elif ret[i] > POS_SCORE:  # nodes marked as benign, contribute to TN
                    pos_more_than_thres += 1
                    #                    tn_num += 1
                    #                    f4.write("%s %f\n" %(i,ret[i]))
                    continue
                elif ret[i] < 0 and ret[i] > NEG_SCORE:
                    neg_more_than_thres += 1
                    continue
                elif ret[i] < NEG_SCORE:  # nodes marked as malicious, contribute to FP
                    neg_less_than_thres += 1
                    #                    fp_num += 1
                    f3.write("%s %f\n" % (i, ret[i]))
                    continue

        identify_clusters(gr, level, fp1, fp2)
        print "Finished with tp/fp analysis"
        print "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" % (
            POS_SCORE, NEG_SCORE, POS_SC, NEG_SC)
        print "Step(iteration): " + str(step)
        print "tp_num: " + str(tp_num)
        print "fp_num: " + str(neg_less_than_thres)
        print "tn_num: " + str(tn_num)
        print "fn_num: " + str(fn_num)
        print "Present in neighbohood, but no score(should be marked as false negatives?): " + str(
            no_score)
        print "is neighbor(of testing): " + str(is_neighbor)
        print "not_neighbor: " + str(not_neighbor)
        print "not_in_graph: " + str(not_in_graph)
        print "Negative more than threshold: " + str(neg_more_than_thres)
        print "Positive less than threshold: " + str(pos_less_than_thres)
        print "Positive more than threshold(tn): " + str(pos_more_than_thres)
        print "Negative less than threshold(fp): " + str(neg_less_than_thres)
        #        gr2.clear()
        f1.close()
        f2.close()
        f3.close()
        f4.close()
        g3.write(
            "###################### Iteration: %d ######################\n" %
            (step))
        g3.write(
            "Values: POS_SCORE: %f, NEG_SCORE: %f, POS_SC: %f, NEG_SC: %f\n" %
            (POS_SCORE, NEG_SCORE, POS_SC, NEG_SC))
        g3.write("tp_num: %d\n" % (tp_num))
        g3.write("fp_num: %d\n" % (fp_num))
        g3.write("tn_num: %d\n" % (tn_num))
        g3.write("fn_num: %d\n" % (fn_num))
        g3.write("is neighbor(of testing): %d\n" % (is_neighbor))
        g3.write(
            "Present in neighborhood, but no score(should be marked as false -ves: %d ?\n"
            % (no_score))
        g3.write("not_neighbor: %d\n" % (not_neighbor))
        g3.write("not_in_graph: %d\n" % (not_in_graph))
        g3.write("Negative more than threshold: %d\n" % (neg_more_than_thres))
        g3.write("Positive less than threshold: %d\n" % (pos_less_than_thres))
        g3.write("Negative less than threshold(fp): %d\n" %
                 (neg_less_than_thres))
        g3.write("Positive more than threshold(tn): %d\n" %
                 (pos_more_than_thres))
        g3.write("\n")
        g3.flush()