Ejemplo n.º 1
0
def verify_candidates(candidates, user_movies_matrix, start_time):
    print("\nVerifying candidates...")
    count = 0

    print("Number of buckets in total: " + str(len(candidates)))
    for cnr, candidate_group in enumerate(candidates):
        # print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group)))
        for cnr1, candidate1 in enumerate(candidate_group):
            for cnr2 in range(cnr1 + 1, len(candidate_group)):
                candidate2 = list(candidate_group)[cnr2]
                jsim = sim.jaccard(user_movies_matrix[candidate1],
                                   user_movies_matrix[candidate2])
                if jsim >= 0.50:
                    print("Number of candidates in bucket " + str(cnr) + ": " +
                          str(len(candidate_group)))
                    count = count + 1
                    print((candidate1, candidate2))
                    print("Similarity: " + str(
                        sim.jaccard(user_movies_matrix[candidate1],
                                    user_movies_matrix[candidate2])))
                    print("Found until now: " + str(count))
                    util.print_time(start_time)
                    print()
        # print()

    print(count)
Ejemplo n.º 2
0
def store_similarity(p):
    length = len(p)
    dict = {}  #最终要得到的字典
    for key in p.keys():
        dict1 = {}  #每个点对应的相似度字典
        for i in range(key + 1, length):
            dict1.setdefault(i, jaccard(p.get(key), p.get(i)))
        dict.setdefault(key, dict1)
    return dict
Ejemplo n.º 3
0
 def test_jaccard(self):
   G = self.G
   jac = jaccard(G)
   nt.assert_equal(len(jac), 7)
   for i in range(7):
     assert(i in jac)
   for i in self.G.jaccard.keys():
     nt.assert_equal(len(self.G.jaccard[i]), len(jac[i]))
     for j in self.G.jaccard[i].keys():
       nt.assert_almost_equal(jac[i][j], self.G.jaccard[i][j], places=4)
Ejemplo n.º 4
0
 def test_jaccard(self):
     G = self.G
     jac = jaccard(G)
     nt.assert_equal(len(jac), 7)
     for i in range(7):
         assert (i in jac)
     for i in self.G.jaccard.keys():
         nt.assert_equal(len(self.G.jaccard[i]), len(jac[i]))
         for j in self.G.jaccard[i].keys():
             nt.assert_almost_equal(jac[i][j],
                                    self.G.jaccard[i][j],
                                    places=4)
Ejemplo n.º 5
0
def verify_partial_candidates(candidate_group, user_movies_matrix, bucket_nr,
                              nr_found, start_time):
    for cnr1, candidate1 in enumerate(candidate_group):
        for cnr2 in range(cnr1 + 1, len(candidate_group)):
            candidate2 = list(candidate_group)[cnr2]
            jsim = sim.jaccard(user_movies_matrix[candidate1],
                               user_movies_matrix[candidate2])
            if jsim >= 0.50:
                pair = sorted((candidate1, candidate2))
                data.save_pair(pair)
                print("\tFound similar pair: " + str(pair))
                print("\tSimilarity: " + str(
                    sim.jaccard(user_movies_matrix[candidate1],
                                user_movies_matrix[candidate2])))
                print("\tBucket number: " + str(bucket_nr))
                print("\tNumber of candidates in the bucket: " +
                      str(len(candidate_group)))
                nr_found[0] = nr_found[0] + 1
                print("\tFound until now: " + str(nr_found[0]))
                util.print_time(start_time, "\t")
                print()
Ejemplo n.º 6
0
def apply2(user_movies_matrix, bands=5, rows=10):
    permutations = []
    for r in range(100):
        permutation = []
        for u in user_movies_matrix:
            permutation.append(u[r])
        permutations.append(permutation)

    signature_matrix = np.array(
        [np.zeros(100).astype(int) for i in range(len(user_movies_matrix))])
    for permutation in permutations:
        for i, user_movies in enumerate(user_movies_matrix):
            if permutation[i] in user_movies:
                signature_matrix[i][0]

    for b in range(1, bands + 1):
        buckets = {}
        for user_id, user_movies in enumerate(user_movies_matrix):
            row_signature = ''
            for r in range(1, rows + 1):
                min_hash = min(
                    [mmh3.hash(movie, seed=b * r) for movie in user_movies])
                row_signature = row_signature + str(min_hash)

            if row_signature in buckets:
                buckets[row_signature].append(user_id)
            else:
                buckets[row_signature] = [user_id]

        for bucket in buckets:
            candidate_group = buckets[bucket]
            if len(candidate_group) > 1:
                for cnr1, candidate1 in enumerate(candidate_group):
                    for cnr2 in range(cnr1 + 1, len(candidate_group)):
                        candidate2 = candidate_group[cnr2]
                        print((candidate1, candidate2))
                        print(
                            sim.jaccard(user_movies_matrix[candidate1],
                                        user_movies_matrix[candidate2]))
Ejemplo n.º 7
0
def neighbors(data, n, simtype):
    for user_key in data:
        user = data[user_key]
        user["neighbors"] = {}

        for other_key in data:
            other = data[other_key]
            if user_key == other_key:
                continue
            farthest, index = farthest_close_neighbor(user)
            sim = None
            if simtype == "pearson":
                sim = similarity.pearson(user, other)
            else:
                sim = similarity.jaccard(user, other)
            if len(user["neighbors"]) < n or sim > farthest:
                user["neighbors"][other_key] = {
                    "id": other_key,
                    "similarity": sim
                }
            if len(user["neighbors"]) > n:
                user["neighbors"].pop(index)
Ejemplo n.º 8
0
def main(args):
    print('Start test')

    creds = ReadDictJson(args.credentails)
    if not creds:
        print('Failed to load credentials file {}. Exiting'.format(args.credentails))
        return False

    s3def = creds['s3'][0]
    s3 = s3store(s3def['address'], 
                 s3def['access key'], 
                 s3def['secret key'], 
                 tls=s3def['tls'], 
                 cert_verify=s3def['cert_verify'], 
                 cert_path=s3def['cert_path']
                 )

    trainingset = '{}/{}/'.format(s3def['sets']['trainingset']['prefix'] , args.trainingset)
    print('Load training set {}/{} to {}'.format(s3def['sets']['trainingset']['bucket'],trainingset,args.trainingset_dir ))
    s3.Mirror(s3def['sets']['trainingset']['bucket'], trainingset, args.trainingset_dir)

    trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir)
    trainingsetDescription = json.load(open(trainingsetDescriptionFile))
    
    config = {
        'batch_size': args.batch_size,
        'trainingset': trainingsetDescription,
        'input_shape': [args.training_crop[0], args.training_crop[1], args.train_depth],
        'classScale': 0.001, # scale value for each product class
        'augment_rotation' : 5., # Rotation in degrees
        'augment_flip_x': False,
        'augment_flip_y': True,
        'augment_brightness':0.,
        'augment_contrast': 0.,
        'augment_shift_x': 0.0, # in fraction of image
        'augment_shift_y': 0.0, # in fraction of image
        'scale_min': 0.75, # in fraction of image
        'scale_max': 1.25, # in fraction of image
        'ignore_label': trainingsetDescription['classes']['ignore'],
        'classes': trainingsetDescription['classes']['classes'],
        'epochs': 1,
        'area_filter_min': 25,
        'weights': None,
        'channel_order': args.channel_order,
        's3_address':s3def['address'],
        's3_sets':s3def['sets'],
        'initialmodel':args.initialmodel,
        'training_dir': None, # used by LoadModel
        'learning_rate': 1e-3, # used by LoadModel
        'clean' : True,
        'test_archive': trainingset,
        'run_archive': '{}{}/'.format(trainingset, args.initialmodel),
        'min':args.min,
    }

    trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir)
    trainingsetDescription = json.load(open(trainingsetDescriptionFile))

    strategy = None
    if(args.strategy == 'mirrored'):
        strategy = tf.distribute.MirroredStrategy(devices=args.devices)

    else:
        device = "/gpu:0"
        if args.devices is not None and len(args.devices) > 0:
            device = args.devices[0]

        strategy = tf.distribute.OneDeviceStrategy(device=device)

    # Prepare datasets for similarity computation
    objTypes = {}
    for objType in trainingsetDescription['classes']['objects']:
        if objType['trainId'] not in objTypes:
            objTypes[objType['trainId']] = copy.deepcopy(objType)
            # set name to category for objTypes and id to trainId
            objTypes[objType['trainId']]['name'] = objType['category']
            objTypes[objType['trainId']]['id'] = objType['trainId']

    results = {'class similarity':{}, 'config':config, 'image':[]}

    for objType in objTypes:
        results['class similarity'][objType] = {'union':0, 'intersection':0} 

    with strategy.scope(): # Apply training strategy 
        model =  LoadModel(config, s3)
        accuracy = tf.keras.metrics.Accuracy()

        # Display model
        model.summary()

        #train_dataset = input_fn('train', args.trainingset_dir, config)
        val_dataset = input_fn('val', args.trainingset_dir, config)

        trainingsetdesc = {}
        validationsetdec = {}
        for dataset in config['trainingset']['sets']:
            if dataset['name'] == 'val':
                validationsetdec = dataset
            if dataset['name'] == 'train':
                trainingsetdesc = dataset

        print("Begin inferences")
        dtSum = 0.0
        accuracySum = 0.0
        total_confusion = None
        iterator = iter(val_dataset)
        numsteps = int(validationsetdec['length']/config['batch_size'])

        if(config['min']):
            numsteps=min(args.min_steps, numsteps)

        try:
            for i in tqdm(range(numsteps)):
                image, annotation  = iterator.get_next()
                initial = datetime.now()
                logits = model.predict(image, batch_size=config['batch_size'], steps=1)
                segmentation = tf.argmax(logits, axis=-1)
                dt = (datetime.now()-initial).total_seconds()
                dtSum += dt
                imageTime = dt/config['batch_size']
                for j in range(config['batch_size']):
                    img = tf.squeeze(image[j]).numpy().astype(np.uint8)
                    ann = tf.squeeze(annotation[j]).numpy().astype(np.uint8)
                    seg = tf.squeeze(segmentation[j]).numpy().astype(np.uint8)

                    accuracy.update_state(ann,seg)
                    seg_accuracy = accuracy.result().numpy()

                    accuracySum += seg_accuracy
                    imagesimilarity, results['class similarity'], unique = jaccard(ann, seg, objTypes, results['class similarity'])

                    confusion = tf.math.confusion_matrix(ann.flatten(),seg.flatten(), config['classes']).numpy().astype(np.int64)
                    if total_confusion is None:
                        total_confusion = confusion
                    else:
                        total_confusion += confusion
                        

                    results['image'].append({'dt':imageTime,'similarity':imagesimilarity, 'accuracy':seg_accuracy.astype(float), 'confusion':confusion.tolist()})
        except Exception as e:
            print("Error: test exception {} step {}".format(e, i))
            numsteps = i
        except:
            print("Error: test exception step {}".format(i))
            numsteps = i

    num_images = numsteps*config['batch_size']
    average_time = dtSum/num_images
    average_accuracy = accuracySum/num_images
    sumIntersection = 0
    sumUnion = 0
    sumAccuracy = 0.0
    dataset_similarity = {}
    for key in results['class similarity']:
        intersection = results['class similarity'][key]['intersection']
        sumIntersection += intersection
        union = results['class similarity'][key]['union']
        sumUnion += union
        class_similarity = similarity(intersection, union)

        # convert to int from int64 for json.dumps
        dataset_similarity[key] = {'intersection':int(intersection) ,'union':int(union) , 'similarity':class_similarity}

    results['class similarity'] = dataset_similarity
    total_similarity = similarity(sumIntersection, sumUnion)

    now = datetime.now()
    date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    test_summary = {'date':date_time, 'model':config['initialmodel']}
    test_summary['model']=config['initialmodel']}
    test_summary['accuracy']=average_accuracy
    test_summary['class_similarity']=dataset_similarity
    test_summary['similarity']=total_similarity
    test_summary['confusion']=total_confusion.tolist()
    test_summary['images']=num_images
    test_summary['image time']=average_time
    test_summary['batch size']=config['batch_size']
    test_summary['test store'] =s3def['address']
    test_summary['test bucket'] = s3def['sets']['trainingset']['bucket']
    test_summary['results'] = results
    
    print ("Average time {}".format(average_time))
    print ('Similarity: {}'.format(dataset_similarity))

    # If there is a way to lock this object between read and write, it would prevent the possability of loosing data
    training_data = s3.GetDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json)
    if training_data is None:
        training_data = []
    training_data.append(test_summary)
    s3.PutDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json, training_data)

    test_url = s3.GetUrl(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json)

    print("Test results {}".format(test_url))
Ejemplo n.º 9
0
    # Calculate Jaccard-coefficient for all links.
    jaccard_vals = []
    cosine_vals = []

    checktable = {}
    for i, label in enumerate(lbl):

        u = i+1
        for v in label:
            if len(lbl[v-1]) == 0: continue

            check_key = tuple(sorted([u, v]))
            if check_key not in checktable:

                sim_uv = similarity.jaccard(lbl[u-1], lbl[v-1])
                jaccard_vals.append(sim_uv)

                sim_uv_cos = (1.0 / math.sqrt(len(lbl[u-1]))) * (1.0 / math.sqrt(len(lbl[v-1]))) * float(len(set(lbl[u-1]) & set(lbl[v-1])))
                cosine_vals.append(sim_uv_cos)
                #sim_uv = float( len(set(lbl[u-1]) & set(lbl[v-1])) ) / float( len( 
                """
                orlist = sorted(list(set(lbl[u-1]) | set(lbl[v-1])))
                feat_u = [0.0] * len(orlist)



                """

                
                checktable[check_key] = 1
Ejemplo n.º 10
0
import pandas

### VERTEX SIMILARITY

graphs = pickle.load(open('data/graphs_networkx.pkl', 'rb'))
results = dict()

for graph_type, G in graphs.iteritems():
    print "Calculating for %s" % (graph_type)
    results[graph_type] = dict()
    print "ASCOS ---------------------------"
    results[graph_type]["ascos"] = ascos(G)
    print "COSINE ---------------------------"
    results[graph_type]["cosine"] = cosine(G)
    print "JACCARD --------------------------"
    results[graph_type]["jaccard"] = jaccard(G)
    print "KATZ -----------------------------"
    results[graph_type]["katz"] = katz(G)
    print "LHN ------------------------------"
    results[graph_type]["lhn"] = lhn(G)
    print "RSS2 -----------------------------"
    results[graph_type]["rss2"] = rss2(G)
    print "DICE -----------------------------"
    results[graph_type]["dice"] = dice(G)
    print "INVERSE LOG WEIGHTED --------------"
    results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G)

pickle.dump(results, open("data/sim_metrics.pkl", "wb"))

### IMAGE SIMILARITY
def show_similarity(p,q):
    a=jaccard(p,q)
    print(p,'\n',q)
    print('二者相似度:',a)
Ejemplo n.º 12
0
def handlenonexe(documentsamples, thresholddict, graph):
    """
    Form edges between non-exe samples based on the `strings` attribute

    Args:
        documentsamples: absolute paths of all non-exe malware samples
        thresholddict  : dict containing Jaccard Index threshold values
        graph          : networkx graph object

    Raises:
        None

    Returns:
        graph: populated networkx graph object
    """

    # Get shelve database object
    db = getdatabase()

    malwareattributes = dict()

    # Get attributes and create node
    for path in documentsamples:
        # Get `strings` for `path` sample from shelve db and store them
        malwareattributes[path] = db[path]

        # Add each malware sample to the graph as a node. The label of the
        # node is equal to the first six chars of SHA256 hash of the sample
        graph.add_node(path.split('/')[-1], label=os.path.split(path)[-1][:6])

    # Create edge based on Jaccard index
    for malware1, malware2 in itertools.combinations(documentsamples, 2):
        # Compute the jaccard distance for the current pair
        jaccardindex = jaccard(malwareattributes[malware1],
                               malwareattributes[malware2])

        # Determine file types to use appropriate jaccard index value
        malware1type = magic.from_file(malware1, mime=True)
        malware2type = magic.from_file(malware2, mime=True)

        if comparefiletypes(malware1type, malware2type):
            try:
                jaccardthreshold = thresholddict[malware1type]
            except KeyError as e:
                print(
                    "[*] Jaccard Index for filetype not available. Skipping..."
                )
                continue
        else:
            #print("Different file signatures detected: " + str(malware1type) + \
            #      "," + str(malware2type))
            # Two malware of different file types cannot possibly be related
            continue

        # If the jaccard  is above `jaccardthreshold`, then add an edge
        if jaccardindex > jaccardthreshold:
            node1 = malware1.split('/')[-1]
            node2 = malware2.split('/')[-1]
            graph.add_edge(node1,
                           node2,
                           penwidth=1 + (jaccardindex - jaccardthreshold) * 10)

    return graph
Ejemplo n.º 13
0
def handleexe(exesamples, thresholddict, ngram, graph):
    """
    Form edges between exe samples based on the dynamic api calls attribute

    Args:
        exesamples   : absolute paths of all exe malware samples
        thresholddict: dict containing Jaccard Index threshold values
        ngram        : ngram number of api sequences
        graph        : networkx graph object

    Raises:
        None

    Returns:
        graph: populated networkx graph object
    """

    malwareattributes = dict()

    # Read task and error ids of previous cuckoo dispatch
    taskids, errorids = readids()

    reportsurl = "http://localhost:8090/tasks/report/"
    headers = {"Authorization": "Bearer WTAIn0HHtRIUlR9_uJkJDg"}

    for taskid in taskids:
        taskreporturl = reportsurl + str(taskid)

        # Get report for `taskid`
        r = requests.get(taskreporturl, headers=headers)

        # Convert 'str' Cuckoo output to Python 'dict'
        taskreportjson = json.loads(r.content.decode('utf-8'))

        # Get info about all processes related to task from cuckoo task report
        taskprocessesinfo = taskreportjson['behavior']['processes']

        # Extract `process_path`, `process_name`, `first_seen` and `pid` from
        # task report
        taskprocessdict = extractstaticfeatures(taskprocessesinfo)

        # Add dynamic API calls info to `taskprocessdict`
        taskprocessdict = extractapicalls(taskprocessdict, taskprocessesinfo)

        # Find order of processes in task based on their `first_seen`
        orderedts = []
        for pid in taskprocessdict:
            orderedts.append(taskprocessdict[pid]['first_seen'])
        orderedts.sort()

        # Extract api calls in order considering all processes related to task
        apiattributes = extractorderedapi(orderedts, taskprocessdict)

        # Calculate n-gram APIs
        apiattributes = calculatengram(ngram, apiattributes)

        # Get parent process (original malware exe process)
        path = getparentprocess(taskid, exesamples)

        # Store dynamic API calls attribute
        malwareattributes[path] = apiattributes

    # Add exe samples' node to graph
    for path in exesamples:
        graph.add_node(path.split('/')[-1], label=os.path.split(path)[-1][:6])

    # Create edge based on Jaccard index
    for malware1, malware2 in itertools.combinations(exesamples, 2):
        # Compute the jaccard distance for the current pair
        jaccardindex = jaccard(malwareattributes[malware1],
                               malwareattributes[malware2], 'api')
        node1 = malware1.split('/')[-1]
        node2 = malware2.split('/')[-1]

        # Determine file types to use appropriate jaccard index value
        malware1type = magic.from_file(malware1, mime=True)
        malware2type = magic.from_file(malware2, mime=True)

        if comparefiletypes(malware1type, malware2type):
            try:
                jaccardthreshold = thresholddict[malware1type]
            except KeyError as e:
                print("[*] Jaccard Index for filetype: " + str(malware1type) +
                      " not available. Skipping...")
                continue
        else:
            #print("Different file signatures detected: " + str(malware1type) + \
            #      "," + str(malware2type))
            # Two malware of different file types cannot possibly be related
            continue

        # If the jaccard index is above `jaccardthreshold`, add an edge
        if jaccardindex > jaccardthreshold:
            graph.add_edge(node1,
                           node2,
                           penwidth=1 + (jaccardindex - jaccardthreshold) * 10)

    return graph
import pandas

### VERTEX SIMILARITY

graphs = pickle.load(open('data/graphs_networkx.pkl','rb'))
results = dict()

for graph_type,G in graphs.iteritems():
    print "Calculating for %s" %(graph_type)
    results[graph_type] = dict()
    print "ASCOS ---------------------------"
    results[graph_type]["ascos"] = ascos(G)
    print "COSINE ---------------------------"
    results[graph_type]["cosine"] = cosine(G)
    print "JACCARD --------------------------"
    results[graph_type]["jaccard"] = jaccard(G)
    print "KATZ -----------------------------"
    results[graph_type]["katz"] = katz(G)
    print "LHN ------------------------------"
    results[graph_type]["lhn"] = lhn(G)
    print "RSS2 -----------------------------"
    results[graph_type]["rss2"] = rss2(G)
    print "DICE -----------------------------"
    results[graph_type]["dice"] = dice(G)
    print "INVERSE LOG WEIGHTED --------------"
    results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G)

pickle.dump(results,open("data/sim_metrics.pkl","wb"))

### IMAGE SIMILARITY