def verify_candidates(candidates, user_movies_matrix, start_time): print("\nVerifying candidates...") count = 0 print("Number of buckets in total: " + str(len(candidates))) for cnr, candidate_group in enumerate(candidates): # print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group))) for cnr1, candidate1 in enumerate(candidate_group): for cnr2 in range(cnr1 + 1, len(candidate_group)): candidate2 = list(candidate_group)[cnr2] jsim = sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]) if jsim >= 0.50: print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group))) count = count + 1 print((candidate1, candidate2)) print("Similarity: " + str( sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]))) print("Found until now: " + str(count)) util.print_time(start_time) print() # print() print(count)
def store_similarity(p): length = len(p) dict = {} #最终要得到的字典 for key in p.keys(): dict1 = {} #每个点对应的相似度字典 for i in range(key + 1, length): dict1.setdefault(i, jaccard(p.get(key), p.get(i))) dict.setdefault(key, dict1) return dict
def test_jaccard(self): G = self.G jac = jaccard(G) nt.assert_equal(len(jac), 7) for i in range(7): assert(i in jac) for i in self.G.jaccard.keys(): nt.assert_equal(len(self.G.jaccard[i]), len(jac[i])) for j in self.G.jaccard[i].keys(): nt.assert_almost_equal(jac[i][j], self.G.jaccard[i][j], places=4)
def test_jaccard(self): G = self.G jac = jaccard(G) nt.assert_equal(len(jac), 7) for i in range(7): assert (i in jac) for i in self.G.jaccard.keys(): nt.assert_equal(len(self.G.jaccard[i]), len(jac[i])) for j in self.G.jaccard[i].keys(): nt.assert_almost_equal(jac[i][j], self.G.jaccard[i][j], places=4)
def verify_partial_candidates(candidate_group, user_movies_matrix, bucket_nr, nr_found, start_time): for cnr1, candidate1 in enumerate(candidate_group): for cnr2 in range(cnr1 + 1, len(candidate_group)): candidate2 = list(candidate_group)[cnr2] jsim = sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]) if jsim >= 0.50: pair = sorted((candidate1, candidate2)) data.save_pair(pair) print("\tFound similar pair: " + str(pair)) print("\tSimilarity: " + str( sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]))) print("\tBucket number: " + str(bucket_nr)) print("\tNumber of candidates in the bucket: " + str(len(candidate_group))) nr_found[0] = nr_found[0] + 1 print("\tFound until now: " + str(nr_found[0])) util.print_time(start_time, "\t") print()
def apply2(user_movies_matrix, bands=5, rows=10): permutations = [] for r in range(100): permutation = [] for u in user_movies_matrix: permutation.append(u[r]) permutations.append(permutation) signature_matrix = np.array( [np.zeros(100).astype(int) for i in range(len(user_movies_matrix))]) for permutation in permutations: for i, user_movies in enumerate(user_movies_matrix): if permutation[i] in user_movies: signature_matrix[i][0] for b in range(1, bands + 1): buckets = {} for user_id, user_movies in enumerate(user_movies_matrix): row_signature = '' for r in range(1, rows + 1): min_hash = min( [mmh3.hash(movie, seed=b * r) for movie in user_movies]) row_signature = row_signature + str(min_hash) if row_signature in buckets: buckets[row_signature].append(user_id) else: buckets[row_signature] = [user_id] for bucket in buckets: candidate_group = buckets[bucket] if len(candidate_group) > 1: for cnr1, candidate1 in enumerate(candidate_group): for cnr2 in range(cnr1 + 1, len(candidate_group)): candidate2 = candidate_group[cnr2] print((candidate1, candidate2)) print( sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]))
def neighbors(data, n, simtype): for user_key in data: user = data[user_key] user["neighbors"] = {} for other_key in data: other = data[other_key] if user_key == other_key: continue farthest, index = farthest_close_neighbor(user) sim = None if simtype == "pearson": sim = similarity.pearson(user, other) else: sim = similarity.jaccard(user, other) if len(user["neighbors"]) < n or sim > farthest: user["neighbors"][other_key] = { "id": other_key, "similarity": sim } if len(user["neighbors"]) > n: user["neighbors"].pop(index)
def main(args): print('Start test') creds = ReadDictJson(args.credentails) if not creds: print('Failed to load credentials file {}. Exiting'.format(args.credentails)) return False s3def = creds['s3'][0] s3 = s3store(s3def['address'], s3def['access key'], s3def['secret key'], tls=s3def['tls'], cert_verify=s3def['cert_verify'], cert_path=s3def['cert_path'] ) trainingset = '{}/{}/'.format(s3def['sets']['trainingset']['prefix'] , args.trainingset) print('Load training set {}/{} to {}'.format(s3def['sets']['trainingset']['bucket'],trainingset,args.trainingset_dir )) s3.Mirror(s3def['sets']['trainingset']['bucket'], trainingset, args.trainingset_dir) trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir) trainingsetDescription = json.load(open(trainingsetDescriptionFile)) config = { 'batch_size': args.batch_size, 'trainingset': trainingsetDescription, 'input_shape': [args.training_crop[0], args.training_crop[1], args.train_depth], 'classScale': 0.001, # scale value for each product class 'augment_rotation' : 5., # Rotation in degrees 'augment_flip_x': False, 'augment_flip_y': True, 'augment_brightness':0., 'augment_contrast': 0., 'augment_shift_x': 0.0, # in fraction of image 'augment_shift_y': 0.0, # in fraction of image 'scale_min': 0.75, # in fraction of image 'scale_max': 1.25, # in fraction of image 'ignore_label': trainingsetDescription['classes']['ignore'], 'classes': trainingsetDescription['classes']['classes'], 'epochs': 1, 'area_filter_min': 25, 'weights': None, 'channel_order': args.channel_order, 's3_address':s3def['address'], 's3_sets':s3def['sets'], 'initialmodel':args.initialmodel, 'training_dir': None, # used by LoadModel 'learning_rate': 1e-3, # used by LoadModel 'clean' : True, 'test_archive': trainingset, 'run_archive': '{}{}/'.format(trainingset, args.initialmodel), 'min':args.min, } trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir) trainingsetDescription = json.load(open(trainingsetDescriptionFile)) strategy = None if(args.strategy == 'mirrored'): strategy = tf.distribute.MirroredStrategy(devices=args.devices) else: device = "/gpu:0" if args.devices is not None and len(args.devices) > 0: device = args.devices[0] strategy = tf.distribute.OneDeviceStrategy(device=device) # Prepare datasets for similarity computation objTypes = {} for objType in trainingsetDescription['classes']['objects']: if objType['trainId'] not in objTypes: objTypes[objType['trainId']] = copy.deepcopy(objType) # set name to category for objTypes and id to trainId objTypes[objType['trainId']]['name'] = objType['category'] objTypes[objType['trainId']]['id'] = objType['trainId'] results = {'class similarity':{}, 'config':config, 'image':[]} for objType in objTypes: results['class similarity'][objType] = {'union':0, 'intersection':0} with strategy.scope(): # Apply training strategy model = LoadModel(config, s3) accuracy = tf.keras.metrics.Accuracy() # Display model model.summary() #train_dataset = input_fn('train', args.trainingset_dir, config) val_dataset = input_fn('val', args.trainingset_dir, config) trainingsetdesc = {} validationsetdec = {} for dataset in config['trainingset']['sets']: if dataset['name'] == 'val': validationsetdec = dataset if dataset['name'] == 'train': trainingsetdesc = dataset print("Begin inferences") dtSum = 0.0 accuracySum = 0.0 total_confusion = None iterator = iter(val_dataset) numsteps = int(validationsetdec['length']/config['batch_size']) if(config['min']): numsteps=min(args.min_steps, numsteps) try: for i in tqdm(range(numsteps)): image, annotation = iterator.get_next() initial = datetime.now() logits = model.predict(image, batch_size=config['batch_size'], steps=1) segmentation = tf.argmax(logits, axis=-1) dt = (datetime.now()-initial).total_seconds() dtSum += dt imageTime = dt/config['batch_size'] for j in range(config['batch_size']): img = tf.squeeze(image[j]).numpy().astype(np.uint8) ann = tf.squeeze(annotation[j]).numpy().astype(np.uint8) seg = tf.squeeze(segmentation[j]).numpy().astype(np.uint8) accuracy.update_state(ann,seg) seg_accuracy = accuracy.result().numpy() accuracySum += seg_accuracy imagesimilarity, results['class similarity'], unique = jaccard(ann, seg, objTypes, results['class similarity']) confusion = tf.math.confusion_matrix(ann.flatten(),seg.flatten(), config['classes']).numpy().astype(np.int64) if total_confusion is None: total_confusion = confusion else: total_confusion += confusion results['image'].append({'dt':imageTime,'similarity':imagesimilarity, 'accuracy':seg_accuracy.astype(float), 'confusion':confusion.tolist()}) except Exception as e: print("Error: test exception {} step {}".format(e, i)) numsteps = i except: print("Error: test exception step {}".format(i)) numsteps = i num_images = numsteps*config['batch_size'] average_time = dtSum/num_images average_accuracy = accuracySum/num_images sumIntersection = 0 sumUnion = 0 sumAccuracy = 0.0 dataset_similarity = {} for key in results['class similarity']: intersection = results['class similarity'][key]['intersection'] sumIntersection += intersection union = results['class similarity'][key]['union'] sumUnion += union class_similarity = similarity(intersection, union) # convert to int from int64 for json.dumps dataset_similarity[key] = {'intersection':int(intersection) ,'union':int(union) , 'similarity':class_similarity} results['class similarity'] = dataset_similarity total_similarity = similarity(sumIntersection, sumUnion) now = datetime.now() date_time = now.strftime("%m/%d/%Y, %H:%M:%S") test_summary = {'date':date_time, 'model':config['initialmodel']} test_summary['model']=config['initialmodel']} test_summary['accuracy']=average_accuracy test_summary['class_similarity']=dataset_similarity test_summary['similarity']=total_similarity test_summary['confusion']=total_confusion.tolist() test_summary['images']=num_images test_summary['image time']=average_time test_summary['batch size']=config['batch_size'] test_summary['test store'] =s3def['address'] test_summary['test bucket'] = s3def['sets']['trainingset']['bucket'] test_summary['results'] = results print ("Average time {}".format(average_time)) print ('Similarity: {}'.format(dataset_similarity)) # If there is a way to lock this object between read and write, it would prevent the possability of loosing data training_data = s3.GetDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json) if training_data is None: training_data = [] training_data.append(test_summary) s3.PutDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json, training_data) test_url = s3.GetUrl(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json) print("Test results {}".format(test_url))
# Calculate Jaccard-coefficient for all links. jaccard_vals = [] cosine_vals = [] checktable = {} for i, label in enumerate(lbl): u = i+1 for v in label: if len(lbl[v-1]) == 0: continue check_key = tuple(sorted([u, v])) if check_key not in checktable: sim_uv = similarity.jaccard(lbl[u-1], lbl[v-1]) jaccard_vals.append(sim_uv) sim_uv_cos = (1.0 / math.sqrt(len(lbl[u-1]))) * (1.0 / math.sqrt(len(lbl[v-1]))) * float(len(set(lbl[u-1]) & set(lbl[v-1]))) cosine_vals.append(sim_uv_cos) #sim_uv = float( len(set(lbl[u-1]) & set(lbl[v-1])) ) / float( len( """ orlist = sorted(list(set(lbl[u-1]) | set(lbl[v-1]))) feat_u = [0.0] * len(orlist) """ checktable[check_key] = 1
import pandas ### VERTEX SIMILARITY graphs = pickle.load(open('data/graphs_networkx.pkl', 'rb')) results = dict() for graph_type, G in graphs.iteritems(): print "Calculating for %s" % (graph_type) results[graph_type] = dict() print "ASCOS ---------------------------" results[graph_type]["ascos"] = ascos(G) print "COSINE ---------------------------" results[graph_type]["cosine"] = cosine(G) print "JACCARD --------------------------" results[graph_type]["jaccard"] = jaccard(G) print "KATZ -----------------------------" results[graph_type]["katz"] = katz(G) print "LHN ------------------------------" results[graph_type]["lhn"] = lhn(G) print "RSS2 -----------------------------" results[graph_type]["rss2"] = rss2(G) print "DICE -----------------------------" results[graph_type]["dice"] = dice(G) print "INVERSE LOG WEIGHTED --------------" results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G) pickle.dump(results, open("data/sim_metrics.pkl", "wb")) ### IMAGE SIMILARITY
def show_similarity(p,q): a=jaccard(p,q) print(p,'\n',q) print('二者相似度:',a)
def handlenonexe(documentsamples, thresholddict, graph): """ Form edges between non-exe samples based on the `strings` attribute Args: documentsamples: absolute paths of all non-exe malware samples thresholddict : dict containing Jaccard Index threshold values graph : networkx graph object Raises: None Returns: graph: populated networkx graph object """ # Get shelve database object db = getdatabase() malwareattributes = dict() # Get attributes and create node for path in documentsamples: # Get `strings` for `path` sample from shelve db and store them malwareattributes[path] = db[path] # Add each malware sample to the graph as a node. The label of the # node is equal to the first six chars of SHA256 hash of the sample graph.add_node(path.split('/')[-1], label=os.path.split(path)[-1][:6]) # Create edge based on Jaccard index for malware1, malware2 in itertools.combinations(documentsamples, 2): # Compute the jaccard distance for the current pair jaccardindex = jaccard(malwareattributes[malware1], malwareattributes[malware2]) # Determine file types to use appropriate jaccard index value malware1type = magic.from_file(malware1, mime=True) malware2type = magic.from_file(malware2, mime=True) if comparefiletypes(malware1type, malware2type): try: jaccardthreshold = thresholddict[malware1type] except KeyError as e: print( "[*] Jaccard Index for filetype not available. Skipping..." ) continue else: #print("Different file signatures detected: " + str(malware1type) + \ # "," + str(malware2type)) # Two malware of different file types cannot possibly be related continue # If the jaccard is above `jaccardthreshold`, then add an edge if jaccardindex > jaccardthreshold: node1 = malware1.split('/')[-1] node2 = malware2.split('/')[-1] graph.add_edge(node1, node2, penwidth=1 + (jaccardindex - jaccardthreshold) * 10) return graph
def handleexe(exesamples, thresholddict, ngram, graph): """ Form edges between exe samples based on the dynamic api calls attribute Args: exesamples : absolute paths of all exe malware samples thresholddict: dict containing Jaccard Index threshold values ngram : ngram number of api sequences graph : networkx graph object Raises: None Returns: graph: populated networkx graph object """ malwareattributes = dict() # Read task and error ids of previous cuckoo dispatch taskids, errorids = readids() reportsurl = "http://localhost:8090/tasks/report/" headers = {"Authorization": "Bearer WTAIn0HHtRIUlR9_uJkJDg"} for taskid in taskids: taskreporturl = reportsurl + str(taskid) # Get report for `taskid` r = requests.get(taskreporturl, headers=headers) # Convert 'str' Cuckoo output to Python 'dict' taskreportjson = json.loads(r.content.decode('utf-8')) # Get info about all processes related to task from cuckoo task report taskprocessesinfo = taskreportjson['behavior']['processes'] # Extract `process_path`, `process_name`, `first_seen` and `pid` from # task report taskprocessdict = extractstaticfeatures(taskprocessesinfo) # Add dynamic API calls info to `taskprocessdict` taskprocessdict = extractapicalls(taskprocessdict, taskprocessesinfo) # Find order of processes in task based on their `first_seen` orderedts = [] for pid in taskprocessdict: orderedts.append(taskprocessdict[pid]['first_seen']) orderedts.sort() # Extract api calls in order considering all processes related to task apiattributes = extractorderedapi(orderedts, taskprocessdict) # Calculate n-gram APIs apiattributes = calculatengram(ngram, apiattributes) # Get parent process (original malware exe process) path = getparentprocess(taskid, exesamples) # Store dynamic API calls attribute malwareattributes[path] = apiattributes # Add exe samples' node to graph for path in exesamples: graph.add_node(path.split('/')[-1], label=os.path.split(path)[-1][:6]) # Create edge based on Jaccard index for malware1, malware2 in itertools.combinations(exesamples, 2): # Compute the jaccard distance for the current pair jaccardindex = jaccard(malwareattributes[malware1], malwareattributes[malware2], 'api') node1 = malware1.split('/')[-1] node2 = malware2.split('/')[-1] # Determine file types to use appropriate jaccard index value malware1type = magic.from_file(malware1, mime=True) malware2type = magic.from_file(malware2, mime=True) if comparefiletypes(malware1type, malware2type): try: jaccardthreshold = thresholddict[malware1type] except KeyError as e: print("[*] Jaccard Index for filetype: " + str(malware1type) + " not available. Skipping...") continue else: #print("Different file signatures detected: " + str(malware1type) + \ # "," + str(malware2type)) # Two malware of different file types cannot possibly be related continue # If the jaccard index is above `jaccardthreshold`, add an edge if jaccardindex > jaccardthreshold: graph.add_edge(node1, node2, penwidth=1 + (jaccardindex - jaccardthreshold) * 10) return graph
import pandas ### VERTEX SIMILARITY graphs = pickle.load(open('data/graphs_networkx.pkl','rb')) results = dict() for graph_type,G in graphs.iteritems(): print "Calculating for %s" %(graph_type) results[graph_type] = dict() print "ASCOS ---------------------------" results[graph_type]["ascos"] = ascos(G) print "COSINE ---------------------------" results[graph_type]["cosine"] = cosine(G) print "JACCARD --------------------------" results[graph_type]["jaccard"] = jaccard(G) print "KATZ -----------------------------" results[graph_type]["katz"] = katz(G) print "LHN ------------------------------" results[graph_type]["lhn"] = lhn(G) print "RSS2 -----------------------------" results[graph_type]["rss2"] = rss2(G) print "DICE -----------------------------" results[graph_type]["dice"] = dice(G) print "INVERSE LOG WEIGHTED --------------" results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G) pickle.dump(results,open("data/sim_metrics.pkl","wb")) ### IMAGE SIMILARITY