def main(): parser = argparse.ArgumentParser( description='Demo: Use apiscout to match WinApi1024 vectors.') parser.add_argument('vector_a', type=str, default='', help='compressed version of first vector.') parser.add_argument('-v', '--vector_b', type=str, default='', help='compressed version of second vector.') parser.add_argument('-c', '--collection', type=str, default='', help='Path to a collection of compressed vectors.') parser.add_argument('-n', '--max_results', type=int, default=5, help='Maximum number of family results to show.') args = parser.parse_args() scout = ApiScout() # load WinApi1024 vector scout.loadWinApi1024(get_winapi1024_path()) if args.vector_a and args.vector_b: score = scout.matchVectors(args.vector_a, args.vector_b) print("Result of matching vectors:") print("Vector A: {}".format(args.vector_a)) print("Vector B: {}".format(args.vector_b)) print("Score: {}".format(score)) elif args.vector_a and args.collection: collection_result = scout.matchVectorCollection( args.vector_a, args.collection) print( scout.renderVectorCollectionResults(collection_result, args.max_results)) else: parser.print_help()
def test_base(x=vector_list[0], y=vector_list[1]): from apiscout.ApiScout import ApiScout _apiscout = ApiScout() _apiscout.setBaseAddress(0) _apiscout.loadWinApi1024('data/winapi1024v1.txt') return _apiscout.matchVectors(x, y)
class Api(): """Stores and analyze malwares info in neo4j""" def __init__(self, host, port, user, password, threshold=40, secure=False, filepath=None, filename=None, folder_path=None): """Connects to neo4j database, loads options and set connectors. @raise CuckooReportError: if unable to connect. """ self.threshold = int(threshold) self.graph = Graph(host=host, user=user, password=password, secure=secure, port=port) self.filepath = filepath self.filename = filename self.folder_path = folder_path self.scout = ApiScout() self.scout.setBaseAddress(0) self.scout.loadWinApi1024( os.path.abspath(os.path.join(os.path.dirname(__file__))) + os.sep + "data" + os.sep + "winapi1024v1.txt") self.magictest = magic.Magic(uncompress=True) CWD = os.path.abspath(os.path.dirname(__file__)) USERDB = os.path.join(CWD, os.path.normpath("data/UserDB.TXT")) with open(USERDB, 'rt') as f: sig_data = f.read() self.signatures = peutils.SignatureDatabase(data=sig_data) if self.folder_path: self.files = self.get_files(folder_path) def check_file(self, f): if magic.from_file(f).find('PE32') == -1: return False if magic.from_file(f).find('self-extracting') != -1: return False try: pe = pefile.PE(f) matches = self.signatures.match_all(pe, ep_only=True) if matches: return False return True except: return False def get_files(self, folder_path): files_end = [] files = [] for root, dirnames, filenames in os.walk(folder_path): for filename in fnmatch.filter(filenames, '*'): files.append(os.path.join(root, filename)) for filepath in files: if not self.check_file(filepath): continue json_path = "/".join(filepath.split( "/")[:-2]) + "/" + filepath.split("/")[-3] + ".json" if not os.path.exists(json_path): json_path = "/".join(filepath.split( "/")[:-1]) + "/" + filepath.split("/")[-2] + ".json" if not os.path.exists(json_path): continue with open(json_path, 'r') as f: file_family = json.loads("".join([ str(x) for x in f.readlines() ])).get('common_name').replace(" ", "_") files_end.append((filepath, file_family)) print(len(files_end), "Files da caricare") return files_end def get_digest(self, file): """ return hash, impuzzy and scout """ md5 = hashlib.md5() sha1 = hashlib.sha1() sha256 = hashlib.sha256() try: impfuzzy = pyimpfuzzy.get_impfuzzy(file) except: impfuzzy = "" if os.path.isfile(file): with open(file, "rb") as f_binary: binary = f_binary.read() try: scout_ev = self.scout.evaluateImportTable(binary, is_unmapped=True) scout_result = self.scout.getWinApi1024Vectors(scout_ev).get( 'import_table', {}).get('vector', None) scout_confidence = self.scout._apivector.getVectorConfidence( scout_result) except: with open('fail_list.txt', 'a') as f: f.write(file + "\n") scout_result = None scout_confidence = None with open(file, "rb") as f: while True: buf = f.read(2047) if not buf: break md5.update(buf) sha1.update(buf) sha256.update(buf) return scout_result, impfuzzy, md5.hexdigest(), sha1.hexdigest( ), sha256.hexdigest(), scout_confidence def impfuzzy_comp(self, list, list_new): ssdeep = re.compile("^[0-9]{1,5}:[0-9a-zA-Z\/\+]+:[0-9a-zA-Z\/\+]+$", re.DOTALL) complist = [] list_len = len(list_new) i = 0 for item_new in list_new: i += 1 if re.search(ssdeep, item_new[2]) and len(item_new[2]) < 150: for j in range(i, list_len): if re.search(ssdeep, list_new[j][2]) and len(list_new[j][2]) < 150: complist.append([ item_new[0], list_new[j][0], pyimpfuzzy.hash_compare(item_new[2], list_new[j][2]) ]) if list: for item_new in list_new: if re.search(ssdeep, item_new[2]) and len(item_new[2]) < 150: for item in list: if re.search(ssdeep, item[2]) and len(item[2]) < 150: complist.append([ item_new[0], item[0], pyimpfuzzy.hash_compare(item_new[2], item[2]) ]) return complist def scout_comp(self, list, list_new): complist = [] list_len = len(list_new) i = 0 for item_new in list_new: i += 1 for j in range(i, list_len): complist.append([ item_new[0], list_new[j][0], int( self.scout.matchVectors(item_new[3], list_new[j][3]) * 100) ]) for item_new in list_new: for item in list: complist.append([ item_new[0], item[0], int(self.scout.matchVectors(item_new[3], item[3]) * 100) ]) return complist def process(self): hashlist = [] hashlist_new = [] nodes = [] edges = [] relationships = [] # recover all actual data database = self.graph.run( "MATCH (m:Malware) RETURN m.id, m.name, m.impfuzzy, m.scout_result, m.scout_confidence, m.md5, m.sha1, m.sha256, m.tag" ).data() if database: for d in database: hashlist.append([ d["m.id"], d["m.name"], d["m.impfuzzy"], d["m.scout_result"], d["m.scout_confidence"], d["m.md5"], d["m.sha1"], d["m.sha256"], d["m.tag"] ]) nodes_count = len(database) i = nodes_count relation_data = self.graph.run( "MATCH (m1:Malware)-[s:same]-(m2:Malware) RETURN m1.id, m2.id, s.value" ).data() if relation_data: for r in relation_data: relationships.append([r["m1.id"], r["m2.id"], r["s.value"]]) for x in range(nodes_count): nodes.append(x) # if massive check for each file if self.folder_path: for item in self.files: scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest( item[0]) if scout_result in ("", 'A171', None): continue query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 objs = self.graph.run(query).data() if not objs and sha256 not in [x[5] for x in hashlist_new]: nodes.append(i) hashlist_new.append([ i, item[0].split("/")[-1], impfuzzy, scout_result, scout_confidence, md5, sha1, sha256, item[1] ]) i += 1 else: continue else: # if single we are in the reporting module # if file is tested it need to have valid apiscout vector if self.check_file(self.filepath): scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest( self.filepath) if scout_result in ("", 'A171', None): return {} else: return {} query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 objs = self.graph.run(query).data() if not objs: nodes.append(nodes_count) hashlist_new.append([ nodes_count, self.filename, impfuzzy, scout_result, scout_confidence, md5, sha1, sha256, None ]) else: return self.search_hash(sha256) # Calculate apiscout correlation result_list = self.scout_comp(hashlist, hashlist_new) if len(database) != len(nodes): for edge in result_list + relationships: if edge[2] > self.threshold: edges.append([[edge[0], edge[1]], edge[2]]) else: edges.append([[edge[0], edge[1]], 0]) pyl = PyLouvain(nodes, edges) partition, modularity = pyl.apply_method() # Create node tx = self.graph.begin() for hash in hashlist_new + hashlist: i = 0 for a in partition: i += 1 if hash[0] in a: tx.append( statement_c, { "id": hash[0], "name": hash[1], "impfuzzy": hash[2], "scout_result": hash[3], "scout_confidence": hash[4], "md5": hash[5], "sha1": hash[6], "sha256": hash[7], "tag": hash[8], "cluster": i }) # Create relationship for result in result_list: if result[2] > self.threshold: tx.append(statement_r, { "id1": result[0], "id2": result[1], "value_scout": result[2] }) tx.process() tx.commit() # recover info if self.filename: return self.search_hash(sha256) def process_file(self, filepath, filename): self.filepath = filepath self.filename = filename return self.process() def search_hash(self, data): return_dict = {} # identify hash type HASHES = ( ("md5", "^[a-fA-F0-9]{32}$"), ("sha1", "^[a-fA-F0-9]{40}$"), ("sha256", "^[a-fA-F0-9]{64}$"), ) res = None for items in HASHES: if re.match(items[1], data): res = items[0] # No hash type match return if res == None: return {} family_query = "MATCH (m1:Malware) WHERE m1.%s=\"%s\" MATCH (m1:Malware)-[s:same]-(m2:Malware) MATCH (m2:Malware) WHERE m2.cluster = m1.cluster RETURN distinct m2.tag as tag, max(s.value) as max order by max(s.value) desc" % ( res, data) file_query = "MATCH (m1:Malware) WHERE m1.%s=\"%s\" MATCH (m1:Malware)-[s:same]-(m2:Malware) MATCH (m2:Malware) WHERE m2.cluster = m1.cluster RETURN m2.tag as tag, m2.sha256 as sha256, max(s.value) as max order by max(s.value) desc LIMIT 10" % ( res, data) cluster_count_query = "MATCH (m1:Malware) where m1.%s=\"%s\" MATCH (m2:Malware)-[p:same]->(m3:Malware) where m2.cluster = m1.cluster and m3.cluster = m1.cluster RETURN count(p.value) as total" % ( res, data) cluster_query = "MATCH (m1:Malware) where m1.%s=\"%s\" MATCH (m2:Malware)-[p:same]->(m3:Malware) where m2.cluster = m1.cluster and m3.cluster = m1.cluster RETURN m2.sha256, m2.tag, p.value, m3.sha256, m3.cluster, m3.tag" % ( res, data) item_query = "MATCH (m:Malware) WHERE m.%s=\"%s\" RETURN m" % (res, data) item_data = self.graph.run(item_query).data() cluster_count_list = self.graph.run(cluster_count_query).data() cluster_count = cluster_count_list[0]['total'] if len( cluster_count_list) > 0 else None return_dict['info'] = item_data[0]['m'] if len(item_data) > 0 else None family_objs = self.graph.run(family_query).data() if family_objs: return_dict['families'] = family_objs return_dict['files'] = self.graph.run(file_query).data() if cluster_count and cluster_count < 100: return_dict['cluster'] = self.graph.run(cluster_query).data() else: return_dict['cluster_count'] = cluster_count return return_dict