def build_dictionaries(**params): ''' Function to build dictionaries of api calls: dict_A.json contains api calls indexed by the app they appear in dict_B.json contains api calls indexed by the method block they appear in dict_P.json contains api calls indexed by the package they appear in dict_I.json contains api calls indexed by the invocation type they appear in api_calls.json contains api calls with the number of times they appear in all apps naming_key.json is a table to look up keys and their relative api calls, apps, code blocks, packages, or invocation types Parameters ---------- dict_directory: dictionary, required File path of dictionary output out_path: str, required File path to get json files of api calls from parsed apps verbose: logical value, required Boolean value to print progress while building dictionaries truncate: logical value, required Boolean value to ''' fp=params["dict_directory"] print("--- Starting Dictionary Creation ---") start_time = time.time() dict_B, dict_P, dict_I, dict_A = dict_builder.fast_dict(**params) for t,fname in zip([dict_A, dict_B, dict_P, dict_I],["dict_A", "dict_B", "dict_P", "dict_I"]): json_functions.save_json(t, fp+fname) print("--- Dictionary Creation Done in " + str(int(time.time() - start_time)) + " Seconds ---") print()
def run(cmd_line_args, params): ''' Function to run entire project Parameters ---------- cmd_line_args: listOfStrings, required list of command line arguments passed params: dictionary, required parameter configuration dictionary pulled from `config/params.json` Returns ------- None ''' print() #get and apply command line arguments args_params = params["options"] cmd_line_args_dict = get_command_ling_args(cmd_line_args, args_params) params["eda-params"]["args_literal"] = cmd_line_args params = apply_command_line_args(cmd_line_args_dict, params) out_fn = os.path.join(params["out_path"], params["params_name"]) jf.save_json(params, out_fn) kwargs = {"cmd_line_args": cmd_line_args_dict, "params": params} Main.run_all(kwargs) print() sys.exit()
def make_graph(src, dst): print() print("--- Starting StellarGraph Creation ---") start_time = time.time() G=build_network.make_stellargraph(src) print(G.info()) print("--- StellarGraph Creation Done in " + str(int(time.time() - start_time)) + " Seconds ---") print() # list of all node types in the graph node_types = ["api_call_nodes", "package_nodes", "app_nodes", "block_nodes"] node_dict = {type:len(G.nodes_of_type(type)) for type in node_types} json_functions.save_json(node_dict, dst) return G
def save_w2v_embedding(model, corp_size, unique_apis, save_unique_apis=True, **params): #Remove pre existing os.makedirs(params["save_dir"], exist_ok=True) embeddings_path = os.path.join(params["save_dir"], params["embeddings_filename"]) try: os.remove(embeddings_path) #change to config except OSError: pass if save_unique_apis: json_functions.save_json( unique_apis, os.path.join(params["save_dir"], params["unique_api_filename"])) if params["verbose"]: print("Saved %s to %s" % (params["unique_api_filename"], params["save_dir"])) #Write to match SHNE intake format f = open(embeddings_path, "a") #change to config f.write(str(len(unique_apis.keys())) + " ") f.write(str(corp_size)) f.write("\n") for p in unique_apis.keys(): f.write(str(unique_apis[p]) + " ") for k in model[p]: f.write(str(k) + " ") f.write("\n") f.close() if params["verbose"]: print("Saved %s to %s" % (params["embeddings_filename"], params["save_dir"]))
def run_all(kwargs): ''' Runs the main project pipeline logic, given the targets. Parameters ---------- cmd_line_args: Dictionary, required Arguments passed on in command lines: test: run on test est node2vec_walk: perfrom node2vec walk instead of word2vec embeddings_only: only get word2vec/node2vec embeddings skip_embeddings: skip word2vec/node2vec embeddings creation skip_shne: skip shne model creation parse_only: Only get api dictionaries. Do not create embeddings or models overwrite: Overwrite any data that may already exist in out folder redirect_std_out: Save cmd line output to text file. Hides console outputs time: time how long to run params: Dictionary, required dictionary of parameters in found in config file in `config/params.json` mal_fp: string file path of the malignant apps benign_fp: string file path of the benign apps limiter: bool Boolean value to dictate if number of apps parsed is to be limited lim_mal: int Number of malignant apps to limit parsing of if limiter is True lim_benign: int Number of benign apps to limit parsing of if limiter is True mal_fp_test_loc: string file path of test malignant apps benign_fp_test_loc: string file path of benign apps directory: string File path to get json files of api calls from parsed apps verbose: bool Boolean value to print progress while building dictionaries truncate: bool Boolean value to dict_directory: string File path of dictionary output multithreading: bool Boolean value to turn on multithreaded processing of data out_path: string File path of outputed parsed apps verbose: bool Boolean value to print progress while building dictionaries ''' params=kwargs["params"] cmd_ln_args=kwargs["cmd_line_args"] etl_params=params["etl-params"] w2v_params=params["word2vec-params"] SKIP_SHNE=cmd_ln_args["skip_shne"] EMBEDDINGS_ONLY=cmd_ln_args["embeddings_only"] PARSE_ONLY=cmd_ln_args["parse_only"] OVERWRITE=cmd_ln_args["overwrite"] SAVE_OUTPUT=cmd_ln_args["redirect_std_out"] CHECK_FILES=params["check_files"] CHECK_FILES2=params["check_files2"] SHNE_PATH=params["shne-params"]["datapath"] VERBOSE=params["verbose"] NUM_CORES=params["core_count"] MULTIPROCESS=params["multithreading"] DICTIONARY_EXTRACT_DIR=etl_params["dict_directory"] ETL_LIMITER=etl_params["limiter"] ETL_MALICIOUS_PATH=etl_params["mal_fp"] ETL_BENIGN_PATH=etl_params["benign_fp"] ETL_LIM=etl_params["lim_apps"] ETL_OUT_PATH=etl_params["out_path"] ETL_TRUNCATE=etl_params["truncate"] API_BOUND=etl_params["lower_bound_api_count"] NAMING_KEY=etl_params["data_naming_key_filename"] API_CALLS_TABLE=etl_params["api_call_filename"] #run shne if preprocessing is done data_exists=os.path.isdir(SHNE_PATH) preprocessing_done=data_exists and all([cf in os.listdir(SHNE_PATH) for cf in CHECK_FILES2]) #skip preprocessing and just run SHNE cmd_line_shne=EMBEDDINGS_ONLY or PARSE_ONLY or OVERWRITE if preprocessing_done and not SKIP_SHNE and not cmd_line_shne: print("PREPROCESSING DONE, STARTING SHNE") run_shne() return # if app files have already been parsed, then skip dictionary creation directory_exists=os.path.isdir(DICTIONARY_EXTRACT_DIR) if directory_exists: all_files_in_directory=all([cf in os.listdir(DICTIONARY_EXTRACT_DIR) for cf in CHECK_FILES]) app_dicts_already_created=directory_exists and all_files_in_directory else: app_dicts_already_created=False if app_dicts_already_created and not OVERWRITE: print("--- DICTIONARIES ALREADY CREATED, STARTING STELLARGRAPH CREATION ---") else: #start extracting smali code mal_app_names, benign_app_names=get_app_names( limiter=ETL_LIMITER, mal_fp=ETL_MALICIOUS_PATH, benign_fp=ETL_BENIGN_PATH, lim_benign=ETL_LIM, lim_mal=ETL_LIM, verbose=VERBOSE ) dictionary_verbose=VERBOSE and not SAVE_OUTPUT create_dictionary( malignant_apps=mal_app_names, benign_apps=benign_app_names, core_count=NUM_CORES, verbose=dictionary_verbose, mal_fp=ETL_MALICIOUS_PATH, benign_fp=ETL_BENIGN_PATH, multi_threading=MULTIPROCESS, out_path=ETL_OUT_PATH ) build_dictionaries( dict_directory=DICTIONARY_EXTRACT_DIR, verbose=dictionary_verbose, out_path=ETL_OUT_PATH, truncate=ETL_TRUNCATE, lower_bound_api_count=API_BOUND, data_naming_key_filename=NAMING_KEY, api_call_filename=API_CALLS_TABLE ) if PARSE_ONLY: if VERBOSE: print("Done.") return # get StellarGraph Network Graph sg_dst=os.path.join(params["shne-params"]["datapath"], params["shne-params"]["node_counts_filename"]) G=make_graph(DICTIONARY_EXTRACT_DIR, sg_dst) if cmd_ln_args["node2vec_walk"]: #generate node2vec random walks node2vec.node2vec_walk(G, params["node2vec-params"]) params["shne-params"]["datapath"]=params["node2vec-params"]["save_dir"] else: # generate metapath2vec random walks metapath2vec.metapath2vec_walk(G, params["metapath2vec-params"]) params["shne-params"]["datapath"]=params["word2vec-params"]["save_dir"] if EMBEDDINGS_ONLY: if VERBOSE: print("Done.") return #BRADEN unique_api_path=os.path.join(etl_params["data_naming_key_dir"], etl_params["data_naming_key_filename"]) if not cmd_ln_args["skip_embeddings"]: print() word2vec.create_w2v_embedding( path=etl_params["data_extract_loc"], path_to_unique_apis=unique_api_path, **w2v_params ) #Config if not SKIP_SHNE: print() run_shne(**params) #save final parameters out={ "params":params, "command_line_arguments": cmd_ln_args } out_fn=os.path.join(params["out_path"],"final_"+params["params_name"]) json_functions.save_json(out, out_fn) return
def fast_dict(**kwargs): """Builds dictionaries which can be converted into matrices A,B,P,I, along with corrisponding test matrices :return; four dictionaries corresponding to matrices A,B,P,I and a test matrix A_test """ key_directory = kwargs["dict_directory"] verbose = kwargs["verbose"] direc = kwargs["out_path"] truncate = kwargs["truncate"] lower_bound_api_count = kwargs["lower_bound_api_count"] naming_key_filename = kwargs["data_naming_key_filename"] api_call_filename = kwargs["api_call_filename"] key_dst = os.path.join(key_directory, naming_key_filename) call_dst = os.path.join(key_directory, api_call_filename) def add_key(store, value, prefix, suffix, value_type): """ Takes a value and a dictionary to add the value to. Adds a key, value pair to dictionary if it does not already exist. Will return the key associated with a value. Key is created by concatenating the letter of the associated node, a,v,c,p, and i to the length of the lookup table. """ if value not in store["get_key"][value_type]: key = prefix + str(suffix) store["lookup_key"][key] = value store["get_key"][value_type][value] = key suffix += 1 else: key = store["get_key"][value_type][value] return key, suffix def append_value(store, key, value): """ Appends value to dictionary at index key. Returns dictionary with appended value """ if key in store: store[key].append(value) else: store[key] = [value] ######################### #FOR TRAIN PORTION OF SPLIT ######################### B = {} A = {} P = {} I = {} C = {} # c- prefix denotes api call # a- prefix denotes app # b- prefix denotes code block # p- prefix denotes package # i- prefix denotes invoke type key_lookup = { "get_key": { "apps": {}, #input a value and value type, i.e. "apps", etc, and get the associated key "blocks": {}, "packages": {}, "invokes": {}, "calls": {} }, "lookup_key": {} #input a key and get the associated value } list_of_files = [] for root, dirs, files in os.walk(direc): list_of_files.append(files) list_of_files = list( set([item for sublist in list_of_files for item in sublist])) random.shuffle(list_of_files) print(str(len(list_of_files)) + " Total Files for Dictionary Creation") ax = 0 #index for apps bx = 0 #index for blocks px = 0 #index for packages ix = 0 #index for invoke types cx = 0 #index for calls iix = 0 #keep track of iterations start_time = time.time() for file in tqdm(list_of_files): if "checkpoint" in file: #for stupid git ignores continue fn = direc + file filez = jf.load_json(fn) filename = file.rstrip(".json") akey, ax = add_key(key_lookup, filename, "a", ax, "apps") for block in filez: if len(block) > 0: #skip empty blocks full_block = " ".join(block) #add block to lookup table and get a key bkey, bx = add_key(key_lookup, full_block, "b", bx, "blocks") for call in block: try: api_call = call.split("}, ")[1].split(" ")[0].strip() ckey, cx = add_key(key_lookup, api_call, "c", cx, "calls") append_value(A, akey, ckey) #append key to dictionary append_value(B, bkey, ckey) package = call.split(";")[0].split(",")[-1].strip() pkey, px = add_key(key_lookup, package, "p", px, "packages") append_value(P, pkey, ckey) invoke_type = call.split("}, ")[0].split( " ")[0].strip() ikey, ix = add_key(key_lookup, invoke_type, "i", ix, "invokes") append_value(I, ikey, ckey) if ckey in C: C[ckey] = C[ckey] + 1 else: C[ckey] = 1 iix += 1 except: continue if truncate: if verbose: print() print( "Truncation is set to True, API calls only occuring less than lower_bound_api_count will be removed from the model." ) print("Number of API calls Before Truncation: " + str(len(B.keys()))) for i in [B, P, I, A]: #remove files where APIs occur less than lower_bound_api_count across whole data set # remove both keys and values from dict d = dict( (k, v) for k, v in C.items() if v <= lower_bound_api_count) for k in d.keys(): try: del i[k] except: continue if verbose: print("Number of API calls After Truncation: " + str(len(B.keys()))) print() #save the key_lookup table to "key_directory" config parameter in dict_build.json jf.save_json(key_lookup, key_dst) jf.save_json(C, call_dst) if verbose: print("Saving node key lookup table to: %s" % key_dst) print("Saving api call list to: %s" % call_dst) #save the key_lookup table to "key_directory" config parameter in dict_build.json return B, P, I, A