Esempio n. 1
0
def build_dictionaries(**params):
    '''
    Function to build dictionaries of api calls:
    dict_A.json contains api calls indexed by the app they appear in
    dict_B.json contains api calls indexed by the method block they appear in
    dict_P.json contains api calls indexed by the package they appear in
    dict_I.json contains api calls indexed by the invocation type they appear in
    api_calls.json contains api calls with the number of times they appear in all apps 
    naming_key.json is a table to look up keys and their relative api calls, apps, code blocks, packages, or invocation types
    
    Parameters
    ----------
    dict_directory: dictionary, required
        File path of dictionary output
    out_path: str, required
        File path to get json files of api calls from parsed apps
    verbose: logical value, required
        Boolean value to print progress while building dictionaries
    truncate: logical value, required
        Boolean value to 
    
    '''
    fp=params["dict_directory"]
    print("--- Starting Dictionary Creation ---")
    start_time = time.time()
    dict_B, dict_P, dict_I, dict_A = dict_builder.fast_dict(**params)
    for t,fname in zip([dict_A, dict_B, dict_P, dict_I],["dict_A", "dict_B", "dict_P", "dict_I"]):
        json_functions.save_json(t, fp+fname)       
    print("--- Dictionary Creation Done in " + str(int(time.time() - start_time)) + " Seconds ---")
    print()
Esempio n. 2
0
def run(cmd_line_args, params):
    '''
    Function to run entire project

    Parameters
    ----------
    cmd_line_args: listOfStrings, required
        list of command line arguments passed
    params: dictionary, required
        parameter configuration dictionary pulled from `config/params.json`
    Returns
    -------
    None
    '''
    print()
    #get and apply command line arguments
    args_params = params["options"]
    cmd_line_args_dict = get_command_ling_args(cmd_line_args, args_params)
    params["eda-params"]["args_literal"] = cmd_line_args
    params = apply_command_line_args(cmd_line_args_dict, params)

    out_fn = os.path.join(params["out_path"], params["params_name"])
    jf.save_json(params, out_fn)

    kwargs = {"cmd_line_args": cmd_line_args_dict, "params": params}

    Main.run_all(kwargs)
    print()
    sys.exit()
Esempio n. 3
0
def make_graph(src, dst):
    print()
    print("--- Starting StellarGraph Creation ---")
    start_time = time.time()
    G=build_network.make_stellargraph(src)
    print(G.info())
    print("--- StellarGraph Creation Done in " + str(int(time.time() - start_time)) + " Seconds ---")
    print()
    # list of all node types in the graph
    node_types = ["api_call_nodes", "package_nodes", "app_nodes", "block_nodes"]
    node_dict = {type:len(G.nodes_of_type(type)) for type in node_types}
    json_functions.save_json(node_dict, dst)
    return G
Esempio n. 4
0
def save_w2v_embedding(model,
                       corp_size,
                       unique_apis,
                       save_unique_apis=True,
                       **params):
    #Remove pre existing
    os.makedirs(params["save_dir"], exist_ok=True)
    embeddings_path = os.path.join(params["save_dir"],
                                   params["embeddings_filename"])
    try:
        os.remove(embeddings_path)  #change to config
    except OSError:
        pass
    if save_unique_apis:
        json_functions.save_json(
            unique_apis,
            os.path.join(params["save_dir"], params["unique_api_filename"]))
        if params["verbose"]:
            print("Saved %s to %s" %
                  (params["unique_api_filename"], params["save_dir"]))

    #Write to match SHNE intake format
    f = open(embeddings_path, "a")  #change to config
    f.write(str(len(unique_apis.keys())) + " ")
    f.write(str(corp_size))
    f.write("\n")

    for p in unique_apis.keys():
        f.write(str(unique_apis[p]) + " ")
        for k in model[p]:
            f.write(str(k) + " ")
        f.write("\n")
    f.close()
    if params["verbose"]:
        print("Saved %s to %s" %
              (params["embeddings_filename"], params["save_dir"]))
Esempio n. 5
0
def run_all(kwargs):
    '''
    Runs the main project pipeline logic, given the targets.
    
    Parameters
    ----------
    cmd_line_args: Dictionary, required
        Arguments passed on in command lines:
            test: run on test est
            node2vec_walk: perfrom node2vec walk instead of word2vec
            embeddings_only: only get word2vec/node2vec embeddings
            skip_embeddings: skip word2vec/node2vec embeddings creation
            skip_shne: skip shne model creation
            parse_only: Only get api dictionaries. Do not create embeddings or models
            overwrite: Overwrite any data that may already exist in out folder
            redirect_std_out: Save cmd line output to text file. Hides console outputs
            time: time how long to run
    params: Dictionary, required
        dictionary of parameters in found in config file in `config/params.json`
            mal_fp: string
                file path of the malignant apps
            benign_fp: string
                file path of the benign apps
            limiter: bool
                Boolean value to dictate if number of apps parsed is to be limited
            lim_mal: int
                Number of malignant apps to limit parsing of if limiter is True 
            lim_benign: int
                Number of benign apps to limit parsing of if limiter is True
            mal_fp_test_loc: string
                file path of test malignant apps            
            benign_fp_test_loc: string
                file path of benign apps
            directory: string
                File path to get json files of api calls from parsed apps
            verbose: bool
                Boolean value to print progress while building dictionaries
            truncate: bool
                Boolean value to
            dict_directory: string
                File path of dictionary output
            multithreading: bool
                Boolean value to turn on multithreaded processing of data    
            out_path: string
                File path of outputed parsed apps            
            verbose: bool
                Boolean value to print progress while building dictionaries            
    '''
    params=kwargs["params"]
    cmd_ln_args=kwargs["cmd_line_args"]

    etl_params=params["etl-params"]
    w2v_params=params["word2vec-params"]

    SKIP_SHNE=cmd_ln_args["skip_shne"]
    EMBEDDINGS_ONLY=cmd_ln_args["embeddings_only"]
    PARSE_ONLY=cmd_ln_args["parse_only"]
    OVERWRITE=cmd_ln_args["overwrite"]
    SAVE_OUTPUT=cmd_ln_args["redirect_std_out"]

    CHECK_FILES=params["check_files"]
    CHECK_FILES2=params["check_files2"]
    SHNE_PATH=params["shne-params"]["datapath"]
    VERBOSE=params["verbose"]
    NUM_CORES=params["core_count"]
    MULTIPROCESS=params["multithreading"]

    DICTIONARY_EXTRACT_DIR=etl_params["dict_directory"]
    ETL_LIMITER=etl_params["limiter"]
    ETL_MALICIOUS_PATH=etl_params["mal_fp"]
    ETL_BENIGN_PATH=etl_params["benign_fp"]
    ETL_LIM=etl_params["lim_apps"]
    ETL_OUT_PATH=etl_params["out_path"]
    ETL_TRUNCATE=etl_params["truncate"]
    API_BOUND=etl_params["lower_bound_api_count"]
    NAMING_KEY=etl_params["data_naming_key_filename"]
    API_CALLS_TABLE=etl_params["api_call_filename"]
    
    #run shne if preprocessing is done
    data_exists=os.path.isdir(SHNE_PATH)
    preprocessing_done=data_exists and all([cf in os.listdir(SHNE_PATH) for cf in CHECK_FILES2])

    #skip preprocessing and just run SHNE
    cmd_line_shne=EMBEDDINGS_ONLY or PARSE_ONLY or OVERWRITE
    if preprocessing_done and not SKIP_SHNE and not cmd_line_shne:
        print("PREPROCESSING DONE, STARTING SHNE")
        run_shne()
        return

    # if app files have already been parsed, then skip dictionary creation
    directory_exists=os.path.isdir(DICTIONARY_EXTRACT_DIR) 
    if directory_exists:
        all_files_in_directory=all([cf in os.listdir(DICTIONARY_EXTRACT_DIR) for cf in CHECK_FILES])
        app_dicts_already_created=directory_exists and all_files_in_directory
    else:
        app_dicts_already_created=False
    
    if app_dicts_already_created and not OVERWRITE:
        print("--- DICTIONARIES ALREADY CREATED, STARTING STELLARGRAPH CREATION ---")

    else:
        #start extracting smali code
        mal_app_names, benign_app_names=get_app_names(
            limiter=ETL_LIMITER,
            mal_fp=ETL_MALICIOUS_PATH,
            benign_fp=ETL_BENIGN_PATH,
            lim_benign=ETL_LIM,
            lim_mal=ETL_LIM,
            verbose=VERBOSE
        )
        dictionary_verbose=VERBOSE and not SAVE_OUTPUT
        create_dictionary(
            malignant_apps=mal_app_names,
            benign_apps=benign_app_names, 
            core_count=NUM_CORES,
            verbose=dictionary_verbose,
            mal_fp=ETL_MALICIOUS_PATH,
            benign_fp=ETL_BENIGN_PATH,
            multi_threading=MULTIPROCESS,
            out_path=ETL_OUT_PATH
        )
        build_dictionaries(
            dict_directory=DICTIONARY_EXTRACT_DIR,
            verbose=dictionary_verbose,
            out_path=ETL_OUT_PATH,
            truncate=ETL_TRUNCATE,
            lower_bound_api_count=API_BOUND,
            data_naming_key_filename=NAMING_KEY,
            api_call_filename=API_CALLS_TABLE
        )
        
    if PARSE_ONLY:
        if VERBOSE:
            print("Done.")
        return
    
    # get StellarGraph Network Graph
    sg_dst=os.path.join(params["shne-params"]["datapath"], params["shne-params"]["node_counts_filename"])
    G=make_graph(DICTIONARY_EXTRACT_DIR, sg_dst)
           
    if cmd_ln_args["node2vec_walk"]:
        #generate node2vec random walks
        node2vec.node2vec_walk(G, params["node2vec-params"])
        params["shne-params"]["datapath"]=params["node2vec-params"]["save_dir"]
    else:
        # generate metapath2vec random walks
        metapath2vec.metapath2vec_walk(G, params["metapath2vec-params"])
        params["shne-params"]["datapath"]=params["word2vec-params"]["save_dir"]
    
    if EMBEDDINGS_ONLY:
        if VERBOSE:
            print("Done.")
        return
    #BRADEN
    unique_api_path=os.path.join(etl_params["data_naming_key_dir"], etl_params["data_naming_key_filename"])
    if not cmd_ln_args["skip_embeddings"]:
        print()
        word2vec.create_w2v_embedding( 
            path=etl_params["data_extract_loc"],
            path_to_unique_apis=unique_api_path,
            **w2v_params
        ) #Config 
    
    if not SKIP_SHNE:
        print()
        run_shne(**params)

    #save final parameters
    out={
        "params":params,
        "command_line_arguments": cmd_ln_args
    }
    out_fn=os.path.join(params["out_path"],"final_"+params["params_name"])
    json_functions.save_json(out, out_fn)
    return
Esempio n. 6
0
def fast_dict(**kwargs):
    """Builds dictionaries which can be converted into matrices A,B,P,I, along with corrisponding test matrices

    :return; four dictionaries corresponding to matrices A,B,P,I and a test matrix A_test
    """
    key_directory = kwargs["dict_directory"]
    verbose = kwargs["verbose"]
    direc = kwargs["out_path"]
    truncate = kwargs["truncate"]
    lower_bound_api_count = kwargs["lower_bound_api_count"]
    naming_key_filename = kwargs["data_naming_key_filename"]
    api_call_filename = kwargs["api_call_filename"]

    key_dst = os.path.join(key_directory, naming_key_filename)
    call_dst = os.path.join(key_directory, api_call_filename)

    def add_key(store, value, prefix, suffix, value_type):
        """
        Takes a value and a dictionary to add the value to. Adds a key, value pair to dictionary if it does not already exist. 
        Will return the key associated with a value. 
        Key is created by concatenating the letter of the associated node, a,v,c,p, and i to the length of the lookup table.
        """
        if value not in store["get_key"][value_type]:
            key = prefix + str(suffix)
            store["lookup_key"][key] = value
            store["get_key"][value_type][value] = key
            suffix += 1
        else:
            key = store["get_key"][value_type][value]
        return key, suffix

    def append_value(store, key, value):
        """
        Appends value to dictionary at index key. Returns dictionary with appended value
        """
        if key in store:
            store[key].append(value)
        else:
            store[key] = [value]

    #########################
    #FOR TRAIN PORTION OF SPLIT
    #########################
    B = {}
    A = {}
    P = {}
    I = {}
    C = {}

    # c- prefix denotes api call
    # a- prefix denotes app
    # b- prefix denotes code block
    # p- prefix denotes package
    # i- prefix denotes invoke type
    key_lookup = {
        "get_key": {
            "apps":
            {},  #input a value and value type, i.e. "apps", etc, and get the associated key
            "blocks": {},
            "packages": {},
            "invokes": {},
            "calls": {}
        },
        "lookup_key": {}  #input a key and get the associated value
    }
    list_of_files = []
    for root, dirs, files in os.walk(direc):
        list_of_files.append(files)

    list_of_files = list(
        set([item for sublist in list_of_files for item in sublist]))
    random.shuffle(list_of_files)
    print(str(len(list_of_files)) + " Total Files for Dictionary Creation")

    ax = 0  #index for apps
    bx = 0  #index for blocks
    px = 0  #index for packages
    ix = 0  #index for invoke types
    cx = 0  #index for calls
    iix = 0  #keep track of iterations
    start_time = time.time()
    for file in tqdm(list_of_files):
        if "checkpoint" in file:  #for stupid git ignores
            continue
        fn = direc + file
        filez = jf.load_json(fn)

        filename = file.rstrip(".json")
        akey, ax = add_key(key_lookup, filename, "a", ax, "apps")

        for block in filez:

            if len(block) > 0:  #skip empty blocks
                full_block = " ".join(block)
                #add block to lookup table and get a key
                bkey, bx = add_key(key_lookup, full_block, "b", bx, "blocks")

                for call in block:
                    try:
                        api_call = call.split("}, ")[1].split(" ")[0].strip()
                        ckey, cx = add_key(key_lookup, api_call, "c", cx,
                                           "calls")
                        append_value(A, akey, ckey)  #append key to dictionary
                        append_value(B, bkey, ckey)

                        package = call.split(";")[0].split(",")[-1].strip()
                        pkey, px = add_key(key_lookup, package, "p", px,
                                           "packages")
                        append_value(P, pkey, ckey)

                        invoke_type = call.split("}, ")[0].split(
                            " ")[0].strip()
                        ikey, ix = add_key(key_lookup, invoke_type, "i", ix,
                                           "invokes")
                        append_value(I, ikey, ckey)

                        if ckey in C:
                            C[ckey] = C[ckey] + 1
                        else:
                            C[ckey] = 1
                        iix += 1
                    except:
                        continue
    if truncate:
        if verbose:
            print()
            print(
                "Truncation is set to True, API calls only occuring less than lower_bound_api_count will be removed from the model."
            )
            print("Number of API calls Before Truncation: " +
                  str(len(B.keys())))
        for i in [B, P, I, A]:
            #remove files where APIs occur less than lower_bound_api_count across whole data set
            # remove both keys and values from dict
            d = dict(
                (k, v) for k, v in C.items() if v <= lower_bound_api_count)
            for k in d.keys():
                try:
                    del i[k]
                except:
                    continue
        if verbose:
            print("Number of API calls After Truncation:  " +
                  str(len(B.keys())))
            print()
    #save the key_lookup table to "key_directory" config parameter in dict_build.json
    jf.save_json(key_lookup, key_dst)
    jf.save_json(C, call_dst)
    if verbose:
        print("Saving node key lookup table to: %s" % key_dst)
        print("Saving api call list to: %s" % call_dst)
    #save the key_lookup table to "key_directory" config parameter in dict_build.json
    return B, P, I, A