コード例 #1
0
def doc2vec_train(train, path_to_walks, app_id_fp):
    """
    performs doc2vec on the walks to obtain document vectors
    USE ON SEPARATE APKs
    
    path_to_walks --> directory conatining the metapath2vec walks
    """
    documents = []
    labels = []
    df = pd.read_csv(app_id_fp)

    train = [utils.dir_and_app(app)[1] for app in train]
    paths = [
        os.path.join(path_to_walks, (appname + "m2v_walks.txt"))
        for appname in train
    ]

    for walk in tqdm(range(len(paths))):
        try:
            documents.append(np.loadtxt(paths[walk], dtype=object))
            direc, appname = utils.dir_and_app(paths[walk])
            label = df[df.app_fp.str.contains(appname)].app_label.iloc[0]
            labels.append(label)

        except:
            print("app ", paths[walk], " is broken")

    docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    documents = []

    model = Doc2Vec(docs, vector_size=100, window=300, min_count=1, workers=4)

    return [model, labels]
コード例 #2
0
def building_embeddings(app_ids_fp, metapathsFP, walksFP):
    """
    builds the embeddings of each app
    
    app_ids_fp --> filepath to each of the apps
    metapathsFP --> path to where metapaths are stored
    walksFP --> path to where the walks are stored
    """
    df = pd.read_csv(app_ids_fp)

    walks = []

    for row in tqdm(range(len(df))):
        app_fp = df.iloc[row]["app_fp"]
        app_label = df.iloc[row]["app_label"]
        directory, app_name = utils.dir_and_app(app_fp)

        metapath_fp = os.path.join(metapathsFP,
                                   (app_name + "m2v_metapaths.txt"))
        walk_fp = os.path.join(walksFP, (app_name + "m2v_walks.txt"))
        if os.path.exists(walk_fp) & os.path.exists(metapath_fp):
            walk = unstack_walks(walk_fp, app_name)
            walks = walks + walk

    return walks
コード例 #3
0
def doc2vec_test(test, path_to_walks, app_id_fp, model, target, metapathsFP,
                 walksFP):
    """
    infers test item for unseen apps
    
    test --> list of test app names
    path_to_walks --> the path to the walks of apps
    app_id_fp --> filepath of information
    model --> the trained model
    """
    #     model = Doc2Vec.load(model_fp)

    documents = []
    # assume that the data is coming from the apps that we have
    labels = []
    df = pd.read_csv(app_id_fp)
    if ".gml.bz2" in test[0]:
        app_names = [utils.dir_and_app(app)[1] for app in test]
    else:
        app_names = test
    paths = [
        os.path.join(path_to_walks, (appname + "m2v_walks.txt"))
        for appname in app_names
    ]
    for ind in range(len(paths)):
        walk = paths[ind]

        #         if not(os.path.exists(walk)): # if this app has not been traversed yet
        #             # find the app location, and create files for it
        #             wrapper(test[ind], target, metapathsFP, walksFP)
        if os.path.exists(walk):
            documents.append(np.loadtxt(walk, dtype=object))
            direc, appname = utils.dir_and_app(walk)
            label = df[df.app_fp.str.contains(appname)].app_label.iloc[0]
            labels.append(label)

    X = []
    for test_app in documents:
        X.append(model.infer_vector(test_app))

    return [X, labels]
コード例 #4
0
def API_abstraction_vectorized(inFP, outFP, kind, to_return, truename=False):
    """
    abstracts edges and nodes of ONE APP to some level
    
    returns a graph that is abstracted (WILL CHANGE)
    
    inFP --> input file path (should be .gml.bz2)
    outFP --> output directory
    kind --> (str) FAMILY or PACKAGE or CLASS
    """

    # getting the app name
    direc, app_name = utils.dir_and_app(inFP)

    try:
        networkx = nx.read_gml(inFP)
    except:
        return inFP + " might be broken!"

    nx_nodes = np.array(networkx.nodes(data=True))
    nx_edges = np.array(networkx.edges, dtype=object)
    node_vfunc = np.vectorize(API_abstraction)
    edge_vfunc = np.vectorize(edge_processing)

    newnodes = [API_abstraction(kind, node) for node in nx_nodes]
    newedges = [edge_processing(kind, edge) for edge in nx_edges]

    G = nx.MultiDiGraph()
    G.add_nodes_from(newnodes)
    G.add_edges_from(newedges)
    if truename == False:
        G = add_apk_node(G, "")
    else:
        G = add_apk_node(G, app_name)
    metapaths = dfs(G, app_name)

    if to_return == "NX":
        return [G, metapaths]
    elif to_return == "SG":
        stellar = StellarGraph.from_networkx(G, node_type_attr="type")
        return [stellar, metapaths]
コード例 #5
0
def get_markov(inFP, outFP, kind):
    """
    obtains the markov chain for one app
    
    inFP --> input file path (should be .gml.bz2)
    outFP --> output directory
    kind --> (str) FAMILY or PACKAGE
    
    """

    direc, app_name = utils.dir_and_app(inFP)
    outputfp = os.path.join(outFP, (app_name + "_" + kind + ".txt"))
    if os.path.exists(outputfp):
        print("app ", inFP, " is already done!")

    else:
        try:
            networkx = nx.read_gml(inFP)
        except:
            return inFP + " might be broken!"

        nx_nodes = np.array(networkx.nodes())
        nx_edges = np.array(networkx.edges, dtype=object)

        # convert to package/family mode
        vfunc = np.vectorize(get_package_family)
        newnodes = vfunc(kind, nx_nodes)
        new_edges = []
        for edge in nx_edges:
            new_edges.append(edge_processing(edge, kind))
        G = nx.MultiDiGraph()
        G.add_nodes_from(newnodes)
        G.add_edges_from(new_edges)
        stellar = StellarGraph.from_networkx(G)

        # step2: markov chain
        ## Set of possible states of the Markov chain is denoted as S
        ## If Sj and Sk are two connected states, Pjk denotes P(transition from Sj to Sk)
        ## Pjk is # occurances(Ojk), or edges(from j to k), divided by # of all occurrences
        ## Pjk = # of Edge(j, k) / # total edges
        if kind == "PACKAGE":
            possible_packages = get_possible_packages()
            S = ["/".join(item).strip()
                 for item in possible_packages] + ["self_defined"]
            possible_edges = get_possible_edges()
        elif kind == "FAMILY":
            possible_packages = POSSIBLE_FAMILIES
            possible_edges = get_possible_family_edges()
            S = possible_packages + ["self_defined"]
        total_edges = stellar.number_of_edges()
        markov = []
        counts_nd_stuff = pd.Series(stellar.edges()).value_counts()

        for j in S:
            for k in S:  ## we might have self calling loops
                edge = (j, k)
                try:
                    Pjk = counts_nd_stuff[edge] / total_edges
                    markov.append(Pjk)
                except ValueError:
                    markov.append(0)

        # build output fp and save
        if (round(sum(markov)) == 1) & (not os.path.exists(outputfp)):
            try:
                np.savetxt(outputfp, markov, fmt="%s")
                print("the app: ", inFP, " is done!", "mode: ", kind)
                return (inFP + " IS FINISHED!")
            except:
                print("the app: ", inFP, " encountered errors!")
コード例 #6
0
def wrapper(apk, target, metapathsFP, walksFP):
    """
    wrapper to build features for doc2vec, metapath2vec. 
    
    apk --> filepath to the apk
    target --> filepath to store common graph txts (for metapath2vec)
    metapathsFP --> filepath to store metapath2vec txts (for metapath2vec)
    walksFP --> filepath to store metapths2vec walks txt (for doc2vec)
    
    """
    if ".gml.bz2" in apk:
        direc, appname = utils.dir_and_app(apk)
    else:
        appname = apk

    document_out = os.path.join(walksFP, (appname + "m2v_walks.txt"))
    metapath_out = os.path.join(metapathsFP, (appname + "m2v_metapaths.txt"))
    graph_out = os.path.join(target, (appname + "graph.txt"))

    if (os.path.exists(document_out)) & (os.path.exists(metapath_out)) & (
            os.path.exists(graph_out)):
        print("the app: ", apk, " is already done!")
    else:
        try:
            networkx, metapaths = API_abstraction_vectorized(
                apk, "", "CLASS", "NX", True)
            stellar = StellarGraph.from_networkx(networkx,
                                                 node_type_attr="type")
            ################## COMMON GRAPH INFORMATION ##################
            if not os.path.exists(graph_out):
                with open(graph_out, 'a') as file:
                    for edge in np.array(networkx.edges):
                        node1, node2, weight = edge
                        type1 = networkx.nodes[node1]["type"]
                        type2 = networkx.nodes[node2]["type"]

                        # columns are: ["node1", "node2", "weight", "type1", "type2"]
                        row = " ".join([node1, node2, weight, type1, type2
                                        ]) + "\n"

                        file.write(row)
                file.close()
            ##############################################################

            ################## DOC2VEC AND METAPATH2VEC INFORMATION ##################
            try:
                # OUTPUT WALKS OF ONE APP
                if not os.path.exists(document_out):
                    document = metapath2vec(stellar, 500, metapaths)
                    np.savetxt(document_out, np.hstack(document), fmt="%s")

                # OUTPUT METAPATHS OF ONE APP
                if not os.path.exists(metapath_out):
                    joined = ["->".join(lst) for lst in metapaths]
                    np.savetxt(metapath_out, joined, fmt="%s")
                print("the app: ", apk, " has finished!")
            except:
                print("The app: ", apk, " seems to be broken!")

        except:
            print("The app: ", apk, " seems to be broken!")
コード例 #7
0
def metapath2vec(commonFP,
                 train_apps,
                 metapathFP,
                 app_ids_fp,
                 walksFP,
                 mdl_fp,
                 walk_length=100,
                 reduced=False,
                 subset=False,
                 testing=False):
    """
    wrapper function
    performs metapath2vec on the commongraph
    outputs: X and y
    
    commonFP --> filepath to the directory containing txts for common graph 
            txts columns are --> ["node1", "node2", "weight", "type1", "type2"]
            
    metapathFP --> filepath to the directory of metapaths of separate apps (from doc2vec)
    app_ids_fp --> filepath to the csv containing app and label information
    walksFP --> filepath too the directory of walks of separate apps (from doc2vec)
    
    """
    df = pd.read_csv(app_ids_fp)
    if testing == True:
        train_apps = [utils.dir_and_app(item)[1] for item in train_apps]

    # get the common graph
    commongraph = get_commongraph(commonFP, train_apps, subset)

    # get the metapaths
    if reduced == True:
        metapaths = reduced_metapaths(metapathFP)
    elif reduced == False:
        metapaths = full_metapaths(metapathFP)

    # get root nodes as indices
    nodes, indices = root_nodes(commongraph, metapaths)

    # run metapath2vec
    walks = run(commongraph, indices, metapaths)

    if os.path.exists(mdl_fp):
        print("model already exists, will load it in .....")
        model = Word2Vec.load(mdl_fp)
    else:
        # gensim word2vec
        model = Word2Vec(walks,
                         size=128,
                         window=5,
                         min_count=0,
                         sg=1,
                         workers=2,
                         iter=1)
        model.save(mdl_fp)

    X = []
    y = []

    for node in nodes:
        X.append(build_embedding(node, walksFP, model))
        try:
            y.append(df[df.app_fp.str.contains(node)].app_label.iloc[0])
        except:
            y.append(0)

    return [X, y]