Esempio n. 1
0
def make_stellargraph(src):
    '''
    Function to create a StellarGraph network of apps, api calls, blocks, packages, and invoke types.
        
    Returns
    -------
    Returns an instance of the StellarGraph network representation of the files found in directory in "key_directory" of config/dict_build.json" file 
    '''
    #get dictionaries of relationships
    A = get_jsons(src, "dict_A.json")
    B = get_jsons(src, "dict_B.json")
    P = get_jsons(src, "dict_P.json")
    C = get_jsons(src, "api_calls.json")

    #get all nodes
    a_nodes = IndexedArray(index=list(set(A.keys())))
    b_nodes = IndexedArray(index=list(set(B.keys())))
    c_nodes = IndexedArray(index=list(set(C.keys())))
    p_nodes = IndexedArray(index=list(set(P.keys())))
    print("Nodes Created")

    graph_nodes = {
        "app_nodes": a_nodes,
        "block_nodes": b_nodes,
        "api_call_nodes": c_nodes,
        #"invoke_type_nodes":i_nodes,
        "package_nodes": p_nodes
    }

    #get all edges
    a_edges = np.array(list(nx.Graph(A).edges))
    b_edges = np.array(list(nx.Graph(B).edges))
    p_edges = np.array(list(nx.Graph(P).edges))
    print("Edges computed")

    #np.concatenate contributes to majority of runtime for make_stellargraph(src)
    edges = pd.DataFrame(np.concatenate(
        (a_edges, b_edges, p_edges))).rename(columns={
            0: "source",
            1: "target"
        })
    print("Concatted")

    length0 = edges.shape[0]

    removed = list(edges.loc[edges.source == edges.target].target)

    edges = edges.loc[edges.source != edges.target].copy()

    length1 = edges.shape[0]
    if length0 - length1 != 0:
        print("Removed %i repeated keys" % (length0 - length1))
        for r in removed:
            print(r)
    return sg(graph_nodes, edges)
Esempio n. 2
0
 def intra_and_inter(pep, hla, after_pca):
     source, target = Graph_Constructor.combinator(pep, hla)
     combine = list(itertools.product(source, target))
     weight = itertools.repeat(2, len(source) * len(target))
     edges_inter = pd.DataFrame({
         'source': [item[0] for item in combine],
         'target': [item[1] for item in combine],
         'weight': weight
     })
     intra_pep = list(itertools.combinations(source, 2))
     intra_hla = list(itertools.combinations(target, 2))
     intra = intra_pep + intra_hla
     weight = itertools.repeat(1, len(intra))
     edges_intra = pd.DataFrame({
         'source': [item[0] for item in intra],
         'target': [item[1] for item in intra],
         'weight': weight
     })
     edges = pd.concat([edges_inter, edges_intra])
     edges = edges.set_index(pd.Index(np.arange(edges.shape[0])))
     feature_array = Graph_Constructor.numerical(pep, hla, after_pca)
     nodes = IndexedArray(feature_array, index=source + target)
     graph = StellarGraph(nodes,
                          edges,
                          node_type_default='corner',
                          edge_type_default='line')
     return graph
Esempio n. 3
0
def arange_graph(request):
    shape = (3, 7, 11) if request.param == "multivariate" else (3, 7)
    total_elems = np.product(shape)
    nodes = IndexedArray(np.arange(total_elems).reshape(shape) / total_elems,
                         index=["a", "b", "c"])
    edges = pd.DataFrame({"source": ["a", "b"], "target": ["b", "c"]})
    return StellarGraph(nodes, edges)
Esempio n. 4
0
def example_hin_1(
    feature_sizes=None,
    is_directed=False,
    self_loop=False,
    reverse_order=False,
    edge_features=False,
) -> StellarGraph:
    def features(label, ids):
        if feature_sizes is None:
            return None
        else:
            feature_size = feature_sizes.get(label, 10)
            return repeated_features(ids, feature_size)

    a_ids = [0, 1, 2, 3]
    if reverse_order:
        a_ids = a_ids[::-1]
    a = IndexedArray(features("A", a_ids), index=a_ids)

    b_ids = [4, 5, 6]
    if reverse_order:
        b_ids = b_ids[::-1]
    b = IndexedArray(features("B", b_ids), index=b_ids)

    r_edges = [(4, 0), (1, 5), (1, 4), (2, 4), (5, 3)]
    f_edges, f_index = [(4, 5)], [100]
    if self_loop:
        # make it a multigraph, across types and within a single one
        r_edges.append((5, 5))
        f_edges.extend([(5, 5), (5, 5)])
        f_index.extend([101, 102])

    r = pd.DataFrame(r_edges, columns=["source", "target"])

    # add some weights for the f edges, but not others
    f_columns = ["source", "target", "weight"]
    for i, src_tgt in enumerate(f_edges):
        f_edges[i] = src_tgt + (10 + i,)

    f = pd.DataFrame(f_edges, columns=f_columns, index=f_index)

    if edge_features:
        r = r.join(pd.DataFrame(-features("R", r.index), index=r.index))
        f = f.join(pd.DataFrame(-features("F", f.index), index=f.index))

    cls = StellarDiGraph if is_directed else StellarGraph
    return cls(nodes={"A": a, "B": b}, edges={"R": r, "F": f})
def test_indexed_array_invalid():
    values = np.random.rand(3, 4, 5)

    with pytest.raises(TypeError,
                       match="values: expected a NumPy array .* found int"):
        IndexedArray(123)

    with pytest.raises(
            ValueError,
            match=
            r"values: expected an array with shape .* found shape \(\) of length 0",
    ):
        IndexedArray(np.zeros(()))

    with pytest.raises(
            ValueError,
            match=
            r"values: expected an array with shape .* found shape \(123,\) of length 1",
    ):
        IndexedArray(np.zeros(123))

    # check that the index `len`-failure works with or without index inference
    with pytest.raises(TypeError,
                       match="index: expected a sequence .* found int"):
        IndexedArray(index=0)

    with pytest.raises(TypeError,
                       match="index: expected a sequence .* found int"):
        IndexedArray(values, index=123)

    with pytest.raises(
            ValueError,
            match="values: expected the index length 2 .* found 3 rows"):
        IndexedArray(values, index=range(0, 3, 2))
def test_indexed_array_non_empty():
    list_ids = ["a", "b", "c"]
    array_ids = np.array([10, -1, 2])
    range_ids = range(106, 100, -2)

    values = np.random.rand(3, 4, 5)

    # this test uses 'is' checks to validate that there's no copying of data
    frame = IndexedArray(values)
    assert frame.index == range(3)
    assert frame.values is values

    frame = IndexedArray(values, index=list_ids)
    assert frame.index is list_ids
    assert frame.values is values

    frame = IndexedArray(values, index=array_ids)
    assert frame.index is array_ids
    assert frame.values is values

    frame = IndexedArray(values, index=range_ids)
    assert frame.index is range_ids
    assert frame.values is values
def build_graph(outfolder, app_data_list, nodes_path, edge_path):
#     with Client() as client, performance_report(os.path.join(outfolder, "performance_report.html")):
#     print(f"Dask Cluster: {client.cluster}")
#     print(f"Dashboard port: {client.scheduler_info()['services']['dashboard']}")

    data = dd.read_csv(list(app_data_list), dtype=str).compute()

    nodes = {}
    api_map = None

    # setup edges.csv
    pd.DataFrame(columns=['source', 'target']).to_csv(edge_path, index=False)

    for label in ['api', 'app', 'method', 'package']:
        print(f'Indexing {label}s')
#         uid_map = data[label].unique()
        uid_map = pd.DataFrame()
        uid_map[label] = data[label].unique()

#             if base_data is not None: # load base items
#                 base_items = pd.read_csv(
#                     os.path.join(base_data, label+'_map.csv'),
#                     usecols=[label]
#                 )
#                 uid_map = pd.concat([base_items, uid_map], ignore_index=True).drop_duplicates().reset_index(drop=True)

        uid_map['uid'] = label + pd.Series(uid_map.index).astype(str)
        uid_map = uid_map.set_index(label)
        uid_map.to_csv(os.path.join(outfolder, label+'_map.csv'))
        nodes[label] = IndexedArray(index=uid_map.uid.values)

        # get edges if not api
        if label == 'api':
            api_map = uid_map.uid  # create api map
        else:
            print(f'Finding {label}-api edges')
            edges = data[[label, 'api']].drop_duplicates()
            edges[label] = edges[label].map(uid_map.uid)
            edges['api'] = edges['api'].map(api_map)
            edges.to_csv(edge_path, mode='a', index=False, header=False)

    del data
    
    # save nodes to file
    with open(nodes_path, 'wb') as file:
        pickle.dump(nodes, file)

    return StellarGraph(nodes = nodes, edges = pd.read_csv(edge_path))
Esempio n. 8
0
 def unweight_edge(pep, hla, after_pca):
     source, target = Graph_Constructor.combinator(pep, hla)
     combine = list(itertools.product(source, target))
     weight = itertools.repeat(1, len(source) * len(target))
     edges = pd.DataFrame({
         'source': [item[0] for item in combine],
         'target': [item[1] for item in combine],
         'weight': weight
     })
     feature_array = Graph_Constructor.numerical(pep, hla, after_pca)
     try:
         nodes = IndexedArray(feature_array, index=source + target)
     except:
         print(pep, hla, feature_array.shape)
     graph = StellarGraph(nodes,
                          edges,
                          node_type_default='corner',
                          edge_type_default='line')
     return graph
Esempio n. 9
0
 def weight_anchor_edge(pep, hla, after_pca):
     source, target = Graph_Constructor.combinator(pep, hla)
     combine = list(itertools.product(source, target))
     weight = itertools.repeat(1, len(source) * len(target))
     edges = pd.DataFrame({
         'source': [item[0] for item in combine],
         'target': [item[1] for item in combine],
         'weight': weight
     })
     for i in range(edges.shape[0]):
         col1 = edges.iloc[i]['source']
         col2 = edges.iloc[i]['target']
         col3 = edges.iloc[i]['weight']
         if col1 == 'a2' or col1 == 'a9' or col1 == 'a10':
             edges.iloc[i]['weight'] = 1.5
     feature_array = Graph_Constructor.numerical(pep, hla, after_pca)
     nodes = IndexedArray(feature_array, index=source + target)
     graph = StellarGraph(nodes,
                          edges,
                          node_type_default='corner',
                          edge_type_default='line')
     return graph
Esempio n. 10
0
def test_gcn_lstm_generator(multivariate):
    shape = (3, 7, 11) if multivariate else (3, 7)
    total_elems = np.product(shape)
    nodes = IndexedArray(np.arange(total_elems).reshape(shape) / total_elems,
                         index=["a", "b", "c"])
    edges = pd.DataFrame({"source": ["a", "b"], "target": ["b", "c"]})
    graph = StellarGraph(nodes, edges)

    gen = SlidingFeaturesNodeGenerator(graph, 2, batch_size=3)
    gcn_lstm = GCN_LSTM(None, None, [2], [4], generator=gen)

    model = Model(*gcn_lstm.in_out_tensors())

    model.compile("adam", loss="mse")

    history = model.fit(gen.flow(slice(0, 5), target_distance=1))

    predictions = model.predict(gen.flow(slice(5, 7)))

    model2 = Model(*gcn_lstm.in_out_tensors())
    predictions2 = model2.predict(gen.flow(slice(5, 7)))
    np.testing.assert_array_equal(predictions, predictions2)
Esempio n. 11
0
def example_graph(
    feature_size=None,
    node_label="default",
    edge_label="default",
    feature_name="feature",
    is_directed=False,
    edge_feature_size=None,
    edge_weights=False,
):
    elist = pd.DataFrame([(1, 2), (2, 3), (1, 4), (4, 2)], columns=["source", "target"])
    if edge_feature_size is not None:
        edge_features = repeated_features(-elist.index, edge_feature_size)
        elist = elist.join(pd.DataFrame(edge_features))
    if edge_weights:
        elist["weight"] = [0.1, 1.0, 20.0, 1.3]

    nodes = [1, 2, 3, 4]
    node_features = repeated_features(nodes, feature_size)

    nodes = IndexedArray(node_features, index=nodes)

    cls = StellarDiGraph if is_directed else StellarGraph
    return cls(nodes={node_label: nodes}, edges={edge_label: elist})
Esempio n. 12
0
def get_commongraph(common_graph_txts, train, subset=False):
    """
    gets the large dataframe of edges
    common_graph_txts --> path to the directory of common graph edges
    train_apps --> list of filepaths to the edges txt
    """
    now = datetime.now()
    apps = [
        os.path.join(common_graph_txts, (appname + "graph.txt"))
        for appname in train
    ]
    if subset == True:
        apps = apps[:10]
    lst_of_dfs = []

    for app in apps:
        if os.path.exists(app):
            df = pd.read_csv(app, delimiter=" ", header=None)
            lst_of_dfs.append(df)

    concat = pd.concat(lst_of_dfs, ignore_index=True)

    concat.columns = ["source", "target", "weight", "type1", "type2"]
    concat.type1 = concat.type1.apply(fix_node_type)
    concat.type2 = concat.type2.apply(fix_node_type)

    no_dup = concat.drop_duplicates(subset="source", keep="last")
    dct = no_dup.groupby([
        'type1'
    ])['source'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

    for key in dct.keys():
        dct[key] = IndexedArray(index=dct[key])

    commongraph = StellarGraph(dct, concat[["source", "target", "weight"]])
    print("common graph loaded: ", (datetime.now() - now))
    return commongraph
Esempio n. 13
0
def load_graph_data(dataframe,
                    embeddings,
                    name="default",
                    testing=False,
                    num_test=100,
                    using_start=False):

    actor_indeces = []
    actor_features = []
    utterance_indeces = []
    utterance_features = []
    source_edges = []
    target_edges = []

    if testing:
        num_dialogues = num_test
    else:
        num_dialogues = len(dataframe['Dialogue ID'].unique())

    print("Building graph, 1 dialogue at a time...")
    for dialogueID in tqdm(dataframe['Dialogue ID'].unique()[0:num_dialogues]):
        dialogue = dataframe[dataframe["Dialogue ID"] == dialogueID]

        # Loop through all utterances of the dialogue
        for rowidx in range(len(dialogue)):
            row = dialogue.iloc[rowidx]

            # 0. Add actor index-feature if it does not already exist
            actor_idx = f"{row.Actor}_{dialogueID}"
            if actor_idx not in actor_indeces:
                actor_indeces.append(actor_idx)
                if len(actor_features) == 0:
                    # Create new numpy array of actor features
                    actor_features = np.random.normal(0.0, 1.0, [1, 1024])
                else:
                    # Concatenate features to already existing array
                    actor_features = np.concatenate(
                        (actor_features, np.random.normal(0.0, 1.0,
                                                          [1, 1024])),
                        axis=0)
            # 1. Add utterance index-feature (ELMo embeddings)
            utt_idx = f"u_dID{dialogueID}_#{rowidx}"
            utterance_indeces.append(utt_idx)
            # To iterate over the ELMo embeddings we use the index list of the
            # dataset, indexed by the row of the dialogue we are currently parsing
            if len(utterance_features) == 0:
                utterance_features = np.array(
                    [embeddings[dialogue.index[rowidx]]])
            else:
                utterance_features = np.concatenate(
                    (utterance_features,
                     np.array([embeddings[dialogue.index[rowidx]]])),
                    axis=0)

            # 2. Build edges. If this is the first row of a dialogue,
            # begin by drawing an edge from the "START-Node" (source)
            # to the current utterance index (target)
            if using_start and rowidx == 0:
                source_edges.append("START-Node")
                target_edges.append(utt_idx)

            # 3. Construct remaining edges.
            # 3.1 Actor to the utterance
            source_edges.append(actor_idx)
            target_edges.append(utt_idx)
            # 3.2 Utterance to the next utterance
            if (rowidx + 1) != len(dialogue):
                source_edges.append(utt_idx)
                target_edges.append(f"u_dID{dialogueID}_#{rowidx + 1}")
            # 3.3 Utterance to all actors
            for actor in dialogue['Actor'].unique():
                all_actor_idx = f"{actor}_{dialogueID}"
                source_edges.append(utt_idx)
                target_edges.append(all_actor_idx)

    # GraphSAGE (Does not support modelling nodes of different kind) ..less bad

    if using_start:
        start_features = np.random.normal(0.0, 1.0, [1, 1024])
        start_index = "START-Node"
        node_features = np.concatenate(
            (actor_features, utterance_features, start_features), axis=0)
        node_indeces = actor_indeces + utterance_indeces + [start_index]
    else:
        node_features = np.concatenate((actor_features, utterance_features),
                                       axis=0)
        node_indeces = actor_indeces + utterance_indeces

    nodes = IndexedArray(node_features, node_indeces)

    edges = pd.DataFrame({"source": source_edges, "target": target_edges})

    # GraphSAGE:
    full_graph = StellarDiGraph(nodes, edges)

    targets = pd.Series(
        dataframe['Dialogue Act'].tolist()[0:len(utterance_indeces)],
        index=utterance_indeces)

    print("Check if graph has all properties required for ML/Inference...")
    full_graph.check_graph_for_ml(expensive_check=True)
    print("Check successful.")
    print(full_graph.info())
    print("---- Graph Creation Finished ----")

    netx_graph = full_graph.to_networkx(feature_attr='utterance_embedding')
    # Save graphs for later use.
    if testing:
        pickle.dump((netx_graph, targets),
                    open(f"visualizeGraph/test_{name}_netx.pickle", "wb"))
        pickle.dump((full_graph, targets),
                    open(f"createdGraphs/test_{name}_graph.pickle", "wb"))
    else:
        pickle.dump((netx_graph, targets),
                    open(f"visualizeGraph/{name}_netx.pickle", "wb"))
        pickle.dump((full_graph, targets),
                    open(f"createdGraphs/{name}_graph.pickle", "wb"))

    return full_graph, targets
Esempio n. 14
0
    def fit_predict(self, path):
        outpath = os.path.join(path, f'm2v-{self.name}')
        os.makedirs(outpath, exist_ok=True)
        # get app data, compute unique apis
        apps = pd.read_csv(os.path.join(path, 'app_list.csv'),
                           usecols=['app'],
                           squeeze=True,
                           dtype=str)
        #         apps = set(apps)
        app_data_list = os.path.join('data', 'out', 'all-apps',
                                     'app-data/') + apps + '.csv'

        print('Computing new edges')
        data = dd.read_csv(list(app_data_list),
                           dtype=str,
                           usecols=['app', 'api']).drop_duplicates().compute()
        data.api = data.api.map(self.api_map)
        data.columns = ['source', 'target']
        data = data.dropna()

        nodes = self.nodes.copy()
        nodes['app'] = IndexedArray(
            index=np.array(list(nodes['app'].index) + list(apps)))
        edges = pd.concat([pd.read_csv(self.edges_path, dtype=str), data],
                          ignore_index=True).reset_index(drop=True)
        g = StellarGraph(nodes=nodes, edges=edges)
        print(g)

        print('Running random walk')
        rw = UniformRandomMetaPathWalk(g)
        walk_args = self.params['walk_args']
        new_walks = rw.run(list(apps),
                           n=walk_args['n'],
                           length=walk_args['length'],
                           metapaths=walk_args['metapaths'])
        metapath_walks = (self.metapath_walks + new_walks)

        print('Running Word2Vec')
        # make features with word2vec
        w2v = Word2Vec(metapath_walks, **self.params['w2v_args'])

        print('Fitting model')
        features = pd.DataFrame(w2v.wv.vectors)
        features['app'] = w2v.wv.index2word
        map_func = lambda uid: uid if uid not in self.inverse_app_map else self.inverse_app_map[
            uid]
        features['app'] = features['app'].map(map_func)
        features = features.set_index('app')
        X_train = features.loc[self.app_map.keys()]
        #         X_train = X_train.uid.map(self.inverse_app_map)
        X_test = features.loc[apps]

        # train model and predict new apps
        labels = pd.read_csv('data/out/all-apps/app_list.csv',
                             usecols=['app', 'malware'],
                             index_col='app',
                             squeeze=True)
        y_test = labels[X_test.index]
        y_train = labels[X_train.index]

        mdl = self.classifier(**self.classifier_args)
        mdl.fit(X_train, y_train)
        pred = mdl.predict(X_test)

        print(classification_report(y_test, pred))

        results = X_test.assign(m2vDroid=pred, true=y_test)

        # save results and training data
        results.to_csv(os.path.join(outpath, 'predictions.csv'))
        X_train.assign(m2vDroid=mdl.predict(X_train), true=y_train).to_csv(
            os.path.join(outpath, 'training_data.csv'))

        return results
def test_indexed_array_empty():
    frame = IndexedArray()
    assert frame.index == range(0)
    np.testing.assert_array_equal(frame.values, np.empty((0, 0)))