def make_stellargraph(src): ''' Function to create a StellarGraph network of apps, api calls, blocks, packages, and invoke types. Returns ------- Returns an instance of the StellarGraph network representation of the files found in directory in "key_directory" of config/dict_build.json" file ''' #get dictionaries of relationships A = get_jsons(src, "dict_A.json") B = get_jsons(src, "dict_B.json") P = get_jsons(src, "dict_P.json") C = get_jsons(src, "api_calls.json") #get all nodes a_nodes = IndexedArray(index=list(set(A.keys()))) b_nodes = IndexedArray(index=list(set(B.keys()))) c_nodes = IndexedArray(index=list(set(C.keys()))) p_nodes = IndexedArray(index=list(set(P.keys()))) print("Nodes Created") graph_nodes = { "app_nodes": a_nodes, "block_nodes": b_nodes, "api_call_nodes": c_nodes, #"invoke_type_nodes":i_nodes, "package_nodes": p_nodes } #get all edges a_edges = np.array(list(nx.Graph(A).edges)) b_edges = np.array(list(nx.Graph(B).edges)) p_edges = np.array(list(nx.Graph(P).edges)) print("Edges computed") #np.concatenate contributes to majority of runtime for make_stellargraph(src) edges = pd.DataFrame(np.concatenate( (a_edges, b_edges, p_edges))).rename(columns={ 0: "source", 1: "target" }) print("Concatted") length0 = edges.shape[0] removed = list(edges.loc[edges.source == edges.target].target) edges = edges.loc[edges.source != edges.target].copy() length1 = edges.shape[0] if length0 - length1 != 0: print("Removed %i repeated keys" % (length0 - length1)) for r in removed: print(r) return sg(graph_nodes, edges)
def intra_and_inter(pep, hla, after_pca): source, target = Graph_Constructor.combinator(pep, hla) combine = list(itertools.product(source, target)) weight = itertools.repeat(2, len(source) * len(target)) edges_inter = pd.DataFrame({ 'source': [item[0] for item in combine], 'target': [item[1] for item in combine], 'weight': weight }) intra_pep = list(itertools.combinations(source, 2)) intra_hla = list(itertools.combinations(target, 2)) intra = intra_pep + intra_hla weight = itertools.repeat(1, len(intra)) edges_intra = pd.DataFrame({ 'source': [item[0] for item in intra], 'target': [item[1] for item in intra], 'weight': weight }) edges = pd.concat([edges_inter, edges_intra]) edges = edges.set_index(pd.Index(np.arange(edges.shape[0]))) feature_array = Graph_Constructor.numerical(pep, hla, after_pca) nodes = IndexedArray(feature_array, index=source + target) graph = StellarGraph(nodes, edges, node_type_default='corner', edge_type_default='line') return graph
def arange_graph(request): shape = (3, 7, 11) if request.param == "multivariate" else (3, 7) total_elems = np.product(shape) nodes = IndexedArray(np.arange(total_elems).reshape(shape) / total_elems, index=["a", "b", "c"]) edges = pd.DataFrame({"source": ["a", "b"], "target": ["b", "c"]}) return StellarGraph(nodes, edges)
def example_hin_1( feature_sizes=None, is_directed=False, self_loop=False, reverse_order=False, edge_features=False, ) -> StellarGraph: def features(label, ids): if feature_sizes is None: return None else: feature_size = feature_sizes.get(label, 10) return repeated_features(ids, feature_size) a_ids = [0, 1, 2, 3] if reverse_order: a_ids = a_ids[::-1] a = IndexedArray(features("A", a_ids), index=a_ids) b_ids = [4, 5, 6] if reverse_order: b_ids = b_ids[::-1] b = IndexedArray(features("B", b_ids), index=b_ids) r_edges = [(4, 0), (1, 5), (1, 4), (2, 4), (5, 3)] f_edges, f_index = [(4, 5)], [100] if self_loop: # make it a multigraph, across types and within a single one r_edges.append((5, 5)) f_edges.extend([(5, 5), (5, 5)]) f_index.extend([101, 102]) r = pd.DataFrame(r_edges, columns=["source", "target"]) # add some weights for the f edges, but not others f_columns = ["source", "target", "weight"] for i, src_tgt in enumerate(f_edges): f_edges[i] = src_tgt + (10 + i,) f = pd.DataFrame(f_edges, columns=f_columns, index=f_index) if edge_features: r = r.join(pd.DataFrame(-features("R", r.index), index=r.index)) f = f.join(pd.DataFrame(-features("F", f.index), index=f.index)) cls = StellarDiGraph if is_directed else StellarGraph return cls(nodes={"A": a, "B": b}, edges={"R": r, "F": f})
def test_indexed_array_invalid(): values = np.random.rand(3, 4, 5) with pytest.raises(TypeError, match="values: expected a NumPy array .* found int"): IndexedArray(123) with pytest.raises( ValueError, match= r"values: expected an array with shape .* found shape \(\) of length 0", ): IndexedArray(np.zeros(())) with pytest.raises( ValueError, match= r"values: expected an array with shape .* found shape \(123,\) of length 1", ): IndexedArray(np.zeros(123)) # check that the index `len`-failure works with or without index inference with pytest.raises(TypeError, match="index: expected a sequence .* found int"): IndexedArray(index=0) with pytest.raises(TypeError, match="index: expected a sequence .* found int"): IndexedArray(values, index=123) with pytest.raises( ValueError, match="values: expected the index length 2 .* found 3 rows"): IndexedArray(values, index=range(0, 3, 2))
def test_indexed_array_non_empty(): list_ids = ["a", "b", "c"] array_ids = np.array([10, -1, 2]) range_ids = range(106, 100, -2) values = np.random.rand(3, 4, 5) # this test uses 'is' checks to validate that there's no copying of data frame = IndexedArray(values) assert frame.index == range(3) assert frame.values is values frame = IndexedArray(values, index=list_ids) assert frame.index is list_ids assert frame.values is values frame = IndexedArray(values, index=array_ids) assert frame.index is array_ids assert frame.values is values frame = IndexedArray(values, index=range_ids) assert frame.index is range_ids assert frame.values is values
def build_graph(outfolder, app_data_list, nodes_path, edge_path): # with Client() as client, performance_report(os.path.join(outfolder, "performance_report.html")): # print(f"Dask Cluster: {client.cluster}") # print(f"Dashboard port: {client.scheduler_info()['services']['dashboard']}") data = dd.read_csv(list(app_data_list), dtype=str).compute() nodes = {} api_map = None # setup edges.csv pd.DataFrame(columns=['source', 'target']).to_csv(edge_path, index=False) for label in ['api', 'app', 'method', 'package']: print(f'Indexing {label}s') # uid_map = data[label].unique() uid_map = pd.DataFrame() uid_map[label] = data[label].unique() # if base_data is not None: # load base items # base_items = pd.read_csv( # os.path.join(base_data, label+'_map.csv'), # usecols=[label] # ) # uid_map = pd.concat([base_items, uid_map], ignore_index=True).drop_duplicates().reset_index(drop=True) uid_map['uid'] = label + pd.Series(uid_map.index).astype(str) uid_map = uid_map.set_index(label) uid_map.to_csv(os.path.join(outfolder, label+'_map.csv')) nodes[label] = IndexedArray(index=uid_map.uid.values) # get edges if not api if label == 'api': api_map = uid_map.uid # create api map else: print(f'Finding {label}-api edges') edges = data[[label, 'api']].drop_duplicates() edges[label] = edges[label].map(uid_map.uid) edges['api'] = edges['api'].map(api_map) edges.to_csv(edge_path, mode='a', index=False, header=False) del data # save nodes to file with open(nodes_path, 'wb') as file: pickle.dump(nodes, file) return StellarGraph(nodes = nodes, edges = pd.read_csv(edge_path))
def unweight_edge(pep, hla, after_pca): source, target = Graph_Constructor.combinator(pep, hla) combine = list(itertools.product(source, target)) weight = itertools.repeat(1, len(source) * len(target)) edges = pd.DataFrame({ 'source': [item[0] for item in combine], 'target': [item[1] for item in combine], 'weight': weight }) feature_array = Graph_Constructor.numerical(pep, hla, after_pca) try: nodes = IndexedArray(feature_array, index=source + target) except: print(pep, hla, feature_array.shape) graph = StellarGraph(nodes, edges, node_type_default='corner', edge_type_default='line') return graph
def weight_anchor_edge(pep, hla, after_pca): source, target = Graph_Constructor.combinator(pep, hla) combine = list(itertools.product(source, target)) weight = itertools.repeat(1, len(source) * len(target)) edges = pd.DataFrame({ 'source': [item[0] for item in combine], 'target': [item[1] for item in combine], 'weight': weight }) for i in range(edges.shape[0]): col1 = edges.iloc[i]['source'] col2 = edges.iloc[i]['target'] col3 = edges.iloc[i]['weight'] if col1 == 'a2' or col1 == 'a9' or col1 == 'a10': edges.iloc[i]['weight'] = 1.5 feature_array = Graph_Constructor.numerical(pep, hla, after_pca) nodes = IndexedArray(feature_array, index=source + target) graph = StellarGraph(nodes, edges, node_type_default='corner', edge_type_default='line') return graph
def test_gcn_lstm_generator(multivariate): shape = (3, 7, 11) if multivariate else (3, 7) total_elems = np.product(shape) nodes = IndexedArray(np.arange(total_elems).reshape(shape) / total_elems, index=["a", "b", "c"]) edges = pd.DataFrame({"source": ["a", "b"], "target": ["b", "c"]}) graph = StellarGraph(nodes, edges) gen = SlidingFeaturesNodeGenerator(graph, 2, batch_size=3) gcn_lstm = GCN_LSTM(None, None, [2], [4], generator=gen) model = Model(*gcn_lstm.in_out_tensors()) model.compile("adam", loss="mse") history = model.fit(gen.flow(slice(0, 5), target_distance=1)) predictions = model.predict(gen.flow(slice(5, 7))) model2 = Model(*gcn_lstm.in_out_tensors()) predictions2 = model2.predict(gen.flow(slice(5, 7))) np.testing.assert_array_equal(predictions, predictions2)
def example_graph( feature_size=None, node_label="default", edge_label="default", feature_name="feature", is_directed=False, edge_feature_size=None, edge_weights=False, ): elist = pd.DataFrame([(1, 2), (2, 3), (1, 4), (4, 2)], columns=["source", "target"]) if edge_feature_size is not None: edge_features = repeated_features(-elist.index, edge_feature_size) elist = elist.join(pd.DataFrame(edge_features)) if edge_weights: elist["weight"] = [0.1, 1.0, 20.0, 1.3] nodes = [1, 2, 3, 4] node_features = repeated_features(nodes, feature_size) nodes = IndexedArray(node_features, index=nodes) cls = StellarDiGraph if is_directed else StellarGraph return cls(nodes={node_label: nodes}, edges={edge_label: elist})
def get_commongraph(common_graph_txts, train, subset=False): """ gets the large dataframe of edges common_graph_txts --> path to the directory of common graph edges train_apps --> list of filepaths to the edges txt """ now = datetime.now() apps = [ os.path.join(common_graph_txts, (appname + "graph.txt")) for appname in train ] if subset == True: apps = apps[:10] lst_of_dfs = [] for app in apps: if os.path.exists(app): df = pd.read_csv(app, delimiter=" ", header=None) lst_of_dfs.append(df) concat = pd.concat(lst_of_dfs, ignore_index=True) concat.columns = ["source", "target", "weight", "type1", "type2"] concat.type1 = concat.type1.apply(fix_node_type) concat.type2 = concat.type2.apply(fix_node_type) no_dup = concat.drop_duplicates(subset="source", keep="last") dct = no_dup.groupby([ 'type1' ])['source'].apply(lambda grp: list(grp.value_counts().index)).to_dict() for key in dct.keys(): dct[key] = IndexedArray(index=dct[key]) commongraph = StellarGraph(dct, concat[["source", "target", "weight"]]) print("common graph loaded: ", (datetime.now() - now)) return commongraph
def load_graph_data(dataframe, embeddings, name="default", testing=False, num_test=100, using_start=False): actor_indeces = [] actor_features = [] utterance_indeces = [] utterance_features = [] source_edges = [] target_edges = [] if testing: num_dialogues = num_test else: num_dialogues = len(dataframe['Dialogue ID'].unique()) print("Building graph, 1 dialogue at a time...") for dialogueID in tqdm(dataframe['Dialogue ID'].unique()[0:num_dialogues]): dialogue = dataframe[dataframe["Dialogue ID"] == dialogueID] # Loop through all utterances of the dialogue for rowidx in range(len(dialogue)): row = dialogue.iloc[rowidx] # 0. Add actor index-feature if it does not already exist actor_idx = f"{row.Actor}_{dialogueID}" if actor_idx not in actor_indeces: actor_indeces.append(actor_idx) if len(actor_features) == 0: # Create new numpy array of actor features actor_features = np.random.normal(0.0, 1.0, [1, 1024]) else: # Concatenate features to already existing array actor_features = np.concatenate( (actor_features, np.random.normal(0.0, 1.0, [1, 1024])), axis=0) # 1. Add utterance index-feature (ELMo embeddings) utt_idx = f"u_dID{dialogueID}_#{rowidx}" utterance_indeces.append(utt_idx) # To iterate over the ELMo embeddings we use the index list of the # dataset, indexed by the row of the dialogue we are currently parsing if len(utterance_features) == 0: utterance_features = np.array( [embeddings[dialogue.index[rowidx]]]) else: utterance_features = np.concatenate( (utterance_features, np.array([embeddings[dialogue.index[rowidx]]])), axis=0) # 2. Build edges. If this is the first row of a dialogue, # begin by drawing an edge from the "START-Node" (source) # to the current utterance index (target) if using_start and rowidx == 0: source_edges.append("START-Node") target_edges.append(utt_idx) # 3. Construct remaining edges. # 3.1 Actor to the utterance source_edges.append(actor_idx) target_edges.append(utt_idx) # 3.2 Utterance to the next utterance if (rowidx + 1) != len(dialogue): source_edges.append(utt_idx) target_edges.append(f"u_dID{dialogueID}_#{rowidx + 1}") # 3.3 Utterance to all actors for actor in dialogue['Actor'].unique(): all_actor_idx = f"{actor}_{dialogueID}" source_edges.append(utt_idx) target_edges.append(all_actor_idx) # GraphSAGE (Does not support modelling nodes of different kind) ..less bad if using_start: start_features = np.random.normal(0.0, 1.0, [1, 1024]) start_index = "START-Node" node_features = np.concatenate( (actor_features, utterance_features, start_features), axis=0) node_indeces = actor_indeces + utterance_indeces + [start_index] else: node_features = np.concatenate((actor_features, utterance_features), axis=0) node_indeces = actor_indeces + utterance_indeces nodes = IndexedArray(node_features, node_indeces) edges = pd.DataFrame({"source": source_edges, "target": target_edges}) # GraphSAGE: full_graph = StellarDiGraph(nodes, edges) targets = pd.Series( dataframe['Dialogue Act'].tolist()[0:len(utterance_indeces)], index=utterance_indeces) print("Check if graph has all properties required for ML/Inference...") full_graph.check_graph_for_ml(expensive_check=True) print("Check successful.") print(full_graph.info()) print("---- Graph Creation Finished ----") netx_graph = full_graph.to_networkx(feature_attr='utterance_embedding') # Save graphs for later use. if testing: pickle.dump((netx_graph, targets), open(f"visualizeGraph/test_{name}_netx.pickle", "wb")) pickle.dump((full_graph, targets), open(f"createdGraphs/test_{name}_graph.pickle", "wb")) else: pickle.dump((netx_graph, targets), open(f"visualizeGraph/{name}_netx.pickle", "wb")) pickle.dump((full_graph, targets), open(f"createdGraphs/{name}_graph.pickle", "wb")) return full_graph, targets
def fit_predict(self, path): outpath = os.path.join(path, f'm2v-{self.name}') os.makedirs(outpath, exist_ok=True) # get app data, compute unique apis apps = pd.read_csv(os.path.join(path, 'app_list.csv'), usecols=['app'], squeeze=True, dtype=str) # apps = set(apps) app_data_list = os.path.join('data', 'out', 'all-apps', 'app-data/') + apps + '.csv' print('Computing new edges') data = dd.read_csv(list(app_data_list), dtype=str, usecols=['app', 'api']).drop_duplicates().compute() data.api = data.api.map(self.api_map) data.columns = ['source', 'target'] data = data.dropna() nodes = self.nodes.copy() nodes['app'] = IndexedArray( index=np.array(list(nodes['app'].index) + list(apps))) edges = pd.concat([pd.read_csv(self.edges_path, dtype=str), data], ignore_index=True).reset_index(drop=True) g = StellarGraph(nodes=nodes, edges=edges) print(g) print('Running random walk') rw = UniformRandomMetaPathWalk(g) walk_args = self.params['walk_args'] new_walks = rw.run(list(apps), n=walk_args['n'], length=walk_args['length'], metapaths=walk_args['metapaths']) metapath_walks = (self.metapath_walks + new_walks) print('Running Word2Vec') # make features with word2vec w2v = Word2Vec(metapath_walks, **self.params['w2v_args']) print('Fitting model') features = pd.DataFrame(w2v.wv.vectors) features['app'] = w2v.wv.index2word map_func = lambda uid: uid if uid not in self.inverse_app_map else self.inverse_app_map[ uid] features['app'] = features['app'].map(map_func) features = features.set_index('app') X_train = features.loc[self.app_map.keys()] # X_train = X_train.uid.map(self.inverse_app_map) X_test = features.loc[apps] # train model and predict new apps labels = pd.read_csv('data/out/all-apps/app_list.csv', usecols=['app', 'malware'], index_col='app', squeeze=True) y_test = labels[X_test.index] y_train = labels[X_train.index] mdl = self.classifier(**self.classifier_args) mdl.fit(X_train, y_train) pred = mdl.predict(X_test) print(classification_report(y_test, pred)) results = X_test.assign(m2vDroid=pred, true=y_test) # save results and training data results.to_csv(os.path.join(outpath, 'predictions.csv')) X_train.assign(m2vDroid=mdl.predict(X_train), true=y_train).to_csv( os.path.join(outpath, 'training_data.csv')) return results
def test_indexed_array_empty(): frame = IndexedArray() assert frame.index == range(0) np.testing.assert_array_equal(frame.values, np.empty((0, 0)))