def test_get_graph_embeddings(): g1 = Graph('ls', convert_dot_to_networkx('./out/ls.dot')) assert g1.graph is not None g2 = Graph('cat', convert_dot_to_networkx('./out/cat.dot')) assert g2.graph is not None graphs = [g1, g2] model = get_graph_embeddings(graphs) assert type(model) is Doc2Vec
if __name__ == "__main__": datasets = get_datasets(dir='/bin/') graphs = [] for dataset in datasets: dataset.set_dot_file_path( generate_dot_by_radare2(dataset.bin_file_path, dataset.sample_name)) if not os.path.exists(dataset.dot_file_path): print(f'{dataset.dot_file_path} is not created.') print('Please check the reason manually...') continue graphs.append( Graph(dataset.sample_name, convert_dot_to_networkx(dataset.dot_file_path))) model = get_graph_embeddings(graphs) results = pd.DataFrame({}, columns=[ '1st Similar file', '1st Similarity Score', '2nd Similar file', '2nd Similarity Score', '3rd Similar file', '3rd Similarity Score' ]) for graph in graphs: print(f'get files which are similar to {graph.name}') sim_files = model.dv.most_similar(f'g_{graph.name}') print(sim_files) print('-----------------------------------------------')
def test_get_graph_embeddings_with_single_graph(): G = convert_dot_to_networkx(test_file_path) assert G is not None with pytest.raises(AssertionError): get_graph_embeddings(G)
def test_convert_to_networkx(): dot_file_path = './out/ls.dot' G = convert_dot_to_networkx(dot_file_path) assert G is not None
def test_get_edge_embeddings(): G = convert_dot_to_networkx(test_file_path) assert G is not None wv = get_edge_embeddings(G) assert type(wv) is HadamardEmbedder
def test_convert_to_networkx_with_empty_saved_path(): dot_file_path = './out/ls.dot' saved_file_path = './out/ls_cfg.png' G = convert_dot_to_networkx(dot_file_path, saved_file_path) assert G is not None
def test_convert_to_networkx_with_empty_dotfile_path(): with pytest.raises(FileNotFoundError): convert_dot_to_networkx('')
def get_edge_embeddings(G: Union[MultiDiGraph, MultiGraph], debug: bool = False) -> HadamardEmbedder: if G is None: raise TypeError(' A type of G must be MultiDiGraph or MultiGraph') wv = None try: wv = HadamardEmbedder(keyed_vectors=get_node_embedings(G)) except Exception as e: raise e if debug: for idxI in range(len(G.nodes())): for idxJ in range(idxI): print('similar vector: ', (idxI, idxJ)) print('similar_edge', wv.most_similar((idxI, idxJ))) print('') return wv if __name__ == '__main__': try: G = convert_dot_to_networkx('./out/ls.dot') embeds = get_edge_embeddings(G) except Exception: print(traceback.format_exc())
min_count=min_count) return model def save_embeddings(output_path: str, model: Doc2Vec, graphs: List[TaggedDocument], dimensions: int): if not os.path.exists(output_path): with open(output_path, 'wb'): pass out = [] for graph in graphs: out.append([graph.name] + list(model.dv["g_" + graph.name])) column_names = ["types"] + ["x_" + str(dim) for dim in range(dimensions)] out = pd.DataFrame(out, columns=column_names) out.to_csv(output_path, index=None) if __name__ == "__main__": dimensions = 128 try: g1 = Graph('ls', convert_dot_to_networkx('./out/ls.dot')) g2 = Graph('cat', convert_dot_to_networkx('./out/cat.dot')) graphs = [g1, g2] model = get_graph_embeddings(graphs, dimensions=dimensions) save_embeddings('./out/ls.model', model, graphs, dimensions) except Exception: print(traceback.format_exc())
def test_get_node_embeddings(): G = convert_dot_to_networkx(test_file_path) assert G is not None wv = get_node_embedings(G) assert type(wv) is KeyedVectors