def create_features(data_name, time_range): for i in range(time_range): gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb")) # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f: # gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"})} features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb")) # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f: # pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL) return # with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f: # l = pickle.load(f) # # print (l[0])
def _calc_tg_feature_matrix(self): log_ext = "log_" if self._params['features']['log'] else "" feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix") mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl") if os.path.exists(mat_pkl): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl, "rb")) gnx_to_vec = {} # create dir for database database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name()) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name)) if self._params['general']["dump_pkl"]: os.makedirs(gnx_path, exist_ok=True) gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params['features']['max_connected']) gnx_ftr.build(should_dump=self._params['general']["dump_pkl"], force_build=self._params['general']['FORCE_REBUILD_FEATURES']) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None) if self._params['general']['dump_pkl']: os.makedirs(feature_matrix_dir, exist_ok=True) pickle.dump(gnx_to_vec, open(mat_pkl, "wb")) return gnx_to_vec
def test_main(): import numpy as np from features_infra.graph_features import GraphFeatures from loggers import PrintLogger import os import pickle import networkx as nx dataset = "citeseer" logger = PrintLogger("MetaTest") base_dir = r"/home/benami/git/pygcn/data" gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb')) max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()), key=len) gnx = gnx.subgraph(max_subgnx) features = GraphFeatures(gnx, TEST_FEATURES, dir_path="./%s_features_sub" % dataset, logger=logger) features.build(should_dump=True) measures_mx = features.to_matrix(add_ones=False, dtype=np.float32, mtype=np.matrix) logger.info("Finished")
def create_features(): for i in range(21): with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'), 'rb') as f: gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"}), } features = GraphFeatures(gnx, features_meta, "/home/dsi/racheli/graph_calculations", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'), 'wb') as f: pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
def __get_features(self): with open(self.dirname + '/data/' + self.DATASET + '.pickle', 'rb') as f: data = pkl.load(f) self._original_adj_matrices = data['A'] y = data['y'] node_labels = [] for a in y.todense(): if a.max() != 0: node_labels.append(a.argmax()) else: node_labels.append(-1) sum_adj = AsymmetricRGCNWithNeighborHistograms.__sum_sparse(self.A) gnx = nx.from_scipy_sparse_matrix(sum_adj, parallel_edges=True) gnx = nx.DiGraph(gnx, labels=node_labels) for n, label in zip(gnx.nodes, node_labels): gnx.node[n]['label'] = label real_labels = list(set(node_labels) - {-1}) # Get the features for the graph NEIGHBOR_FEATURES = { "first_neighbor_histogram": FeatureMeta( nth_neighbor_calculator(1, labels_to_consider=real_labels), {"fnh", "first_neighbor"}), "second_neighbor_histogram": FeatureMeta( nth_neighbor_calculator(2, labels_to_consider=real_labels), {"snh", "second_neighbor"}), } features_path = os.path.join(os.path.abspath('../features'), self.DATASET) features = GraphFeatures(gnx, NEIGHBOR_FEATURES, dir_path=features_path) features.build(include=set(self.idx_train), should_dump=True) add_ones = bool( {"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(NEIGHBOR_FEATURES)) _topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) del data return sp.csr_matrix( np.hstack([ _topo_mx, np.zeros( (_topo_mx.shape[0], _topo_mx.shape[0] - _topo_mx.shape[1])) ]))
def set_train(self, train_set, features_meta): features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, is_max_connected=False) # Already taking the max sub_graph in init features.build(include=set(train_set), should_dump=True) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
def split_train(self, train_p, features_meta): train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx, test_size=1 - train_p, shuffle=True) features_path = self._features_path() features = GraphFeatures(self._gnx, features_meta, dir_path=features_path, logger=self._logger, is_max_connected=self._is_max_connected) features.build(include=set(train_set), should_dump=False) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
def _prepare_data(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) gnx = gnx.subgraph(self._nodes_order) features = GraphFeatures(gnx, self._features_meta, dir_path=feat_path, logger=self._logger) features.build(include=self._train_set) add_ones = bool( set(self._features_meta).intersection( ["first_neighbor_histogram", "second_neighbor_histogram"])) cur_data = features.to_matrix(add_ones=add_ones, dtype=np.float32, mtype=np.array, should_zscore=True) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) pickle.dump(cur_data, open(os.path.join(feat_path, "data.pkl"), "wb")) cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) pickle.dump(cur_labels, open(os.path.join(feat_path, "labels.pkl"), "wb")) # Arranging data as <batch, seq, feature> if self._gnx_idx is None: self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data")
def calculate_gpu_one(run, level, size, p, directed): from features_infra.graph_features import GraphFeatures from features_infra.feature_calculators import FeatureMeta from features_algorithms.accelerated_graph_features.motifs import nth_nodes_motif from loggers import FileLogger feature_meta = { "motif" + str(level): FeatureMeta(nth_nodes_motif(level, gpu=True, device=3), {"m" + str(level)}) } head_path = os.path.join( "size{}_p{}_directed{}_runs".format(size, p, directed), "run_" + str(run)) dump_path = os.path.join(head_path, "motifs_gpu") graph = pickle.load(open(os.path.join(head_path, "gnx.pkl"), "rb")) logger = FileLogger("CalculationLogger" + str(level), path=dump_path, level=logging.DEBUG) raw_feature = GraphFeatures(gnx=graph, features=feature_meta, dir_path=dump_path, logger=logger) raw_feature.build(should_dump=True)