class GraphLoader(object): def __init__(self, data_path, feature_meta, test_p, gnx_idx=None, cuda_num=None, logger=None): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._gnx_idx = gnx_idx self._cuda_num = cuda_num self._test_p = test_p self._features_meta = feature_meta self._data_path = data_path # self._logger.debug("Loading %s dataset...", self._dataset) # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb")) # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb")) # self._nodes_order = sorted(self._gnx) self._train_set = self._test_set = self._train_idx = self._test_idx = None self._inputs = self._targets = None self._nodes_order = [] self._labels = {i: label for i, label in enumerate(self._get_labels())} self._prepare_data() def _get_labels(self): gnx = pickle.load( open(os.path.join(next(self._get_gnx_paths()), "gnx.pkl"), "rb")) return gnx.graph["node_labels"] @staticmethod def _encode_onehot_gnx1(gnx, nodes_order): # gnx, nodes_order: list = None): labels = gnx.graph["node_labels"] labels_dict = { c: np.identity(len(labels))[i, :] for i, c in enumerate(labels) } labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)}) return np.array(list( map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)), dtype=np.int32) def _encode_onehot_gnx(self, gnx, nodes_order): # gnx, nodes_order: list = None): # labels = gnx.graph["node_labels"] ident = np.identity(len(self._labels)) labels_dict = {label: ident[j, :] for j, label in self._labels.items()} # labels_dict = {c: np.identity(len(self._labels))[i, :] for i, c in enumerate(labels)} # labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)}) return np.array(list( map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)), dtype=np.int32) def _join_graphs1(self): all_nodes = set() common_nodes = None for path in self._get_gnx_paths(): gnx = pickle.load(open(os.path.join(path, "orig_gnx.pkl"), "rb")) all_nodes = all_nodes.union(gnx) if common_nodes is None: common_nodes = set(gnx) else: common_nodes = common_nodes.intersection(gnx) pickle.dump( all_nodes, open(os.path.join(path, "..", "..", "all_nodes.pkl"), "wb")) pickle.dump( common_nodes, open(os.path.join(path, "..", "..", "common_nodes.pkl"), "wb")) return all_nodes, common_nodes def _join_graphs(self): path = next(self._get_gnx_paths()) all_nodes = pickle.load( open(os.path.join(path, "..", "..", "all_nodes.pkl"), "rb")) common_nodes = pickle.load( open(os.path.join(path, "..", "..", "common_nodes.pkl"), "rb")) return all_nodes, common_nodes def _split_data(self): feat_path = os.path.join(next(self._get_gnx_paths()), "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) self._nodes_order = sorted( [node for node in gnx if gnx.node[node]['label'] is not None]) indexes = [(i, node) for i, node in enumerate(self._nodes_order)] idx, nodes = zip(*indexes) c_train, c_test, c_train_idx, c_test_idx = train_test_split( nodes, idx, test_size=self._test_p, shuffle=True) self._train_set = set(c_train) self._test_set = set(c_test) self._test_idx = np.array(c_test_idx) self._train_idx = np.array(c_train_idx) def _split_data_orig(self): all_nodes, common_nodes = self._join_graphs() self._nodes_order = sorted(all_nodes) indexes = [(i, node) for i, node in enumerate(self._nodes_order)] common, uncommon = [], [] for i, node in indexes: cur_list = common if node in common_nodes else uncommon cur_list.append((i, node)) c_idx, c_nodes = zip(*common) c_train, c_test, c_train_idx, c_test_idx = train_test_split( c_nodes, c_idx, test_size=self._test_p, shuffle=True) uc_idx, uc_nodes = zip(*uncommon) uc_train, uc_test, uc_train_idx, uc_test_idx = train_test_split( uc_nodes, uc_idx, test_size=self._test_p, shuffle=True) self._train_set = set(c_train).union(uc_train) self._test_set = set(c_test).union(uc_test) self._test_idx = np.array(c_test_idx + uc_test_idx) self._train_idx = np.array(c_train_idx + uc_train_idx) def _activate_cuda(self, *args): if self._cuda_num is None: return args return [x.cuda(self._cuda_num) for x in args] # firms/years/features_0-1 def _get_gnx_paths(self): paths = sorted(os.listdir(self._data_path), key=int) if self._gnx_idx is not None: # for x in [4, 6]: # yield os.path.join(self._data_path, paths[x]) yield os.path.join(self._data_path, paths[self._gnx_idx]) return for path in paths: yield os.path.join(self._data_path, path) def _prepare_data1(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") cur_data = pickle.load( open(os.path.join(feat_path, "data.pkl"), "rb")) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) cur_labels = pickle.load( open(os.path.join(feat_path, "labels.pkl"), "rb")) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data") def _prepare_data(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) gnx = gnx.subgraph(self._nodes_order) features = GraphFeatures(gnx, self._features_meta, dir_path=feat_path, logger=self._logger) features.build(include=self._train_set) add_ones = bool( set(self._features_meta).intersection( ["first_neighbor_histogram", "second_neighbor_histogram"])) cur_data = features.to_matrix(add_ones=add_ones, dtype=np.float32, mtype=np.array, should_zscore=True) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) pickle.dump(cur_data, open(os.path.join(feat_path, "data.pkl"), "wb")) cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) pickle.dump(cur_labels, open(os.path.join(feat_path, "labels.pkl"), "wb")) # Arranging data as <batch, seq, feature> if self._gnx_idx is None: self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data") # topo_x = torch.FloatTensor(topo_x) # np.array(features.todense())) # # labels = torch.LongTensor(np.where(labels)[1]) # # train_idx = torch.LongTensor(self._train_idx) # test_idx = torch.LongTensor(self._test_idx) # # topo_x, labels = convert_to_variable(topo_x, labels) # return self.activate_cuda([topo_x, labels]) def _load_data(self, indexes, nbatch): # for inputs, targets in zip(self._inputs, self._targets): inputs, targets = self._inputs[indexes], self._targets[indexes] # for i in range(0, int(len(inputs) / nbatch) * nbatch, nbatch): for i in range(0, len(inputs), nbatch): data, labels = inputs[i:i + nbatch], targets[i:i + nbatch] # if self._gnx_idx is not None: # data, labels = data[:, self._gnx_idx, :], labels[:, self._gnx_idx, :] data, labels = Variable(torch.FloatTensor(data)), Variable( torch.LongTensor(np.where(labels)[self.feat_dim])) # labels = labels[labels != reverse_labels[None]] yield self._activate_cuda(data, labels) def load_train_data(self, nbatch): return self._load_data(self._train_idx, nbatch) def load_test_data(self, nbatch): return self._load_data(self._test_idx, nbatch) @property def feat_dim(self): return 2 if self._gnx_idx is None else 1 @property def num_nodes(self): return self._inputs.shape[0] @property def sequence_len(self): return self._inputs.shape[1] @property def num_features(self): return self._inputs.shape[self.feat_dim] @property def num_labels(self): return len(self._labels) @property def labels(self): return self._labels @property def num_layers(self): return [100, 20]
class GraphLoader(object): def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._paths = paths self._ignore_index = ignore_index self._cuda_num = cuda_num self._dtype = dtype self._logger.debug("Loading %s dataset...", paths["features"]) self._gnx = pickle.load(open(paths["gnx"], "rb")) self._is_max_connected = is_max_connected if is_max_connected: self._gnx = get_max_subgraph(self._gnx) self.ordered_nodes = sorted(self._gnx) self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]) # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]] self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])} self._node_labels = self._get_node_labels() self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0])) bow_mx = np.vstack(self._content.values()).astype(DTYPE) median_bow = np.median(bow_mx, axis=0) bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE) self._bow_mx = z_scoring(bow_mx) self._topo_mx = None # Adjacency matrices adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE) self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense() self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj) self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense() self._train_set = self._test_set = None self._train_idx = self._test_idx = self._base_train_idx = None self._val_idx = None @property def name(self): return str(self._paths["name"]) @property def is_graph_directed(self): return self._gnx.is_directed() # def _activate_cuda(self, items): # return items # def _encode_onehot_gnx(self): # gnx, nodes_order: list = None): # labels = self._labels.copy() # if labels[len(labels) - 1] is not None: # labels[len(labels)] = None # ident = np.identity(len(labels)) # if self._gnx.graph.get('is_index_labels', False): # labels_dict = {i: ident[i, :] for i, label in labels.items()} # else: # labels_dict = {label: ident[i, :] for i, label in labels.items()} # return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self._nodes_order)), # dtype=np.int32) def _get_node_labels(self): labels = self._labels.copy() labels[self._ignore_index] = None labels_dict = {label: i for i, label in labels.items()} return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self.ordered_nodes)), dtype=np.int32) def set_variables(self, **kwargs): for key, val in kwargs.items(): self.__setattr__(key, val) @property def num_labels(self): return len(self._labels) # @property # def labels(self): # labels = torch.LongTensor(self._node_labels) # return activate_cuda(labels, cuda_num=self._cuda_num) @property def labels(self): labels = torch.LongTensor(self._node_labels) return activate_cuda(labels, cuda_num=self._cuda_num) @property def distinct_labels(self): return sorted(self._labels.keys()) # def _get_idx(self, idx_name): # return torch.LongTensor([x for x in getattr(self, idx_name) if x in set(self._labeled_nodes)]) # # @property # def train_idx(self): # return activate_cuda(self._get_idx("_train_idx"), cuda_num=self._cuda_num) # # @property # def val_idx(self): # return activate_cuda(self._get_idx("_val_idx"), cuda_num=self._cuda_num) # # @property # def test_idx(self): # return activate_cuda(self._get_idx("_test_idx"), cuda_num=self._cuda_num) @property def bow_mx(self): # bow_feat = torch.FloatTensor(self._bow_mx) bow_feat = torch.DoubleTensor(self._bow_mx) return activate_cuda(bow_feat, cuda_num=self._cuda_num) @property def topo_mx(self): assert self._topo_mx is not None, "Split train required" # topo_feat = torch.FloatTensor(self._topo_mx) topo_feat = torch.DoubleTensor(self._topo_mx) return activate_cuda(topo_feat, cuda_num=self._cuda_num) @property def adj_rt_mx(self): return activate_cuda(self._adj_rt, cuda_num=self._cuda_num) # .clone()) @property def adj_mx(self): return activate_cuda(self._adj, cuda_num=self._cuda_num).type(self._dtype) # .clone()) # def split_test(self, test_p): # indexes, nodes = zip(*self._labeled_nodes) # self._train_set, _, self._base_train_idx, self._test_idx = train_test_split(nodes, indexes, test_size=test_p) # def split_train(self, train_p, features_meta): # train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx, # test_size=1 - train_p) # feat_path = os.path.join(self._feat_path, "features%d" % (self._is_max_connected,)) # features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, # is_max_connected=False) # Already taking the max sub_graph in init # features.build(include=set(train_set), should_dump=False) # # add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) # self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) # # ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) # self._topo_mx /= ratio def set_train(self, train_set, features_meta): features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, is_max_connected=False) # Already taking the max sub_graph in init features.build(include=set(train_set), should_dump=True) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
class GraphFeatures(dict): def __init__(self, gnx, features, dir_path, logger=None, is_max_connected=False): self._base_dir = dir_path self._logger = EmptyLogger() if logger is None else logger self._matrix = None self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx self._abbreviations = { abbr: name for name, meta in features.items() for abbr in meta.abbr_set } # building the feature calculators data structure super(GraphFeatures, self).__init__({ name: meta.calculator(self._gnx, logger=logger) for name, meta in features.items() }) @property def graph(self): return self._gnx def _build_serially(self, include, force_build: bool = False, dump_path: str = None): if VERBOSE: self._logger.debug("Start building graph features") if dump_path is not None and self._gnx is not None: pickle.dump(self._gnx, open(self._feature_path("gnx", dump_path), "wb")) for name, feature in self.items(): if force_build or not os.path.exists(self._feature_path(name)): is_dumped = dump_path is not None and feature.DUMPABLE msg = "Dumped to: %s" % dump_path if is_dumped else "Not dumped" feature.build(include=include, msg=msg) if is_dumped: self._dump_feature(name, feature, dump_path) else: self._load_feature(name) if VERBOSE: self._logger.debug("Finished building graph features") # a single process means it is calculated serially def build(self, num_processes: int = 1, include: set = None, should_dump: bool = False): # , exclude: set=None): # if exclude is None: # exclude = set() if include is None: include = set() if 1 == num_processes: dump_path = None if should_dump: dump_path = self._base_dir if not os.path.exists(dump_path): os.makedirs(dump_path) return self._build_serially(include, dump_path=dump_path) request_queue = Queue() workers = [ Worker(request_queue, self, include, logger=self._logger) for _ in range(num_processes) ] # Starting all workers for worker in workers: worker.start() # Feeding the queue with all the features for feature_name in self: request_queue.put(feature_name) # Sentinel objects to allow clean shutdown: 1 per worker. for _ in range(num_processes): request_queue.put(None) # Joining all workers for worker in workers: worker.join() def _load_feature(self, name): if self._gnx is None: assert os.path.exists(self._feature_path( "gnx")), "Graph is not present in the given directory" self._gnx = pickle.load(open(self._feature_path("gnx"), "rb")) feature = pickle.load(open(self._feature_path(name), "rb")) feature.load_meta({ name: getattr(self, name) for name in FeatureCalculator.META_VALUES }) self[name] = feature return self[name] def __getattr__(self, name): if name not in self: if name in self._abbreviations: name = self._abbreviations[name] else: return super(GraphFeatures, self).__getattribute__(name) # if obj is already calculated - return it obj = self[name] if obj.is_loaded: return obj # if obj is not calculated, check if it exist on the file system # if it doesn't - calculate it, if it does - load it and return it if not os.path.exists(self._feature_path(name)): obj.build() return obj return self._load_feature(name) @property def features(self): return set(self) def _feature_path(self, name, dir_path=None): if dir_path is None: dir_path = self._base_dir return os.path.join(dir_path, name + ".pkl") def _dump_feature(self, name, feature, dir_path): if feature.is_loaded: prev_meta = feature.clean_meta( ) # in order not to save unnecessary data pickle.dump(feature, open(self._feature_path(name, dir_path), "wb")) feature.load_meta(prev_meta) def dump(self, dir_path=None): if dir_path is None: dir_path = self._base_dir if not os.path.exists(dir_path): os.makedirs(dir_path) for name, feature in self.items(): self._dump_feature(name, feature, dir_path) @property def shape(self): sorted_features = map(at(1), sorted(self.items(), key=at(0))) sorted_features = [ feature for feature in sorted_features if feature.is_relevant() and feature.is_loaded ] res = [] for feature in sorted_features: res.append((feature.print_name()), feature.shape[1]) return res # sparse.csr_matrix(matrix, dtype=np.float32) def to_matrix(self, entries_order: list = None, add_ones=False, dtype=None, mtype=np.matrix, should_zscore: bool = True): if entries_order is None: entries_order = sorted(self._gnx) sorted_features = map(at(1), sorted(self.items(), key=at(0))) # Consider caching the matrix creation (if it takes long time) sorted_features = [ feature for feature in sorted_features if feature.is_relevant() and feature.is_loaded ] if sorted_features: mx = np.hstack([ feature.to_matrix(entries_order, mtype=mtype, should_zscore=should_zscore) for feature in sorted_features ]) if add_ones: mx = np.hstack([mx, np.ones((mx.shape[0], 1))]) mx.astype(dtype) else: mx = np.matrix([]) return mtype(mx) def to_dict(self, dtype=None, should_zscore: bool = True): mx = self.to_matrix(dtype=dtype, mtype=np.matrix, should_zscore=should_zscore) return {node: mx[i, :] for i, node in enumerate(sorted(self._gnx))}
class GraphLoader(object): def __init__(self, data_dir, features_meta, is_max_connected=False, cuda_num=None, logger=None): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._data_path = data_dir self._cuda_num = cuda_num self._features_meta = features_meta self._is_max_connected = is_max_connected self._logger.debug("Loading %s dataset...", self._data_path) features_path = self._features_path() gpath = os.path.realpath(os.path.join(features_path, "..", "gnx.pkl")) self._gnx = pickle.load(open(gpath, "rb")) self._nodes_order = sorted(self._gnx) self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])} self._ident_labels = self._encode_onehot_gnx() self._content = pickle.load(open(os.path.join(self._data_path, "content.pkl"), "rb")) bow_mx = np.vstack([self._content[node] for node in self._nodes_order]).astype(DTYPE) self._bow_mx = normalize(bow_mx) self._topo_mx = None # Adjacency matrices adj = nx.adjacency_matrix(self._gnx, nodelist=self._nodes_order).astype(DTYPE) self._adj = handle_matrix_symmetric(adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense() self._adj_rt = handle_matrix_concat(adj, should_normalize=True) self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense() self._train_set = self._test_set = None self._train_idx = self._test_idx = self._base_train_idx = None self._val_idx = None def _activate_cuda(self, *items): if self._cuda_num is None: return items if 1 == len(items): return items[0].cuda(self._cuda_num) return [x.cuda(self._cuda_num) for x in items] def _encode_onehot_gnx(self): # gnx, nodes_order: list = None): ident = np.identity(len(self._labels)) if self._gnx.graph.get('is_index_labels', False): labels_dict = {label: ident[i, :] for i, label in self._labels.items()} else: labels_dict = {i: ident[i, :] for i, label in self._labels.items()} return np.array(list(map(lambda n: labels_dict[self._gnx.node[n]['label']], self._nodes_order)), dtype=np.int32) @property def num_labels(self): return len(self._labels) @property def labels(self): labels = torch.LongTensor(np.where(self._ident_labels)[1]) return self._activate_cuda(labels) @property def train_idx(self): train_idx = torch.LongTensor(self._train_idx) return self._activate_cuda(train_idx) @property def val_idx(self): val_idx = torch.LongTensor(self._val_idx) return self._activate_cuda(val_idx) @property def test_idx(self): test_idx = torch.LongTensor(self._test_idx) return self._activate_cuda(test_idx) @property def bow_mx(self): bow_feat = torch.FloatTensor(self._bow_mx) return self._activate_cuda(bow_feat) @property def topo_mx(self): assert self._topo_mx is not None, "Split train required" topo_feat = torch.FloatTensor(self._topo_mx) return self._activate_cuda(topo_feat) @property def adj_rt_mx(self): return self._activate_cuda(self._adj_rt.clone()) @property def adj_mx(self): return self._activate_cuda(self._adj.clone()) def split_test(self, test_p): indexes = range(len(self._nodes_order)) self._train_set, test_set, self._base_train_idx, self._test_idx = train_test_split(self._nodes_order, indexes, test_size=test_p, shuffle=True) # test_set unused def _features_path(self): return os.path.join(self._data_path, "features%d" % (self._is_max_connected,)) def split_train(self, train_p, features_meta=None): if features_meta is None: features_meta = self._features_meta train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx, test_size=1 - train_p, shuffle=True) features_path = self._features_path() features = GraphFeatures(self._gnx, features_meta, dir_path=features_path, logger=self._logger, is_max_connected=self._is_max_connected) features.build(include=set(train_set), should_dump=True) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio