def __init__(self, data_path, feature_meta, test_p, gnx_idx=None, cuda_num=None, logger=None): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._gnx_idx = gnx_idx self._cuda_num = cuda_num self._test_p = test_p self._features_meta = feature_meta self._data_path = data_path # self._logger.debug("Loading %s dataset...", self._dataset) # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb")) # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb")) # self._nodes_order = sorted(self._gnx) self._train_set = self._test_set = self._train_idx = self._test_idx = None self._inputs = self._targets = None self._nodes_order = [] self._labels = {i: label for i, label in enumerate(self._get_labels())} self._prepare_data()
def __init__(self, data_dir, features_meta, is_max_connected=False, cuda_num=None, logger=None): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._data_path = data_dir self._cuda_num = cuda_num self._features_meta = features_meta self._is_max_connected = is_max_connected self._logger.debug("Loading %s dataset...", self._data_path) features_path = self._features_path() gpath = os.path.realpath(os.path.join(features_path, "..", "gnx.pkl")) self._gnx = pickle.load(open(gpath, "rb")) self._nodes_order = sorted(self._gnx) self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])} self._ident_labels = self._encode_onehot_gnx() self._content = pickle.load(open(os.path.join(self._data_path, "content.pkl"), "rb")) bow_mx = np.vstack([self._content[node] for node in self._nodes_order]).astype(DTYPE) self._bow_mx = normalize(bow_mx) self._topo_mx = None # Adjacency matrices adj = nx.adjacency_matrix(self._gnx, nodelist=self._nodes_order).astype(DTYPE) self._adj = handle_matrix_symmetric(adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense() self._adj_rt = handle_matrix_concat(adj, should_normalize=True) self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense() self._train_set = self._test_set = None self._train_idx = self._test_idx = self._base_train_idx = None self._val_idx = None
def __init__(self, conf, logger, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger() if data_logger is None else data_logger self._conf = conf self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni self._device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') self._ce_loss = self._soft_ce_loss self._temporal_loss = torch.nn.MSELoss(reduction='sum').to(self._device)
def __init__(self, conf, logger, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._conf = conf self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni self._device = torch.device( 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') self._loss = BCELoss()
def __init__(self, data_path, cuda, logger, data_logger=None): self._logger = logger self._cuda = cuda self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._data_path = data_path # feature_meta = NEIGHBOR_FEATURES feature_meta = NODE_FEATURES # feature_meta = NODE_FEATURES.copy() # feature_meta.update(NEIGHBOR_FEATURES) self.loader = GraphLoader(feature_meta, cuda_num=cuda, logger=self._logger)
def __init__(self, dataset_path, conf, logger, data_logger=None): self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._criterion = torch.nn.NLLLoss() self._conf = conf features_meta = get_features() self.loader = GraphLoader( dataset_path, features_meta, is_max_connected=False, # self._conf['dataset'] == "citeseer", cuda_num=conf["cuda"], logger=self._logger)
class SpecificFeatureTest(unittest.TestCase): logger = EmptyLogger() @classmethod def setUpClass(cls): cls._test_data = TestData(logger=cls.logger) def _test_feature(self, feature_cls, is_directed, is_max_connected=True, manual=None, **cmp_features): gnx = get_di_graph() if is_directed else get_graph() gnx = filter_gnx(gnx, is_max_connected) feature = feature_cls(gnx, logger=self.logger) res = feature.build() # mx_res = feature.to_matrix() if manual is None: prev_res = self._test_data.load_feature(feature_cls, is_directed) else: prev_res = manual if prev_res is not None or feature.is_relevant(): if not are_results_equal(res, prev_res, **cmp_features): are_results_equal(res, prev_res, **cmp_features) self.assertTrue(are_results_equal(res, prev_res, **cmp_features))
def attachDetachLogger(self, cpu): if type(cpu.logger) is not EmptyLogger: print("Detaching logger") cpu.logger = EmptyLogger() else: print("Attaching logger") cpu.logger = Logger(cpu)
def __init__(self, gnx, features, dir_path, logger=None, is_max_connected=False): self._base_dir = dir_path self._logger = EmptyLogger() if logger is None else logger self._matrix = None if is_max_connected: if gnx.is_directed(): subgraphs = nx.weakly_connected_component_subgraphs(gnx) else: subgraphs = nx.connected_component_subgraphs(gnx) self._gnx = max(subgraphs, key=len) else: self._gnx = gnx self._abbreviations = { abbr: name for name, meta in features.items() for abbr in meta.abbr_set } # building the feature calculators data structure super(GraphFeatures, self).__init__({ name: meta.calculator(self._gnx, logger=logger) for name, meta in features.items() })
def __init__(self, path_info, is_max_connected, *args, logger=None, cuda_num=None, is_debug=False, **kwargs): super(MultiGraphLoader, self).__init__() # def __init__(self, path_info, is_max_connected, norm_adj=True, cuda_num=None, logger=None): # path_info = {"years": os.path.realpath(os.path.join(PROJ_DIR, "..", "data", "firms", "years")), # "label": "top"} self._path_info = path_info self._path_info["split"] = os.path.realpath(os.path.join(self._path_info["years"], "..", "split.pkl")) if logger is None: logger = EmptyLogger() self._logger = logger # TODO: implement dynamic loading of the data for year in sorted(os.listdir(path_info["years"]), key=int): data_path = os.path.realpath(os.path.join(path_info["years"], year)) year_paths = { "features": os.path.join(data_path, "features%d" % (is_max_connected,)), "content": os.path.join(data_path, "content_clean.pkl"), # "content": os.path.join(data_path, path_info["label"], "content.pkl"), "gnx": os.path.join(data_path, path_info["label"], "gnx.pkl"), "name": str(year), } self[int(year)] = GraphLoader(year_paths, is_max_connected, *args, logger=self._logger, cuda_num=cuda_num, **kwargs) if is_debug and (1997 == int(year)): break self._nodes = np.array(self.ordered_nodes) # getattr will take the first one self._test_idx = self._base_train_idx = None self._train_idx = self._val_idx = None self._should_split = not os.path.exists(self._path_info["split"]) if not self._should_split: self._load_split()
def __init__(self, gnx, logger=None): # super(FeatureCalculator, self).__init__() self._is_loaded = False self._features = {} self._logger = EmptyLogger() if logger is None else logger self._gnx = gnx self._print_name = self.print_name()
def test_attachDetachLogger_attaches_logger_when_ther_is_Empty(self): debugger = Debugger() cpu = FakeCpu() cpu.logger = EmptyLogger() debugger.attachDetachLogger(cpu) self.assertFalse(type(cpu.logger) is EmptyLogger)
def __init__(self, *args, logger=None, **kwargs): super(BaseReactor, self).__init__() self._should_run = True self._logger = logger or EmptyLogger() self._selector = SelectSelector() self.register = self._selector.register self._logger.info("Initiating %s" % (type(self).__name__, ))
def __init__(self, conf, GS,logger, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger() if data_logger is None else data_logger self._conf = conf self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni # choosing GPU device self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self._device != "cpu": with torch.cuda.device("cuda:{}".format(CUDA_Device)): torch.cuda.empty_cache() if not self._is_nni: self._device = torch.device("cuda:{}".format(CUDA_Device)) self._loss = self.graphSaintLoss if GS else self.regular_loss self.accuracy = self.accuracy_GraphSaint if GS else self.accuracy_regular self._ce_loss = torch.nn.CrossEntropyLoss(reduction="mean").to(self._device) self._ce_loss2 = torch.nn.BCELoss(reduction='mean')
def __init__(self, queue, calculators, include, logger=None): super(Worker, self).__init__() if logger is None: logger = EmptyLogger() self._queue = queue self._calculators = calculators self._logger = logger self._include = include
def __init__(self, params, loader, cuda_device, data_logger=None, epochs_logger=None): self._params = params self.prev_training_inds = None self.prev_val_inds = None self.prev_test_inds = None self._epoch_logger = EmptyLogger( ) if epochs_logger is None else epochs_logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._device = torch.device( f'cuda:{cuda_device}') if torch.cuda.is_available( ) else torch.device('cpu') self._mse_loss = self.weighted_mse_loss self._temporal_loss = self.weighted_mse_loss self.net = None self.opt = None self.loader = loader self.num_features = loader.dataset[ 0].num_features #loader[0].num_features # if SSP self.preconditioner = None self.eps = params['eps'] self.update_freq = params['update_freq'] self.gamma = params['gamma'] self.alpha = params['alpha'] self.lamda = params['lamda'] self.best_loss = None # self.best_model = None self.best_epoch = None self.is_nni = params["is_nni"]
def __init__(self, conf, logger, weights, graph_params, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._conf = conf self._weights_dict = weights self._clique_size = graph_params['clique_size'] self._graph_params = graph_params self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni self._device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu')
def __init__(self, products_path, dataset_path, conf, logger, data_logger=None): self.conf = conf self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self.products_path = products_path self.loader = GraphLoader(dataset_path, is_max_connected=False, norm_adj=conf["norm_adj"], cuda_num=conf["cuda"], logger=self._logger) self._criterion = torch.nn.NLLLoss()
def __init__(self, gnx, features, dir_path, logger=None, is_max_connected=False): self._base_dir = dir_path self._logger = EmptyLogger() if logger is None else logger self._matrix = None self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx self._abbreviations = { abbr: name for name, meta in features.items() for abbr in meta.abbr_set } # building the feature calculators data structure super(GraphFeatures, self).__init__({ name: meta.calculator(self._gnx, logger=logger) for name, meta in features.items() })
def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._paths = paths self._ignore_index = ignore_index self._cuda_num = cuda_num self._dtype = dtype self._logger.debug("Loading %s dataset...", paths["features"]) self._gnx = pickle.load(open(paths["gnx"], "rb")) self._is_max_connected = is_max_connected if is_max_connected: self._gnx = get_max_subgraph(self._gnx) self.ordered_nodes = sorted(self._gnx) self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]) # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]] self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])} self._node_labels = self._get_node_labels() self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0])) bow_mx = np.vstack(self._content.values()).astype(DTYPE) median_bow = np.median(bow_mx, axis=0) bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE) self._bow_mx = z_scoring(bow_mx) self._topo_mx = None # Adjacency matrices adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE) self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense() self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj) self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense() self._train_set = self._test_set = None self._train_idx = self._test_idx = self._base_train_idx = None self._val_idx = None
def __init__(self, logger=None): if logger is None: logger = EmptyLogger() self._logger = logger self._data_dir = os.path.dirname(os.path.realpath(__file__)) df1 = pd.read_csv(os.path.join(self._data_dir, "test_undirected")) self._ugnx = nx.from_pandas_edgelist(df1, "n1", "n2", ["weight"], create_using=nx.Graph()) df2 = pd.read_csv(os.path.join(self._data_dir, "test_directed")) self._gnx = nx.from_pandas_edgelist(df2, "n1", "n2", ["weight"], create_using=nx.DiGraph())
def __init__(self, reader=None, crucial=True, state_notifier=None, reactor=None, logger=None): super(Channel, self).__init__() self._state_notifier = state_notifier # handler = handler or ChannelHandler(self) self._reactor = reactor or AsyncReactor() self._logger = logger or EmptyLogger() self._state = ChannelState.IDLE self._crucial = crucial if reader is not None: reader.set_channel(self) else: reader = DefaultReader(1, self) self._reader = reader self._buffered = ""
def __init__(self, params, logger, data_logger=None, epochs_logger=None): self._logger = logger self._epoch_logger = epochs_logger self._data_logger = EmptyLogger() if data_logger is None else data_logger self._parameters = params self._lr = params["lr"] self._is_nni = params['is_nni'] self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self._mse_loss = self.weighted_mse_loss self._temporal_loss = self.weighted_mse_loss self.model = GCN(num_of_features=self._parameters["feature_matrices"][0].shape[1], hid_size=self._parameters["hid_size"], num_of_classes=self._parameters["number_of_classes"], activation=self._parameters["activation"], dropout=self._parameters["dropout"]) self.model = self.model.to(self._device) self.opt = self._parameters["optimizer"](self.model.parameters(), lr=self._parameters['lr'], weight_decay=self._parameters['weight_decay'])
def __init__(self, paths, fast_mode, norm_adj, cuda_dev, is_max, logger, data_logger=None, is_debug=False, dtype=torch.double): # plt.rcParams.update({'figure.max_open_warning': 0}) self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._fast_mode = fast_mode self.products_path = paths["products"] self._paths = paths self._cuda_dev = cuda_dev self._dtype = dtype self.loaders = MultiGraphLoader(paths, is_max, norm_adj=norm_adj, logger=self._logger, ignore_index=-1, cuda_num=cuda_dev, is_debug=is_debug, dtype=dtype) criterion_weight = torch.FloatTensor([1 / 34, 1 / 740]) if cuda_dev is not None: criterion_weight = criterion_weight.cuda(cuda_dev) self._criterion_weight = criterion_weight self._criterion = torch.nn.NLLLoss(weight=self._criterion_weight, ignore_index=-1) # self._criterion = torch.nn.CrossEntropyLoss(weight=criterion_weight, ignore_index=-1) self._criterion = self._criterion.type(self._dtype).cuda( self._cuda_dev) self._run_label = "" self._reset_saved_models()
class GraphLoader(object): def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._paths = paths self._ignore_index = ignore_index self._cuda_num = cuda_num self._dtype = dtype self._logger.debug("Loading %s dataset...", paths["features"]) self._gnx = pickle.load(open(paths["gnx"], "rb")) self._is_max_connected = is_max_connected if is_max_connected: self._gnx = get_max_subgraph(self._gnx) self.ordered_nodes = sorted(self._gnx) self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]) # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]] self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])} self._node_labels = self._get_node_labels() self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0])) bow_mx = np.vstack(self._content.values()).astype(DTYPE) median_bow = np.median(bow_mx, axis=0) bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE) self._bow_mx = z_scoring(bow_mx) self._topo_mx = None # Adjacency matrices adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE) self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense() self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj) self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense() self._train_set = self._test_set = None self._train_idx = self._test_idx = self._base_train_idx = None self._val_idx = None @property def name(self): return str(self._paths["name"]) @property def is_graph_directed(self): return self._gnx.is_directed() # def _activate_cuda(self, items): # return items # def _encode_onehot_gnx(self): # gnx, nodes_order: list = None): # labels = self._labels.copy() # if labels[len(labels) - 1] is not None: # labels[len(labels)] = None # ident = np.identity(len(labels)) # if self._gnx.graph.get('is_index_labels', False): # labels_dict = {i: ident[i, :] for i, label in labels.items()} # else: # labels_dict = {label: ident[i, :] for i, label in labels.items()} # return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self._nodes_order)), # dtype=np.int32) def _get_node_labels(self): labels = self._labels.copy() labels[self._ignore_index] = None labels_dict = {label: i for i, label in labels.items()} return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self.ordered_nodes)), dtype=np.int32) def set_variables(self, **kwargs): for key, val in kwargs.items(): self.__setattr__(key, val) @property def num_labels(self): return len(self._labels) # @property # def labels(self): # labels = torch.LongTensor(self._node_labels) # return activate_cuda(labels, cuda_num=self._cuda_num) @property def labels(self): labels = torch.LongTensor(self._node_labels) return activate_cuda(labels, cuda_num=self._cuda_num) @property def distinct_labels(self): return sorted(self._labels.keys()) # def _get_idx(self, idx_name): # return torch.LongTensor([x for x in getattr(self, idx_name) if x in set(self._labeled_nodes)]) # # @property # def train_idx(self): # return activate_cuda(self._get_idx("_train_idx"), cuda_num=self._cuda_num) # # @property # def val_idx(self): # return activate_cuda(self._get_idx("_val_idx"), cuda_num=self._cuda_num) # # @property # def test_idx(self): # return activate_cuda(self._get_idx("_test_idx"), cuda_num=self._cuda_num) @property def bow_mx(self): # bow_feat = torch.FloatTensor(self._bow_mx) bow_feat = torch.DoubleTensor(self._bow_mx) return activate_cuda(bow_feat, cuda_num=self._cuda_num) @property def topo_mx(self): assert self._topo_mx is not None, "Split train required" # topo_feat = torch.FloatTensor(self._topo_mx) topo_feat = torch.DoubleTensor(self._topo_mx) return activate_cuda(topo_feat, cuda_num=self._cuda_num) @property def adj_rt_mx(self): return activate_cuda(self._adj_rt, cuda_num=self._cuda_num) # .clone()) @property def adj_mx(self): return activate_cuda(self._adj, cuda_num=self._cuda_num).type(self._dtype) # .clone()) # def split_test(self, test_p): # indexes, nodes = zip(*self._labeled_nodes) # self._train_set, _, self._base_train_idx, self._test_idx = train_test_split(nodes, indexes, test_size=test_p) # def split_train(self, train_p, features_meta): # train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx, # test_size=1 - train_p) # feat_path = os.path.join(self._feat_path, "features%d" % (self._is_max_connected,)) # features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, # is_max_connected=False) # Already taking the max sub_graph in init # features.build(include=set(train_set), should_dump=False) # # add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) # self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) # # ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) # self._topo_mx /= ratio def set_train(self, train_set, features_meta): features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, is_max_connected=False) # Already taking the max sub_graph in init features.build(include=set(train_set), should_dump=True) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
class GraphLoader(object): def __init__(self, data_path, feature_meta, test_p, gnx_idx=None, cuda_num=None, logger=None): super(GraphLoader, self).__init__() self._logger = EmptyLogger() if logger is None else logger self._gnx_idx = gnx_idx self._cuda_num = cuda_num self._test_p = test_p self._features_meta = feature_meta self._data_path = data_path # self._logger.debug("Loading %s dataset...", self._dataset) # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb")) # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb")) # self._nodes_order = sorted(self._gnx) self._train_set = self._test_set = self._train_idx = self._test_idx = None self._inputs = self._targets = None self._nodes_order = [] self._labels = {i: label for i, label in enumerate(self._get_labels())} self._prepare_data() def _get_labels(self): gnx = pickle.load( open(os.path.join(next(self._get_gnx_paths()), "gnx.pkl"), "rb")) return gnx.graph["node_labels"] @staticmethod def _encode_onehot_gnx1(gnx, nodes_order): # gnx, nodes_order: list = None): labels = gnx.graph["node_labels"] labels_dict = { c: np.identity(len(labels))[i, :] for i, c in enumerate(labels) } labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)}) return np.array(list( map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)), dtype=np.int32) def _encode_onehot_gnx(self, gnx, nodes_order): # gnx, nodes_order: list = None): # labels = gnx.graph["node_labels"] ident = np.identity(len(self._labels)) labels_dict = {label: ident[j, :] for j, label in self._labels.items()} # labels_dict = {c: np.identity(len(self._labels))[i, :] for i, c in enumerate(labels)} # labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)}) return np.array(list( map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)), dtype=np.int32) def _join_graphs1(self): all_nodes = set() common_nodes = None for path in self._get_gnx_paths(): gnx = pickle.load(open(os.path.join(path, "orig_gnx.pkl"), "rb")) all_nodes = all_nodes.union(gnx) if common_nodes is None: common_nodes = set(gnx) else: common_nodes = common_nodes.intersection(gnx) pickle.dump( all_nodes, open(os.path.join(path, "..", "..", "all_nodes.pkl"), "wb")) pickle.dump( common_nodes, open(os.path.join(path, "..", "..", "common_nodes.pkl"), "wb")) return all_nodes, common_nodes def _join_graphs(self): path = next(self._get_gnx_paths()) all_nodes = pickle.load( open(os.path.join(path, "..", "..", "all_nodes.pkl"), "rb")) common_nodes = pickle.load( open(os.path.join(path, "..", "..", "common_nodes.pkl"), "rb")) return all_nodes, common_nodes def _split_data(self): feat_path = os.path.join(next(self._get_gnx_paths()), "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) self._nodes_order = sorted( [node for node in gnx if gnx.node[node]['label'] is not None]) indexes = [(i, node) for i, node in enumerate(self._nodes_order)] idx, nodes = zip(*indexes) c_train, c_test, c_train_idx, c_test_idx = train_test_split( nodes, idx, test_size=self._test_p, shuffle=True) self._train_set = set(c_train) self._test_set = set(c_test) self._test_idx = np.array(c_test_idx) self._train_idx = np.array(c_train_idx) def _split_data_orig(self): all_nodes, common_nodes = self._join_graphs() self._nodes_order = sorted(all_nodes) indexes = [(i, node) for i, node in enumerate(self._nodes_order)] common, uncommon = [], [] for i, node in indexes: cur_list = common if node in common_nodes else uncommon cur_list.append((i, node)) c_idx, c_nodes = zip(*common) c_train, c_test, c_train_idx, c_test_idx = train_test_split( c_nodes, c_idx, test_size=self._test_p, shuffle=True) uc_idx, uc_nodes = zip(*uncommon) uc_train, uc_test, uc_train_idx, uc_test_idx = train_test_split( uc_nodes, uc_idx, test_size=self._test_p, shuffle=True) self._train_set = set(c_train).union(uc_train) self._test_set = set(c_test).union(uc_test) self._test_idx = np.array(c_test_idx + uc_test_idx) self._train_idx = np.array(c_train_idx + uc_train_idx) def _activate_cuda(self, *args): if self._cuda_num is None: return args return [x.cuda(self._cuda_num) for x in args] # firms/years/features_0-1 def _get_gnx_paths(self): paths = sorted(os.listdir(self._data_path), key=int) if self._gnx_idx is not None: # for x in [4, 6]: # yield os.path.join(self._data_path, paths[x]) yield os.path.join(self._data_path, paths[self._gnx_idx]) return for path in paths: yield os.path.join(self._data_path, path) def _prepare_data1(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") cur_data = pickle.load( open(os.path.join(feat_path, "data.pkl"), "rb")) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) cur_labels = pickle.load( open(os.path.join(feat_path, "labels.pkl"), "rb")) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data") def _prepare_data(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) gnx = gnx.subgraph(self._nodes_order) features = GraphFeatures(gnx, self._features_meta, dir_path=feat_path, logger=self._logger) features.build(include=self._train_set) add_ones = bool( set(self._features_meta).intersection( ["first_neighbor_histogram", "second_neighbor_histogram"])) cur_data = features.to_matrix(add_ones=add_ones, dtype=np.float32, mtype=np.array, should_zscore=True) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) pickle.dump(cur_data, open(os.path.join(feat_path, "data.pkl"), "wb")) cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) pickle.dump(cur_labels, open(os.path.join(feat_path, "labels.pkl"), "wb")) # Arranging data as <batch, seq, feature> if self._gnx_idx is None: self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data") # topo_x = torch.FloatTensor(topo_x) # np.array(features.todense())) # # labels = torch.LongTensor(np.where(labels)[1]) # # train_idx = torch.LongTensor(self._train_idx) # test_idx = torch.LongTensor(self._test_idx) # # topo_x, labels = convert_to_variable(topo_x, labels) # return self.activate_cuda([topo_x, labels]) def _load_data(self, indexes, nbatch): # for inputs, targets in zip(self._inputs, self._targets): inputs, targets = self._inputs[indexes], self._targets[indexes] # for i in range(0, int(len(inputs) / nbatch) * nbatch, nbatch): for i in range(0, len(inputs), nbatch): data, labels = inputs[i:i + nbatch], targets[i:i + nbatch] # if self._gnx_idx is not None: # data, labels = data[:, self._gnx_idx, :], labels[:, self._gnx_idx, :] data, labels = Variable(torch.FloatTensor(data)), Variable( torch.LongTensor(np.where(labels)[self.feat_dim])) # labels = labels[labels != reverse_labels[None]] yield self._activate_cuda(data, labels) def load_train_data(self, nbatch): return self._load_data(self._train_idx, nbatch) def load_test_data(self, nbatch): return self._load_data(self._test_idx, nbatch) @property def feat_dim(self): return 2 if self._gnx_idx is None else 1 @property def num_nodes(self): return self._inputs.shape[0] @property def sequence_len(self): return self._inputs.shape[1] @property def num_features(self): return self._inputs.shape[self.feat_dim] @property def num_labels(self): return len(self._labels) @property def labels(self): return self._labels @property def num_layers(self): return [100, 20]
class ModelRunner: def __init__(self, conf, logger, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger( ) if data_logger is None else data_logger self._conf = conf self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni self._device = torch.device( 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') self._loss = BCELoss() @property def logger(self): return self._logger @property def data_logger(self): return self._data_logger def my_loss(self, output, target, weights=None): output = torch.clamp(output, min=1e-8, max=1 - 1e-8) if weights is not None: assert len(weights) == 2 loss = weights[1] * (target * torch.log(output)) + \ weights[0] * ((1 - target) * torch.log(1 - output)) else: loss = target * torch.log(output) + ( 1 - target) * torch.log(1 - output) ret = torch.neg(torch.mean(loss)) return ret def accuracy(self, output, labels): #output = torch.sigmoid(output) ##todo USE it only with BCEWithLogit idxs_1_labeled = torch.where(labels == 1) answers = output[idxs_1_labeled] true_pos = torch.where(answers >= 0.5) # tuple (,) return len(true_pos[0]) / len(idxs_1_labeled[0]) def _get_model(self): model = Graphs_Rec(in_features=self._conf["train_data"][0].shape[0], hid_features=self._conf["hid_features"], out_features=1, activation=self._conf["activation"], dropout=self._conf["dropout"]) opt = self._conf["optimizer"](model.parameters(), lr=self._conf["lr"], weight_decay=self._conf["weight_decay"]) ##checged : added "feature_matrices" return { "model": model, "optimizer": opt, "train_data": self._conf["train_data"], "training_labels": self._conf["training_labels"], "test_data": self._conf["test_data"], "test_labels": self._conf["test_labels"] } # verbose = 0 - silent # verbose = 1 - print test results # verbose = 2 - print train for each epoch and test results def run(self, verbose=2): if self._is_nni: verbose = 0 model = self._get_model() ## loss_train, acc_train, intermediate_acc_test, losses_train, accs_train, test_results = self.train( self._conf["epochs"], model=model, verbose=verbose) ## # Testing result = self.test(model=model, verbose=verbose if not self._is_nni else 0, print_to_file=False) if self._is_nni: self._logger.debug('Final loss train: %3.4f' % loss_train) self._logger.debug('Final accuracy train: %3.4f' % acc_train) final_results = result["acc"] self._logger.debug('Final accuracy test: %3.4f' % final_results) # _nni.report_final_result(test_auc) if verbose != 0: names = "" vals = () for name, val in result.items(): names = names + name + ": %3.4f " vals = vals + tuple([val]) self._data_logger.info(name, val) parameters = { "lr": self._conf["lr"], "weight_decay": self._conf["weight_decay"], "dropout": self._conf["dropout"], "optimizer": self._conf["optim_name"] } return loss_train, acc_train, intermediate_acc_test, result, losses_train, accs_train, test_results, parameters def train(self, epochs, model=None, verbose=2): loss_train = 0. acc_train = 0. losses_train = [] accs_train = [] test_results = [] intermediate_test_acc = [] for epoch in range(epochs): loss_train, acc_train = self._train(epoch, model, verbose) ## losses_train.append(loss_train) accs_train.append(acc_train) ## # /---------------------- FOR NNI ------------------------- if epoch % 5 == 0: test_res = self.test( model, verbose=verbose if not self._is_nni else 0) test_results.append(test_res) if self._is_nni: test_acc = test_res["acc"] intermediate_test_acc.append(test_acc) return loss_train, acc_train, intermediate_test_acc, losses_train, \ accs_train, test_results def _train(self, epoch, model, verbose=2): #self._loss = self._loss = BCEWithLogitsLoss(torch.ones([223653]).to(self._device)) model_ = model["model"] model_ = model_.to(self._device) optimizer = model["optimizer"] ###! labels = torch.from_numpy(model["training_labels"]).to( dtype=torch.float, device=self._device) labels = torch.DoubleTensor(model["training_labels"]).to( dtype=torch.float, device=self._device) ###todo train = torch.from_numpy(model["train_data"]).to(dtype=torch.float, device=self._device) model_.train() optimizer.zero_grad() self._loss = self.my_loss ###send the model output = model_(train) ### loss_train = 0. labeld_1_num = len([b for b, item in enumerate(labels) if item == 1]) output = output.view(output.shape[0]) ###todo! # loss_train += self._loss(output, labels, weights=[1,(len(train)-78)/78]) ##weights=[19/len(train),(len(train)-19)/len(train)]) loss_train += self._loss( output, labels, weights=[ len(train) / (len(train) - labeld_1_num), len(train) / labeld_1_num ]) ##weights=[19/len(train),(len(train)-19)/len(train)]) # loss_train /= len(train) loss_train.backward() optimizer.step() acc_train = self.accuracy(output, labels) if verbose == 2: # Evaluate validation set performance separately, # deactivates dropout during validation run. self._logger.debug( 'Epoch: {:04d} '.format(epoch + 1) + 'loss_train: {:.4f} '.format(loss_train.data.item()) + 'acc_train: {:.4f} '.format(acc_train)) return loss_train, acc_train def test(self, model=None, verbose=2, print_to_file=False): #self._loss=self._loss = BCEWithLogitsLoss(torch.ones([894618]).to(self._device)) model_ = model["model"] model_ = model_.to(self._device) labels = torch.from_numpy(model["test_labels"]).to(dtype=torch.float, device=self._device) labels = torch.DoubleTensor(model["test_labels"]).to( dtype=torch.float, device=self._device) ###todo### test = torch.from_numpy(model["test_data"]).to(dtype=torch.float, device=self._device) model_.eval() '''self._loss = self.my_loss pos_weight = torch.ones([len(test)]).to(self._device) # All weights are equal to 1 pos_weight *= 79 / (len(test) - 79) self._loss = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)''' ###send the model output = model_(test) ### output = output.view(output.shape[0]) ###todo! self._loss = self.my_loss loss_test = 0. loss_test += self._loss(output, labels) #, weights=[1, (len(test) - 20) / 20]) #loss_test += self._loss(output, labels) #loss_test /= len(test) acc_test = self.accuracy(output, labels) if verbose != 0: self._logger.info( "Test: loss= {:.4f} ".format(loss_test.data.item()) + "acc= {:.4f}".format(acc_test)) result = {"loss": loss_test.data.item(), "acc": acc_test} return result
def __init__(self, reactor_cls=None, logger=None): reactor_cls = reactor_cls or AsyncReactor self._reactor = reactor_cls() self._logger = logger or EmptyLogger()
class ModelRunner: def __init__(self, conf, GS,logger, data_logger=None, is_nni=False): self._logger = logger self._data_logger = EmptyLogger() if data_logger is None else data_logger self._conf = conf self.bar = 0.5 self._lr = conf["lr"] self._is_nni = is_nni # choosing GPU device self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self._device != "cpu": with torch.cuda.device("cuda:{}".format(CUDA_Device)): torch.cuda.empty_cache() if not self._is_nni: self._device = torch.device("cuda:{}".format(CUDA_Device)) self._loss = self.graphSaintLoss if GS else self.regular_loss self.accuracy = self.accuracy_GraphSaint if GS else self.accuracy_regular self._ce_loss = torch.nn.CrossEntropyLoss(reduction="mean").to(self._device) self._ce_loss2 = torch.nn.BCELoss(reduction='mean') @property def logger(self): return self._logger @property def data_logger(self): return self._data_logger def graphSaintLoss(self, calcs, beta=None, gamma=None): if beta is None: beta = 1 / len(calcs["f_ns_out"]) if len(calcs["f_ns_out"])!=0 else 0 gamma = 1 / len(calcs["s_ns_out"]) if len(calcs["s_ns_out"])!=0 else 0 cn_loss = self._ce_loss2(calcs["cn_out"], calcs["cn_label"].float()) f_ns_loss = self._ce_loss2(calcs["f_ns_out"], calcs["f_ns_labels"].float()) *(beta) if len(calcs["f_ns_out"])!=0 else 0 s_ns_loss = self._ce_loss2(calcs["s_ns_out"], calcs["s_ns_labels"].float()) * (gamma) if len(calcs["s_ns_out"])!=0 else 0 return cn_loss+f_ns_loss+s_ns_loss def regular_loss(self, calcs, beta=None, gamma=None): if beta is None: beta = 1 / len(calcs["f_ns_out"]) if len(calcs["f_ns_out"])!=0 else 0 gamma = 1 / len(calcs["s_ns_out"]) if len(calcs["s_ns_out"])!=0 else 0 cn_loss = self._ce_loss(calcs["cn_out"], calcs["cn_label"].long()) f_ns_loss = self._ce_loss(calcs["f_ns_out"], calcs["f_ns_labels"].long()) *(beta) if len(calcs["f_ns_out"])!=0 else 0 s_ns_loss = self._ce_loss(calcs["s_ns_out"], calcs["s_ns_labels"].long()) * (gamma) if len(calcs["s_ns_out"])!=0 else 0 return cn_loss+f_ns_loss+s_ns_loss def _get_model(self): model = GCN(in_features=self._conf["in_features"], hid_features=self._conf["hid_features"], out_features= self._conf["out_features"], activation=self._conf["activation"], dropout= self._conf["dropout"]) opt = self._conf["optimizer"](model.parameters(), lr=self._conf["lr"], weight_decay=self._conf["weight_decay"]) return {"model": model, "optimizer": opt, "beta": self._conf["beta"],"gamma": self._conf["gamma"], "labels": self._conf["labels"], "X": self._conf["X"], "ds_name": self._conf["ds_name"], "adj_tr": self._conf["adj_tr"], "adj_te": self._conf["adj_te"], "train_ind": self._conf["train_ind"], "test_ind": self._conf["test_ind"], "testt": self._conf["testt"], "traint": self._conf["traint"] } # verbose = 0 - silent # verbose = 1 - print test results # verbose = 2 - print train for each epoch and test results def run(self, verbose=2): if self._is_nni: verbose = 0 model = self._get_model() ## loss_train, acc_train, intermediate_acc_test, losses_train, accs_train, accs_cn_train, accs_f_train, accs_s_train, test_results = self.train( self._conf["epochs"], model=model, verbose=verbose) ## # Testing . ## result is only the last one! do not use. same as 7 in last result = self.test(model=model, verbose=verbose if not self._is_nni else 0, print_to_file=True) test_results.append(result) if self._is_nni: self._logger.debug('Final loss train: %3.4f' % loss_train) self._logger.debug('Final accuracy train: %3.4f' % acc_train) final_results = result["acc"] self._logger.debug('Final accuracy test: %3.4f' % final_results) # _nni.report_final_result(test_auc) if verbose != 0: names = "" vals = () for name, val in result.items(): names = names + name + ": %3.4f " vals = vals + tuple([val]) self._data_logger.info(name, val) parameters = { "lr": self._conf["lr"], "weight_decay": self._conf["weight_decay"], "dropout": self._conf["dropout"], "optimizer": self._conf["optim_name"]} return loss_train, acc_train, intermediate_acc_test, result, losses_train, accs_train, accs_cn_train, accs_f_train, accs_s_train, test_results, parameters def train(self, epochs, model=None, verbose=2): loss_train = 0. acc_train = 0. losses_train = [] accs_train = [] accs_cn_train = [] accs_f_train = [] accs_s_train = [] test_results = [] intermediate_test_acc = [] for epoch in range(epochs): loss_train, acc_train, acc_train_cn , acc_train_f, acc_train_s= self._train(epoch, model, verbose) losses_train.append(loss_train) accs_train.append(acc_train) accs_cn_train.append(acc_train_cn) accs_f_train.append(acc_train_f) accs_s_train.append(acc_train_s) ## # /---------------------- FOR NNI ------------------------- if epoch % 5 == 0: test_res = self.test(model, verbose=verbose if not self._is_nni else 0) test_results.append(test_res) if self._is_nni: test_acc = test_res["acc"] intermediate_test_acc.append(test_acc) return loss_train, acc_train, intermediate_test_acc, losses_train, \ accs_train, accs_cn_train, accs_f_train, accs_s_train, test_results ''' This function calculates the output and the labels for each node: for each node we take as an input the nodels' output and the labels, and return the output and label of the central node, of it's first neighbors and of it's second neighbors. NOTE: we take only those that are in train indices ''' def calculate_labels_outputs(self,node, outputs , labels, indices, ego_graph): f_neighbors = set(list(ego_graph.neighbors(node))) s_neighbors = set() #create second neighbors for f_neighbor in f_neighbors: for s_neighbor in ego_graph.neighbors(f_neighbor): if s_neighbor not in f_neighbors and s_neighbor != node and s_neighbor not in s_neighbors: s_neighbors.add(s_neighbor) # notice we use the "index" in order to have correlation between the neihbors and the output's index (graph nodes are labeld with numbers from 0 to N (of the big graph) and the output's labels from 0 to n=len(ego graph). so using the "index" solves it (hopefully) ;) cn_out= outputs[[list(ego_graph.nodes).index(node)]] cn_label = labels[[node]] #create vectors for the first neighbors outputs and labels. NOTE: we take only those that are in train indices f_ns_out = outputs[[list(ego_graph.nodes).index(f_n) for f_n in f_neighbors if indices[f_n]]] f_ns_labels = labels[[f_n for f_n in f_neighbors if indices[f_n]]] #same for second neoghbors s_ns_out = outputs[[list(ego_graph.nodes).index(s_n) for s_n in s_neighbors if indices[s_n]]] s_ns_labels = labels[[s_n for s_n in s_neighbors if indices[s_n]]] return { "cn_out": cn_out, "cn_label": cn_label, "f_ns_out": f_ns_out, "f_ns_labels": f_ns_labels, "s_ns_out": s_ns_out, "s_ns_labels": s_ns_labels } def _train(self, epoch, model, verbose=2): model_ = model["model"] model_ = model_.to(self._device) optimizer = model["optimizer"] #train ind are the nodes to create subgraphs from. traint are nodes in train (that we can learn from) train_indices = model["train_ind"] model["labels"] = model["labels"].to(self._device) labels = model["labels"] beta = model["beta"] gamma = model["gamma"] model_.train() optimizer.zero_grad() loss_train = 0. loss_train1 = 0. calcs_batch = [] BATCH_SIZE= 30 # create subgraphs only for partial, but use labels of all train indices for idx,node in enumerate(train_indices): # adj = nx.ego_graph(model["adj_matrices"], node, radius=2) adj = model["adj_tr"][node] X_t = model["X"][list(adj.nodes)].to(device=self._device) output = model_(X_t, nx.adjacency_matrix(adj).tocoo()) calcs = self.calculate_labels_outputs( node, output, labels, model["traint"], adj) #no batches: loss_train += self._loss(calcs, beta, gamma) # # if we want to use batches # loss_train1 += self._loss(calcs, beta, gamma) # loss_train += self._loss(calcs, beta, gamma).data.item() # if idx % BATCH_SIZE == 0 and idx > 0: # loss_train1 /= BATCH_SIZE # loss_train1.backward() # optimizer.step() # loss_train1 = 0. calcs_batch.append(calcs) acc_train, acc_train_cn, acc_train_f, acc_train_s = self.accuracy(calcs_batch) # loss_train /= len(train_indices) # loss_train.backward() optimizer.step() if verbose == 2: # Evaluate validation set performance separately, # deactivates dropout during validation run. self._logger.debug('Epoch: {:04d} '.format(epoch + 1) + 'ce_loss_train: {:.4f} '.format(loss_train) + 'acc_train: {:.4f} '.format(acc_train)) return loss_train, acc_train, acc_train_cn , acc_train_f, acc_train_s ''' Accuracy function. For the graphSaint we use sigmoid on each index, then we use BCE loss, then we span the vectors(of each node's result) to one vector of all the centrals, one vector of all the first neighbors, and one vector of all the second neghbors, put 1 in the indexes that have value >= 0.5 and 0 otherwise, then calculate f1 score on the vector ''' @staticmethod def accuracy_GraphSaint(calcs): #create one vector that will contain all the central's nodes outputs (for each node in the train/test). same for labels out, labs = ([calcs[i]["cn_out"].data[0].tolist() for i in range(len(calcs))], [calcs[i]["cn_label"].data[0].tolist() for i in range(len(calcs))]) out = np.array(out) labs = np.array(labs) out[out > 0.5] = 1 out[out <= 0.5] = 0 acc_cn = metrics.f1_score(labs, out, average="micro") # create one vector that will contain all the first neighbors outputs (of each node in the train/test. each node has vector of first neighbors- put it all to one long vector) out = [] labs = [] for i in range(len(calcs)): out += calcs[i]["f_ns_out"].data.tolist() labs += calcs[i]["f_ns_labels"].data.tolist() out=np.array(out) labs=np.array(labs) out[out > 0.5] = 1 out[out <= 0.5] = 0 if len(out) != 0: acc_f = metrics.f1_score(labs, out, average="micro") else: acc_f = np.nan # same for second neighbors (same as first) out = [] labs = [] for i in range(len(calcs)): out += calcs[i]["s_ns_out"].data.tolist() labs += calcs[i]["s_ns_labels"].data.tolist() out = np.array(out) labs = np.array(labs) out[out > 0.5] = 1 out[out <= 0.5] = 0 if len(out) != 0: # fpr, tpr, thresholds = metrics.roc_curve(labs2, out2) # acc_s = metrics.auc(fpr, tpr) acc_s = metrics.f1_score(labs, out, average="micro") else: acc_s = np.nan return np.nanmean(np.array([acc_cn, acc_f, acc_s])), acc_cn, acc_f, acc_s def accuracy_regular(self,calcs): out, labs = ([calcs[i]["cn_out"].data[0].tolist() for i in range(len(calcs))], [calcs[i]["cn_label"].data[0].tolist() for i in range(len(calcs))]) acc_cn = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs) out = [] labs = [] for i in range(len(calcs)): out += calcs[i]["f_ns_out"].data.tolist() labs += calcs[i]["f_ns_labels"].data.tolist() if len(out) != 0: acc_f = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs) else: acc_f = np.nan out = [] labs = [] for i in range(len(calcs)): out += calcs[i]["s_ns_out"].data.tolist() labs += calcs[i]["s_ns_labels"].data.tolist() if len(out) != 0: acc_s = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs) else: acc_s = np.nan return np.nanmean(np.array([acc_cn, acc_f, acc_s])), acc_cn, acc_f, acc_s def test(self, model=None, verbose=2, print_to_file=False): model_ = model["model"] test_indices = model["test_ind"] labels = model["labels"] beta = model["beta"] gamma = model["gamma"] model_.eval() test_loss = 0 calcs_batch=[] with torch.no_grad(): for node in test_indices: # adj = nx.ego_graph(model["adj_matrices"], node, radius=2) adj = model["adj_te"][node] X_t = model["X"][list(adj.nodes)].to(device=self._device) output = model_(X_t, nx.adjacency_matrix(adj).tocoo()) calcs = self.calculate_labels_outputs(node, output, labels, model["testt"], adj) test_loss += self._loss(calcs, beta, gamma).data.item() calcs_batch.append(calcs) test_loss /= len(test_indices) test_acc, acc_test_cn, acc_test_f, acc_test_s = self.accuracy(calcs_batch) if verbose != 0: self._logger.info("Test: ce_loss= {:.4f} ".format(test_loss) + "acc= {:.4f}".format(test_acc)) result = {"loss": test_loss, "acc": test_acc, "acc_cn": acc_test_cn, "acc_f":acc_test_f, "acc_s":acc_test_s} return result
class GraphFeatures(dict): def __init__(self, gnx, features, dir_path, logger=None, is_max_connected=False): self._base_dir = dir_path self._logger = EmptyLogger() if logger is None else logger self._matrix = None self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx self._abbreviations = { abbr: name for name, meta in features.items() for abbr in meta.abbr_set } # building the feature calculators data structure super(GraphFeatures, self).__init__({ name: meta.calculator(self._gnx, logger=logger) for name, meta in features.items() }) @property def graph(self): return self._gnx def _build_serially(self, include, force_build: bool = False, dump_path: str = None): if VERBOSE: self._logger.debug("Start building graph features") if dump_path is not None and self._gnx is not None: pickle.dump(self._gnx, open(self._feature_path("gnx", dump_path), "wb")) for name, feature in self.items(): if force_build or not os.path.exists(self._feature_path(name)): is_dumped = dump_path is not None and feature.DUMPABLE msg = "Dumped to: %s" % dump_path if is_dumped else "Not dumped" feature.build(include=include, msg=msg) if is_dumped: self._dump_feature(name, feature, dump_path) else: self._load_feature(name) if VERBOSE: self._logger.debug("Finished building graph features") # a single process means it is calculated serially def build(self, num_processes: int = 1, include: set = None, should_dump: bool = False): # , exclude: set=None): # if exclude is None: # exclude = set() if include is None: include = set() if 1 == num_processes: dump_path = None if should_dump: dump_path = self._base_dir if not os.path.exists(dump_path): os.makedirs(dump_path) return self._build_serially(include, dump_path=dump_path) request_queue = Queue() workers = [ Worker(request_queue, self, include, logger=self._logger) for _ in range(num_processes) ] # Starting all workers for worker in workers: worker.start() # Feeding the queue with all the features for feature_name in self: request_queue.put(feature_name) # Sentinel objects to allow clean shutdown: 1 per worker. for _ in range(num_processes): request_queue.put(None) # Joining all workers for worker in workers: worker.join() def _load_feature(self, name): if self._gnx is None: assert os.path.exists(self._feature_path( "gnx")), "Graph is not present in the given directory" self._gnx = pickle.load(open(self._feature_path("gnx"), "rb")) feature = pickle.load(open(self._feature_path(name), "rb")) feature.load_meta({ name: getattr(self, name) for name in FeatureCalculator.META_VALUES }) self[name] = feature return self[name] def __getattr__(self, name): if name not in self: if name in self._abbreviations: name = self._abbreviations[name] else: return super(GraphFeatures, self).__getattribute__(name) # if obj is already calculated - return it obj = self[name] if obj.is_loaded: return obj # if obj is not calculated, check if it exist on the file system # if it doesn't - calculate it, if it does - load it and return it if not os.path.exists(self._feature_path(name)): obj.build() return obj return self._load_feature(name) @property def features(self): return set(self) def _feature_path(self, name, dir_path=None): if dir_path is None: dir_path = self._base_dir return os.path.join(dir_path, name + ".pkl") def _dump_feature(self, name, feature, dir_path): if feature.is_loaded: prev_meta = feature.clean_meta( ) # in order not to save unnecessary data pickle.dump(feature, open(self._feature_path(name, dir_path), "wb")) feature.load_meta(prev_meta) def dump(self, dir_path=None): if dir_path is None: dir_path = self._base_dir if not os.path.exists(dir_path): os.makedirs(dir_path) for name, feature in self.items(): self._dump_feature(name, feature, dir_path) @property def shape(self): sorted_features = map(at(1), sorted(self.items(), key=at(0))) sorted_features = [ feature for feature in sorted_features if feature.is_relevant() and feature.is_loaded ] res = [] for feature in sorted_features: res.append((feature.print_name()), feature.shape[1]) return res # sparse.csr_matrix(matrix, dtype=np.float32) def to_matrix(self, entries_order: list = None, add_ones=False, dtype=None, mtype=np.matrix, should_zscore: bool = True): if entries_order is None: entries_order = sorted(self._gnx) sorted_features = map(at(1), sorted(self.items(), key=at(0))) # Consider caching the matrix creation (if it takes long time) sorted_features = [ feature for feature in sorted_features if feature.is_relevant() and feature.is_loaded ] if sorted_features: mx = np.hstack([ feature.to_matrix(entries_order, mtype=mtype, should_zscore=should_zscore) for feature in sorted_features ]) if add_ones: mx = np.hstack([mx, np.ones((mx.shape[0], 1))]) mx.astype(dtype) else: mx = np.matrix([]) return mtype(mx) def to_dict(self, dtype=None, should_zscore: bool = True): mx = self.to_matrix(dtype=dtype, mtype=np.matrix, should_zscore=should_zscore) return {node: mx[i, :] for i, node in enumerate(sorted(self._gnx))}