def __init__(self, source_dataset, target_dataset, args): """ Parameters ---------- source_dataset: Dataset Dataset object of source dataset target_dataset: Dataset Dataset object of target dataset args: argparse.ArgumentParser.parse_args() arguments as parameters for model. """ np.random.seed(args.seed) torch.manual_seed(args.seed) super(DeepLink, self).__init__(source_dataset, target_dataset) self.source_dataset = source_dataset self.target_dataset = target_dataset self.alpha = args.alpha self.map_batchsize = args.batch_size_mapping self.cuda = args.cuda self.embedding_dim = args.embedding_dim self.embedding_epochs = args.embedding_epochs self.supervised_epochs = args.supervised_epochs self.unsupervised_epochs = args.unsupervised_epochs self.supervised_lr = args.supervised_lr self.unsupervised_lr = args.unsupervised_lr self.num_cores = args.num_cores gt = load_gt(args.train_dict, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.full_gt = {} self.full_gt.update(gt) test_gt = load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.full_gt.update(test_gt) self.full_gt = { self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v] for k, v in self.full_gt.items() } self.train_dict = { self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v] for k, v in gt.items() } self.number_walks = args.number_walks self.format = args.format self.walk_length = args.walk_length self.window_size = args.window_size self.top_k = args.top_k self.S = None self.source_embedding = None self.target_embedding = None self.source_after_mapping = None self.source_train_nodes = np.array(list(self.train_dict.keys())) self.source_anchor_nodes = np.array(list(self.train_dict.keys())) self.hidden_dim1 = args.hidden_dim1 self.hidden_dim2 = args.hidden_dim2 self.seed = args.seed
def create_feature(num_feat, path, path2, groundtruth): source_dataset = Dataset(path + "/graphsage/") if path2 != "": target_dataset = Dataset(path2 + "/graphsage/") groundtruth = graph_utils.load_gt(groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') source_nodes = np.array(list(groundtruth.keys())) target_nodes = np.array(list(groundtruth.values())) source_feats = create_onehot_feature(num_feat, len(source_dataset.G.nodes())) target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat)) target_feats[target_nodes] = source_feats[source_nodes] source_feats2 = np.zeros(source_feats.shape) target_feats2 = np.zeros(target_feats.shape) source_feats2[:, 0] = 1 target_feats2[:, 0] = 1 source_feats2[source_nodes] = source_feats[source_nodes] target_feats2[target_nodes] = target_feats[target_nodes] np.save(path + "/graphsage/feats.npy", source_feats2) np.save(path2 + "/graphsage/feats.npy", target_feats2) return print("Remove exceed file") remove_exceed_files(path) print("Creating features") source_id2idx = source_dataset.id2idx if args.keep_old_feats != "": source_feats = np.load(args.keep_old_feats) if source_feats.shape[1] != num_feat: print("Number of feat must equal to the old features") else: source_feats = create_onehot_feature(num_feat, len(source_dataset.G.nodes())) print("Saving source feats") np.save(path + "/graphsage/feats.npy", source_feats) tree_dir = [x[0] for x in os.walk(path)] print("Start searching for target dir") for dir in tree_dir: if "seed" in dir.split("/")[-1]: print("Working with {}".format(dir)) # is a child file try: target_dataset = Dataset(dir + "/graphsage/") except Exception as err: print("Error: {}".format(err)) continue target_id2idx = target_dataset.id2idx dictionary = graph_utils.load_gt(dir + "/dictionaries/groundtruth", source_id2idx, target_id2idx, 'dict') target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat)) source_nodes = np.array(list(dictionary.keys())) target_nodes = np.array(list(dictionary.values())) target_feats[target_nodes] = source_feats[source_nodes] np.save(dir + "/graphsage/feats.npy", target_feats) print("DONE")
def __init__(self, source_dataset, target_dataset, args): """ :params source_dataset: source graph :params target_dataset: target graph :params args: more config params """ super(EMGCN, self).__init__(source_dataset, target_dataset) self.args = args self.source_dataset = source_dataset self.target_dataset = target_dataset self.alpha_att_val = [args.rel, args.att, args.attval] self.n_node_s = len(self.source_dataset.G.nodes()) self.n_node_t = len(self.target_dataset.G.nodes()) self.full_dict = load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.alphas = [1, 1, 1, 1, 1, 1] self.att_dict1, self.att_dict2 = self.source_dataset.get_raw_att_dicts( ) self.source_att_set = set(self.att_dict1.keys()) self.target_att_set = set(self.att_dict2.keys()) self.kept_att = self.source_att_set.intersection(self.target_att_set) self.att_dict_inverse1 = {v: k for k, v in self.att_dict1.items()} self.att_dict_inverse2 = {v: k for k, v in self.att_dict2.items()} self.source_att_value = self.source_dataset.get_the_raw_datastructure( self.source_dataset.ent_att_val1, self.att_dict_inverse1, self.kept_att) self.target_att_value = self.source_dataset.get_the_raw_datastructure( self.source_dataset.ent_att_val2, self.att_dict_inverse2, self.kept_att) self.statistic()
def main(args): source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, "dict") DataPreprocess.evaluateDataset(source_dataset, target_dataset, groundtruth, args.output_dir)
def __init__(self, source_dataset, target_dataset, args): """ Parameters ---------- source_dataset: Dataset Dataset object of source dataset target_dataset: Dataset Dataset object of target dataset args: argparse.ArgumentParser.parse_args() arguments as parameters for model. """ super(NAWAL, self).__init__(source_dataset, target_dataset) # dataset self.source_dataset = source_dataset self.target_dataset = target_dataset # embedding_params self.args = args self.pale_train_anchors = load_gt(args.train_dict, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.train_dict = self.pale_train_anchors self.nawal_test_anchors = load_gt(args.test_dict, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.test_dict = self.nawal_test_anchors self.source_train_nodes = np.array(list( self.pale_train_anchors.keys())) # nawal_mapping_params self.decrease_lr = False # if use auto_encoder mapping self.n_refinement = 5 self.source_embedding = None self.target_embedding = None self.mean_cosine = -1 self.best_valid_metric = -1 self.best_W = None self.encoder = None self.decoder = None
def __init__(self, source_dataset, target_dataset, args): """ :params source_dataset: source graph :params target_dataset: target graph :params args: more config params """ super(GAlign, self).__init__(source_dataset, target_dataset) self.source_dataset = source_dataset self.target_dataset = target_dataset self.alphas = [args.alpha0, args.alpha1, args.alpha2] self.args = args self.full_dict = load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict')
def __init__(self, source_dataset, target_dataset, args): """ Parameters ---------- source_dataset: Dataset Dataset object of source dataset target_dataset: Dataset Dataset object of target dataset args: argparse.ArgumentParser.parse_args() arguments as parameters for model. """ super(PALE, self).__init__(source_dataset, target_dataset) self.source_dataset = source_dataset self.target_dataset = target_dataset self.source_path = args.source_dataset self.emb_batchsize = args.batch_size_embedding self.map_batchsize = args.batch_size_mapping self.emb_lr = args.learning_rate1 self.cuda = args.cuda self.neg_sample_size = args.neg_sample_size self.embedding_dim = args.embedding_dim self.emb_epochs = args.embedding_epochs self.map_epochs = args.mapping_epochs self.mapping_model = args.mapping_model self.map_act = args.activate_function self.map_lr = args.learning_rate2 self.embedding_name = args.embedding_name gt = load_gt(args.train_dict, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.gt_train = { self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v] for k, v in gt.items() } self.S = None self.source_embedding = None self.target_embedding = None self.source_after_mapping = None self.source_train_nodes = np.array(list(self.gt_train.keys()))
def __init__(self, source_dataset, target_dataset, args): """ Parameters ---------- source_dataset: Dataset Dataset object of source dataset target_dataset: Dataset Dataset object of target dataset args: argparse.ArgumentParser.parse_args() arguments as parameters for model. """ np.random.seed(args.seed) torch.manual_seed(args.seed) super(DeepLink, self).__init__(source_dataset, target_dataset) self.source_dataset = source_dataset self.target_dataset = target_dataset self.args = args self.known_anchor_links = load_gt(args.train_dict, source_dataset.id2idx, target_dataset.id2idx, 'dict') self.train_dict = self.known_anchor_links self.number_walks = args.number_walks self.format = args.format self.walk_length = args.walk_length self.window_size = args.window_size self.top_k = args.top_k self.S = None self.source_embedding = None self.target_embedding = None self.source_after_mapping = None self.source_train_nodes = np.array(list(self.train_dict.keys())) self.hidden_dim1 = args.hidden_dim1 self.hidden_dim2 = args.hidden_dim2 self.seed = args.seed
def __init__(self, source_dataset, target_dataset, args): """ Parameters ---------- source_dataset: Dataset Dataset object of source dataset target_dataset: Dataset Dataset object of target dataset args: argparse.ArgumentParser.parse_args() arguments as parameters for model. """ np.random.seed(args.seed) torch.manual_seed(args.seed) super(CENALP, self).__init__(source_dataset, target_dataset) self.source_dataset = source_dataset self.target_dataset = target_dataset self.args = args self.known_anchor_links = load_gt(args.train_dict, format='dict') self.pi = self.known_anchor_links.copy() self.cur_iter = 0 source_deg = self.source_dataset.get_nodes_degrees() target_deg = self.target_dataset.get_nodes_degrees() self.idx2id_source = {v:k for k,v in self.source_dataset.id2idx.items()} self.idx2id_target = {v:k for k,v in self.target_dataset.id2idx.items()} self.deg = np.concatenate((source_deg, target_deg)) if self.source_dataset.features is not None: self.sim_attr = self.source_dataset.features.dot(self.target_dataset.features.T) self.sim_attr[self.sim_attr < 0] = 0 else: self.sim_attr = np.zeros((len(self.source_dataset.G.nodes()), len(self.target_dataset.G.nodes())))
# target_degree = normalize_data(target_degree) # distance = source_degree - target_degree # return np.random.choice(distance, 300) return source_degree[:500], target_degree[:500] if __name__ == "__main__": args = parse_args() source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) source_id2idx = source_dataset.id2idx target_id2idx = target_dataset.id2idx source_idx2id = {v: k for k, v in source_id2idx.items()} target_idx2id = {v: k for k, v in target_id2idx.items()} groundtruth = graph_utils.load_gt(args.groundtruth, source_id2idx, target_id2idx, "dict", True) source_degree, target_degree = get_distance(source_dataset, target_dataset, groundtruth) data_matrix = np.array([source_degree, target_degree]) models = ["source graph", "target graph"] line_chart(models, data_matrix, "Anchor pairs", "Degree", name="degree_flickr.png") # exit() # source_dataset = Dataset(args.source_dataset2) # target_dataset = Dataset(args.target_dataset2) # source_id2idx = source_dataset.id2idx
deg = data.get_nodes_degrees() deg = np.array(deg) binn = int(max(deg) / dim) feature = np.zeros((len(data.G.nodes()), dim)) for i in range(len(deg)): deg_i = deg[i] node_i = data.G.nodes()[i] node_i_idx = data.id2idx[node_i] feature[node_i_idx, int(deg_i/(binn+ 1))] = 1 return feature def create_feature(data, dim): shape = (len(data.G.nodes()), int(dim)) features = np.random.uniform(size=shape) for i, feat in enumerate(features): mask = np.ones(feat.shape, dtype=bool) mask[feat.argmax()] = False feat[~mask] = 1 feat[mask] = 0 return features if __name__ == "__main__": args = parse_args() data1 = Dataset(args.input_data1) data2 = Dataset(args.input_data2) ground_truth = load_gt(args.ground_truth, data1.id2idx, data2.id2idx, 'dict') feature1, feature2 = create_features(data1, data2, args.feature_dim, ground_truth) np.save(args.input_data1 + '/feats.npy', feature1) np.save(args.input_data2 + '/feats.npy', feature2)
parser_DeepLink.add_argument('--top_k', default=5, type=int) parser_DeepLink.add_argument('--alpha', default=0.8, type=float) parser_DeepLink.add_argument('--num_cores', default=8, type=int) return parser.parse_args() if __name__ == '__main__': args = parse_args() print(args) source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) source_nodes = source_dataset.G.nodes() target_nodes = target_dataset.G.nodes() groundtruth_matrix = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx) algorithm = args.algorithm if (algorithm == "IsoRank"): model = IsoRank(source_dataset, target_dataset, args.H, args.alpha, args.max_iter, args.tol) elif (algorithm == "FINAL"): model = FINAL(source_dataset, target_dataset, H=args.H, alpha=args.alpha, maxiter=args.max_iter, tol=args.tol) elif (algorithm == "REGAL"): model = REGAL(source_dataset, target_dataset, max_layer=args.max_layer, alpha=args.alpha, k=args.k, num_buckets=args.buckets, gammastruc = args.gammastruc, gammaattr = args.gammaattr, normalize=True, num_top=args.num_top) elif algorithm == "BigAlign": model = BigAlign(source_dataset, target_dataset, lamb=args.lamb) elif algorithm == "IONE": model = IONE(source_dataset, target_dataset, gt_train=args.train_dict, epochs=args.epochs, dim=args.dim, seed=args.seed) elif algorithm == "PALE":
def nawal_mapping(self): #print("Start nawal mapping") self.mapping = Mapping(self.args.embedding_dim) self.discriminator = Discriminator(self.args.embedding_dim, self.args.dis_layers, self.args.dis_hid_dim, self.args.dis_dropout, self.args.dis_input_dropout) optim_fn, optim_params = get_optimizer(self.args.map_optimizer) self.map_optimizer = optim_fn(self.mapping.parameters(), **optim_params) optim_fn, optim_params = get_optimizer(self.args.dis_optimizer) self.dis_optimizer = optim_fn(self.discriminator.parameters(), **optim_params) if self.args.cuda: self.mapping = self.mapping.cuda() self.discriminator = self.discriminator.cuda() nawal_map_epoch_times = [] for n_epoch in range(self.args.nawal_mapping_epochs): print('Starting adversarial training epoch %i...' % n_epoch) tic = time.time() n_nodes_proc = 0 stats = {'DIS_COSTS': []} for n_iter in range(0, self.args.nawal_mapping_epoch_size, self.args.nawal_mapping_batch_size): # discriminator training for _ in range(self.args.dis_steps): self.dis_step(stats) # mapping training (discriminator fooling) n_nodes_proc += self.mapping_step() # log stats if n_iter % 500 == 0: stats_str = [('DIS_COSTS', 'Discriminator loss')] stats_log = [ '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_nodes_proc / (time.time() - tic))) print(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_nodes_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation self.dist_mean_cosine() # JSON log / save best model / end of epoch self.save_best() print('End of epoch %i.\n\n' % n_epoch) nawal_map_epoch_times.append(time.time() - tic) # update the learning rate (stop if too small) self.update_lr() self.reload_best() self.S = self.calculate_simi_matrix(self.mapping.eval()) # print("NAWAL before refining") groundtruth_matrix = load_gt(self.args.test_dict, self.source_dataset.id2idx, self.target_dataset.id2idx) groundtruth_dict = load_gt(self.args.test_dict, self.source_dataset.id2idx, self.target_dataset.id2idx, 'dict') self.nawal_before_refine_acc = get_statistics(self.S, groundtruth_dict, groundtruth_matrix) # print("Accuracy: {}".format(acc)) self.mapping.train() nawal_refine_epoch_times = [] # training loop for n_iter in range(self.n_refinement): tic = time.time() # build a dictionary from aligned embeddings src_emb = self.mapping(self.source_embedding).data tgt_emb = self.target_embedding dico = build_dictionary(src_emb, tgt_emb, p_keep=0.45) # apply the Procrustes solution self.procrustes(dico) self.dist_mean_cosine() self.save_best() nawal_refine_epoch_times.append(time.time() - tic) self.reload_best() S = self.calculate_simi_matrix(self.mapping.eval(), save=True) # print("Nawal after refining") groundtruth_matrix = load_gt(self.args.test_dict, self.source_dataset.id2idx, self.target_dataset.id2idx) groundtruth_dict = load_gt(self.args.test_dict, self.source_dataset.id2idx, self.target_dataset.id2idx, 'dict') self.nawal_after_refine_acc = get_statistics(S, groundtruth_dict, groundtruth_matrix) self.log() print("NAWAL average map epoch time: {:.4f}".format( np.mean(nawal_map_epoch_times))) print("NAWAL average refine epoch time: {:.4f}".format( np.mean(nawal_refine_epoch_times))) print("NAWAL average emb epoch time: {:.4f}".format( np.mean(self.epoch_times))) return S
# often change parser_EMGCN.add_argument('--num_each_refine', type=int, default=100) return parser.parse_args() if __name__ == '__main__': args = parse_args() print(args) start_time = time() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) source_dataset = Dataset(args.source_dataset, args.dataset_name) target_dataset = Dataset(args.target_dataset) groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') algorithm = args.algorithm if algorithm == "EMGCN": model = EMGCN(source_dataset, target_dataset, args) else: raise Exception("Unsupported algorithm") S = model.align() for i in range(2): if i == 1: print("right to left...") else: print("left to right...")