Ejemplo n.º 1
0
    def __init__(self, source_dataset, target_dataset, args):
        """
        Parameters
        ----------
        source_dataset: Dataset
            Dataset object of source dataset
        target_dataset: Dataset
            Dataset object of target dataset
        args: argparse.ArgumentParser.parse_args()
            arguments as parameters for model.
        """
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

        super(DeepLink, self).__init__(source_dataset, target_dataset)
        self.source_dataset = source_dataset
        self.target_dataset = target_dataset
        self.alpha = args.alpha
        self.map_batchsize = args.batch_size_mapping
        self.cuda = args.cuda
        self.embedding_dim = args.embedding_dim
        self.embedding_epochs = args.embedding_epochs
        self.supervised_epochs = args.supervised_epochs
        self.unsupervised_epochs = args.unsupervised_epochs
        self.supervised_lr = args.supervised_lr
        self.unsupervised_lr = args.unsupervised_lr
        self.num_cores = args.num_cores

        gt = load_gt(args.train_dict, source_dataset.id2idx,
                     target_dataset.id2idx, 'dict')
        self.full_gt = {}
        self.full_gt.update(gt)
        test_gt = load_gt(args.groundtruth, source_dataset.id2idx,
                          target_dataset.id2idx, 'dict')
        self.full_gt.update(test_gt)
        self.full_gt = {
            self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v]
            for k, v in self.full_gt.items()
        }
        self.train_dict = {
            self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v]
            for k, v in gt.items()
        }

        self.number_walks = args.number_walks
        self.format = args.format
        self.walk_length = args.walk_length
        self.window_size = args.window_size
        self.top_k = args.top_k

        self.S = None
        self.source_embedding = None
        self.target_embedding = None
        self.source_after_mapping = None
        self.source_train_nodes = np.array(list(self.train_dict.keys()))
        self.source_anchor_nodes = np.array(list(self.train_dict.keys()))

        self.hidden_dim1 = args.hidden_dim1
        self.hidden_dim2 = args.hidden_dim2
        self.seed = args.seed
Ejemplo n.º 2
0
def create_feature(num_feat, path, path2, groundtruth):
    source_dataset = Dataset(path + "/graphsage/")
    if path2 != "":
        target_dataset = Dataset(path2 + "/graphsage/")
        groundtruth = graph_utils.load_gt(groundtruth, source_dataset.id2idx,
                                          target_dataset.id2idx, 'dict')
        source_nodes = np.array(list(groundtruth.keys()))
        target_nodes = np.array(list(groundtruth.values()))
        source_feats = create_onehot_feature(num_feat,
                                             len(source_dataset.G.nodes()))
        target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat))
        target_feats[target_nodes] = source_feats[source_nodes]
        source_feats2 = np.zeros(source_feats.shape)
        target_feats2 = np.zeros(target_feats.shape)
        source_feats2[:, 0] = 1
        target_feats2[:, 0] = 1
        source_feats2[source_nodes] = source_feats[source_nodes]
        target_feats2[target_nodes] = target_feats[target_nodes]
        np.save(path + "/graphsage/feats.npy", source_feats2)
        np.save(path2 + "/graphsage/feats.npy", target_feats2)
        return
    print("Remove exceed file")
    remove_exceed_files(path)
    print("Creating features")
    source_id2idx = source_dataset.id2idx
    if args.keep_old_feats != "":
        source_feats = np.load(args.keep_old_feats)
        if source_feats.shape[1] != num_feat:
            print("Number of feat must equal to the old features")
    else:
        source_feats = create_onehot_feature(num_feat,
                                             len(source_dataset.G.nodes()))
    print("Saving source feats")
    np.save(path + "/graphsage/feats.npy", source_feats)
    tree_dir = [x[0] for x in os.walk(path)]
    print("Start searching for target dir")
    for dir in tree_dir:
        if "seed" in dir.split("/")[-1]:
            print("Working with {}".format(dir))
            # is a child file
            try:
                target_dataset = Dataset(dir + "/graphsage/")
            except Exception as err:
                print("Error: {}".format(err))
                continue
            target_id2idx = target_dataset.id2idx
            dictionary = graph_utils.load_gt(dir + "/dictionaries/groundtruth",
                                             source_id2idx, target_id2idx,
                                             'dict')
            target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat))
            source_nodes = np.array(list(dictionary.keys()))
            target_nodes = np.array(list(dictionary.values()))
            target_feats[target_nodes] = source_feats[source_nodes]
            np.save(dir + "/graphsage/feats.npy", target_feats)
    print("DONE")
Ejemplo n.º 3
0
 def __init__(self, source_dataset, target_dataset, args):
     """
     :params source_dataset: source graph
     :params target_dataset: target graph
     :params args: more config params
     """
     super(EMGCN, self).__init__(source_dataset, target_dataset)
     self.args = args
     self.source_dataset = source_dataset
     self.target_dataset = target_dataset
     self.alpha_att_val = [args.rel, args.att, args.attval]
     self.n_node_s = len(self.source_dataset.G.nodes())
     self.n_node_t = len(self.target_dataset.G.nodes())
     self.full_dict = load_gt(args.groundtruth, source_dataset.id2idx,
                              target_dataset.id2idx, 'dict')
     self.alphas = [1, 1, 1, 1, 1, 1]
     self.att_dict1, self.att_dict2 = self.source_dataset.get_raw_att_dicts(
     )
     self.source_att_set = set(self.att_dict1.keys())
     self.target_att_set = set(self.att_dict2.keys())
     self.kept_att = self.source_att_set.intersection(self.target_att_set)
     self.att_dict_inverse1 = {v: k for k, v in self.att_dict1.items()}
     self.att_dict_inverse2 = {v: k for k, v in self.att_dict2.items()}
     self.source_att_value = self.source_dataset.get_the_raw_datastructure(
         self.source_dataset.ent_att_val1, self.att_dict_inverse1,
         self.kept_att)
     self.target_att_value = self.source_dataset.get_the_raw_datastructure(
         self.source_dataset.ent_att_val2, self.att_dict_inverse2,
         self.kept_att)
     self.statistic()
Ejemplo n.º 4
0
def main(args):
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)
    groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx,
                                      target_dataset.id2idx, "dict")
    DataPreprocess.evaluateDataset(source_dataset, target_dataset, groundtruth,
                                   args.output_dir)
Ejemplo n.º 5
0
    def __init__(self, source_dataset, target_dataset, args):
        """
        Parameters
        ----------
        source_dataset: Dataset
            Dataset object of source dataset
        target_dataset: Dataset
            Dataset object of target dataset
        args: argparse.ArgumentParser.parse_args()
            arguments as parameters for model.
        """

        super(NAWAL, self).__init__(source_dataset, target_dataset)
        # dataset
        self.source_dataset = source_dataset
        self.target_dataset = target_dataset

        # embedding_params
        self.args = args

        self.pale_train_anchors = load_gt(args.train_dict,
                                          source_dataset.id2idx,
                                          target_dataset.id2idx, 'dict')
        self.train_dict = self.pale_train_anchors
        self.nawal_test_anchors = load_gt(args.test_dict,
                                          source_dataset.id2idx,
                                          target_dataset.id2idx, 'dict')
        self.test_dict = self.nawal_test_anchors
        self.source_train_nodes = np.array(list(
            self.pale_train_anchors.keys()))

        # nawal_mapping_params
        self.decrease_lr = False
        # if use auto_encoder mapping
        self.n_refinement = 5

        self.source_embedding = None
        self.target_embedding = None
        self.mean_cosine = -1
        self.best_valid_metric = -1
        self.best_W = None
        self.encoder = None
        self.decoder = None
Ejemplo n.º 6
0
 def __init__(self, source_dataset, target_dataset, args):
     """
     :params source_dataset: source graph
     :params target_dataset: target graph
     :params args: more config params
     """
     super(GAlign, self).__init__(source_dataset, target_dataset)
     self.source_dataset = source_dataset
     self.target_dataset = target_dataset
     self.alphas = [args.alpha0, args.alpha1, args.alpha2]
     self.args = args
     self.full_dict = load_gt(args.groundtruth, source_dataset.id2idx,
                              target_dataset.id2idx, 'dict')
Ejemplo n.º 7
0
    def __init__(self, source_dataset, target_dataset, args):
        """
        Parameters
        ----------
        source_dataset: Dataset
            Dataset object of source dataset
        target_dataset: Dataset
            Dataset object of target dataset
        args: argparse.ArgumentParser.parse_args()
            arguments as parameters for model.
        """

        super(PALE, self).__init__(source_dataset, target_dataset)
        self.source_dataset = source_dataset
        self.target_dataset = target_dataset
        self.source_path = args.source_dataset

        self.emb_batchsize = args.batch_size_embedding
        self.map_batchsize = args.batch_size_mapping
        self.emb_lr = args.learning_rate1
        self.cuda = args.cuda
        self.neg_sample_size = args.neg_sample_size
        self.embedding_dim = args.embedding_dim
        self.emb_epochs = args.embedding_epochs
        self.map_epochs = args.mapping_epochs
        self.mapping_model = args.mapping_model
        self.map_act = args.activate_function
        self.map_lr = args.learning_rate2
        self.embedding_name = args.embedding_name

        gt = load_gt(args.train_dict, source_dataset.id2idx,
                     target_dataset.id2idx, 'dict')
        self.gt_train = {
            self.source_dataset.id2idx[k]: self.target_dataset.id2idx[v]
            for k, v in gt.items()
        }

        self.S = None
        self.source_embedding = None
        self.target_embedding = None
        self.source_after_mapping = None
        self.source_train_nodes = np.array(list(self.gt_train.keys()))
Ejemplo n.º 8
0
    def __init__(self, source_dataset, target_dataset, args):
        """
        Parameters
        ----------
        source_dataset: Dataset
            Dataset object of source dataset
        target_dataset: Dataset
            Dataset object of target dataset
        args: argparse.ArgumentParser.parse_args()
            arguments as parameters for model.
        """
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

        super(DeepLink, self).__init__(source_dataset, target_dataset)
        self.source_dataset = source_dataset
        self.target_dataset = target_dataset
        self.args = args

        self.known_anchor_links = load_gt(args.train_dict,
                                          source_dataset.id2idx,
                                          target_dataset.id2idx, 'dict')
        self.train_dict = self.known_anchor_links
        self.number_walks = args.number_walks
        self.format = args.format
        self.walk_length = args.walk_length
        self.window_size = args.window_size
        self.top_k = args.top_k

        self.S = None
        self.source_embedding = None
        self.target_embedding = None
        self.source_after_mapping = None
        self.source_train_nodes = np.array(list(self.train_dict.keys()))

        self.hidden_dim1 = args.hidden_dim1
        self.hidden_dim2 = args.hidden_dim2
        self.seed = args.seed
Ejemplo n.º 9
0
    def __init__(self, source_dataset, target_dataset, args):
        """
        Parameters
        ----------
        source_dataset: Dataset
            Dataset object of source dataset
        target_dataset: Dataset
            Dataset object of target dataset
        args: argparse.ArgumentParser.parse_args()
            arguments as parameters for model.
        """
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

        super(CENALP, self).__init__(source_dataset, target_dataset)
        self.source_dataset = source_dataset
        self.target_dataset = target_dataset
        self.args = args

        self.known_anchor_links = load_gt(args.train_dict, format='dict')
        self.pi = self.known_anchor_links.copy()
        self.cur_iter = 0

        source_deg = self.source_dataset.get_nodes_degrees()
        target_deg = self.target_dataset.get_nodes_degrees()

        self.idx2id_source = {v:k for k,v in self.source_dataset.id2idx.items()}
        self.idx2id_target = {v:k for k,v in self.target_dataset.id2idx.items()}

        self.deg = np.concatenate((source_deg, target_deg))

        if self.source_dataset.features is not None:
            self.sim_attr = self.source_dataset.features.dot(self.target_dataset.features.T)
            self.sim_attr[self.sim_attr < 0] = 0
        else:
            self.sim_attr = np.zeros((len(self.source_dataset.G.nodes()), len(self.target_dataset.G.nodes())))
Ejemplo n.º 10
0
    # target_degree = normalize_data(target_degree)
    # distance = source_degree - target_degree
    # return np.random.choice(distance, 300)
    return source_degree[:500], target_degree[:500]


if __name__ == "__main__":
    args = parse_args()
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)

    source_id2idx = source_dataset.id2idx
    target_id2idx = target_dataset.id2idx
    source_idx2id = {v: k for k, v in source_id2idx.items()}
    target_idx2id = {v: k for k, v in target_id2idx.items()}
    groundtruth = graph_utils.load_gt(args.groundtruth, source_id2idx,
                                      target_id2idx, "dict", True)

    source_degree, target_degree = get_distance(source_dataset, target_dataset,
                                                groundtruth)
    data_matrix = np.array([source_degree, target_degree])
    models = ["source graph", "target graph"]
    line_chart(models,
               data_matrix,
               "Anchor pairs",
               "Degree",
               name="degree_flickr.png")
    # exit()
    # source_dataset = Dataset(args.source_dataset2)
    # target_dataset = Dataset(args.target_dataset2)

    # source_id2idx = source_dataset.id2idx
Ejemplo n.º 11
0
    deg = data.get_nodes_degrees()
    deg = np.array(deg)
    binn = int(max(deg) / dim)
    feature = np.zeros((len(data.G.nodes()), dim))
    for i in range(len(deg)):
        deg_i = deg[i]
        node_i = data.G.nodes()[i]
        node_i_idx = data.id2idx[node_i]
        feature[node_i_idx, int(deg_i/(binn+ 1))] = 1
    return feature

def create_feature(data, dim):
    shape = (len(data.G.nodes()), int(dim))
    features = np.random.uniform(size=shape)
    for i, feat in enumerate(features):
        mask = np.ones(feat.shape, dtype=bool)
        mask[feat.argmax()] = False
        feat[~mask] = 1
        feat[mask] = 0
    return features


if __name__ == "__main__":
    args = parse_args()
    data1 = Dataset(args.input_data1)
    data2 = Dataset(args.input_data2)
    ground_truth = load_gt(args.ground_truth, data1.id2idx, data2.id2idx, 'dict')
    feature1, feature2 = create_features(data1, data2, args.feature_dim, ground_truth)
    np.save(args.input_data1 + '/feats.npy', feature1)
    np.save(args.input_data2 + '/feats.npy', feature2)
    parser_DeepLink.add_argument('--top_k',               default=5, type=int)
    parser_DeepLink.add_argument('--alpha',               default=0.8, type=float)
    parser_DeepLink.add_argument('--num_cores',           default=8, type=int)


    return parser.parse_args()

if __name__ == '__main__':
    args = parse_args()
    print(args)

    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)
    source_nodes = source_dataset.G.nodes()
    target_nodes = target_dataset.G.nodes()
    groundtruth_matrix = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx)

    algorithm = args.algorithm

    if (algorithm == "IsoRank"):
        model = IsoRank(source_dataset, target_dataset, args.H, args.alpha, args.max_iter, args.tol)
    elif (algorithm == "FINAL"):
        model = FINAL(source_dataset, target_dataset, H=args.H, alpha=args.alpha, maxiter=args.max_iter, tol=args.tol)
    elif (algorithm == "REGAL"):
        model = REGAL(source_dataset, target_dataset, max_layer=args.max_layer, alpha=args.alpha, k=args.k, num_buckets=args.buckets,
                      gammastruc = args.gammastruc, gammaattr = args.gammaattr, normalize=True, num_top=args.num_top)
    elif algorithm == "BigAlign":
        model = BigAlign(source_dataset, target_dataset, lamb=args.lamb)
    elif algorithm == "IONE":
        model = IONE(source_dataset, target_dataset, gt_train=args.train_dict, epochs=args.epochs, dim=args.dim, seed=args.seed)
    elif algorithm == "PALE":
Ejemplo n.º 13
0
    def nawal_mapping(self):
        #print("Start nawal mapping")
        self.mapping = Mapping(self.args.embedding_dim)
        self.discriminator = Discriminator(self.args.embedding_dim,
                                           self.args.dis_layers,
                                           self.args.dis_hid_dim,
                                           self.args.dis_dropout,
                                           self.args.dis_input_dropout)

        optim_fn, optim_params = get_optimizer(self.args.map_optimizer)
        self.map_optimizer = optim_fn(self.mapping.parameters(),
                                      **optim_params)
        optim_fn, optim_params = get_optimizer(self.args.dis_optimizer)
        self.dis_optimizer = optim_fn(self.discriminator.parameters(),
                                      **optim_params)

        if self.args.cuda:
            self.mapping = self.mapping.cuda()
            self.discriminator = self.discriminator.cuda()
        nawal_map_epoch_times = []
        for n_epoch in range(self.args.nawal_mapping_epochs):
            print('Starting adversarial training epoch %i...' % n_epoch)
            tic = time.time()
            n_nodes_proc = 0
            stats = {'DIS_COSTS': []}

            for n_iter in range(0, self.args.nawal_mapping_epoch_size,
                                self.args.nawal_mapping_batch_size):
                # discriminator training
                for _ in range(self.args.dis_steps):
                    self.dis_step(stats)

                # mapping training (discriminator fooling)
                n_nodes_proc += self.mapping_step()

                # log stats
                if n_iter % 500 == 0:
                    stats_str = [('DIS_COSTS', 'Discriminator loss')]
                    stats_log = [
                        '%s: %.4f' % (v, np.mean(stats[k]))
                        for k, v in stats_str if len(stats[k]) > 0
                    ]
                    stats_log.append('%i samples/s' % int(n_nodes_proc /
                                                          (time.time() - tic)))
                    print(('%06i - ' % n_iter) + ' - '.join(stats_log))

                    # reset
                    tic = time.time()
                    n_nodes_proc = 0
                    for k, _ in stats_str:
                        del stats[k][:]
            # embeddings / discriminator evaluation
            self.dist_mean_cosine()

            # JSON log / save best model / end of epoch
            self.save_best()

            print('End of epoch %i.\n\n' % n_epoch)
            nawal_map_epoch_times.append(time.time() - tic)
            # update the learning rate (stop if too small)
            self.update_lr()

        self.reload_best()
        self.S = self.calculate_simi_matrix(self.mapping.eval())
        # print("NAWAL before refining")
        groundtruth_matrix = load_gt(self.args.test_dict,
                                     self.source_dataset.id2idx,
                                     self.target_dataset.id2idx)
        groundtruth_dict = load_gt(self.args.test_dict,
                                   self.source_dataset.id2idx,
                                   self.target_dataset.id2idx, 'dict')
        self.nawal_before_refine_acc = get_statistics(self.S, groundtruth_dict,
                                                      groundtruth_matrix)
        # print("Accuracy: {}".format(acc))

        self.mapping.train()
        nawal_refine_epoch_times = []
        # training loop
        for n_iter in range(self.n_refinement):
            tic = time.time()
            # build a dictionary from aligned embeddings
            src_emb = self.mapping(self.source_embedding).data
            tgt_emb = self.target_embedding
            dico = build_dictionary(src_emb, tgt_emb, p_keep=0.45)
            # apply the Procrustes solution
            self.procrustes(dico)
            self.dist_mean_cosine()
            self.save_best()
            nawal_refine_epoch_times.append(time.time() - tic)
        self.reload_best()

        S = self.calculate_simi_matrix(self.mapping.eval(), save=True)
        # print("Nawal after refining")
        groundtruth_matrix = load_gt(self.args.test_dict,
                                     self.source_dataset.id2idx,
                                     self.target_dataset.id2idx)
        groundtruth_dict = load_gt(self.args.test_dict,
                                   self.source_dataset.id2idx,
                                   self.target_dataset.id2idx, 'dict')
        self.nawal_after_refine_acc = get_statistics(S, groundtruth_dict,
                                                     groundtruth_matrix)
        self.log()
        print("NAWAL average map epoch time: {:.4f}".format(
            np.mean(nawal_map_epoch_times)))
        print("NAWAL average refine epoch time: {:.4f}".format(
            np.mean(nawal_refine_epoch_times)))
        print("NAWAL average emb epoch time: {:.4f}".format(
            np.mean(self.epoch_times)))
        return S
Ejemplo n.º 14
0
    # often change
    parser_EMGCN.add_argument('--num_each_refine', type=int, default=100)

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    print(args)
    start_time = time()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    source_dataset = Dataset(args.source_dataset, args.dataset_name)
    target_dataset = Dataset(args.target_dataset)
    groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx,
                                      target_dataset.id2idx, 'dict')

    algorithm = args.algorithm

    if algorithm == "EMGCN":
        model = EMGCN(source_dataset, target_dataset, args)
    else:
        raise Exception("Unsupported algorithm")

    S = model.align()

    for i in range(2):
        if i == 1:
            print("right to left...")
        else:
            print("left to right...")