Esempio n. 1
0
def create_feature(num_feat, path, path2, groundtruth):
    source_dataset = Dataset(path + "/graphsage/")
    if path2 != "":
        target_dataset = Dataset(path2 + "/graphsage/")
        groundtruth = graph_utils.load_gt(groundtruth, source_dataset.id2idx,
                                          target_dataset.id2idx, 'dict')
        source_nodes = np.array(list(groundtruth.keys()))
        target_nodes = np.array(list(groundtruth.values()))
        source_feats = create_onehot_feature(num_feat,
                                             len(source_dataset.G.nodes()))
        target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat))
        target_feats[target_nodes] = source_feats[source_nodes]
        source_feats2 = np.zeros(source_feats.shape)
        target_feats2 = np.zeros(target_feats.shape)
        source_feats2[:, 0] = 1
        target_feats2[:, 0] = 1
        source_feats2[source_nodes] = source_feats[source_nodes]
        target_feats2[target_nodes] = target_feats[target_nodes]
        np.save(path + "/graphsage/feats.npy", source_feats2)
        np.save(path2 + "/graphsage/feats.npy", target_feats2)
        return
    print("Remove exceed file")
    remove_exceed_files(path)
    print("Creating features")
    source_id2idx = source_dataset.id2idx
    if args.keep_old_feats != "":
        source_feats = np.load(args.keep_old_feats)
        if source_feats.shape[1] != num_feat:
            print("Number of feat must equal to the old features")
    else:
        source_feats = create_onehot_feature(num_feat,
                                             len(source_dataset.G.nodes()))
    print("Saving source feats")
    np.save(path + "/graphsage/feats.npy", source_feats)
    tree_dir = [x[0] for x in os.walk(path)]
    print("Start searching for target dir")
    for dir in tree_dir:
        if "seed" in dir.split("/")[-1]:
            print("Working with {}".format(dir))
            # is a child file
            try:
                target_dataset = Dataset(dir + "/graphsage/")
            except Exception as err:
                print("Error: {}".format(err))
                continue
            target_id2idx = target_dataset.id2idx
            dictionary = graph_utils.load_gt(dir + "/dictionaries/groundtruth",
                                             source_id2idx, target_id2idx,
                                             'dict')
            target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat))
            source_nodes = np.array(list(dictionary.keys()))
            target_nodes = np.array(list(dictionary.values()))
            target_feats[target_nodes] = source_feats[source_nodes]
            np.save(dir + "/graphsage/feats.npy", target_feats)
    print("DONE")
Esempio n. 2
0
def main(args):

    source_dataset = Dataset(args.prefix1)
    target_dataset = Dataset(args.prefix2)

    model = REGAL(source_dataset,
                  target_dataset,
                  max_layer=args.max_layer,
                  alpha=args.alpha,
                  k=args.k,
                  num_buckets=args.buckets,
                  gammastruc=args.gammastruc,
                  gammaattr=args.gammaattr,
                  normalize=True,
                  num_top=args.num_top)
    S = model.align()
Esempio n. 3
0
def get_test_data(files, vocab, glove, batch_size=1):
    dataset = Dataset(files)
    dataset.set_pad_indices(vocab)
    dataset.create(vocab)
    dataset.add_glove_vecs(glove)
    dataloader = data.DataLoader(dataset,
                                 batch_size=batch_size,
                                 collate_fn=dataset.collate_fn)
    return dataloader
 def generate_random_clone_synthetic(self,
                                     p_new_connection,
                                     p_remove_connection,
                                     p_change_feats=None):
     print("===============")
     dataset = Dataset(self.networkx_dir)
     G = random_clone_synthetic(dataset, p_new_connection,
                                p_remove_connection, self.seed)
     self._save_graph(G, self.output_dir1, p_change_feats)
Esempio n. 5
0
    def test_create_from_files(self):
        """Test dataset creation from files."""
        feature_file = "{}/{}".format(self.TEST_FOLDER, "train.feat")
        text_file = "{}/{}".format(self.TEST_FOLDER, "train.text")

        dataset = Dataset.create_dataset_from_files(feature_file, text_file)
        self.assertEqual(len(dataset.documents), 4)
        self.assertEqual(len(dataset.documents[0].X),
                         len(dataset.documents[0].Y))
        self.assertEqual(len(dataset.documents[0].X),
                         len(dataset.documents[0].text))
Esempio n. 6
0
def get_train_data(files, glove_file, batch_size=1):
    vocab = Vocab(files)
    vocab.add_padunk_vocab()
    vocab.create()

    glove = Glove(glove_file)
    glove.create(vocab)

    dataset = Dataset(files)
    dataset.set_pad_indices(vocab)
    dataset.create(vocab)
    dataset.add_glove_vecs(glove)

    dataloader = data.DataLoader(dataset,
                                 batch_size=batch_size,
                                 collate_fn=dataset.collate_fn)
    return dataloader, vocab, glove
Esempio n. 7
0
def create_H(args):
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)
    src_degrees = np.array(source_dataset.get_nodes_degrees())
    trg_degrees = np.array(target_dataset.get_nodes_degrees())
    # import pdb
    # pdb.set_trace()

    # src_degrees = src_degrees/src_degrees.max()
    # trg_degrees = trg_degrees/trg_degrees.max()
    #
    # distance_matrix = np.zeros((len(src_degrees), len(trg_degrees)))
    # for src_idx, src_deg in enumerate(src_degrees):
    #     for trg_idx, trg_deg in enumerate(trg_degrees):
    #         distance_matrix[src_idx,trg_idx] = np.abs(src_deg-trg_deg)
    # max_distance = distance_matrix.max()
    # H = 1-distance_matrix/max_distance
    # H = H.T

    H = np.zeros((len(trg_degrees), len(src_degrees)))
    for i in range(H.shape[0]):
        H[i, :] = np.abs(trg_degrees[i] - src_degrees) / max(
            [src_degrees.max(), trg_degrees[i]])
    H = H / H.sum()

    # H = np.zeros((len(trg_degrees),len(src_degrees)))
    # for i, trg_deg in enumerate(trg_degrees):
    #     for j, src_deg in enumerate(src_degrees):
    #         H[i,j]=1-min([src_deg,trg_deg])/max([src_deg,trg_deg])
    # idxs_trg = np.random.choice(H.shape[0],2000000,replace=True)
    # idxs_src = np.random.choice(H.shape[1],2000000,replace=True)
    # H[idxs_trg,idxs_src]=0

    print("H shape: ", H.shape)
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    np.save(args.out_dir + "/H2.npy", H)
    print("H has been saved to ", args.out_dir)
Esempio n. 8
0
    source_gt_index = groundtruth.keys()
    target_gt_index = groundtruth.values()
    source_degree = get_degree_array(source_dataset, source_idx2id,
                                     source_gt_index)
    # source_degree = normalize_data(source_degree)
    target_degree = get_degree_array(target_dataset, target_idx2id,
                                     target_gt_index)
    # target_degree = normalize_data(target_degree)
    # distance = source_degree - target_degree
    # return np.random.choice(distance, 300)
    return source_degree[:500], target_degree[:500]


if __name__ == "__main__":
    args = parse_args()
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)

    source_id2idx = source_dataset.id2idx
    target_id2idx = target_dataset.id2idx
    source_idx2id = {v: k for k, v in source_id2idx.items()}
    target_idx2id = {v: k for k, v in target_id2idx.items()}
    groundtruth = graph_utils.load_gt(args.groundtruth, source_id2idx,
                                      target_id2idx, "dict", True)

    source_degree, target_degree = get_distance(source_dataset, target_dataset,
                                                groundtruth)
    data_matrix = np.array([source_degree, target_degree])
    models = ["source graph", "target graph"]
    line_chart(models,
               data_matrix,
Esempio n. 9
0
    deg = data.get_nodes_degrees()
    deg = np.array(deg)
    binn = int(max(deg) / dim)
    feature = np.zeros((len(data.G.nodes()), dim))
    for i in range(len(deg)):
        deg_i = deg[i]
        node_i = data.G.nodes()[i]
        node_i_idx = data.id2idx[node_i]
        feature[node_i_idx, int(deg_i/(binn+ 1))] = 1
    return feature

def create_feature(data, dim):
    shape = (len(data.G.nodes()), int(dim))
    features = np.random.uniform(size=shape)
    for i, feat in enumerate(features):
        mask = np.ones(feat.shape, dtype=bool)
        mask[feat.argmax()] = False
        feat[~mask] = 1
        feat[mask] = 0
    return features


if __name__ == "__main__":
    args = parse_args()
    data1 = Dataset(args.input_data1)
    data2 = Dataset(args.input_data2)
    ground_truth = load_gt(args.ground_truth, data1.id2idx, data2.id2idx, 'dict')
    feature1, feature2 = create_features(data1, data2, args.feature_dim, ground_truth)
    np.save(args.input_data1 + '/feats.npy', feature1)
    np.save(args.input_data2 + '/feats.npy', feature2)
Esempio n. 10
0
        return self.alignment_matrix

    def get_alignment_matrix(self):
        if self.alignment_matrix is None:
            raise Exception("Must calculate alignment matrix by calling 'align()' method first")
        return self.alignment_matrix

def parse_args():
    parser = argparse.ArgumentParser(description="IsoRank")
    parser.add_argument('--prefix1',             default="/home/trunght/dataspace/graph/douban/offline/graphsage/")
    parser.add_argument('--prefix2',             default="/home/trunght/dataspace/graph/douban/online/graphsage/")    
    parser.add_argument('--groundtruth',        default=None)
    parser.add_argument('--H', default=None)
    parser.add_argument('--max_iter',         default=30, type=int)
    parser.add_argument('--alpha',          default=0.82, type=float)
    parser.add_argument('--tol',          default=1e-4, type=float)
    parser.add_argument('--k', default=1, type=int)

    return parser.parse_args()

if __name__ == "__main__":
    args = parse_args()
    print(args)

    source_dataset = Dataset(args.prefix1)
    target_dataset = Dataset(args.prefix2)
    
    model = FINAL(source_dataset, target_dataset, None, args.alpha, args.max_iter, args.tol)
    S = model.align()

Esempio n. 11
0
    parser_EMGCN.add_argument('--attval', type=float,
                              default=0.25)  # what is this
    # often change
    parser_EMGCN.add_argument('--num_each_refine', type=int, default=100)

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    print(args)
    start_time = time()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    source_dataset = Dataset(args.source_dataset, args.dataset_name)
    target_dataset = Dataset(args.target_dataset)
    groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx,
                                      target_dataset.id2idx, 'dict')

    algorithm = args.algorithm

    if algorithm == "EMGCN":
        model = EMGCN(source_dataset, target_dataset, args)
    else:
        raise Exception("Unsupported algorithm")

    S = model.align()

    for i in range(2):
        if i == 1:
Esempio n. 12
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Train et evaluate a dependency model."""

from input.dataset import Dataset
from model.parser_model import ParserModel

path = "/Users/alicia/Documents/01-Projets/04-Research/01-Dependency-parsing"

train_dataset = Dataset.create_dataset_from_files(path + "/data/train.feat",
                                                  path + "/data/train.text")
dev_dataset = Dataset.create_dataset_from_files(path + "/data/dev.feat",
                                                path + "/data/dev.text")
test_dataset = Dataset.create_dataset_from_files(path + "/data/test.feat",
                                                 path + "/data/test.text")

n_features = len(train_dataset.documents[0].X[0])

parser_model = ParserModel(n_features,
                           dropout_prob=0.6,
                           learning_rate=0.00001,
                           batch_size=5,
                           hidden_size=100,
                           model_folder=path + "/data/models")
dev_loss = parser_model.train(train_dataset, dev_dataset, 400)

Y = parser_model.predict(test_dataset)
for index, document in enumerate(test_dataset.documents):
    print("Accuracy: " + document.compute_accuracy(Y[index]))
Esempio n. 13
0
def get_val_data(files,
                 pklpath,
                 vocab,
                 glove,
                 batch_size=1,
                 num_workers=0,
                 pretrained=False,
                 pklexist=False,
                 data_parallel=True,
                 frame_trunc_length=45,
                 spatial=False):

    if pretrained:
        pixel = Pixel(files, pklpath)
        if not pklexist:
            pixel.create()
            pixel.save()
        else:
            pixel.load()

    dataset = Dataset(files)
    dataset.set_flags(mode='test',
                      data_parallel=data_parallel,
                      frame_trunc_length=frame_trunc_length,
                      pretrained=pretrained,
                      spatial=spatial)
    dataset.set_pad_indices(vocab)
    dataset.create(vocab)
    dataset.add_glove_vecs(glove)
    if pretrained: dataset.add_video_vecs(pixel)

    dataloader = data.DataLoader(dataset,
                                 batch_size=batch_size,
                                 collate_fn=dataset.collate_fn,
                                 num_workers=num_workers)
    return dataloader
Esempio n. 14
0
def get_train_data(files,
                   pklpath,
                   glove_file,
                   glove_embdim,
                   batch_size=1,
                   shuffle=True,
                   num_workers=0,
                   pretrained=False,
                   pklexist=False,
                   data_parallel=True,
                   frame_trunc_length=45,
                   spatial=False):

    start_time = time.time()
    vocab = Vocab(files)
    vocab.add_begend_vocab()
    vocab.create()
    vocab_time = time.time()

    glove = Glove(glove_file, glove_embdim)
    glove.create(vocab)
    glove_time = time.time()

    if pretrained:
        pixel = Pixel(files, pklpath)
        if not pklexist:
            pixel.create()
            pixel.save()
        else:
            pixel.load()
    pixel_time = time.time()

    dataset = Dataset(files)
    dataset.set_flags(mode='train',
                      data_parallel=data_parallel,
                      frame_trunc_length=frame_trunc_length,
                      pretrained=pretrained,
                      spatial=spatial)
    dataset.set_pad_indices(vocab)
    dataset.create(vocab)
    dataset.add_glove_vecs(glove)
    if pretrained: dataset.add_video_vecs(pixel)
    dataset_time = time.time()

    print('Vocab : {0}, Glove : {1}, Pixel : {2}, Dataset : {3}'.format(
        vocab_time - start_time, glove_time - vocab_time,
        pixel_time - glove_time, dataset_time - pixel_time))
    dataloader = data.DataLoader(dataset,
                                 batch_size=batch_size,
                                 collate_fn=dataset.collate_fn,
                                 shuffle=shuffle,
                                 num_workers=num_workers)
    return dataloader, vocab, glove, dataset.__len__()
        print(
            "Number of removed node in this iteration: {}".format(num_removed))
        print("Number of nodes left: {}".format(len(G.nodes())))
        iter += 1
    return G


def _save_graph(G, output_dir):
    with open(output_dir, "w+") as file:
        res = json_graph.node_link_data(G)
        file.write(json.dumps(res))


if __name__ == "__main__":
    args = parse_args()
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)
    groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx,
                                      target_dataset.id2idx, 'dict')
    source_groundtruth_nodes = list(groundtruth.keys())
    target_groundtruth_nodes = list(groundtruth.values())
    source_idx2id = {v: k for k, v in source_dataset.id2idx.items()}
    target_idx2id = {v: k for k, v in target_dataset.id2idx.items()}
    source_gt_id = [source_idx2id[node] for node in source_groundtruth_nodes]
    target_gt_id = [target_idx2id[node] for node in target_groundtruth_nodes]
    source_care_deg = source_dataset.get_nodes_degrees(
    )[source_groundtruth_nodes]
    target_care_deg = target_dataset.get_nodes_degrees(
    )[target_groundtruth_nodes]

    print("Number of nodes in groundtruth: {}".format(len(groundtruth)))