def create_feature(num_feat, path, path2, groundtruth): source_dataset = Dataset(path + "/graphsage/") if path2 != "": target_dataset = Dataset(path2 + "/graphsage/") groundtruth = graph_utils.load_gt(groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') source_nodes = np.array(list(groundtruth.keys())) target_nodes = np.array(list(groundtruth.values())) source_feats = create_onehot_feature(num_feat, len(source_dataset.G.nodes())) target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat)) target_feats[target_nodes] = source_feats[source_nodes] source_feats2 = np.zeros(source_feats.shape) target_feats2 = np.zeros(target_feats.shape) source_feats2[:, 0] = 1 target_feats2[:, 0] = 1 source_feats2[source_nodes] = source_feats[source_nodes] target_feats2[target_nodes] = target_feats[target_nodes] np.save(path + "/graphsage/feats.npy", source_feats2) np.save(path2 + "/graphsage/feats.npy", target_feats2) return print("Remove exceed file") remove_exceed_files(path) print("Creating features") source_id2idx = source_dataset.id2idx if args.keep_old_feats != "": source_feats = np.load(args.keep_old_feats) if source_feats.shape[1] != num_feat: print("Number of feat must equal to the old features") else: source_feats = create_onehot_feature(num_feat, len(source_dataset.G.nodes())) print("Saving source feats") np.save(path + "/graphsage/feats.npy", source_feats) tree_dir = [x[0] for x in os.walk(path)] print("Start searching for target dir") for dir in tree_dir: if "seed" in dir.split("/")[-1]: print("Working with {}".format(dir)) # is a child file try: target_dataset = Dataset(dir + "/graphsage/") except Exception as err: print("Error: {}".format(err)) continue target_id2idx = target_dataset.id2idx dictionary = graph_utils.load_gt(dir + "/dictionaries/groundtruth", source_id2idx, target_id2idx, 'dict') target_feats = np.zeros((len(target_dataset.G.nodes()), num_feat)) source_nodes = np.array(list(dictionary.keys())) target_nodes = np.array(list(dictionary.values())) target_feats[target_nodes] = source_feats[source_nodes] np.save(dir + "/graphsage/feats.npy", target_feats) print("DONE")
def main(args): source_dataset = Dataset(args.prefix1) target_dataset = Dataset(args.prefix2) model = REGAL(source_dataset, target_dataset, max_layer=args.max_layer, alpha=args.alpha, k=args.k, num_buckets=args.buckets, gammastruc=args.gammastruc, gammaattr=args.gammaattr, normalize=True, num_top=args.num_top) S = model.align()
def get_test_data(files, vocab, glove, batch_size=1): dataset = Dataset(files) dataset.set_pad_indices(vocab) dataset.create(vocab) dataset.add_glove_vecs(glove) dataloader = data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn) return dataloader
def generate_random_clone_synthetic(self, p_new_connection, p_remove_connection, p_change_feats=None): print("===============") dataset = Dataset(self.networkx_dir) G = random_clone_synthetic(dataset, p_new_connection, p_remove_connection, self.seed) self._save_graph(G, self.output_dir1, p_change_feats)
def test_create_from_files(self): """Test dataset creation from files.""" feature_file = "{}/{}".format(self.TEST_FOLDER, "train.feat") text_file = "{}/{}".format(self.TEST_FOLDER, "train.text") dataset = Dataset.create_dataset_from_files(feature_file, text_file) self.assertEqual(len(dataset.documents), 4) self.assertEqual(len(dataset.documents[0].X), len(dataset.documents[0].Y)) self.assertEqual(len(dataset.documents[0].X), len(dataset.documents[0].text))
def get_train_data(files, glove_file, batch_size=1): vocab = Vocab(files) vocab.add_padunk_vocab() vocab.create() glove = Glove(glove_file) glove.create(vocab) dataset = Dataset(files) dataset.set_pad_indices(vocab) dataset.create(vocab) dataset.add_glove_vecs(glove) dataloader = data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn) return dataloader, vocab, glove
def create_H(args): source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) src_degrees = np.array(source_dataset.get_nodes_degrees()) trg_degrees = np.array(target_dataset.get_nodes_degrees()) # import pdb # pdb.set_trace() # src_degrees = src_degrees/src_degrees.max() # trg_degrees = trg_degrees/trg_degrees.max() # # distance_matrix = np.zeros((len(src_degrees), len(trg_degrees))) # for src_idx, src_deg in enumerate(src_degrees): # for trg_idx, trg_deg in enumerate(trg_degrees): # distance_matrix[src_idx,trg_idx] = np.abs(src_deg-trg_deg) # max_distance = distance_matrix.max() # H = 1-distance_matrix/max_distance # H = H.T H = np.zeros((len(trg_degrees), len(src_degrees))) for i in range(H.shape[0]): H[i, :] = np.abs(trg_degrees[i] - src_degrees) / max( [src_degrees.max(), trg_degrees[i]]) H = H / H.sum() # H = np.zeros((len(trg_degrees),len(src_degrees))) # for i, trg_deg in enumerate(trg_degrees): # for j, src_deg in enumerate(src_degrees): # H[i,j]=1-min([src_deg,trg_deg])/max([src_deg,trg_deg]) # idxs_trg = np.random.choice(H.shape[0],2000000,replace=True) # idxs_src = np.random.choice(H.shape[1],2000000,replace=True) # H[idxs_trg,idxs_src]=0 print("H shape: ", H.shape) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) np.save(args.out_dir + "/H2.npy", H) print("H has been saved to ", args.out_dir)
source_gt_index = groundtruth.keys() target_gt_index = groundtruth.values() source_degree = get_degree_array(source_dataset, source_idx2id, source_gt_index) # source_degree = normalize_data(source_degree) target_degree = get_degree_array(target_dataset, target_idx2id, target_gt_index) # target_degree = normalize_data(target_degree) # distance = source_degree - target_degree # return np.random.choice(distance, 300) return source_degree[:500], target_degree[:500] if __name__ == "__main__": args = parse_args() source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) source_id2idx = source_dataset.id2idx target_id2idx = target_dataset.id2idx source_idx2id = {v: k for k, v in source_id2idx.items()} target_idx2id = {v: k for k, v in target_id2idx.items()} groundtruth = graph_utils.load_gt(args.groundtruth, source_id2idx, target_id2idx, "dict", True) source_degree, target_degree = get_distance(source_dataset, target_dataset, groundtruth) data_matrix = np.array([source_degree, target_degree]) models = ["source graph", "target graph"] line_chart(models, data_matrix,
deg = data.get_nodes_degrees() deg = np.array(deg) binn = int(max(deg) / dim) feature = np.zeros((len(data.G.nodes()), dim)) for i in range(len(deg)): deg_i = deg[i] node_i = data.G.nodes()[i] node_i_idx = data.id2idx[node_i] feature[node_i_idx, int(deg_i/(binn+ 1))] = 1 return feature def create_feature(data, dim): shape = (len(data.G.nodes()), int(dim)) features = np.random.uniform(size=shape) for i, feat in enumerate(features): mask = np.ones(feat.shape, dtype=bool) mask[feat.argmax()] = False feat[~mask] = 1 feat[mask] = 0 return features if __name__ == "__main__": args = parse_args() data1 = Dataset(args.input_data1) data2 = Dataset(args.input_data2) ground_truth = load_gt(args.ground_truth, data1.id2idx, data2.id2idx, 'dict') feature1, feature2 = create_features(data1, data2, args.feature_dim, ground_truth) np.save(args.input_data1 + '/feats.npy', feature1) np.save(args.input_data2 + '/feats.npy', feature2)
return self.alignment_matrix def get_alignment_matrix(self): if self.alignment_matrix is None: raise Exception("Must calculate alignment matrix by calling 'align()' method first") return self.alignment_matrix def parse_args(): parser = argparse.ArgumentParser(description="IsoRank") parser.add_argument('--prefix1', default="/home/trunght/dataspace/graph/douban/offline/graphsage/") parser.add_argument('--prefix2', default="/home/trunght/dataspace/graph/douban/online/graphsage/") parser.add_argument('--groundtruth', default=None) parser.add_argument('--H', default=None) parser.add_argument('--max_iter', default=30, type=int) parser.add_argument('--alpha', default=0.82, type=float) parser.add_argument('--tol', default=1e-4, type=float) parser.add_argument('--k', default=1, type=int) return parser.parse_args() if __name__ == "__main__": args = parse_args() print(args) source_dataset = Dataset(args.prefix1) target_dataset = Dataset(args.prefix2) model = FINAL(source_dataset, target_dataset, None, args.alpha, args.max_iter, args.tol) S = model.align()
parser_EMGCN.add_argument('--attval', type=float, default=0.25) # what is this # often change parser_EMGCN.add_argument('--num_each_refine', type=int, default=100) return parser.parse_args() if __name__ == '__main__': args = parse_args() print(args) start_time = time() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) source_dataset = Dataset(args.source_dataset, args.dataset_name) target_dataset = Dataset(args.target_dataset) groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') algorithm = args.algorithm if algorithm == "EMGCN": model = EMGCN(source_dataset, target_dataset, args) else: raise Exception("Unsupported algorithm") S = model.align() for i in range(2): if i == 1:
#!/usr/bin/env python # -*- coding: utf-8 -*- """Train et evaluate a dependency model.""" from input.dataset import Dataset from model.parser_model import ParserModel path = "/Users/alicia/Documents/01-Projets/04-Research/01-Dependency-parsing" train_dataset = Dataset.create_dataset_from_files(path + "/data/train.feat", path + "/data/train.text") dev_dataset = Dataset.create_dataset_from_files(path + "/data/dev.feat", path + "/data/dev.text") test_dataset = Dataset.create_dataset_from_files(path + "/data/test.feat", path + "/data/test.text") n_features = len(train_dataset.documents[0].X[0]) parser_model = ParserModel(n_features, dropout_prob=0.6, learning_rate=0.00001, batch_size=5, hidden_size=100, model_folder=path + "/data/models") dev_loss = parser_model.train(train_dataset, dev_dataset, 400) Y = parser_model.predict(test_dataset) for index, document in enumerate(test_dataset.documents): print("Accuracy: " + document.compute_accuracy(Y[index]))
def get_val_data(files, pklpath, vocab, glove, batch_size=1, num_workers=0, pretrained=False, pklexist=False, data_parallel=True, frame_trunc_length=45, spatial=False): if pretrained: pixel = Pixel(files, pklpath) if not pklexist: pixel.create() pixel.save() else: pixel.load() dataset = Dataset(files) dataset.set_flags(mode='test', data_parallel=data_parallel, frame_trunc_length=frame_trunc_length, pretrained=pretrained, spatial=spatial) dataset.set_pad_indices(vocab) dataset.create(vocab) dataset.add_glove_vecs(glove) if pretrained: dataset.add_video_vecs(pixel) dataloader = data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, num_workers=num_workers) return dataloader
def get_train_data(files, pklpath, glove_file, glove_embdim, batch_size=1, shuffle=True, num_workers=0, pretrained=False, pklexist=False, data_parallel=True, frame_trunc_length=45, spatial=False): start_time = time.time() vocab = Vocab(files) vocab.add_begend_vocab() vocab.create() vocab_time = time.time() glove = Glove(glove_file, glove_embdim) glove.create(vocab) glove_time = time.time() if pretrained: pixel = Pixel(files, pklpath) if not pklexist: pixel.create() pixel.save() else: pixel.load() pixel_time = time.time() dataset = Dataset(files) dataset.set_flags(mode='train', data_parallel=data_parallel, frame_trunc_length=frame_trunc_length, pretrained=pretrained, spatial=spatial) dataset.set_pad_indices(vocab) dataset.create(vocab) dataset.add_glove_vecs(glove) if pretrained: dataset.add_video_vecs(pixel) dataset_time = time.time() print('Vocab : {0}, Glove : {1}, Pixel : {2}, Dataset : {3}'.format( vocab_time - start_time, glove_time - vocab_time, pixel_time - glove_time, dataset_time - pixel_time)) dataloader = data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, shuffle=shuffle, num_workers=num_workers) return dataloader, vocab, glove, dataset.__len__()
print( "Number of removed node in this iteration: {}".format(num_removed)) print("Number of nodes left: {}".format(len(G.nodes()))) iter += 1 return G def _save_graph(G, output_dir): with open(output_dir, "w+") as file: res = json_graph.node_link_data(G) file.write(json.dumps(res)) if __name__ == "__main__": args = parse_args() source_dataset = Dataset(args.source_dataset) target_dataset = Dataset(args.target_dataset) groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict') source_groundtruth_nodes = list(groundtruth.keys()) target_groundtruth_nodes = list(groundtruth.values()) source_idx2id = {v: k for k, v in source_dataset.id2idx.items()} target_idx2id = {v: k for k, v in target_dataset.id2idx.items()} source_gt_id = [source_idx2id[node] for node in source_groundtruth_nodes] target_gt_id = [target_idx2id[node] for node in target_groundtruth_nodes] source_care_deg = source_dataset.get_nodes_degrees( )[source_groundtruth_nodes] target_care_deg = target_dataset.get_nodes_degrees( )[target_groundtruth_nodes] print("Number of nodes in groundtruth: {}".format(len(groundtruth)))