def get_data_and_targets(schema, neg_data, data_indices, paper, cites, content): n_papers = schema.entities[0].n_instances n_words = schema.entities[1].n_instances train_targets = torch.LongTensor(paper[1]) # Randomly fill in values and coalesce to remove duplicates n_cites_neg = int(neg_data * cites.shape[1]) #cites_neg = np.random.choice(data_indices, (2, n_cites_neg)) cites_neg = np.random.randint(0, n_papers, (2, n_cites_neg)) cites_matrix = SparseMatrix( indices=torch.LongTensor(np.concatenate((cites, cites_neg), axis=1)), values=torch.cat((torch.ones(cites.shape[1], 1), torch.zeros(n_cites_neg, 1))), shape=(n_papers, n_papers, 1)).coalesce() # For each paper, randomly fill in values and coalesce to remove duplicates n_content_neg = int(neg_data * content.shape[1]) #content_neg = np.stack((np.random.choice(data_indices, (n_content_neg,)), # np.random.randint(0, n_words, (n_content_neg,)))) content_neg = np.stack((np.random.randint(0, n_papers, (n_content_neg, )), np.random.randint(0, n_words, (n_content_neg, )))) content_matrix = SparseMatrix( indices=torch.LongTensor(np.concatenate((content, content_neg), axis=1)), values=torch.cat((torch.ones(content.shape[1], 1), torch.zeros(n_content_neg, 1))), shape=(n_papers, n_words, 1)).coalesce() data = SparseMatrixData(schema) data[0] = cites_matrix data[1] = content_matrix return data, train_targets
def make_flat_target_matrix(full_relation, rel_ids, pos_heads, pos_tails, neg_heads, neg_tails, device): full_heads, full_tails = np.array([], dtype=np.int32), np.array([], dtype=np.int32) for rel_id in rel_ids: full_heads = np.concatenate((full_heads, pos_heads[rel_id])) full_heads = np.concatenate((full_heads, neg_heads[rel_id])) full_tails = np.concatenate((full_tails, pos_tails[rel_id])) full_tails = np.concatenate((full_tails, neg_tails[rel_id])) n_rels = len(rel_ids) indices = torch.LongTensor(np.vstack((full_heads, full_tails))) values = torch.zeros((indices.shape[1], n_rels)) shape = (full_relation.entities[0].n_instances, full_relation.entities[1].n_instances, n_rels) full_matrix = SparseMatrix(indices=indices, values=values, shape=shape) full_matrix = full_matrix.to(device).coalesce_() matrix_out = SparseMatrix.from_other_sparse_matrix(full_matrix, 0) for rel_id in rel_ids: rel_matrix = make_target_matrix(full_relation, pos_heads[rel_id], pos_tails[rel_id], neg_heads[rel_id], neg_tails[rel_id], device) rel_matrix_full = SparseMatrix.from_other_sparse_matrix(full_matrix, 1) + rel_matrix matrix_out.values = torch.cat([matrix_out.values, rel_matrix_full.values], 1) matrix_out.n_channels += 1 return matrix_out
def combine_matrices_flat(full_relation, a_pos_heads, a_pos_tails, a_neg_heads, a_neg_tails, ids, b_matrix, device): ''' inputs: a_heads: a dict of ID : head indices a_tails: a dict of ID : tail indices ids: IDs with which to access the indices of A b_matrix: a matrix whose indices we want to include in output returns: out_matrix: matrix with indices & values of A as well as indices of B valid_masks: a dict of id:indices that correspond to the indices for each of the relations in A ''' full_heads, full_tails = np.array([], dtype=np.int32), np.array([], dtype=np.int32) for rel_id in ids: full_heads = np.concatenate((full_heads, a_pos_heads[rel_id])) full_heads = np.concatenate((full_heads, a_neg_heads[rel_id])) full_tails = np.concatenate((full_tails, a_pos_tails[rel_id])) full_tails = np.concatenate((full_tails, a_neg_tails[rel_id])) indices = torch.LongTensor(np.vstack((full_heads, full_tails))) values = torch.zeros((indices.shape[1], 1)) shape = (full_relation.entities[0].n_instances, full_relation.entities[1].n_instances, 1) full_a_matrix = SparseMatrix(indices=indices, values=values, shape=shape) full_a_matrix = full_a_matrix.to(device).coalesce_() b_idx_matrix = SparseMatrix.from_other_sparse_matrix(b_matrix, 1) b_idx_matrix.values += 1 out_idx_matrix = b_idx_matrix + full_a_matrix out_matrix = SparseMatrix.from_other_sparse_matrix(out_idx_matrix, 0) for rel_id in ids: rel_matrix = make_target_matrix(full_relation, a_pos_heads[rel_id], a_pos_tails[rel_id], a_neg_heads[rel_id], a_neg_tails[rel_id], device) rel_full_matrix = SparseMatrix.from_other_sparse_matrix( out_idx_matrix, 1) + rel_matrix out_matrix.values = torch.cat( [out_matrix.values, rel_full_matrix.values], 1) out_matrix.n_channels += 1 rel_idx_matrix = SparseMatrix.from_other_sparse_matrix(rel_matrix, 1) rel_idx_matrix.values += 1 rel_idx_full_matrix = SparseMatrix.from_other_sparse_matrix( out_idx_matrix, 1) + rel_idx_matrix out_idx_matrix.values = torch.cat( [out_idx_matrix.values, rel_idx_full_matrix.values], 1) out_idx_matrix.n_channels += 1 masks = {} for channel_i, rel_id in enumerate(ids): masks[rel_id] = out_idx_matrix.values[:, channel_i + 1].nonzero().squeeze() return out_matrix, masks
def make_target_matrix(relation, pos_head, pos_tail, neg_head, neg_tail, device): n_pos = pos_head.shape[0] pos_indices = np.vstack((pos_head, pos_tail)) pos_values = np.ones((n_pos, 1)) n_neg = neg_head.shape[0] neg_indices = np.vstack((neg_head, neg_tail)) neg_values = np.zeros((n_neg, 1)) indices = torch.LongTensor(np.concatenate((pos_indices, neg_indices), 1)) values = torch.FloatTensor(np.concatenate((pos_values, neg_values), 0)) shape = (relation.entities[0].n_instances, relation.entities[1].n_instances, 1) data_target = SparseMatrix(indices=indices, values=values, shape=shape) data_target = data_target.to(device).coalesce_() return data_target
def binary_relation_to_matrix(self, relation, typedict, raw_vals, ent_id_to_idx_dict, ent_n_id_str, ent_m_id_str): assert not relation.is_set ent_n = relation.entities[0] ent_n_name = entity_names[ent_n.id] ent_m = relation.entities[1] ent_m_name = entity_names[ent_m.id] instances_n = ent_n.n_instances instances_m = ent_m.n_instances tensor_list = [] for key, val in typedict.items(): if val == 'id': continue elif val == 'ordinal': func = self.ordinal_to_tensor elif val == 'categorical': func = self.categorical_to_tensor elif val == 'binary': func = self.binary_to_tensor tensor_list.append(func(raw_vals[key])) n_ids = raw_vals[ent_n_id_str] m_ids = raw_vals[ent_m_id_str] if len(tensor_list) != 0: values = torch.cat(tensor_list, 1) else: values = torch.ones(len(n_ids), 1) indices_n = torch.LongTensor( [ent_id_to_idx_dict[ent_n_name][ent_i] for ent_i in n_ids]) indices_m = torch.LongTensor( [ent_id_to_idx_dict[ent_m_name][ent_i] for ent_i in m_ids]) return SparseMatrix(indices=torch.stack((indices_n, indices_m)), values=values, shape=(instances_n, instances_m, values.shape[1]))
def set_relation_to_matrix(self, relation, typedict, raw_vals): assert relation.entities[0] == relation.entities[1] assert relation.is_set n_instances = relation.entities[0].n_instances tensor_list = [] for key, val in typedict.items(): if key == self.TARGET_KEY: continue if val == 'id': continue elif val == 'ordinal': func = self.ordinal_to_tensor elif val == 'categorical': func = self.categorical_to_tensor elif val == 'binary': func = self.binary_to_tensor tensor_list.append(func(raw_vals[key])) if len(tensor_list) != 0: values = torch.cat(tensor_list, 1) else: values = torch.ones(n_instances, 1) indices = torch.arange(n_instances).repeat(2, 1) return SparseMatrix(indices=indices, values=values, shape=(n_instances, n_instances, values.shape[1]))
def generate_target_matrix(true_matrix, n_samples, pos_rate, device): ''' Generate a target matrix with n_samples indices, of which pos_rate is the proportion are true positives, while 1-pos_rate is the proportion of randomly generated links. true_matrix is a matrix containing all true positive links Note that the randomly generated links have a ~99.99% of being negative but there may be some false negatives (1e-4 sparsity for each relation) ''' n_n = true_matrix.n n_m = true_matrix.m n_channels = 1 n_pos_samples = int(pos_rate * n_samples) perm = torch.randperm(true_matrix.nnz()) pos_sample_idx = perm[:n_pos_samples] pos_indices = true_matrix.indices[:, pos_sample_idx] pos_values = torch.ones(n_pos_samples).to(device) n_neg_samples = n_samples - n_pos_samples neg_indices_n = torch.randint(0, n_n, [n_neg_samples]).to(device) neg_indices_m = torch.randint(0, n_m, [n_neg_samples]).to(device) neg_indices = torch.stack((neg_indices_n, neg_indices_m)) neg_values = torch.zeros(n_neg_samples).to(device) return SparseMatrix(indices=torch.cat((pos_indices, neg_indices), 1), values=torch.cat((pos_values, neg_values), 0).unsqueeze(1), shape=(n_n, n_m, n_channels)).coalesce()
def setUp(self): ''' 1ooo 2o3o oooo 4oo5 ''' values1 = torch.arange(1, 6, dtype=torch.float32).view(5, 1) indices1 = torch.LongTensor([[0, 0], [1, 0], [1, 2], [3, 0], [3, 3]]).T shape1 = (4, 4, 1) self.X = SparseMatrix(indices1, values1, shape1) ''' o1o2 oo3o oo4o 5ooo ''' values2 = torch.arange(1, 6, dtype=torch.float32).view(5, 1) indices2 = torch.LongTensor([[0, 1], [0, 3], [1, 2], [2, 2], [3, 0]]).T shape2 = (4, 4, 1) self.Y = SparseMatrix(indices2, values2, shape2) self.pooled = torch.arange(1, 5, dtype=torch.float32).view(4, 1) # Two-channeled versions: ''' 1ooo 6ooo 2o3o 7o8o oooo oooo 4oo5 9ooX ''' values1_2 = torch.arange(1, 11, dtype=torch.float32).view(2, 5).T indices1 = torch.LongTensor([[0, 0], [1, 0], [1, 2], [3, 0], [3, 3]]).T shape1_2 = (4, 4, 2) self.X2 = SparseMatrix(indices1, values1_2, shape1_2) ''' o1o2 o6o7 oo3o oo8o oo4o oo9o 5ooo Xooo ''' values2_2 = torch.arange(1, 11, dtype=torch.float32).view(2, 5).T indices2 = torch.LongTensor([[0, 1], [0, 3], [1, 2], [2, 2], [3, 0]]).T shape2_2 = (4, 4, 2) self.Y2 = SparseMatrix(indices2, values2_2, shape2_2) self.pooled2 = torch.arange(1, 9, dtype=torch.float32).view(2, 4).T
def to_sparse_matrix(self): sparse = {} for rel_id in self.schema.relations: dense = self.rel_tensors[rel_id] sparse[rel_id] = SparseMatrix.from_dense_tensor(dense) return SparseMatrixData(self.schema, sparse, batch_size=self.batch_size)
def forward(self, matrix): values_out = self.linear(matrix.values) shape_out = (matrix.n, matrix.m, values_out.shape[1]) return SparseMatrix(indices=matrix.indices, values=values_out, shape=shape_out, indices_diag=matrix.indices_diag, is_set=matrix.is_set)
def test_broadcast_col(self): zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X, n_channels=1) out = zero_matrix.broadcast(self.pooled, "col") ''' 1ooo 2o2o oooo 4oo4 ''' self.assertSameValues(out.values, np.array([[1, 2, 2, 4, 4]]).T)
def test_broadcast_row(self): zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X, n_channels=1) out = zero_matrix.broadcast(self.pooled, "row") ''' 1ooo 1o3o oooo 1oo4 ''' self.assertSameValues(out.values, np.array([[1, 1, 3, 1, 4]]).T)
def test_broadcast_all(self): zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X, n_channels=1) out = zero_matrix.broadcast(torch.Tensor([5.]), "all") ''' 5ooo 5o5o oooo 5oo5 ''' self.assertSameValues(out.values, np.array([[5, 5, 5, 5, 5]]).T)
def test_broadcast_diag(self): zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X, n_channels=1) out = zero_matrix.broadcast(torch.Tensor([5., 2]), "diag") ''' 5ooo 0o0o oooo 0oo5 ''' self.assertSameValues(out.values, np.array([[5, 0, 0, 0, 5], [2, 0, 0, 0, 2]]).T)
def forward(self, X_in, X_out, indices_identity, indices_trans): ''' X_in: Source sparse tensor X_out: Correpsonding sparse tensor for target relation ''' self.logger.info("n_params: {}".format(self.n_params)) if type(X_out) == SparseMatrix: Y = SparseMatrix.from_other_sparse_matrix(X_out, self.out_dim) else: Y = X_out.clone() #TODO: can add a cache for input operations here for i in range(self.n_params): op_inp, op_out = self.all_ops[i] weight = self.weights[i] device = weight.device if op_inp == None: X_mul = torch.matmul(X_in, weight) X_op_out = self.output_op(op_out, X_out, X_mul, device) elif op_out == None: X_op_inp = self.input_op(op_inp, X_in, device) X_mul = torch.matmul(X_op_inp, weight) X_op_out = X_mul elif op_out[0] == "i": # Identity X_intersection_vals = X_in.gather_mask(indices_identity[0]) X_mul = X_intersection_vals @ weight X_op_out = X_out.broadcast_from_mask(X_mul, indices_identity[1], device) elif op_out[0] == "t": # Transpose X_T_intersection_vals = X_in.gather_transpose(indices_trans[0]) X_mul = X_T_intersection_vals @ weight X_op_out = X_out.broadcast_from_mask(X_mul, indices_trans[1], device) else: # Pool or Gather or Do Nothing X_op_inp = self.input_op(op_inp, X_in, device) # Multiply values by weight X_mul = torch.matmul(X_op_inp, weight) # Broadcast or Embed Diag or Transpose X_op_out = self.output_op(op_out, X_out, X_mul, device) #assert X_op_out.nnz() == X_out.nnz() #assert Y.nnz() == X_out.nnz(), "Y: {}, X_out: {}".format(Y.nnz(), X_out.nnz()) #assert Y.nnz() == X_op_out.nnz(), "Y: {}, X_op_out: {}".format(Y.nnz(), X_op_out.nnz()) Y = Y + X_op_out return Y
def make_entity_embeddings(cls, entities, embedding_dim): ''' Initialize from pytorch's built-in sparse tensor ''' data = {} relations = {} for ent in entities: n_ent = ent.n_instances data[ent.id] = SparseMatrix( indices=torch.arange(n_ent, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_ent, embedding_dim]), shape=(n_ent, n_ent, embedding_dim), is_set=True) relations[ent.id] = Relation(ent.id, [ent, ent], is_set=True) embedding_schema = DataSchema(entities, relations) return cls(embedding_schema, data)
def select_features(data, schema, feats_type, target_ent): ''' TODO: IMPLEMENT THIS ''' # Select features for nodes in_dims = {} num_relations = len(schema.relations) - len(schema.entities) if feats_type == 0: # Keep all node attributes pass elif feats_type == 1: # Set all non-target node attributes to zero for ent_i in schema.entities: if ent_i.id != target_ent: # 10 dimensions for some reason n_dim = 10 rel_id = num_relations + ent_i.id data[rel_id] = SparseMatrix.from_other_sparse_matrix( data[rel_id], n_dim) ''' elif feats_type == 2: # Set all non-target node attributes to one-hot vector for i in range(0, len(features_list)): if i != target_ent: dim = features_list[i].shape[0] indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1) values = torch.FloatTensor(np.ones(dim)) features_list[i] = torch.sparse.FloatTensor(indices, values, torch.Size([dim, dim])).to(device) elif feats_type == 3: in_dims = [features.shape[0] for features in features_list] for i in range(len(features_list)): dim = features_list[i].shape[0] indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = np.ones(dim) features_list[i] = torch.sparse.FloatTensor(indices, values, torch.Size([dim, dim])).to(device) ''' for rel_id in schema.relations: in_dims[rel_id] = data[rel_id].n_channels return data, in_dims
set_seed(args.seed) n_papers = 200 n_words = 300 n_classes = 4 value_dist = torch.distributions.bernoulli.Bernoulli(probs=1. / (1. + args.neg_data)) paper = np.stack( [np.arange(n_papers), np.random.randint(0, n_classes, n_papers)]) n_cites = 2 * int(n_papers * 3) cites = np.unique(np.random.randint(0, n_papers, (2, n_cites)), axis=1) cites_matrix = SparseMatrix(indices=torch.LongTensor(cites), values=value_dist.sample((cites.shape[1], 1)), shape=(n_papers, n_papers, 1)).coalesce() n_content = 2 * int(0.2 * n_papers * n_words) content = np.stack([ np.random.randint(0, n_papers, (n_content)), np.random.randint(0, n_words, (n_content)) ]) content = np.unique(cites, axis=1) content_matrix = SparseMatrix(indices=torch.LongTensor(content), values=value_dist.sample( (content.shape[1], 1)), shape=(n_papers, n_words, 1)).coalesce() ent_papers = Entity(0, n_papers) #ent_classes = Entity(1, n_classes)
def make_target_matrix_test(relation, left, right, labels, device): indices = torch.LongTensor(np.vstack((left, right))) values = torch.FloatTensor(labels).unsqueeze(1) shape = (relation.entities[0].n_instances, relation.entities[1].n_instances, 1) return SparseMatrix(indices=indices, values=values, shape=shape).to(device)
relations = [] rel_movie_actor = Relation(0, [ent_movie, ent_actor]) rel_movie_director = Relation(1, [ent_movie, ent_director]) rel_movie_keyword = Relation(2, [ent_movie, ent_keyword]) rel_movie_feature = Relation(3, [ent_movie, ent_movie], is_set=True) relations = [rel_movie_actor, rel_movie_director, rel_movie_keyword, rel_movie_feature] schema = DataSchema(entities, relations) schema_out = DataSchema([ent_movie], [Relation(0, [ent_movie, ent_movie], is_set=True)]) data = SparseMatrixData(schema) for rel_i, rel_name in enumerate(relation_names): if rel_name == 'movie_feature': values = preprocess_features(raw_data[rel_name]) data[rel_i] = SparseMatrix.from_embed_diag(values) else: data[rel_i] = SparseMatrix.from_scipy_sparse(raw_data[rel_name]) data = data.to(device) indices_identity, indices_transpose = data.calculate_indices() input_channels = {rel.id: data[rel.id].n_channels for rel in relations} data_target = Data(schema_out) n_movies = ent_movie.n_instances labels = [] with open(data_file_dir + 'index_label.txt', 'r') as label_file: lines = label_file.readlines() for line in lines: label = line.rstrip().split(',')[1] labels.append(int(label)) labels = torch.LongTensor(labels).to(device) - min(labels)
schema = dataloader.schema data = dataloader.data.to(device) indices_identity, indices_transpose = data.calculate_indices() embedding_entity = schema.entities[TARGET_NODE_TYPE] input_channels = { rel.id: data[rel.id].n_channels for rel in schema.relations } embedding_schema = DataSchema( schema.entities, Relation(0, [embedding_entity, embedding_entity], is_set=True)) n_instances = embedding_entity.n_instances data_embedding = SparseMatrixData(embedding_schema) data_embedding[0] = SparseMatrix( indices=torch.arange(n_instances, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_instances, args.embedding_dim]), shape=(n_instances, n_instances, args.embedding_dim), is_set=True) data_embedding.to(device) target_schema = DataSchema(schema.entities, schema.relations[TARGET_REL_ID]) target_node_idx_to_id = dataloader.target_node_idx_to_id #%% net = SparseMatrixAutoEncoder(schema, input_channels, layers=args.layers, embedding_dim=args.embedding_dim, embedding_entities=[embedding_entity], activation=eval('nn.%s()' % args.act_fn), final_activation=nn.Sigmoid(), dropout=args.dropout_rate,
def __init__(self, use_node_attrs=True): entities = [ Entity(entity_id, n_instances) for entity_id, n_instances in ENTITY_N_INSTANCES.items() ] relations = [ Relation(rel_id, [entities[entity_i], entities[entity_j]]) for rel_id, (entity_i, entity_j) in RELATION_IDX.items() ] if use_node_attrs: for entity_id in ENTITY_N_INSTANCES.keys(): rel = Relation(10 + entity_id, [entities[entity_id], entities[entity_id]], is_set=True) relations.append(rel) self.schema = DataSchema(entities, relations) self.node_id_to_idx = {ent_i: {} for ent_i in range(len(entities))} with open(NODE_FILE_STR, 'r') as node_file: lines = node_file.readlines() node_counter = {ent_i: 0 for ent_i in range(len(entities))} for line in lines: node_id, node_name, node_type, values = line.rstrip().split( '\t') node_id = int(node_id) node_type = int(node_type) node_idx = node_counter[node_type] self.node_id_to_idx[node_type][node_id] = node_idx node_counter[node_type] += 1 target_node_id_to_idx = self.node_id_to_idx[TARGET_NODE_TYPE] self.target_node_idx_to_id = { idx: id for id, idx in target_node_id_to_idx.items() } raw_data_indices = {rel_id: [] for rel_id in range(len(relations))} raw_data_values = {rel_id: [] for rel_id in range(len(relations))} if use_node_attrs: with open(NODE_FILE_STR, 'r') as node_file: lines = node_file.readlines() for line in lines: node_id, node_name, node_type, values = line.rstrip( ).split('\t') node_type = int(node_type) node_id = self.node_id_to_idx[node_type][int(node_id)] values = list(map(float, values.split(','))) raw_data_indices[10 + node_type].append([node_id, node_id]) raw_data_values[10 + node_type].append(values) with open(LINK_FILE_STR, 'r') as link_file: lines = link_file.readlines() for line in lines: node_i, node_j, rel_num, val = line.rstrip().split('\t') rel_num = int(rel_num) node_i_type, node_j_type = RELATION_IDX[rel_num] node_i = self.node_id_to_idx[node_i_type][int(node_i)] node_j = self.node_id_to_idx[node_j_type][int(node_j)] val = float(val) raw_data_indices[rel_num].append([node_i, node_j]) raw_data_values[rel_num].append([val]) self.data = SparseMatrixData(self.schema) for rel in relations: indices = torch.LongTensor(raw_data_indices[rel.id]).T values = torch.Tensor(raw_data_values[rel.id]) n = rel.entities[0].n_instances m = rel.entities[1].n_instances n_channels = values.shape[1] data_matrix = SparseMatrix(indices=indices, values=values, shape=np.array([n, m, n_channels]), is_set=rel.is_set) del raw_data_indices[rel.id] del raw_data_values[rel.id] self.data[rel.id] = data_matrix
train_start = int(args.val_pct * (n_targets / 100.)) val_indices_idx = shuffled_indices_idx[val_start:train_start] val_indices = target_indices[val_indices_idx] train_indices_idx = shuffled_indices_idx[train_start:] train_indices = target_indices[train_indices_idx] #%% train_targets = targets[train_indices_idx] val_targets = targets[val_indices_idx] n_output_classes = len(targets.unique()) data_target[0] = SparseMatrix( indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_outputs, n_output_classes]), shape=(n_outputs, n_outputs, n_output_classes), is_set=True).to(device) #%% net = SparseMatrixEntityPredictor(schema, input_channels, layers=args.layers, fc_layers=args.fc_layers, activation=eval('nn.%s()' % args.act_fn), final_activation=nn.Identity(), target_entities=schema_out.entities, dropout=args.dropout_rate, output_dim=n_output_classes, norm=args.norm, pool_op=args.pool_op,
def __init__(self): data_raw = { rel_name: {key: list() for key in schema_dict[rel_name].keys()} for rel_name in schema_dict.keys() } for relation_name in relation_names: with open(csv_file_str.format(relation_name)) as file: reader = csv.reader(file) keys = schema_dict[relation_name].keys() for cols in reader: for key, col in zip(keys, cols): data_raw[relation_name][key].append(col) ent_person = Entity(0, len(data_raw['person']['p_id'])) ent_course = Entity(1, len(data_raw['course']['course_id'])) entities = [ent_person, ent_course] rel_person = Relation(0, [ent_person, ent_person], is_set=True) rel_course = Relation(1, [ent_course, ent_course], is_set=True) rel_advisedBy = Relation(2, [ent_person, ent_person]) rel_taughtBy = Relation(3, [ent_course, ent_person]) relations = [rel_person, rel_course, rel_advisedBy, rel_taughtBy] self.schema = DataSchema(entities, relations) self.data = SparseMatrixData(self.schema) ent_id_to_idx_dict = { 'person': self.id_to_idx(data_raw['person']['p_id']), 'course': self.id_to_idx(data_raw['course']['course_id']) } for relation in relations: relation_name = relation_names[relation.id] print(relation_name) if relation.is_set: data_matrix = self.set_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name]) else: if relation_name == 'advisedBy': ent_n_id_str = 'p_id' ent_m_id_str = 'p_id_dummy' elif relation_name == 'taughtBy': ent_n_id_str = 'course_id' ent_m_id_str = 'p_id' data_matrix = self.binary_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str, ent_m_id_str) self.data[relation.id] = data_matrix self.target = self.get_targets( data_raw[self.TARGET_RELATION][self.TARGET_KEY], schema_dict[self.TARGET_RELATION][self.TARGET_KEY]) self.target_rel_id = 0 rel_out = Relation(self.target_rel_id, [ent_person, ent_person], is_set=True) self.schema_out = DataSchema([ent_person], [rel_out]) self.data_target = Data(self.schema_out) n_output_classes = len( np.unique(data_raw[self.TARGET_RELATION][self.TARGET_KEY])) self.output_dim = n_output_classes n_person = ent_person.n_instances self.data_target[self.target_rel_id] = SparseMatrix( indices=torch.arange(n_person, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_person, n_output_classes]), shape=(n_person, n_person, n_output_classes))
def load_data_flat(prefix, use_node_attrs=True, use_edge_data=True, node_val='one'): ''' Load data into one matrix with all relations, reproducing Maron 2019 The first [# relation types] channels are adjacency matrices, while the next [sum of feature dimensions per entity type] channels have node attributes on the relevant segment of their diagonals if use_node_attrs=True. If node features aren't included, then ndoe_val is used instead. ''' dl = data_loader(DATA_FILE_DIR + prefix) total_n_nodes = dl.nodes['total'] entities = [Entity(0, total_n_nodes)] relations = {0: Relation(0, [entities[0], entities[0]])} schema = DataSchema(entities, relations) # Sparse Matrix containing all data data_full = sum(dl.links['data'].values()).tocoo() data_diag = scipy.sparse.coo_matrix( (np.ones(total_n_nodes), (np.arange(total_n_nodes), np.arange(total_n_nodes))), (total_n_nodes, total_n_nodes)) data_full += data_diag data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_() data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0) # Load up all edge data for rel_id in sorted(dl.links['data'].keys()): data_matrix = dl.links['data'][rel_id] data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data_rel.values = torch.ones(data_rel.values.shape) data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full, 1) + data_rel data_out.values = torch.cat([data_out.values, data_rel_full.values], 1) data_out.n_channels += 1 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): start_i = dl.nodes['shift'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) n_channels = attr_matrix.shape[1] indices = torch.arange(start_i, start_i + n_instances).unsqueeze(0).repeat( 2, 1) data_rel = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([total_n_nodes, total_n_nodes, n_channels]), is_set=True) data_rel_full = SparseMatrix.from_other_sparse_matrix( data_full, n_channels) + data_rel data_out.values = torch.cat( [data_out.values, data_rel_full.values], 1) data_out.n_channels += n_channels data = SparseMatrixData(schema) data[0] = data_out return schema,\ data, \ dl
def load_data(prefix, use_node_attrs=True, use_edge_data=True, use_other_edges=True, node_val='one'): dl = data_loader(DATA_FILE_DIR + prefix) all_entities = [ Entity(entity_id, n_instances) for entity_id, n_instances in sorted(dl.nodes['count'].items()) ] relations = {} test_types = dl.test_types if use_other_edges: for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()): relations[rel_id] = Relation( rel_id, [all_entities[entity_i], all_entities[entity_j]]) else: for rel_id in test_types: entity_i, entity_j = dl.links['meta'][rel_id] relations[rel_id] = Relation( rel_id, [all_entities[entity_i], all_entities[entity_j]]) if use_other_edges: entities = all_entities else: entities = list(np.unique(relations[test_types[0]].entities)) max_relation = max(relations) + 1 if use_node_attrs: # Create fake relations to represent node attributes for entity in entities: rel_id = max_relation + entity.id relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True) schema = DataSchema(entities, relations) data = SparseMatrixData(schema) for rel_id, data_matrix in dl.links['data'].items(): if use_other_edges or rel_id in test_types: # Get subset belonging to entities in relation relation = relations[rel_id] start_i = dl.nodes['shift'][relation.entities[0].id] end_i = start_i + dl.nodes['count'][relation.entities[0].id] start_j = dl.nodes['shift'][relation.entities[1].id] end_j = start_j + dl.nodes['count'][relation.entities[1].id] rel_matrix = data_matrix[start_i:end_i, start_j:end_j] data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data[rel_id].values = torch.ones(data[rel_id].values.shape) if use_node_attrs: for ent in entities: ent_id = ent.id attr_matrix = dl.nodes['attr'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) n_channels = attr_matrix.shape[1] rel_id = ent_id + max_relation indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1) data[rel_id] = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([n_instances, n_instances, n_channels]), is_set=True) return schema,\ data, \ dl
def load_data(prefix='DBLP', use_node_attrs=True, use_edge_data=True, feats_type=0): dl = data_loader(DATA_FILE_DIR + prefix) # Create Schema entities = [ Entity(entity_id, n_instances) for entity_id, n_instances in sorted(dl.nodes['count'].items()) ] relations = { rel_id: Relation(rel_id, [entities[entity_i], entities[entity_j]]) for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()) } num_relations = len(relations) if use_node_attrs: # Create fake relations to represent node attributes for entity in entities: rel_id = num_relations + entity.id relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True) schema = DataSchema(entities, relations) # Collect data data = SparseMatrixData(schema) for rel_id, data_matrix in dl.links['data'].items(): # Get subset belonging to entities in relation start_i = dl.nodes['shift'][relations[rel_id].entities[0].id] end_i = start_i + dl.nodes['count'][relations[rel_id].entities[0].id] start_j = dl.nodes['shift'][relations[rel_id].entities[1].id] end_j = start_j + dl.nodes['count'][relations[rel_id].entities[1].id] rel_matrix = data_matrix[start_i:end_i, start_j:end_j] data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data[rel_id].values = torch.ones(data[rel_id].values.shape) target_entity = 0 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): if attr_matrix is None: # Attribute for each node is a single 1 attr_matrix = np.ones(dl.nodes['count'][ent_id])[:, None] n_channels = attr_matrix.shape[1] rel_id = ent_id + num_relations n_instances = dl.nodes['count'][ent_id] indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1) data[rel_id] = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([n_instances, n_instances, n_channels]), is_set=True) n_outputs = dl.nodes['count'][target_entity] n_output_classes = dl.labels_train['num_classes'] schema_out = DataSchema([entities[target_entity]], [ Relation(0, [entities[target_entity], entities[target_entity]], is_set=True) ]) data_target = SparseMatrixData(schema_out) data_target[0] = SparseMatrix( indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_outputs, n_output_classes]), shape=(n_outputs, n_outputs, n_output_classes), is_set=True) labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']), dtype=int) val_ratio = 0.2 train_idx = np.nonzero(dl.labels_train['mask'])[0] np.random.shuffle(train_idx) split = int(train_idx.shape[0] * val_ratio) val_idx = train_idx[:split] train_idx = train_idx[split:] train_idx = np.sort(train_idx) val_idx = np.sort(val_idx) test_idx = np.nonzero(dl.labels_test['mask'])[0] labels[train_idx] = dl.labels_train['data'][train_idx] labels[val_idx] = dl.labels_train['data'][val_idx] if prefix != 'IMDB': labels = labels.argmax(axis=1) train_val_test_idx = {} train_val_test_idx['train_idx'] = train_idx train_val_test_idx['val_idx'] = val_idx train_val_test_idx['test_idx'] = test_idx return schema,\ schema_out, \ data, \ data_target, \ labels,\ train_val_test_idx,\ dl
if args.training_data == 'val': train_data = val_data indices_identity = idx_id_val indices_transpose = idx_trans_val elif args.training_data == 'test': train_data = test_data indices_identity = idx_id_test indices_transpose = idx_trans_test val_data = test_data idx_id_val = indices_identity idx_trans_val = indices_transpose data_target = Data(schema_out) data_target[0] = SparseMatrix( indices=torch.arange(len(paper_names), dtype=torch.int64).repeat(2, 1), values=torch.zeros([len(paper_names), n_classes]), shape=(len(paper_names), len(paper_names), n_classes)) data_target = data_target.to(device) #%% # Loss function: def classification_loss(data_pred, data_true): return F.cross_entropy(data_pred, data_true) n_channels = 1 net = SparseMatrixEntityPredictor(schema, n_channels, layers=args.layers, fc_layers=args.fc_layers, activation=eval('nn.%s()' % args.act_fn),
def load_data_flat(prefix, use_node_attrs=True, use_edge_data=True, node_val='zero', feats_type=0): ''' Load data into one matrix with all relations, reproducing Maron 2019 The first [# relation types] channels are adjacency matrices, while the next [sum of feature dimensions per entity type] channels have node attributes on the relevant segment of their diagonals if use_node_attrs=True. If node features aren't included, then ndoe_val is used instead. ''' dl = data_loader(DATA_FILE_DIR + prefix) total_n_nodes = dl.nodes['total'] entities = [Entity(0, total_n_nodes)] relations = {0: Relation(0, [entities[0], entities[0]])} schema = DataSchema(entities, relations) # Sparse Matrix containing all data data_full = sum(dl.links['data'].values()).tocoo() data_diag = scipy.sparse.coo_matrix( (np.ones(total_n_nodes), (np.arange(total_n_nodes), np.arange(total_n_nodes))), (total_n_nodes, total_n_nodes)) data_full += data_diag data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_() data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0) # Load up all edge data for rel_id in sorted(dl.links['data'].keys()): data_matrix = dl.links['data'][rel_id] data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data_rel.values = torch.ones(data_rel.values.shape) data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full, 1) + data_rel data_out.values = torch.cat([data_out.values, data_rel_full.values], 1) data_out.n_channels += 1 target_entity = 0 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): start_i = dl.nodes['shift'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) if feats_type == 1 and ent_id != target_entity: # To keep same behaviour as non-LGNN model, use 10 dimensions attr_matrix = np.zeros((n_instances, 10)) n_channels = attr_matrix.shape[1] indices = torch.arange(start_i, start_i + n_instances).unsqueeze(0).repeat( 2, 1) data_rel = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([total_n_nodes, total_n_nodes, n_channels]), is_set=True) data_rel_full = SparseMatrix.from_other_sparse_matrix( data_full, n_channels) + data_rel data_out.values = torch.cat( [data_out.values, data_rel_full.values], 1) data_out.n_channels += n_channels data = SparseMatrixData(schema) data[0] = data_out n_outputs = total_n_nodes n_output_classes = dl.labels_train['num_classes'] schema_out = DataSchema([entities[target_entity]], [ Relation(0, [entities[target_entity], entities[target_entity]], is_set=True) ]) data_target = SparseMatrixData(schema_out) data_target[0] = SparseMatrix( indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_outputs, n_output_classes]), shape=(n_outputs, n_outputs, n_output_classes), is_set=True) labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']), dtype=int) val_ratio = 0.2 train_idx = np.nonzero(dl.labels_train['mask'])[0] np.random.shuffle(train_idx) split = int(train_idx.shape[0] * val_ratio) val_idx = train_idx[:split] train_idx = train_idx[split:] train_idx = np.sort(train_idx) val_idx = np.sort(val_idx) test_idx = np.nonzero(dl.labels_test['mask'])[0] labels[train_idx] = dl.labels_train['data'][train_idx] labels[val_idx] = dl.labels_train['data'][val_idx] if prefix != 'IMDB': labels = labels.argmax(axis=1) train_val_test_idx = {} train_val_test_idx['train_idx'] = train_idx train_val_test_idx['val_idx'] = val_idx train_val_test_idx['test_idx'] = test_idx return schema,\ schema_out, \ data, \ data_target, \ labels,\ train_val_test_idx,\ dl