def make_data(self, tucker): data = Data(self.schema) for rel in self.schema.relations: embeddings = [self.embeddings[ent.id] for ent in rel.entities] rel_data = self.calculate_relation(tucker, *embeddings) data[rel.id] = torch.tensor(rel_data, dtype=torch.float32).unsqueeze(0) if self.batch_dim: data[rel.id] = data[rel.id].unsqueeze(0) return data
def forward(self, data): data_out = Data(self.schema) for relation_i, relation_j in self.relation_pairs: X = data[relation_i.id] layer = self.block_modules[str((relation_i.id, relation_j.id))] out = layer.forward(X) if relation_j.id not in data_out: data_out[relation_j.id] = out else: data_out[relation_j.id] = data_out[relation_j.id] + out return data_out
def forward(self, data): out = Data(self.enc_schema, batch_size=data.batch_size) for entity in self.schema.entities: for relation in self.schema.relations.values(): if entity not in relation.entities: continue else: pooling_dims = self.get_pooling_dims(entity, relation) data_rel = data[relation.id] entity_out = self.pool_tensor(data_rel, pooling_dims) entity_out = self.pool_tensor_diag(entity_out) out[entity.id] = entity_out return out
def forward(self, data): data_out = Data(self.schema) for relation_i, relation_j in self.relation_pairs: self.logger.info("Relation: ({}, {})".format( relation_i.id, relation_j.id)) X_in = data[relation_i.id] Y_in = data[relation_j.id] layer = self.block_modules[str((relation_i.id, relation_j.id))] Y_out = layer.forward(X_in, Y_in) if relation_j.id not in data_out: data_out[relation_j.id] = Y_out else: data_out[relation_j.id] = data_out[relation_j.id] + Y_out return data_out
def generate_data(self, n_dim_ent=5, batch_size=1): if self.embeddings == None: self.generate_embeddings(n_dim_ent, batch_size) # TODO: make two-channeled def rel_student_fn(embedding): return 100 * np.mean(np.abs(np.sin(embedding)), 1) def rel_courses_fn(embedding): return 100 * np.round(np.sum(np.arctan(np.exp(embedding)), 1)) def rel_professor_fn(embedding): return np.sum(np.sign(embedding), 1) + n_dim_ent def rel_takes_fn(embedding_student, embedding_course): return 100 / (1 + np.exp(embedding_student @ embedding_course.T)) def rel_ref_fn(embedding_student, embedding_professor): return np.sign(embedding_student @ embedding_professor.T) def rel_teaches_fn(embed_professor, embed_course): return 50 + 50 * (np.sin(embed_professor) @ np.cos(embed_course).T) def rel_prereq_fn(embed_course1, embed_course2): return 50 * np.pi * np.arctan(embed_course1 @ embed_course2.T) rel_fns = { 0: rel_takes_fn, 1: rel_ref_fn, 2: rel_teaches_fn, 3: rel_prereq_fn, 4: rel_student_fn, 5: rel_courses_fn, 6: rel_professor_fn } # TODO: change sparsity data = Data(self.schema) for relation in self.schema.relations: entities = relation.entities relation_data = torch.zeros(batch_size, 1, *relation.get_shape()) for batch in range(batch_size): ent_embeddings = [ self.embeddings[ent.id][batch] for ent in entities ] relation_data[batch] = torch.tensor( rel_fns[relation.id](*ent_embeddings)) data[relation.id] = relation_data return data
def make_observed(self, sparsity, n_channels=1, min_val=-2, max_val=2): data = Data(self.schema) for rel in self.schema.relations: n_entries = int(sparsity * rel.get_n_entries()) indices = np.zeros((len(rel.entities), n_entries)) for i, entity in enumerate(rel.entities): indices[i] = np.random.randint(0, entity.n_instances, n_entries) values = np.single( np.random.uniform(min_val, max_val, (n_channels, n_entries))) shape = np.array(rel.get_shape()) data[rel.id] = SparseTensor(indices, values, shape).coalesce() return data
def __init__(self): data_raw = { rel_name: {key: list() for key in schema_dict[rel_name].keys()} for rel_name in schema_dict.keys() } for relation_name in relation_names: with open(csv_file_str.format(relation_name)) as file: reader = csv.reader(file) keys = schema_dict[relation_name].keys() for cols in reader: for key, col in zip(keys, cols): data_raw[relation_name][key].append(col) ent_person = Entity(0, len(data_raw['person']['p_id'])) ent_course = Entity(1, len(data_raw['course']['course_id'])) entities = [ent_person, ent_course] rel_person = Relation(0, [ent_person, ent_person], is_set=True) rel_course = Relation(1, [ent_course, ent_course], is_set=True) rel_advisedBy = Relation(2, [ent_person, ent_person]) rel_taughtBy = Relation(3, [ent_course, ent_person]) relations = [rel_person, rel_course, rel_advisedBy, rel_taughtBy] self.schema = DataSchema(entities, relations) self.data = SparseMatrixData(self.schema) ent_id_to_idx_dict = { 'person': self.id_to_idx(data_raw['person']['p_id']), 'course': self.id_to_idx(data_raw['course']['course_id']) } for relation in relations: relation_name = relation_names[relation.id] print(relation_name) if relation.is_set: data_matrix = self.set_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name]) else: if relation_name == 'advisedBy': ent_n_id_str = 'p_id' ent_m_id_str = 'p_id_dummy' elif relation_name == 'taughtBy': ent_n_id_str = 'course_id' ent_m_id_str = 'p_id' data_matrix = self.binary_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str, ent_m_id_str) self.data[relation.id] = data_matrix self.target = self.get_targets( data_raw[self.TARGET_RELATION][self.TARGET_KEY], schema_dict[self.TARGET_RELATION][self.TARGET_KEY]) self.target_rel_id = 0 rel_out = Relation(self.target_rel_id, [ent_person, ent_person], is_set=True) self.schema_out = DataSchema([ent_person], [rel_out]) self.data_target = Data(self.schema_out) n_output_classes = len( np.unique(data_raw[self.TARGET_RELATION][self.TARGET_KEY])) self.output_dim = n_output_classes n_person = ent_person.n_instances self.data_target[self.target_rel_id] = SparseMatrix( indices=torch.arange(n_person, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_person, n_output_classes]), shape=(n_person, n_person, n_output_classes))
def __init__(self): self.target_relation = 'advisedBy' data_raw = { rel_name: {key: list() for key in schema_dict[rel_name].keys()} for rel_name in schema_dict.keys() } for relation_name in relation_names: with open(csv_file_str.format(relation_name)) as file: reader = csv.reader(file) keys = schema_dict[relation_name].keys() for cols in reader: for key, col in zip(keys, cols): data_raw[relation_name][key].append(col) ent_person = Entity(0, len(data_raw['person']['p_id'])) ent_course = Entity(1, len(data_raw['course']['course_id'])) entities = [ent_person, ent_course] rel_person_matrix = Relation(0, [ent_person, ent_person], is_set=True) rel_person = Relation(0, [ent_person]) rel_course_matrix = Relation(1, [ent_course, ent_course], is_set=True) rel_course = Relation(1, [ent_course]) rel_advisedBy = Relation(2, [ent_person, ent_person]) rel_taughtBy = Relation(3, [ent_course, ent_person]) relations_matrix = [ rel_person_matrix, rel_course_matrix, rel_advisedBy, rel_taughtBy ] relations = [rel_person, rel_course, rel_taughtBy] self.target_rel_id = 2 self.schema = DataSchema(entities, relations) schema_matrix = DataSchema(entities, relations_matrix) matrix_data = SparseMatrixData(schema_matrix) ent_id_to_idx_dict = { 'person': self.id_to_idx(data_raw['person']['p_id']), 'course': self.id_to_idx(data_raw['course']['course_id']) } for relation in relations_matrix: relation_name = relation_names[relation.id] print(relation_name) if relation.is_set: data_matrix = self.set_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name]) else: if relation_name == 'advisedBy': ent_n_id_str = 'p_id' ent_m_id_str = 'p_id_dummy' elif relation_name == 'taughtBy': ent_n_id_str = 'course_id' ent_m_id_str = 'p_id' data_matrix = self.binary_relation_to_matrix( relation, schema_dict[relation_name], data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str, ent_m_id_str) matrix_data[relation.id] = data_matrix rel_out = Relation(2, [ent_person, ent_person]) self.schema_out = DataSchema([ent_person], [rel_out]) self.output_dim = 1 data = Data(self.schema) for rel_matrix in schema_matrix.relations: for rel in self.schema.relations: if rel_matrix.id == rel.id: data_matrix = matrix_data[rel_matrix.id] if rel_matrix.is_set: dense_data = torch.diagonal(data_matrix.to_dense(), 0, 1, 2).unsqueeze(0) else: dense_data = data_matrix.to_dense().unsqueeze(0) data[rel.id] = dense_data self.data = data self.target = matrix_data[self.target_rel_id].to_dense().squeeze()
def forward(self, encodings): data_out = Data(self.schema) for relation in self.schema.relations.values(): data_out[relation.id] = self.make_relation(encodings, relation) return data_out
#ent_classes = Entity(1, n_classes) ent_words = Entity(1, n_words) rel_paper = Relation(0, [ent_papers, ent_papers], is_set=True) rel_cites = Relation(0, [ent_papers, ent_papers]) rel_content = Relation(1, [ent_papers, ent_words]) schema = DataSchema([ent_papers, ent_words], [rel_cites, rel_content]) schema_out = DataSchema([ent_papers], [rel_paper]) targets = torch.LongTensor(paper[1]) data = SparseMatrixData(schema) data[0] = cites_matrix data[1] = content_matrix indices_identity, indices_transpose = data.calculate_indices() data_target = Data(schema_out) data_target[0] = SparseMatrix(indices=torch.arange( n_papers, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_papers, n_classes]), shape=(n_papers, n_papers, n_classes)) data_target = data_target.to(device) #%% # Loss function: def classification_loss(data_pred, data_true): return F.cross_entropy(data_pred, data_true) n_channels = 1 net = SparseMatrixEntityPredictor(schema, n_channels,
for i in progress: optimizer.zero_grad() data_out = net.forward(data_hidden) train_loss = loss_fcn(data_out, data_hidden, observed) train_loss.backward() optimizer.step() with torch.no_grad(): val_loss = loss_fcn(data_out, data, missing) sched.step(val_loss) progress.set_description("Train: {:.4f}, Val: {:.4f}".format( train_loss.item(), val_loss.item())) #%% # Predict means (i.e. 0) fake_data_out = Data( schema, {key: torch.zeros_like(val) for key, val in data.rel_tensors.items()}) print(loss_fcn(fake_data_out, data, observed)) #%% encoding_size = net.get_encoding_size() total_encoding_size = sum([enc[0] * enc[1] for enc in encoding_size.values()]) num_els = {key: val.numel() for key, val in data.items()} print("Total datapoints: ", sum(d.numel() for d in data.values())) print("Total params: ", sum(p.numel() for p in net.parameters() if p.requires_grad)) def std_mean_per_entity(data_pred, data_true): '''For each entity instance in each relation, get the difference between the predicted and actual means. Return mean and std of these differences
def load_data(): paper_names = [] classes = [] word_names = ['word'+str(i+1) for i in range(1433)] with open(csv_file_str.format('paper')) as paperfile: reader = csv.reader(paperfile) for paper_name, class_name in reader: paper_names.append(paper_name) classes.append(class_name) class_names = list(np.unique(classes)) class_name_to_idx = {class_name : i for i, class_name in enumerate(class_names)} paper_name_to_idx = {paper_name: i for i, paper_name in enumerate(paper_names)} paper = np.array([[paper_name_to_idx[paper_name] for paper_name in paper_names], [class_name_to_idx[class_name] for class_name in classes]]) cites = [] with open(csv_file_str.format('cites')) as citesfile: reader = csv.reader(citesfile) for citer, citee in reader: cites.append([paper_name_to_idx[citer], paper_name_to_idx[citee]]) cites = np.array(cites).T content = [] def word_to_idx(word): ''' words all formatted like: "word1328" ''' return int(word[4:]) - 1 with open(csv_file_str.format('content')) as contentfile: reader = csv.reader(contentfile) for paper_name, word_name in reader: content.append([paper_name_to_idx[paper_name], word_to_idx(word_name)]) content = np.array(content).T n_papers = len(paper_names) n_classes = len(class_names) n_words = len(word_names) ent_papers = Entity(0, n_papers) ent_classes = Entity(1, n_classes) ent_words = Entity(2, n_words) entities = [ent_papers, ent_classes, ent_words] rel_paper = Relation(0, [ent_papers, ent_classes]) rel_cites = Relation(1, [ent_papers, ent_papers]) rel_content = Relation(2, [ent_papers, ent_words]) relations = [rel_paper, rel_cites, rel_content] schema = DataSchema(entities, relations) class_targets = torch.LongTensor(paper[1]) paper_matrix = torch.zeros(n_papers, n_classes) paper_matrix[paper] = 1 cites_matrix = torch.zeros(n_papers, n_papers) cites_matrix[cites] = 1 content_matrix = torch.zeros(n_papers, n_words) content_matrix[content] = 1 data = Data(schema) data[0] = paper_matrix.unsqueeze(0).unsqueeze(0) data[1] = cites_matrix.unsqueeze(0).unsqueeze(0) data[2] = content_matrix.unsqueeze(0).unsqueeze(0) return data, schema, class_targets