def __init__(self,
                 entity_counts,
                 sparsity=0.5,
                 embedding_dims=2,
                 tucker=False,
                 batch_dim=True):
        self.n_student = entity_counts[0]
        self.n_course = entity_counts[1]
        self.n_professor = entity_counts[2]
        self.sparsity = sparsity
        self.embedding_dims = embedding_dims
        self.tucker = tucker
        # Wether to include a batch dimension
        self.batch_dim = batch_dim

        ent_students = Entity(0, self.n_student)
        ent_courses = Entity(1, self.n_course)
        ent_professors = Entity(2, self.n_professor)
        entities = [ent_students, ent_courses, ent_professors]
        relations = []
        relations.append(Relation(0, [ent_students, ent_courses], 1))
        relations.append(Relation(1, [ent_students, ent_professors], 1))
        relations.append(Relation(2, [ent_professors, ent_courses], 1))
        relations.append(Relation(3, [ent_courses, ent_courses], 1))
        relations.append(Relation(4, [ent_courses, ent_courses], 1))

        self.schema = DataSchema(entities, relations)
        self.embedding_dims = embedding_dims
        np.random.seed(0)
        self.embeddings = self.make_embeddings(self.embedding_dims)
        self.data = self.make_data(self.tucker)
        self.observed = self.make_observed(self.sparsity)
    def get_node_classification_data(self):
        entities = self.schema.entities

        self.schema_out = DataSchema([entities[TARGET_NODE_TYPE]], [
            Relation(0,
                     [entities[TARGET_NODE_TYPE], entities[TARGET_NODE_TYPE]],
                     is_set=True)
        ])
        target_indices = []
        targets = []
        with open(LABEL_FILE_STR, 'r') as label_file:
            lines = label_file.readlines()
            for line in lines:
                node_id, node_name, node_type, node_label = line.rstrip(
                ).split('\t')
                node_type = int(node_type)
                node_id = self.node_id_to_idx[node_type][int(node_id)]
                node_label = int(node_label)
                target_indices.append(node_id)
                targets.append(node_label)

        self.target_indices = torch.LongTensor(target_indices)
        self.targets = torch.LongTensor(targets)
        self.n_outputs = self.schema.entities[TARGET_NODE_TYPE].n_instances
        self.data_target = SparseMatrixData(self.schema_out)
    def __init__(self, n_student, n_course, n_professor):
        self.n_student = n_student
        self.n_course = n_course
        self.n_professor = n_professor

        ent_students = Entity(0, self.n_student)
        ent_courses = Entity(1, self.n_course)
        ent_professors = Entity(2, self.n_professor)
        entities = [ent_students, ent_courses, ent_professors]

        #TODO: Fix student self-relation to have two channels
        relations = []
        #Takes
        relations.append(Relation(0, [ent_students, ent_courses], 1))
        #Reference
        relations.append(Relation(1, [ent_students, ent_professors], 1))
        #Teaches
        relations.append(Relation(2, [ent_professors, ent_courses], 1))
        #Prereq
        relations.append(Relation(3, [ent_courses, ent_courses], 1))
        #Student
        relations.append(Relation(4, [ent_students], 1))
        #Course
        relations.append(Relation(5, [ent_courses], 1))
        #Professor
        relations.append(Relation(6, [ent_professors], 1))

        # pick n dimensions
        # Draw from n-dimensional normal dist to get encodings for each entity
        self.schema = DataSchema(entities, relations)
        self.embeddings = None
Beispiel #4
0
 def __init__(self, schema, dims):
     super(EntityPooling, self).__init__()
     self.schema = schema
     self.dims = dims
     self.out_shape = [e.n_instances for e in self.schema.entities]
     # Make a "schema" for the encodings
     enc_relations = {
         i: Relation(i, [self.schema.entities[i]])
         for i in range(len(self.schema.entities))
     }
     self.enc_schema = DataSchema(self.schema.entities, enc_relations)
Beispiel #5
0
 def __init__(self,
              schema,
              input_dim=1,
              output_dim=1,
              entities=None,
              pool_op='mean'):
     '''
     input_dim: either a rel_id: dimension dict, or an integer for all relations
     output_dim: either a rel_id: dimension dict, or an integer for all relations
     '''
     if entities == None:
         entities = schema.entities
     enc_relations = {
         entity.id: Relation(entity.id, [entity, entity], is_set=True)
         for entity in entities
     }
     encodings_schema = DataSchema(entities, enc_relations)
     super().__init__(schema,
                      input_dim,
                      output_dim,
                      schema_out=encodings_schema,
                      pool_op=pool_op)
Beispiel #6
0
 def __init__(self,
              schema,
              input_dim=1,
              output_dim=1,
              entities=None,
              pool_op='mean'):
     '''
     schema: schema to broadcast to
     input_dim: either a rel_id: dimension dict, or an integer for all relations
     output_dim: either a rel_id: dimension dict, or an integer for all relations
     entities: if specified, these are the input entities for the encodings
     '''
     if entities == None:
         entities = schema.entities
     enc_relations = {
         entity.id: Relation(entity.id, [entity, entity], is_set=True)
         for entity in entities
     }
     encodings_schema = DataSchema(entities, enc_relations)
     super().__init__(encodings_schema,
                      input_dim,
                      output_dim,
                      schema_out=schema,
                      pool_op=pool_op)
    def __init__(self, entity_counts, sparsity=0.5, n_channels=1):
        self.n_student = entity_counts[0]
        self.n_course = entity_counts[1]
        self.n_professor = entity_counts[2]
        # Upper estimate of sparsity
        self.sparsity = sparsity

        ent_students = Entity(0, self.n_student)
        ent_courses = Entity(1, self.n_course)
        ent_professors = Entity(2, self.n_professor)
        entities = [ent_students, ent_courses, ent_professors]
        relations = []
        relations.append(Relation(0, [ent_students, ent_courses], 1))
        relations.append(Relation(1, [ent_students, ent_professors], 1))
        relations.append(Relation(2, [ent_professors, ent_courses], 1))
        relations.append(
            Relation(3, [ent_students, ent_professors, ent_courses], 1))
        relations.append(Relation(4, [ent_courses, ent_courses], 1))
        relations.append(Relation(5, [ent_students], 1))
        #relations.append(Relation(6, [ent_students, ent_students, ent_students, ent_courses], 1))

        self.schema = DataSchema(entities, relations)

        self.observed = self.make_observed(self.sparsity, n_channels)
Beispiel #8
0
    args = get_hyperparams(argv)
    print(args)
    set_seed(args.seed)

    dataloader = PubMedData(args.node_labels)
    schema = dataloader.schema
    data = dataloader.data.to(device)
    indices_identity, indices_transpose = data.calculate_indices()
    embedding_entity = schema.entities[TARGET_NODE_TYPE]
    input_channels = {
        rel.id: data[rel.id].n_channels
        for rel in schema.relations
    }
    embedding_schema = DataSchema(
        schema.entities,
        Relation(0, [embedding_entity, embedding_entity], is_set=True))
    n_instances = embedding_entity.n_instances
    data_embedding = SparseMatrixData(embedding_schema)
    data_embedding[0] = SparseMatrix(
        indices=torch.arange(n_instances, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_instances, args.embedding_dim]),
        shape=(n_instances, n_instances, args.embedding_dim),
        is_set=True)
    data_embedding.to(device)
    target_schema = DataSchema(schema.entities,
                               schema.relations[TARGET_REL_ID])
    target_node_idx_to_id = dataloader.target_node_idx_to_id
    #%%
    net = SparseMatrixAutoEncoder(schema,
                                  input_channels,
                                  layers=args.layers,
Beispiel #9
0
    def __init__(self):
        self.target_relation = 'advisedBy'

        data_raw = {
            rel_name: {key: list()
                       for key in schema_dict[rel_name].keys()}
            for rel_name in schema_dict.keys()
        }

        for relation_name in relation_names:
            with open(csv_file_str.format(relation_name)) as file:
                reader = csv.reader(file)
                keys = schema_dict[relation_name].keys()
                for cols in reader:
                    for key, col in zip(keys, cols):
                        data_raw[relation_name][key].append(col)

        ent_person = Entity(0, len(data_raw['person']['p_id']))
        ent_course = Entity(1, len(data_raw['course']['course_id']))
        entities = [ent_person, ent_course]

        rel_person_matrix = Relation(0, [ent_person, ent_person], is_set=True)
        rel_person = Relation(0, [ent_person])
        rel_course_matrix = Relation(1, [ent_course, ent_course], is_set=True)
        rel_course = Relation(1, [ent_course])
        rel_advisedBy = Relation(2, [ent_person, ent_person])
        rel_taughtBy = Relation(3, [ent_course, ent_person])
        relations_matrix = [
            rel_person_matrix, rel_course_matrix, rel_advisedBy, rel_taughtBy
        ]
        relations = [rel_person, rel_course, rel_taughtBy]

        self.target_rel_id = 2
        self.schema = DataSchema(entities, relations)
        schema_matrix = DataSchema(entities, relations_matrix)
        matrix_data = SparseMatrixData(schema_matrix)

        ent_id_to_idx_dict = {
            'person': self.id_to_idx(data_raw['person']['p_id']),
            'course': self.id_to_idx(data_raw['course']['course_id'])
        }

        for relation in relations_matrix:
            relation_name = relation_names[relation.id]
            print(relation_name)
            if relation.is_set:
                data_matrix = self.set_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name])
            else:
                if relation_name == 'advisedBy':
                    ent_n_id_str = 'p_id'
                    ent_m_id_str = 'p_id_dummy'
                elif relation_name == 'taughtBy':
                    ent_n_id_str = 'course_id'
                    ent_m_id_str = 'p_id'
                data_matrix = self.binary_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str,
                    ent_m_id_str)
            matrix_data[relation.id] = data_matrix

        rel_out = Relation(2, [ent_person, ent_person])
        self.schema_out = DataSchema([ent_person], [rel_out])

        self.output_dim = 1
        data = Data(self.schema)
        for rel_matrix in schema_matrix.relations:
            for rel in self.schema.relations:
                if rel_matrix.id == rel.id:
                    data_matrix = matrix_data[rel_matrix.id]
                    if rel_matrix.is_set:
                        dense_data = torch.diagonal(data_matrix.to_dense(), 0,
                                                    1, 2).unsqueeze(0)
                    else:
                        dense_data = data_matrix.to_dense().unsqueeze(0)
                    data[rel.id] = dense_data
        self.data = data

        self.target = matrix_data[self.target_rel_id].to_dense().squeeze()
        features = sp.csr_matrix(features, dtype=np.float32)
        rowsum = np.array(features.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
        features = r_mat_inv.dot(features)
        return torch.Tensor(features.todense())

    ent_movie = Entity(0, raw_data['movie_feature'].shape[0])
    ent_actor = Entity(1, raw_data['movie_actor'].shape[1])
    ent_director = Entity(2, raw_data['movie_director'].shape[1])
    ent_keyword = Entity(3, raw_data['movie_keyword'].shape[1])
    entities = [ent_movie, ent_actor, ent_director, ent_keyword]

    relations = []
    rel_movie_actor = Relation(0, [ent_movie, ent_actor])
    rel_movie_director = Relation(1, [ent_movie, ent_director])
    rel_movie_keyword = Relation(2, [ent_movie, ent_keyword])
    rel_movie_feature = Relation(3, [ent_movie, ent_movie], is_set=True)
    relations = [rel_movie_actor, rel_movie_director, rel_movie_keyword, rel_movie_feature]

    schema = DataSchema(entities, relations)
    schema_out = DataSchema([ent_movie], [Relation(0, [ent_movie, ent_movie], is_set=True)])

    data = SparseMatrixData(schema)
    for rel_i, rel_name in enumerate(relation_names):
        if rel_name == 'movie_feature':
            values = preprocess_features(raw_data[rel_name])
            data[rel_i] = SparseMatrix.from_embed_diag(values)
        else:
            data[rel_i] = SparseMatrix.from_scipy_sparse(raw_data[rel_name])
    #print("X_out: ", X_out)
    expected_shape = [batch_size] + [out_dim] + relation_j.get_shape()
    print("Out shape: ", list(X_out.shape))
    print("Expected shape: ", expected_shape)
    assert(list(X_out.shape) == expected_shape)


#%%
if __name__ == '__main__': 
    #Example 
    ##Ri = {n1, n2, n3}
    #Rj = {m1, m2}
    X = torch.tensor(np.arange(12, dtype=np.float32)).view(1, 1, 2,2,3)
    # Entity index : number instances mapping
    entities = [Entity(0, 3), Entity(1, 2), Entity(2, 5)]
    relation_i = Relation(0, [entities[1], entities[1], entities[0]])
    relation_j = Relation(1, [entities[0], entities[1]])

    test_layer_single_block(X, entities, relation_i, relation_j, 1, 7, 1)

    # %%
    # Example 2
    #Ri = {n1, n2}
    #Rj = {m1, m2, m3}
    X = torch.tensor(np.arange(16,  dtype=np.float32)).view(1, 1,4,4)
    entities = [Entity(0, 3), Entity(1, 2), Entity(2, 4)]
    relation_i = Relation(0, [entities[2], entities[2]])
    relation_j = Relation(1, [entities[0], entities[1], entities[1]])
    
    test_layer_single_block(X, entities, relation_i, relation_j, 1, 3, 1)
    
Beispiel #12
0
def load_data_flat(prefix,
                   use_node_attrs=True,
                   use_edge_data=True,
                   node_val='one'):
    '''
    Load data into one matrix with all relations, reproducing Maron 2019
    The first [# relation types] channels are adjacency matrices,
    while the next [sum of feature dimensions per entity type] channels have
    node attributes on the relevant segment of their diagonals if use_node_attrs=True.
    If node features aren't included, then ndoe_val is used instead.
    '''
    dl = data_loader(DATA_FILE_DIR + prefix)
    total_n_nodes = dl.nodes['total']
    entities = [Entity(0, total_n_nodes)]
    relations = {0: Relation(0, [entities[0], entities[0]])}
    schema = DataSchema(entities, relations)

    # Sparse Matrix containing all data
    data_full = sum(dl.links['data'].values()).tocoo()
    data_diag = scipy.sparse.coo_matrix(
        (np.ones(total_n_nodes),
         (np.arange(total_n_nodes), np.arange(total_n_nodes))),
        (total_n_nodes, total_n_nodes))
    data_full += data_diag
    data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_()
    data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0)
    # Load up all edge data
    for rel_id in sorted(dl.links['data'].keys()):
        data_matrix = dl.links['data'][rel_id]
        data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data_rel.values = torch.ones(data_rel.values.shape)
        data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full,
                                                              1) + data_rel
        data_out.values = torch.cat([data_out.values, data_rel_full.values], 1)
        data_out.n_channels += 1

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            start_i = dl.nodes['shift'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            n_channels = attr_matrix.shape[1]
            indices = torch.arange(start_i,
                                   start_i + n_instances).unsqueeze(0).repeat(
                                       2, 1)
            data_rel = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([total_n_nodes, total_n_nodes, n_channels]),
                is_set=True)
            data_rel_full = SparseMatrix.from_other_sparse_matrix(
                data_full, n_channels) + data_rel
            data_out.values = torch.cat(
                [data_out.values, data_rel_full.values], 1)
            data_out.n_channels += n_channels

    data = SparseMatrixData(schema)
    data[0] = data_out

    return schema,\
           data, \
           dl
Beispiel #13
0
def load_data():
    paper_names = []
    classes = []
    word_names = ['word'+str(i+1) for i in range(1433)]

    with open(csv_file_str.format('paper')) as paperfile:
        reader = csv.reader(paperfile)
        for paper_name, class_name in reader:
            paper_names.append(paper_name)
            classes.append(class_name)

    class_names = list(np.unique(classes))
    class_name_to_idx = {class_name : i for i, class_name in enumerate(class_names)}
    paper_name_to_idx = {paper_name: i for i, paper_name in enumerate(paper_names)}
    paper = np.array([[paper_name_to_idx[paper_name] for paper_name in paper_names],
                      [class_name_to_idx[class_name] for class_name in classes]])

    cites = []
    with open(csv_file_str.format('cites')) as citesfile:
        reader = csv.reader(citesfile)
        for citer, citee in reader:
            cites.append([paper_name_to_idx[citer], paper_name_to_idx[citee]])
    cites = np.array(cites).T

    content = []
    def word_to_idx(word):
        '''
        words all formatted like: "word1328"
        '''
        return int(word[4:]) - 1

    with open(csv_file_str.format('content')) as contentfile:
        reader = csv.reader(contentfile)
        for paper_name, word_name in reader:
            content.append([paper_name_to_idx[paper_name],
                            word_to_idx(word_name)])
    content = np.array(content).T

    n_papers = len(paper_names)
    n_classes = len(class_names)
    n_words = len(word_names)
    ent_papers = Entity(0, n_papers)
    ent_classes = Entity(1, n_classes)
    ent_words = Entity(2, n_words)
    entities = [ent_papers, ent_classes, ent_words]
    rel_paper = Relation(0, [ent_papers, ent_classes])
    rel_cites = Relation(1, [ent_papers, ent_papers])
    rel_content = Relation(2, [ent_papers, ent_words])
    relations = [rel_paper, rel_cites, rel_content]
    schema = DataSchema(entities, relations)

    class_targets = torch.LongTensor(paper[1])

    paper_matrix = torch.zeros(n_papers, n_classes)
    paper_matrix[paper] = 1
    
    cites_matrix = torch.zeros(n_papers, n_papers)
    cites_matrix[cites] = 1
    
    content_matrix = torch.zeros(n_papers, n_words)
    content_matrix[content] = 1
    
    

    data = Data(schema)
    data[0] = paper_matrix.unsqueeze(0).unsqueeze(0)
    data[1] = cites_matrix.unsqueeze(0).unsqueeze(0)
    data[2] = content_matrix.unsqueeze(0).unsqueeze(0)
    return data, schema, class_targets
Beispiel #14
0
def load_data(prefix,
              use_node_attrs=True,
              use_edge_data=True,
              use_other_edges=True,
              node_val='one'):
    dl = data_loader(DATA_FILE_DIR + prefix)

    all_entities = [
        Entity(entity_id, n_instances)
        for entity_id, n_instances in sorted(dl.nodes['count'].items())
    ]

    relations = {}
    test_types = dl.test_types
    if use_other_edges:
        for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()):
            relations[rel_id] = Relation(
                rel_id, [all_entities[entity_i], all_entities[entity_j]])

    else:
        for rel_id in test_types:
            entity_i, entity_j = dl.links['meta'][rel_id]
            relations[rel_id] = Relation(
                rel_id, [all_entities[entity_i], all_entities[entity_j]])

    if use_other_edges:
        entities = all_entities
    else:
        entities = list(np.unique(relations[test_types[0]].entities))

    max_relation = max(relations) + 1
    if use_node_attrs:
        # Create fake relations to represent node attributes
        for entity in entities:
            rel_id = max_relation + entity.id
            relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True)
    schema = DataSchema(entities, relations)

    data = SparseMatrixData(schema)
    for rel_id, data_matrix in dl.links['data'].items():
        if use_other_edges or rel_id in test_types:
            # Get subset belonging to entities in relation
            relation = relations[rel_id]
            start_i = dl.nodes['shift'][relation.entities[0].id]
            end_i = start_i + dl.nodes['count'][relation.entities[0].id]
            start_j = dl.nodes['shift'][relation.entities[1].id]
            end_j = start_j + dl.nodes['count'][relation.entities[1].id]
            rel_matrix = data_matrix[start_i:end_i, start_j:end_j]
            data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo())
            if not use_edge_data:
                # Use only adjacency information
                data[rel_id].values = torch.ones(data[rel_id].values.shape)

    if use_node_attrs:
        for ent in entities:
            ent_id = ent.id
            attr_matrix = dl.nodes['attr'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            n_channels = attr_matrix.shape[1]
            rel_id = ent_id + max_relation
            indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1)
            data[rel_id] = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([n_instances, n_instances, n_channels]),
                is_set=True)

    return schema,\
           data, \
           dl
Beispiel #15
0
def load_data(prefix='DBLP',
              use_node_attrs=True,
              use_edge_data=True,
              feats_type=0):
    dl = data_loader(DATA_FILE_DIR + prefix)

    # Create Schema
    entities = [
        Entity(entity_id, n_instances)
        for entity_id, n_instances in sorted(dl.nodes['count'].items())
    ]
    relations = {
        rel_id: Relation(rel_id, [entities[entity_i], entities[entity_j]])
        for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items())
    }
    num_relations = len(relations)
    if use_node_attrs:
        # Create fake relations to represent node attributes
        for entity in entities:
            rel_id = num_relations + entity.id
            relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True)
    schema = DataSchema(entities, relations)

    # Collect data
    data = SparseMatrixData(schema)
    for rel_id, data_matrix in dl.links['data'].items():
        # Get subset belonging to entities in relation
        start_i = dl.nodes['shift'][relations[rel_id].entities[0].id]
        end_i = start_i + dl.nodes['count'][relations[rel_id].entities[0].id]
        start_j = dl.nodes['shift'][relations[rel_id].entities[1].id]
        end_j = start_j + dl.nodes['count'][relations[rel_id].entities[1].id]
        rel_matrix = data_matrix[start_i:end_i, start_j:end_j]
        data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data[rel_id].values = torch.ones(data[rel_id].values.shape)

    target_entity = 0

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            if attr_matrix is None:
                # Attribute for each node is a single 1
                attr_matrix = np.ones(dl.nodes['count'][ent_id])[:, None]
            n_channels = attr_matrix.shape[1]
            rel_id = ent_id + num_relations
            n_instances = dl.nodes['count'][ent_id]
            indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1)
            data[rel_id] = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([n_instances, n_instances, n_channels]),
                is_set=True)

    n_outputs = dl.nodes['count'][target_entity]
    n_output_classes = dl.labels_train['num_classes']
    schema_out = DataSchema([entities[target_entity]], [
        Relation(0, [entities[target_entity], entities[target_entity]],
                 is_set=True)
    ])
    data_target = SparseMatrixData(schema_out)
    data_target[0] = SparseMatrix(
        indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_outputs, n_output_classes]),
        shape=(n_outputs, n_outputs, n_output_classes),
        is_set=True)
    labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']),
                      dtype=int)
    val_ratio = 0.2
    train_idx = np.nonzero(dl.labels_train['mask'])[0]
    np.random.shuffle(train_idx)
    split = int(train_idx.shape[0] * val_ratio)
    val_idx = train_idx[:split]
    train_idx = train_idx[split:]
    train_idx = np.sort(train_idx)
    val_idx = np.sort(val_idx)
    test_idx = np.nonzero(dl.labels_test['mask'])[0]
    labels[train_idx] = dl.labels_train['data'][train_idx]
    labels[val_idx] = dl.labels_train['data'][val_idx]
    if prefix != 'IMDB':
        labels = labels.argmax(axis=1)
    train_val_test_idx = {}
    train_val_test_idx['train_idx'] = train_idx
    train_val_test_idx['val_idx'] = val_idx
    train_val_test_idx['test_idx'] = test_idx
    return schema,\
           schema_out, \
           data, \
           data_target, \
           labels,\
           train_val_test_idx,\
           dl
Beispiel #16
0
    def __init__(self):
        data_raw = {
            rel_name: {key: list()
                       for key in schema_dict[rel_name].keys()}
            for rel_name in schema_dict.keys()
        }

        for relation_name in relation_names:
            with open(csv_file_str.format(relation_name)) as file:
                reader = csv.reader(file)
                keys = schema_dict[relation_name].keys()
                for cols in reader:
                    for key, col in zip(keys, cols):
                        data_raw[relation_name][key].append(col)

        ent_person = Entity(0, len(data_raw['person']['p_id']))
        ent_course = Entity(1, len(data_raw['course']['course_id']))
        entities = [ent_person, ent_course]

        rel_person = Relation(0, [ent_person, ent_person], is_set=True)
        rel_course = Relation(1, [ent_course, ent_course], is_set=True)
        rel_advisedBy = Relation(2, [ent_person, ent_person])
        rel_taughtBy = Relation(3, [ent_course, ent_person])
        relations = [rel_person, rel_course, rel_advisedBy, rel_taughtBy]

        self.schema = DataSchema(entities, relations)
        self.data = SparseMatrixData(self.schema)

        ent_id_to_idx_dict = {
            'person': self.id_to_idx(data_raw['person']['p_id']),
            'course': self.id_to_idx(data_raw['course']['course_id'])
        }

        for relation in relations:
            relation_name = relation_names[relation.id]
            print(relation_name)
            if relation.is_set:
                data_matrix = self.set_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name])
            else:
                if relation_name == 'advisedBy':
                    ent_n_id_str = 'p_id'
                    ent_m_id_str = 'p_id_dummy'
                elif relation_name == 'taughtBy':
                    ent_n_id_str = 'course_id'
                    ent_m_id_str = 'p_id'
                data_matrix = self.binary_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str,
                    ent_m_id_str)
            self.data[relation.id] = data_matrix

        self.target = self.get_targets(
            data_raw[self.TARGET_RELATION][self.TARGET_KEY],
            schema_dict[self.TARGET_RELATION][self.TARGET_KEY])
        self.target_rel_id = 0
        rel_out = Relation(self.target_rel_id, [ent_person, ent_person],
                           is_set=True)
        self.schema_out = DataSchema([ent_person], [rel_out])
        self.data_target = Data(self.schema_out)
        n_output_classes = len(
            np.unique(data_raw[self.TARGET_RELATION][self.TARGET_KEY]))
        self.output_dim = n_output_classes
        n_person = ent_person.n_instances
        self.data_target[self.target_rel_id] = SparseMatrix(
            indices=torch.arange(n_person, dtype=torch.int64).repeat(2, 1),
            values=torch.zeros([n_person, n_output_classes]),
            shape=(n_person, n_person, n_output_classes))
    def __init__(self, use_node_attrs=True):
        entities = [
            Entity(entity_id, n_instances)
            for entity_id, n_instances in ENTITY_N_INSTANCES.items()
        ]
        relations = [
            Relation(rel_id, [entities[entity_i], entities[entity_j]])
            for rel_id, (entity_i, entity_j) in RELATION_IDX.items()
        ]
        if use_node_attrs:
            for entity_id in ENTITY_N_INSTANCES.keys():
                rel = Relation(10 + entity_id,
                               [entities[entity_id], entities[entity_id]],
                               is_set=True)
                relations.append(rel)
        self.schema = DataSchema(entities, relations)

        self.node_id_to_idx = {ent_i: {} for ent_i in range(len(entities))}
        with open(NODE_FILE_STR, 'r') as node_file:
            lines = node_file.readlines()
            node_counter = {ent_i: 0 for ent_i in range(len(entities))}
            for line in lines:
                node_id, node_name, node_type, values = line.rstrip().split(
                    '\t')
                node_id = int(node_id)
                node_type = int(node_type)
                node_idx = node_counter[node_type]
                self.node_id_to_idx[node_type][node_id] = node_idx
                node_counter[node_type] += 1
        target_node_id_to_idx = self.node_id_to_idx[TARGET_NODE_TYPE]
        self.target_node_idx_to_id = {
            idx: id
            for id, idx in target_node_id_to_idx.items()
        }

        raw_data_indices = {rel_id: [] for rel_id in range(len(relations))}
        raw_data_values = {rel_id: [] for rel_id in range(len(relations))}
        if use_node_attrs:
            with open(NODE_FILE_STR, 'r') as node_file:
                lines = node_file.readlines()
                for line in lines:
                    node_id, node_name, node_type, values = line.rstrip(
                    ).split('\t')
                    node_type = int(node_type)
                    node_id = self.node_id_to_idx[node_type][int(node_id)]
                    values = list(map(float, values.split(',')))
                    raw_data_indices[10 + node_type].append([node_id, node_id])
                    raw_data_values[10 + node_type].append(values)

        with open(LINK_FILE_STR, 'r') as link_file:
            lines = link_file.readlines()
            for line in lines:
                node_i, node_j, rel_num, val = line.rstrip().split('\t')
                rel_num = int(rel_num)
                node_i_type, node_j_type = RELATION_IDX[rel_num]
                node_i = self.node_id_to_idx[node_i_type][int(node_i)]
                node_j = self.node_id_to_idx[node_j_type][int(node_j)]
                val = float(val)
                raw_data_indices[rel_num].append([node_i, node_j])
                raw_data_values[rel_num].append([val])

        self.data = SparseMatrixData(self.schema)
        for rel in relations:
            indices = torch.LongTensor(raw_data_indices[rel.id]).T
            values = torch.Tensor(raw_data_values[rel.id])
            n = rel.entities[0].n_instances
            m = rel.entities[1].n_instances
            n_channels = values.shape[1]
            data_matrix = SparseMatrix(indices=indices,
                                       values=values,
                                       shape=np.array([n, m, n_channels]),
                                       is_set=rel.is_set)
            del raw_data_indices[rel.id]
            del raw_data_values[rel.id]
            self.data[rel.id] = data_matrix
Beispiel #18
0
    n_content = 2 * int(0.2 * n_papers * n_words)

    content = np.stack([
        np.random.randint(0, n_papers, (n_content)),
        np.random.randint(0, n_words, (n_content))
    ])
    content = np.unique(cites, axis=1)
    content_matrix = SparseMatrix(indices=torch.LongTensor(content),
                                  values=value_dist.sample(
                                      (content.shape[1], 1)),
                                  shape=(n_papers, n_words, 1)).coalesce()

    ent_papers = Entity(0, n_papers)
    #ent_classes = Entity(1, n_classes)
    ent_words = Entity(1, n_words)
    rel_paper = Relation(0, [ent_papers, ent_papers], is_set=True)
    rel_cites = Relation(0, [ent_papers, ent_papers])
    rel_content = Relation(1, [ent_papers, ent_words])
    schema = DataSchema([ent_papers, ent_words], [rel_cites, rel_content])
    schema_out = DataSchema([ent_papers], [rel_paper])
    targets = torch.LongTensor(paper[1])

    data = SparseMatrixData(schema)
    data[0] = cites_matrix
    data[1] = content_matrix

    indices_identity, indices_transpose = data.calculate_indices()

    data_target = Data(schema_out)
    data_target[0] = SparseMatrix(indices=torch.arange(
        n_papers, dtype=torch.int64).repeat(2, 1),
Beispiel #19
0
def load_data():
    paper_names = []
    classes = []
    word_names = ['word' + str(i + 1) for i in range(1433)]

    with open(csv_file_str.format('paper')) as paperfile:
        reader = csv.reader(paperfile)
        for paper_name, class_name in reader:
            paper_names.append(paper_name)
            classes.append(class_name)

    class_names = list(np.unique(classes))
    class_name_to_idx = {
        class_name: i
        for i, class_name in enumerate(class_names)
    }
    paper_name_to_idx = {
        paper_name: i
        for i, paper_name in enumerate(paper_names)
    }
    paper = np.array(
        [[paper_name_to_idx[paper_name] for paper_name in paper_names],
         [class_name_to_idx[class_name] for class_name in classes]])

    cites = []
    with open(csv_file_str.format('cites')) as citesfile:
        reader = csv.reader(citesfile)
        for citer, citee in reader:
            cites.append([paper_name_to_idx[citer], paper_name_to_idx[citee]])
    cites = np.array(cites).T

    content = []

    def word_to_idx(word):
        '''
        words all formatted like: "word1328"
        '''
        return int(word[4:]) - 1

    with open(csv_file_str.format('content')) as contentfile:
        reader = csv.reader(contentfile)
        for paper_name, word_name in reader:
            content.append(
                [paper_name_to_idx[paper_name],
                 word_to_idx(word_name)])
    content = np.array(content).T

    n_papers = len(paper_names)
    n_classes = len(class_names)
    n_words = len(word_names)
    ent_papers = Entity(0, n_papers)
    ent_classes = Entity(1, n_classes)
    ent_words = Entity(2, n_words)
    entities = [ent_papers, ent_classes, ent_words]
    rel_paper = Relation(0, [ent_papers, ent_classes])
    rel_cites = Relation(1, [ent_papers, ent_papers])
    rel_content = Relation(2, [ent_papers, ent_words])
    relations = [rel_paper, rel_cites, rel_content]
    schema = DataSchema(entities, relations)

    # For each paper, get a random negative sample
    random_class_offset = np.random.randint(1, n_classes, (n_papers, ))
    paper_neg = np.stack(
        (paper[0], (paper[1] + random_class_offset) % n_classes))
    paper_matrix = SparseTensor(
        indices=torch.LongTensor(np.concatenate((paper, paper_neg), axis=1)),
        values=torch.cat((torch.ones(
            1, paper.shape[1]), torch.zeros(1, paper_neg.shape[1])), 1),
        shape=np.array([n_papers, n_classes])).coalesce()

    class_targets = torch.LongTensor(paper[1])

    # Randomly fill in values and coalesce to remove duplicates
    cites_neg = np.random.randint(0, n_papers, cites.shape)
    cites_matrix = SparseTensor(
        indices=torch.LongTensor(np.concatenate((cites, cites_neg), axis=1)),
        values=torch.cat((torch.ones(
            1, cites.shape[1]), torch.zeros(1, cites_neg.shape[1])), 1),
        shape=np.array([n_papers, n_papers])).coalesce()

    # For each paper, randomly fill in values and coalesce to remove duplicates
    content_neg = np.stack(
        (np.random.randint(0, n_papers, (content.shape[1], )),
         np.random.randint(0, n_words, (content.shape[1], ))))
    content_matrix = SparseTensor(
        indices=torch.LongTensor(np.concatenate((content, content_neg),
                                                axis=1)),
        values=torch.cat((torch.ones(
            1, content.shape[1]), torch.zeros(1, content_neg.shape[1])), 1),
        shape=np.array([n_papers, n_words])).coalesce()

    paper_dense_indices = np.array([
        np.tile(range(n_papers), n_classes),
        np.repeat(range(n_classes), n_papers)
    ])
    paper_dense_values = torch.zeros(paper_dense_indices.shape[1])
    for paper_i, class_name in enumerate(classes):
        class_i = class_name_to_idx[class_name]
        paper_dense_values[paper_i * n_classes + class_i] = 1
    paper_dense_matrix = SparseTensor(
        indices=torch.LongTensor(paper_dense_indices),
        values=torch.Tensor(paper_dense_values).unsqueeze(0),
        shape=np.array([n_papers, n_classes]))

    data = SparseTensorData(schema)
    data[0] = paper_dense_matrix
    data[1] = cites_matrix
    data[2] = content_matrix
    return data, schema, class_targets
Beispiel #20
0
def load_data_flat(prefix,
                   use_node_attrs=True,
                   use_edge_data=True,
                   node_val='zero',
                   feats_type=0):
    '''
    Load data into one matrix with all relations, reproducing Maron 2019
    The first [# relation types] channels are adjacency matrices,
    while the next [sum of feature dimensions per entity type] channels have
    node attributes on the relevant segment of their diagonals if use_node_attrs=True.
    If node features aren't included, then ndoe_val is used instead.
    '''
    dl = data_loader(DATA_FILE_DIR + prefix)
    total_n_nodes = dl.nodes['total']
    entities = [Entity(0, total_n_nodes)]
    relations = {0: Relation(0, [entities[0], entities[0]])}
    schema = DataSchema(entities, relations)

    # Sparse Matrix containing all data
    data_full = sum(dl.links['data'].values()).tocoo()
    data_diag = scipy.sparse.coo_matrix(
        (np.ones(total_n_nodes),
         (np.arange(total_n_nodes), np.arange(total_n_nodes))),
        (total_n_nodes, total_n_nodes))
    data_full += data_diag
    data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_()
    data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0)
    # Load up all edge data
    for rel_id in sorted(dl.links['data'].keys()):
        data_matrix = dl.links['data'][rel_id]
        data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data_rel.values = torch.ones(data_rel.values.shape)
        data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full,
                                                              1) + data_rel
        data_out.values = torch.cat([data_out.values, data_rel_full.values], 1)
        data_out.n_channels += 1

    target_entity = 0

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            start_i = dl.nodes['shift'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            if feats_type == 1 and ent_id != target_entity:
                # To keep same behaviour as non-LGNN model, use 10 dimensions
                attr_matrix = np.zeros((n_instances, 10))
            n_channels = attr_matrix.shape[1]
            indices = torch.arange(start_i,
                                   start_i + n_instances).unsqueeze(0).repeat(
                                       2, 1)
            data_rel = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([total_n_nodes, total_n_nodes, n_channels]),
                is_set=True)
            data_rel_full = SparseMatrix.from_other_sparse_matrix(
                data_full, n_channels) + data_rel
            data_out.values = torch.cat(
                [data_out.values, data_rel_full.values], 1)
            data_out.n_channels += n_channels

    data = SparseMatrixData(schema)
    data[0] = data_out

    n_outputs = total_n_nodes
    n_output_classes = dl.labels_train['num_classes']
    schema_out = DataSchema([entities[target_entity]], [
        Relation(0, [entities[target_entity], entities[target_entity]],
                 is_set=True)
    ])
    data_target = SparseMatrixData(schema_out)
    data_target[0] = SparseMatrix(
        indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_outputs, n_output_classes]),
        shape=(n_outputs, n_outputs, n_output_classes),
        is_set=True)
    labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']),
                      dtype=int)
    val_ratio = 0.2
    train_idx = np.nonzero(dl.labels_train['mask'])[0]
    np.random.shuffle(train_idx)
    split = int(train_idx.shape[0] * val_ratio)
    val_idx = train_idx[:split]
    train_idx = train_idx[split:]
    train_idx = np.sort(train_idx)
    val_idx = np.sort(val_idx)
    test_idx = np.nonzero(dl.labels_test['mask'])[0]
    labels[train_idx] = dl.labels_train['data'][train_idx]
    labels[val_idx] = dl.labels_train['data'][val_idx]
    if prefix != 'IMDB':
        labels = labels.argmax(axis=1)
    train_val_test_idx = {}
    train_val_test_idx['train_idx'] = train_idx
    train_val_test_idx['val_idx'] = val_idx
    train_val_test_idx['test_idx'] = test_idx

    return schema,\
           schema_out, \
           data, \
           data_target, \
           labels,\
           train_val_test_idx,\
           dl