Beispiel #1
0
    def process(self):
        add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

        if self.meta_info['additional node files'] == 'None':
            additional_node_files = []
        else:
            additional_node_files = self.meta_info[
                'additional node files'].split(',')

        if self.meta_info['additional edge files'] == 'None':
            additional_edge_files = []
        else:
            additional_edge_files = self.meta_info[
                'additional edge files'].split(',')

        if self.is_hetero:
            data = read_heterograph_pyg(
                self.raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files,
                binary=self.binary)[0]
        else:
            data = read_graph_pyg(self.raw_dir,
                                  add_inverse_edge=add_inverse_edge,
                                  additional_node_files=additional_node_files,
                                  additional_edge_files=additional_edge_files,
                                  binary=self.binary)[0]

        data = data if self.pre_transform is None else self.pre_transform(data)

        print('Saving...')
        torch.save(self.collate([data]), self.processed_paths[0])
Beispiel #2
0
    def process(self):
        ### read pyg graph list
        add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

        if self.meta_info['additional node files'] == 'None':
            additional_node_files = []
        else:
            additional_node_files = self.meta_info['additional node files'].split(',')

        if self.meta_info['additional edge files'] == 'None':
            additional_edge_files = []
        else:
            additional_edge_files = self.meta_info['additional edge files'].split(',')

        data_list = read_graph_pyg(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)

        if self.task_type == 'subtoken prediction':
            graph_label_notparsed = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values
            graph_label = [str(graph_label_notparsed[i][0]).split(' ') for i in range(len(graph_label_notparsed))]

            for i, g in enumerate(data_list):
                g.y = graph_label[i]

        else:
            if self.binary:
                graph_label = np.load(osp.join(self.raw_dir, 'graph-label.npz'))['graph_label']
            else:
                graph_label = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values

            has_nan = np.isnan(graph_label).any()

            for i, g in enumerate(data_list):
                if 'classification' in self.task_type:
                    if has_nan:
                        g.y = torch.from_numpy(graph_label[i]).view(1,-1).to(torch.float32)
                    else:
                        g.y = torch.from_numpy(graph_label[i]).view(1,-1).to(torch.long)
                else:
                    g.y = torch.from_numpy(graph_label[i]).view(1,-1).to(torch.float32)

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)

        print('Saving...')
        torch.save((data, slices), self.processed_paths[0])
Beispiel #3
0
    def process(self):
        add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

        if self.meta_info['additional node files'] == 'None':
            additional_node_files = []
        else:
            additional_node_files = self.meta_info[
                'additional node files'].split(',')

        if self.meta_info['additional edge files'] == 'None':
            additional_edge_files = []
        else:
            additional_edge_files = self.meta_info[
                'additional edge files'].split(',')

        if self.is_hetero:
            data = read_heterograph_pyg(
                self.raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files,
                binary=self.binary)[0]

            if self.binary:
                tmp = np.load(osp.join(self.raw_dir, 'node-label.npz'))
                node_label_dict = {}
                for key in list(tmp.keys()):
                    node_label_dict[key] = tmp[key]
                del tmp
            else:
                node_label_dict = read_node_label_hetero(self.raw_dir)

            data.y_dict = {}
            if 'classification' in self.task_type:
                for nodetype, node_label in node_label_dict.items():
                    # detect if there is any nan
                    if np.isnan(node_label).any():
                        data.y_dict[nodetype] = torch.from_numpy(
                            node_label).to(torch.float32)
                    else:
                        data.y_dict[nodetype] = torch.from_numpy(
                            node_label).to(torch.long)
            else:
                for nodetype, node_label in node_label_dict.items():
                    data.y_dict[nodetype] = torch.from_numpy(node_label).to(
                        torch.float32)

        else:
            data = read_graph_pyg(self.raw_dir,
                                  add_inverse_edge=add_inverse_edge,
                                  additional_node_files=additional_node_files,
                                  additional_edge_files=additional_edge_files,
                                  binary=self.binary)[0]

            ### adding prediction target
            if self.binary:
                node_label = np.load(osp.join(self.raw_dir,
                                              'node-label.npz'))['node_label']
            else:
                node_label = pd.read_csv(osp.join(self.raw_dir,
                                                  'node-label.csv.gz'),
                                         compression='gzip',
                                         header=None).values

            if 'classification' in self.task_type:
                # detect if there is any nan
                if np.isnan(node_label).any():
                    data.y = torch.from_numpy(node_label).to(torch.float32)
                else:
                    data.y = torch.from_numpy(node_label).to(torch.long)

            else:
                data.y = torch.from_numpy(node_label).to(torch.float32)

        data = data if self.pre_transform is None else self.pre_transform(data)

        print('Saving...')
        torch.save(self.collate([data]), self.processed_paths[0])
Beispiel #4
0
    def process(self):
        from ogb.io.read_graph_pyg import read_graph_pyg

        ### read pyg graph list
        add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

        if self.meta_info['additional node files'] == 'None':
            additional_node_files = []
        else:
            additional_node_files = self.meta_info[
                'additional node files'].split(',')

        if self.meta_info['additional edge files'] == 'None':
            additional_edge_files = []
        else:
            additional_edge_files = self.meta_info[
                'additional edge files'].split(',')

        data_list = read_graph_pyg(self.raw_dir,
                                   add_inverse_edge=add_inverse_edge,
                                   additional_node_files=additional_node_files,
                                   additional_edge_files=additional_edge_files,
                                   binary=self.binary)

        if self.task_type == 'subtoken prediction':
            graph_label_notparsed = pd.read_csv(os.path.join(
                self.raw_dir, 'graph-label.csv.gz'),
                                                compression='gzip',
                                                header=None).values
            graph_label = [
                str(graph_label_notparsed[i][0]).split(' ')
                for i in range(len(graph_label_notparsed))
            ]

            for i, g in enumerate(data_list):
                g.y = graph_label[i]

        else:
            if self.binary:
                graph_label = np.load(
                    os.path.join(self.raw_dir,
                                 'graph-label.npz'))['graph_label']
            else:
                graph_label = pd.read_csv(os.path.join(self.raw_dir,
                                                       'graph-label.csv.gz'),
                                          compression='gzip',
                                          header=None).values

            has_nan = np.isnan(graph_label).any()

            for i, g in enumerate(data_list):
                if 'classification' in self.task_type:
                    if has_nan:
                        g.y = torch.from_numpy(graph_label[i]).view(1, -1).to(
                            torch.float32)
                    else:
                        g.y = torch.from_numpy(graph_label[i]).view(1, -1).to(
                            torch.long)
                else:
                    g.y = torch.from_numpy(graph_label[i]).view(1, -1).to(
                        torch.float32)

        if self.pre_transform is not None:
            dataset = self.collate(data_list)

            data_list = _process_dataset_pretransform_torch(
                self.pre_transform, TensorSliceDataset(*dataset))

        dataset = self.collate(data_list)
        np.savez(self.processed_paths[0], **_dataset_to_dict(dataset))