Ejemplo n.º 1
0
    def one_hot_process(self):
        input_path = self.raw_paths[0]
        input_df = pd.read_csv(input_path, sep=',', dtype='str')
        smile_list = list(input_df[self.smile_col])
        if self.available_prop:
                prop_list = list(input_df[self.prop_name])
                
        self.all_smiles = smile_list
        data_list = []
                
        for i in range(len(smile_list)):
            smile = smile_list[i]
            mol = Chem.MolFromSmiles(smile)
            Chem.Kekulize(mol)
            num_atom = mol.GetNumAtoms()
            if num_atom > self.num_max_node:
                continue
            else:
                # atoms
                atom_array = np.zeros((len(self.atom_list), self.num_max_node), dtype=np.int32)
                if self.one_shot:
                    virtual_node = np.ones((1, self.num_max_node), dtype=np.int32)

                atom_idx = 0
                for atom in mol.GetAtoms():
                    atom_feature = atom.GetAtomicNum()
#                     print('self.atom_list','atom_feature', 'index')
#                     print(self.atom_list, atom_feature, self.atom_list.index(atom_feature))
                    atom_array[self.atom_list.index(atom_feature), atom_idx] = 1
                    if self.one_shot:
                        virtual_node[0, atom_idx] = 0
                    atom_idx += 1
                    
                if self.one_shot:
                    x = torch.tensor(np.concatenate((atom_array, virtual_node), axis=0))
                else:
                    x = torch.tensor(atom_array)

                # bonds
                adj_array = np.zeros([4, self.num_max_node, self.num_max_node], dtype=np.float32)
                for bond in mol.GetBonds():
                    bond_type = bond.GetBondType()
                    ch = bond_type_to_int[bond_type]
                    i = bond.GetBeginAtomIdx()
                    j = bond.GetEndAtomIdx()
                    adj_array[ch, i, j] = 1.0
                    adj_array[ch, j, i] = 1.0
                adj_array[-1, :, :] = 1 - np.sum(adj_array, axis=0)
                                
                data = Data(x=x)
                data.adj = torch.tensor(adj_array)
                data.num_atom = num_atom
                if self.available_prop:
                    data.y = torch.tensor([float(prop_list[i])])
                data_list.append(data)

        data, slices = self.collate(data_list)
        return data, slices
Ejemplo n.º 2
0
    def get(self, idx):
        #data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(idx))
        #return data
        if idx in self.cache:
            return self.cache[idx]

        d = self.input_ds[idx]
        adj = d['adj']
        vect_feat = d['vect_feat']
        input_mask = d['input_mask']
        pred_mask = d['pred_mask']
        vals = d['vals']
        coords = d['coords']

        #print(adj.shape)
        have_edge, _ = adj.max(dim=0)
        #print("have_edge.shape=", have_edge.shape)
        edges = np.argwhere(have_edge.numpy())

        #print("adj.shape=", adj.shape)
        #print("edges.shape=", edges.shape)

        edge_attr = [adj[:, i, j] for i, j in edges]
        edge_type = torch.Tensor(
            [np.argwhere(adj[:, i, j])[0] for i, j in edges]).long()

        edge_attr = torch.stack(edge_attr)
        #print("edge_attr.shape=", edge_attr.shape)

        edge_index_tensor = torch.Tensor(edges.T).long()
        #print("edge_index_tensor.shape=", edge_index_tensor.shape, "edge_index_tensor.dtype=", edge_index_tensor.dtype)

        y = torch.Tensor(vals)  #.reshape(1, -1, 1)

        # FIXME maybe we need to select non-padded nodes here?
        data = GeomData(x=torch.Tensor(vect_feat),
                        edge_index=edge_index_tensor,
                        edge_type=edge_type,
                        input_idx=idx,
                        pos=coords,
                        edge_attr=edge_attr,
                        y=y)

        data.pred_mask = torch.Tensor(pred_mask)  # .reshape(1, -1, 1)
        data.input_mask = torch.Tensor(input_mask)
        data.adj = torch.Tensor(adj)
        # print("pred_mask.shape=", pred_mask.shape,
        #       " returning data.pred_mask.shape=", data.pred_mask.shape,
        #       "y.shape=", y.shape)

        self.cache[idx] = data
        return data
Ejemplo n.º 3
0
def create_and_save_data(scale, fs_subjects_dir_path, atlas_sheet_path,
                         fsl_subjects_dir_path, atlas_dir_path,
                         correlation_measure, r, tmp_dir_path, subject):
    """
    create torch_geometric :obj:`Data` for one subject with re-sampling
    save the list of :obj:`Data` to tmp_dir_path via pickle
    :param scale:
    :param fs_subjects_dir_path:
    :param atlas_sheet_path:
    :param fsl_subjects_dir_path:
    :param atlas_dir_path:
    :param correlation_measure:
    :param r: N choose r, for re-sampling
    :param tmp_dir_path:
    :param subject:
    :return:
    """
    data_list = []

    node_attr_array = to_node_attr_array(subject, fs_subjects_dir_path,
                                         atlas_sheet_path, scale)
    time_series = to_time_series(subject, fsl_subjects_dir_path,
                                 atlas_dir_path, scale)

    # # TODO: redundant data argumentation ?
    # combo_list = resample_mm_N_choose_n(time_series)
    # shuffle(combo_list)
    # for combo in combo_list:  # 3 channels in 1 comb (RESTBLOCK IPBLOCK UNKNOWN)
    #     corr_list = [correlation_measure.fit_transform([cm])[0] for cm in combo]
    #     # convert correlation to distance between [0, 1]
    #     # corr_list = [1 - np.sqrt((1 - corr) / 2) for corr in corr_list]
    #     # corr_list = [remove_least_k_percent(np.abs(corr), k=0.4) for corr in corr_list]  # abs
    #     all_corr = np.stack(corr_list, axis=-1)
    #
    #     # create torch_geometric Data
    #     G = nx.from_numpy_array(np.ones_like(corr_list[0]))
    #     A = nx.to_scipy_sparse_matrix(G)
    #     adj = A.tocoo()
    #     edge_index = np.stack([adj.row, adj.col])
    #     edge_attr = np.ones((len(adj.row), len(corr_list)))  # add edge_attr later
    #     # for i in range(len(adj.row)):
    #     #     edge_attr[i] = all_corr[adj.row[i], adj.col[i]]
    #
    #     data = Data(x=torch.tensor(node_attr_array, dtype=torch.float),
    #                 edge_index=torch.tensor(edge_index, dtype=torch.long),
    #                 edge_attr=torch.tensor(edge_attr, dtype=torch.float),
    #                 y=torch.tensor([0]) if subject.startswith('2') else torch.tensor([1]))
    #     data.adj = torch.tensor(all_corr, dtype=torch.float)
    #     data.ts = torch.tensor(time_series)
    #     data_list.append(data)

    corr = correlation_measure.fit_transform([time_series])[0]
    G = nx.from_numpy_array(np.ones_like(corr))
    A = nx.to_scipy_sparse_matrix(G)
    adj = A.tocoo()

    edge_index = np.stack([adj.row, adj.col])
    edge_attr = np.ones((len(adj.row), 1))  # add edge_attr later

    data = Data(
        x=torch.tensor(node_attr_array, dtype=torch.float),
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        edge_attr=torch.tensor(edge_attr, dtype=torch.float),
        y=torch.tensor([0]) if subject.startswith('2') else torch.tensor([1]))
    data.adj = torch.tensor(corr, dtype=torch.float)
    data.ts = torch.tensor(time_series)
    data_list.append(data)

    subject_tmp_file_path = osp.join(tmp_dir_path, '{}.pickle'.format(subject))
    print("Saving {} samples on subject {}".format(len(data_list), subject))
    with open(subject_tmp_file_path, 'wb') as pfile:
        pickle.dump(data_list, pfile, protocol=pickle.HIGHEST_PROTOCOL)