def process(self):
        print('processing data from ({}) and saving it to ({})'.format(
            self.directory, os.path.join(self.directory, 'processed')))

        with open(os.path.join(self.directory, "summary_qm9.json"), "r") as f:
            summary = json.load(f)

        atom_slices = [0]
        edge_slices = [0]
        total_eigvecs = []
        total_eigvals = []
        all_atom_features = []
        all_edge_features = []
        targets = {
            'ensembleenergy': [],
            'ensembleentropy': [],
            'ensemblefreeenergy': [],
            'lowestenergy': [],
            'poplowestpct': [],
            'temperature': [],
            'uniqueconfs': []
        }
        edge_indices = []  # edges of each molecule in coo format
        atomic_number_long = []
        n_atoms_list = []

        coordinates = []
        smiles_list = []
        total_atoms = 0
        total_edges = 0
        avg_degree = 0  # average degree in the dataset
        for smiles, sub_dic in tqdm(list(summary.items())):
            pickle_path = os.path.join(self.directory,
                                       sub_dic.get("pickle_path", ""))
            if os.path.isfile(pickle_path):
                pickle_file = open(pickle_path, 'rb')
                mol_dict = pickle.load(pickle_file)
                if 'ensembleenergy' in mol_dict:
                    conformers = mol_dict['conformers']
                    mol = conformers[0]['rd_mol']
                    n_atoms = len(mol.GetAtoms())
                    atom_features_list = []
                    for atom in mol.GetAtoms():
                        atom_features_list.append(atom_to_feature_vector(atom))
                    all_atom_features.append(
                        torch.tensor(atom_features_list, dtype=torch.long))

                    adj = GetAdjacencyMatrix(mol, useBO=False, force=True)
                    max_freqs = 10
                    adj = torch.tensor(adj).float()
                    D = torch.diag(adj.sum(dim=0))
                    L = D - adj
                    N = adj.sum(dim=0)**-0.5
                    L_sym = torch.eye(n_atoms) - N * L * N
                    try:
                        eig_vals, eig_vecs = torch.symeig(L_sym,
                                                          eigenvectors=True)
                    except Exception as e:  # if we have disconnected components
                        deg = adj.sum(dim=0)
                        deg[deg == 0] = 1
                        N = deg**-0.5
                        L_sym = torch.eye(n_atoms) - N * L * N
                        eig_vals, eig_vecs = torch.symeig(L_sym,
                                                          eigenvectors=True)
                    idx = eig_vals.argsort(
                    )[0:
                      max_freqs]  # Keep up to the maximum desired number of frequencies
                    eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx]

                    # Sort, normalize and pad EigenVectors
                    eig_vecs = eig_vecs[:,
                                        eig_vals.argsort()]  # increasing order
                    eig_vecs = F.normalize(eig_vecs,
                                           p=2,
                                           dim=1,
                                           eps=1e-12,
                                           out=None)
                    if n_atoms < max_freqs:
                        eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms),
                                         value=float('nan'))
                        eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms),
                                         value=float('nan'))

                    total_eigvecs.append(eig_vecs)
                    total_eigvals.append(eig_vals.unsqueeze(0))

                    edges_list = []
                    edge_features_list = []
                    for bond in mol.GetBonds():
                        i = bond.GetBeginAtomIdx()
                        j = bond.GetEndAtomIdx()
                        edge_feature = bond_to_feature_vector(bond)

                        # add edges in both directions
                        edges_list.append((i, j))
                        edge_features_list.append(edge_feature)
                        edges_list.append((j, i))
                        edge_features_list.append(edge_feature)
                    # Graph connectivity in COO format with shape [2, num_edges]
                    edge_index = torch.tensor(edges_list, dtype=torch.long).T
                    edge_features = torch.tensor(edge_features_list,
                                                 dtype=torch.long)

                    avg_degree += (len(edges_list) / 2) / n_atoms

                    targets['ensembleenergy'].append(
                        mol_dict['ensembleenergy'])
                    targets['ensembleentropy'].append(
                        mol_dict['ensembleentropy'])
                    targets['ensemblefreeenergy'].append(
                        mol_dict['ensemblefreeenergy'])
                    targets['lowestenergy'].append(mol_dict['lowestenergy'])
                    targets['poplowestpct'].append(mol_dict['poplowestpct'])
                    targets['temperature'].append(mol_dict['temperature'])
                    targets['uniqueconfs'].append(mol_dict['uniqueconfs'])
                    conformers = [
                        torch.tensor(
                            conformer['rd_mol'].GetConformer().GetPositions(),
                            dtype=torch.float) for conformer in conformers[:10]
                    ]
                    if len(
                            conformers
                    ) < 10:  # if there are less than 10 conformers we add the first one a few times
                        conformers.extend([conformers[0]] *
                                          (10 - len(conformers)))

                    all_edge_features.append(edge_features)
                    coordinates.append(torch.cat(conformers, dim=1))
                    edge_indices.append(edge_index)
                    total_edges += len(edges_list)
                    total_atoms += n_atoms
                    smiles_list.append(smiles)
                    edge_slices.append(total_edges)
                    atom_slices.append(total_atoms)
                    n_atoms_list.append(n_atoms)

        for key, value in targets.items():
            targets[key] = torch.tensor(value)[:, None]
        data_dict = {
            'smiles':
            smiles_list,
            'n_atoms':
            torch.tensor(n_atoms_list, dtype=torch.long),
            'atom_slices':
            torch.tensor(atom_slices, dtype=torch.long),
            'edge_slices':
            torch.tensor(edge_slices, dtype=torch.long),
            'atom_features':
            torch.cat(all_atom_features, dim=0),
            'edge_features':
            torch.cat(all_edge_features, dim=0),
            'atomic_number_long':
            torch.tensor(atomic_number_long, dtype=torch.long),
            'edge_indices':
            torch.cat(edge_indices, dim=1),
            'coordinates':
            torch.cat(coordinates, dim=0).float(),
            'targets':
            targets,
            'avg_degree':
            avg_degree / len(n_atoms_list)
        }
        data_dict.update(targets)
        if not os.path.exists(os.path.join(self.directory, 'processed')):
            os.mkdir(os.path.join(self.directory, 'processed'))
        torch.save(
            data_dict,
            os.path.join(self.directory, 'processed', self.processed_file))
Ejemplo n.º 2
0
    def process(self):
        print('processing data from ({}) and saving it to ({})'.format(self.qm9_directory,
                                                                       os.path.join(self.qm9_directory, 'processed')))

        # load qm9 data with spatial coordinates
        data_qm9 = dict(np.load(os.path.join(self.qm9_directory, self.raw_spatial_data), allow_pickle=True))
        coordinates = torch.tensor(data_qm9['R'], dtype=torch.float)
        # Read the QM9 data with SMILES information
        molecules_df = pd.read_csv(os.path.join(self.qm9_directory, self.raw_qm9_file))

        atom_slices = [0]
        edge_slices = [0]
        total_eigvecs = []
        total_eigvals = []
        all_atom_features = []
        all_edge_features = []
        edge_indices = []  # edges of each molecule in coo format
        targets = []  # the 19 properties that should be predicted for the QM9 dataset
        total_atoms = 0
        total_edges = 0
        avg_degree = 0  # average degree in the dataset
        # go through all molecules in the npz file
        for mol_idx, n_atoms in tqdm(enumerate(data_qm9['N'])):
            # get the molecule using the smiles representation from the csv file
            mol = Chem.MolFromSmiles(molecules_df['smiles'][data_qm9['id'][mol_idx]])
            # add hydrogen bonds to molecule because they are not in the smiles representation
            mol = Chem.AddHs(mol)

            atom_features_list = []
            for atom in mol.GetAtoms():
                atom_features_list.append(atom_to_feature_vector(atom))
            all_atom_features.append(torch.tensor(atom_features_list, dtype=torch.long))

            adj = GetAdjacencyMatrix(mol, useBO=False, force=True)
            max_freqs = 10
            adj = torch.tensor(adj).float()
            D = torch.diag(adj.sum(dim=0))
            L = D - adj
            N = adj.sum(dim=0) ** -0.5
            L_sym = torch.eye(n_atoms) - N * L * N
            eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True)
            idx = eig_vals.argsort()[0: max_freqs]  # Keep up to the maximum desired number of frequencies
            eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx]

            # Sort, normalize and pad EigenVectors
            eig_vecs = eig_vecs[:, eig_vals.argsort()]  # increasing order
            eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None)
            if n_atoms < max_freqs:
                eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan'))
                eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan'))

            total_eigvecs.append(eig_vecs)
            total_eigvals.append(eig_vals.unsqueeze(0))

            edges_list = []
            edge_features_list = []
            for bond in mol.GetBonds():
                i = bond.GetBeginAtomIdx()
                j = bond.GetEndAtomIdx()
                edge_feature = bond_to_feature_vector(bond)

                # add edges in both directions
                edges_list.append((i, j))
                edge_features_list.append(edge_feature)
                edges_list.append((j, i))
                edge_features_list.append(edge_feature)
            # Graph connectivity in COO format with shape [2, num_edges]
            edge_index = torch.tensor(edges_list, dtype=torch.long).T
            edge_features = torch.tensor(edge_features_list, dtype=torch.long)

            avg_degree += (len(edges_list) / 2) / n_atoms

            # get all 19 attributes that should be predicted, so we drop the first two entries (name and smiles)
            target = torch.tensor(molecules_df.iloc[data_qm9['id'][mol_idx]][2:], dtype=torch.float)
            targets.append(target)
            edge_indices.append(edge_index)
            all_edge_features.append(edge_features)

            total_edges += len(edges_list)
            total_atoms += n_atoms
            edge_slices.append(total_edges)
            atom_slices.append(total_atoms)

        # convert targets to eV units
        targets = torch.stack(targets) * torch.tensor(list(self.unit_conversion.values()))[None, :]
        data_dict = {'mol_id': data_qm9['id'],
                     'n_atoms': torch.tensor(data_qm9['N'], dtype=torch.long),
                     'atom_slices': torch.tensor(atom_slices, dtype=torch.long),
                     'edge_slices': torch.tensor(edge_slices, dtype=torch.long),
                     'eig_vecs': torch.cat(total_eigvecs).float(),
                     'eig_vals': torch.cat(total_eigvals).float(),
                     'edge_indices': torch.cat(edge_indices, dim=1),
                     'atom_features': torch.cat(all_atom_features, dim=0),
                     'edge_features': torch.cat(all_edge_features, dim=0),
                     'atomic_number_long': torch.tensor(data_qm9['Z'], dtype=torch.long)[:, None],
                     'coordinates': coordinates,
                     'targets': targets,
                     'avg_degree': avg_degree / len(data_qm9['id'])
                     }

        if not os.path.exists(os.path.join(self.qm9_directory, 'processed')):
            os.mkdir(os.path.join(self.qm9_directory, 'processed'))
        torch.save(data_dict, os.path.join(self.qm9_directory, 'processed', self.processed_file))