def process(self): print('processing data from ({}) and saving it to ({})'.format( self.directory, os.path.join(self.directory, 'processed'))) with open(os.path.join(self.directory, "summary_qm9.json"), "r") as f: summary = json.load(f) atom_slices = [0] edge_slices = [0] total_eigvecs = [] total_eigvals = [] all_atom_features = [] all_edge_features = [] targets = { 'ensembleenergy': [], 'ensembleentropy': [], 'ensemblefreeenergy': [], 'lowestenergy': [], 'poplowestpct': [], 'temperature': [], 'uniqueconfs': [] } edge_indices = [] # edges of each molecule in coo format atomic_number_long = [] n_atoms_list = [] coordinates = [] smiles_list = [] total_atoms = 0 total_edges = 0 avg_degree = 0 # average degree in the dataset for smiles, sub_dic in tqdm(list(summary.items())): pickle_path = os.path.join(self.directory, sub_dic.get("pickle_path", "")) if os.path.isfile(pickle_path): pickle_file = open(pickle_path, 'rb') mol_dict = pickle.load(pickle_file) if 'ensembleenergy' in mol_dict: conformers = mol_dict['conformers'] mol = conformers[0]['rd_mol'] n_atoms = len(mol.GetAtoms()) atom_features_list = [] for atom in mol.GetAtoms(): atom_features_list.append(atom_to_feature_vector(atom)) all_atom_features.append( torch.tensor(atom_features_list, dtype=torch.long)) adj = GetAdjacencyMatrix(mol, useBO=False, force=True) max_freqs = 10 adj = torch.tensor(adj).float() D = torch.diag(adj.sum(dim=0)) L = D - adj N = adj.sum(dim=0)**-0.5 L_sym = torch.eye(n_atoms) - N * L * N try: eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) except Exception as e: # if we have disconnected components deg = adj.sum(dim=0) deg[deg == 0] = 1 N = deg**-0.5 L_sym = torch.eye(n_atoms) - N * L * N eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) idx = eig_vals.argsort( )[0: max_freqs] # Keep up to the maximum desired number of frequencies eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx] # Sort, normalize and pad EigenVectors eig_vecs = eig_vecs[:, eig_vals.argsort()] # increasing order eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None) if n_atoms < max_freqs: eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan')) eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan')) total_eigvecs.append(eig_vecs) total_eigvals.append(eig_vals.unsqueeze(0)) edges_list = [] edge_features_list = [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() edge_feature = bond_to_feature_vector(bond) # add edges in both directions edges_list.append((i, j)) edge_features_list.append(edge_feature) edges_list.append((j, i)) edge_features_list.append(edge_feature) # Graph connectivity in COO format with shape [2, num_edges] edge_index = torch.tensor(edges_list, dtype=torch.long).T edge_features = torch.tensor(edge_features_list, dtype=torch.long) avg_degree += (len(edges_list) / 2) / n_atoms targets['ensembleenergy'].append( mol_dict['ensembleenergy']) targets['ensembleentropy'].append( mol_dict['ensembleentropy']) targets['ensemblefreeenergy'].append( mol_dict['ensemblefreeenergy']) targets['lowestenergy'].append(mol_dict['lowestenergy']) targets['poplowestpct'].append(mol_dict['poplowestpct']) targets['temperature'].append(mol_dict['temperature']) targets['uniqueconfs'].append(mol_dict['uniqueconfs']) conformers = [ torch.tensor( conformer['rd_mol'].GetConformer().GetPositions(), dtype=torch.float) for conformer in conformers[:10] ] if len( conformers ) < 10: # if there are less than 10 conformers we add the first one a few times conformers.extend([conformers[0]] * (10 - len(conformers))) all_edge_features.append(edge_features) coordinates.append(torch.cat(conformers, dim=1)) edge_indices.append(edge_index) total_edges += len(edges_list) total_atoms += n_atoms smiles_list.append(smiles) edge_slices.append(total_edges) atom_slices.append(total_atoms) n_atoms_list.append(n_atoms) for key, value in targets.items(): targets[key] = torch.tensor(value)[:, None] data_dict = { 'smiles': smiles_list, 'n_atoms': torch.tensor(n_atoms_list, dtype=torch.long), 'atom_slices': torch.tensor(atom_slices, dtype=torch.long), 'edge_slices': torch.tensor(edge_slices, dtype=torch.long), 'atom_features': torch.cat(all_atom_features, dim=0), 'edge_features': torch.cat(all_edge_features, dim=0), 'atomic_number_long': torch.tensor(atomic_number_long, dtype=torch.long), 'edge_indices': torch.cat(edge_indices, dim=1), 'coordinates': torch.cat(coordinates, dim=0).float(), 'targets': targets, 'avg_degree': avg_degree / len(n_atoms_list) } data_dict.update(targets) if not os.path.exists(os.path.join(self.directory, 'processed')): os.mkdir(os.path.join(self.directory, 'processed')) torch.save( data_dict, os.path.join(self.directory, 'processed', self.processed_file))
def process(self): print('processing data from ({}) and saving it to ({})'.format(self.qm9_directory, os.path.join(self.qm9_directory, 'processed'))) # load qm9 data with spatial coordinates data_qm9 = dict(np.load(os.path.join(self.qm9_directory, self.raw_spatial_data), allow_pickle=True)) coordinates = torch.tensor(data_qm9['R'], dtype=torch.float) # Read the QM9 data with SMILES information molecules_df = pd.read_csv(os.path.join(self.qm9_directory, self.raw_qm9_file)) atom_slices = [0] edge_slices = [0] total_eigvecs = [] total_eigvals = [] all_atom_features = [] all_edge_features = [] edge_indices = [] # edges of each molecule in coo format targets = [] # the 19 properties that should be predicted for the QM9 dataset total_atoms = 0 total_edges = 0 avg_degree = 0 # average degree in the dataset # go through all molecules in the npz file for mol_idx, n_atoms in tqdm(enumerate(data_qm9['N'])): # get the molecule using the smiles representation from the csv file mol = Chem.MolFromSmiles(molecules_df['smiles'][data_qm9['id'][mol_idx]]) # add hydrogen bonds to molecule because they are not in the smiles representation mol = Chem.AddHs(mol) atom_features_list = [] for atom in mol.GetAtoms(): atom_features_list.append(atom_to_feature_vector(atom)) all_atom_features.append(torch.tensor(atom_features_list, dtype=torch.long)) adj = GetAdjacencyMatrix(mol, useBO=False, force=True) max_freqs = 10 adj = torch.tensor(adj).float() D = torch.diag(adj.sum(dim=0)) L = D - adj N = adj.sum(dim=0) ** -0.5 L_sym = torch.eye(n_atoms) - N * L * N eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) idx = eig_vals.argsort()[0: max_freqs] # Keep up to the maximum desired number of frequencies eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx] # Sort, normalize and pad EigenVectors eig_vecs = eig_vecs[:, eig_vals.argsort()] # increasing order eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None) if n_atoms < max_freqs: eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan')) eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan')) total_eigvecs.append(eig_vecs) total_eigvals.append(eig_vals.unsqueeze(0)) edges_list = [] edge_features_list = [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() edge_feature = bond_to_feature_vector(bond) # add edges in both directions edges_list.append((i, j)) edge_features_list.append(edge_feature) edges_list.append((j, i)) edge_features_list.append(edge_feature) # Graph connectivity in COO format with shape [2, num_edges] edge_index = torch.tensor(edges_list, dtype=torch.long).T edge_features = torch.tensor(edge_features_list, dtype=torch.long) avg_degree += (len(edges_list) / 2) / n_atoms # get all 19 attributes that should be predicted, so we drop the first two entries (name and smiles) target = torch.tensor(molecules_df.iloc[data_qm9['id'][mol_idx]][2:], dtype=torch.float) targets.append(target) edge_indices.append(edge_index) all_edge_features.append(edge_features) total_edges += len(edges_list) total_atoms += n_atoms edge_slices.append(total_edges) atom_slices.append(total_atoms) # convert targets to eV units targets = torch.stack(targets) * torch.tensor(list(self.unit_conversion.values()))[None, :] data_dict = {'mol_id': data_qm9['id'], 'n_atoms': torch.tensor(data_qm9['N'], dtype=torch.long), 'atom_slices': torch.tensor(atom_slices, dtype=torch.long), 'edge_slices': torch.tensor(edge_slices, dtype=torch.long), 'eig_vecs': torch.cat(total_eigvecs).float(), 'eig_vals': torch.cat(total_eigvals).float(), 'edge_indices': torch.cat(edge_indices, dim=1), 'atom_features': torch.cat(all_atom_features, dim=0), 'edge_features': torch.cat(all_edge_features, dim=0), 'atomic_number_long': torch.tensor(data_qm9['Z'], dtype=torch.long)[:, None], 'coordinates': coordinates, 'targets': targets, 'avg_degree': avg_degree / len(data_qm9['id']) } if not os.path.exists(os.path.join(self.qm9_directory, 'processed')): os.mkdir(os.path.join(self.qm9_directory, 'processed')) torch.save(data_dict, os.path.join(self.qm9_directory, 'processed', self.processed_file))