def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]: """Prepare both smiles and mol by standardizing to common rules. This method should be called before `get_input_feats`. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. Returns: -------- canonical_smiles: str Canonical SMILES representation of the molecule. mol: rdkit.Chem.rdchem.Mol Modified molecule w/ kekulization and Hs added, if specified. """ canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True) mol = rdmolfiles.MolFromSmiles(canonical_smiles) if self.add_Hs: mol = rdmolops.AddHs(mol) if self.kekulize: rdmolops.Kekulize(mol) return canonical_smiles, mol
def import_csv_data(self, file): print("Load Processed Data") self.__init__() self.split = True with open(file) as csvfile: reader = csv.DictReader(csvfile) for row in reader: mol = rf.MolFromSmiles(row['Smiles']) graph = GraphBuilder.molToGraph(mol) graph.smiles = row['Smiles'] graph.label = float(row['Label']) self.graphList.append(graph) if graph.label == 1.0: self.positiveInd.append(int(row['Index'])) else: self.negativeInd.append(int(row['Index'])) if graph.nodeNum > self.maxNodes: self.maxNodes = graph.nodeNum if row['In Train'] == 'True': self.trainInd.append(int(row['Index'])) else: self.testInd.append(int(row['Index'])) print("Number of Samples in the DataSet: " + str(len(self.graphList))) print("Node Number: " + str(self.maxNodes)) print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim)) print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim)) print("Number of Positive Samples: " + str(len(self.positiveInd))) print("Number of Negative Samples: " + str(len(self.negativeInd))) return super().get_trainTestGraph(), self.maxNodes, self.graphList[0].nodeFeatureDim, self.graphList[0].edgeFeatureDim
def load_csv(self, file = '', col_1 = 'smiles', col_2 = 'label'): print("Load Data") with open(file) as csvfile: reader = csv.DictReader(csvfile) counter = 0 for row in reader: counter += 1 smiles = row[col_1] mol = rf.MolFromSmiles(smiles) if(len(smiles) > 0 and mol): graph = GraphBuilder.molToGraph(mol) if len(graph.edges) > 0: graph.smiles = smiles graph.label = float(row[col_2]) graph.scaled_label = float(row[col_2]) self.graphList.append(graph) if graph.nodeNum > self.maxNodes: self.maxNodes = graph.nodeNum else: print("Row " + str(counter) + " has no edge. \n") else: print("Row " + str(counter) + " has no smiles/mol. \n") print("Number of Samples in the DataSet: " + str(len(self.graphList))) print("Node Number: " + str(self.maxNodes)) print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim)) print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim)) return
def import_csv_data(self, file): print("Load Processed Data") self.__init__() self.split = True with open(file) as csvfile: reader = csv.DictReader(csvfile) for row in reader: mol = rf.MolFromSmiles(row['Smiles']) graph = GraphBuilder.molToGraph(mol) graph.smiles = row['Smiles'] graph.label = float(row['Label']) graph.scaled_label = float(row['Scaled Label']) self.graphList.append(graph) if graph.nodeNum > self.maxNodes: self.maxNodes = graph.nodeNum if row['In Train'] == 'True': self.trainInd.append(int(row['Index'])) else: self.testInd.append(int(row['Index'])) self.mean, self.var = self.__getMeanVar(self.graphList) sd = math.sqrt(self.var) print("Number of Samples in the DataSet: " + str(len(self.graphList))) print("Data Mean: " + str(self.mean)) print("Data Sd: " + str(sd)) print("Node Number: " + str(self.maxNodes)) print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim)) print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim)) return self.get_trainTestGraph(), self.maxNodes, self.graphList[0].nodeFeatureDim, self.graphList[0].edgeFeatureDim
def _transform_input_features(self, input_features): """Transform input features into morgan fingerprint binary vectors :param input_features: input features :type input_features: np.array :return: binary vectors of 256 elements :rtype: np.array """ mol_smiles = [ rdmolfiles.MolFromSmiles(input_feature) for input_feature in input_features ] morgan_fingerprints = [ self._get_morgan_fingerprint_as_bit_vect(mol_smile) for mol_smile in mol_smiles ] binary_vecotrs = [ np.array( self._morgan_fingerprint_to_binary_vector(morgan_fingerprint)) for morgan_fingerprint in morgan_fingerprints ] return np.array(binary_vecotrs)
def assess_retro_template(self, template, reactant_mol_list, retro_outcomes): """Checks whether the recorded reactants belong to the set of generated precursors. Parameters: template (str): Retro-reaction template SMIRKS string reactant_mol_list (list): List of the recorded reactant molecules to be cross-checked as RDKit Mols. retro_outcomes (obj): outcome list from the function 'check_retro_template_outcome()' or object generated from RdKit 'RunReactants'. stereochemistry (bool): If True, stereochemistry will be considered when matching generated and recorded outcomes. If False (default) stereochemistry will not be considered and matches will be done on the scaffold. Returns: selectivity (float): the fraction of generated precursors matching the recorded precursors i.e. 1.0 - match or match.match or match.match.match etc. 0.5 - match.none or match.none.match.none etc. 0.0 - none Notes: Requires an outcome list from the function 'check_retro_template_outcome()' """ template = rdChemReactions.ReactionFromSmarts(template) if cc.USE_STEREOCHEMISTRY is False: reactant_mol_list = [] for smiles in self.reactant_list: if not smiles: continue reactant_mol_list.append(rdmolfiles.MolFromSmiles(smiles.replace('@',''))) else: reactant_mol_list = reactant_mol_list reactant_inchi = [Chem.MolToInchi(reactant) for reactant in reactant_mol_list] precursor_set = [] for outcome_set in retro_outcomes: if cc.USE_STEREOCHEMISTRY is False: outcome_set_inchi = [Chem.MolToInchi(Chem.MolFromSmiles(outcome.replace('@',''))) for outcome in outcome_set.split('.')] else: outcome_set_inchi = [Chem.MolToInchi(Chem.MolFromSmiles(outcome)) for outcome in outcome_set.split('.')] precursor_set.append(outcome_set_inchi) assessment = [] for precursor in precursor_set: #There must be a match between the generated outcomes and recorded reactants if len(list(set(precursor) & set(reactant_inchi))) != 0: assessment.append(2) #No match or error elif len(list(set(precursor) & set(reactant_inchi))) == 0: assessment.append(1) else: print("Template error") assessment.append(0) #Quantify the level of selectivity, if an error has occured set to 0 if assessment.count(0) != 0: return 0 else: selectivity = assessment.count(2)/len(assessment) return selectivity
def test_get_morgan_fingerprint_as_bit_vect(self): smile = 'CCCCCCCC(=O)OCC(COC(=O)CCCCCCC)OC(=O)CCCCCCC' mol_from_smile = rdmolfiles.MolFromSmiles(smile) f_print = self.dataset_generator._get_morgan_fingerprint_as_bit_vect( mol_from_smile) file_path = os.path.join(self.current_dir_path, 'data/morgan_fingerprint.pkl') with open(file_path, 'rb') as f: result_fingerprint = pickle.load(f) self.assertEqual(f_print, result_fingerprint)
def test_mols(): mols = [] all_smiles = [ 'CN=C=O', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2', 'CCCCCCCCCCCCCCCC' ] for smiles in all_smiles: mol = rdmolfiles.MolFromSmiles(smiles) mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) mol = rdmolops.RemoveHs(mol) mol.SetProp('Fitness', str(np.random.rand(1)[0])) mols.append(mol) return mols
def smiles_from_seq(self, seq): """Calculates the smiles of a given peptide dendrimer sequence Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ gs, bs, terminal, capping = self.split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(self.T_SMILES[terminal[0]]) else: molecule = '' # creates the dendrimer structure for gen in gs: for aa in gen: if aa == '-': self.metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(self.AA_SMILES[aa]) else: molecule = self.connect_mol( molecule, rdmolfiles.MolFromSmiles(self.AA_SMILES[aa])) if bs: if bs[0] == '-': self.metbond = True bs.pop(0) if molecule == '': molecule = rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]]) else: molecule = self.connect_mol( molecule, rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]])) bs.pop(0) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if capping: molecule = attach_capping( molecule, rdmolfiles.MolFromSmiles(self.C_SMILES[capping[0]])) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) molecule_smile = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace( '[N]', 'N').replace('[C]', 'C') return molecule_smile
def smiles_from_seq_cyclic(seq): """Calculates the smiles of the given peptide sequence and cyclize it Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ # used internally to recognize a methylated aa: metbond = False # can be set with exclude or allow methylation, # it refers to the possibility of having methylation in the entire GA: methyl = False if 'X' in seq: cy = 1 for i in NT: seq = seq.replace(i, '') for i in CT: seq = seq.replace(i, '') else: cy = 0 gs, bs, terminal, capping = split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]]) else: molecule = '' if bs: if verbose: print( 'dendrimer, cyclization not possible, branching unit will not be considered' ) # creates the linear peptide structure for gen in gs: for aa in gen: if aa == 'X': continue if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = connect_mol(molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa])) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if capping: molecule = attach_capping( molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]])) # cyclize if molecule == '': smiles = '' return smiles #print (cy) molecule = cyclize(molecule, cy) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) smiles = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace( '[N]', 'N').replace('[C]', 'C') return smiles
def connect_mol(mol1, mol2): """it is connecting all Nterminals of mol1 with the Cterminal of the maximum possible number of mol2s Arguments: mol1 {rdKit mol object} -- first molecule to be connected mol2 {rdKit mol object} -- second molecule to be connected Returns: rdKit mol object -- mol1 updated (connected with mol2, one or more) """ # used internally to recognize a methylated aa: metbond = False # can be set with exclude or allow methylation, # it refers to the possibility of having methylation in the entire GA: methyl = False count = 0 # detects all the N terminals in mol1 for atom in mol1.GetAtoms(): atom.SetProp('Cterm', 'False') atom.SetProp('methyl', 'False') if atom.GetSmarts() == '[N:2]' or atom.GetSmarts( ) == '[NH2:2]' or atom.GetSmarts() == '[NH:2]': count += 1 atom.SetProp('Nterm', 'True') else: atom.SetProp('Nterm', 'False') # detects all the C terminals in mol2 (it should be one) for atom in mol2.GetAtoms(): atom.SetProp('Nterm', 'False') atom.SetProp('methyl', 'False') if atom.GetSmarts() == '[C:1]' or atom.GetSmarts() == '[CH:1]': atom.SetProp('Cterm', 'True') else: atom.SetProp('Cterm', 'False') # mol2 is addes to all the N terminal of mol1 for i in range(count): combo = rdmolops.CombineMols(mol1, mol2) Nterm = [] Cterm = [] # saves in two different lists the index of the atoms which has to be connected for atom in combo.GetAtoms(): if atom.GetProp('Nterm') == 'True': Nterm.append(atom.GetIdx()) if atom.GetProp('Cterm') == 'True': Cterm.append(atom.GetIdx()) # creates the amide bond edcombo = rdchem.EditableMol(combo) edcombo.AddBond(Nterm[0], Cterm[0], order=Chem.rdchem.BondType.SINGLE) edcombo.RemoveAtom(Cterm[0] + 1) clippedMol = edcombo.GetMol() # removes tags and lables form c term atoms which reacted clippedMol.GetAtomWithIdx(Cterm[0]).SetProp('Cterm', 'False') clippedMol.GetAtomWithIdx(Cterm[0]).SetAtomMapNum(0) # methylates amide bond if metbond == True and methyl == True: Nterm = [] Met = [] methyl = rdmolfiles.MolFromSmiles('[C:4]') for atom in methyl.GetAtoms(): atom.SetProp('methyl', 'True') atom.SetProp('Nterm', 'False') atom.SetProp('Cterm', 'False') metcombo = rdmolops.CombineMols(clippedMol, methyl) for atom in metcombo.GetAtoms(): if atom.GetProp('Nterm') == 'True': Nterm.append(atom.GetIdx()) if atom.GetProp('methyl') == 'True': Met.append(atom.GetIdx()) metedcombo = rdchem.EditableMol(metcombo) metedcombo.AddBond(Nterm[0], Met[0], order=Chem.rdchem.BondType.SINGLE) clippedMol = metedcombo.GetMol() clippedMol.GetAtomWithIdx(Met[0]).SetProp('methyl', 'False') clippedMol.GetAtomWithIdx(Met[0]).SetAtomMapNum(0) # removes tags and lables form the atoms which reacted clippedMol.GetAtomWithIdx(Nterm[0]).SetProp('Nterm', 'False') clippedMol.GetAtomWithIdx(Nterm[0]).SetAtomMapNum(0) # uptades the 'core' molecule mol1 = clippedMol metbond = False return mol1
def sample_mol(): mol = rdmolfiles.MolFromSmiles('CN=C=O') mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) return rdmolops.RemoveHs(mol)
def parse(self, df: pd.DataFrame, target_index: Optional[List[int]] = None, return_is_successful: bool = True) -> Dict[str, Any]: """Parse dataframe using the preprocessor given. Params: ------- df: pd.DataFrame DataFrame to be parsed. target_index: list of int or None, optional, default=None Indicies to extract. If None, then all examples (in the dataset) are parsed. Allows for easier batching. return_is_successful: bool, optional, default=True If True, boolean list (representing whether parsing of the sequence has succeeded or not) is returned in the key 'is_successful'. If False, `None` is returned instead. """ features = None is_successful_list = [] pp = self.preprocessor mutator = self.mutator processed_as = 'sequence' if self.process_as_seq else 'SMILES' if target_index is not None: df = df.iloc[target_index] data_index = df.columns.get_loc(self.data_col) pdb_index = df.columns.get_loc( self.pdb_col) if self.pdb_col is not None else None pos_index = df.columns.get_loc( self.pos_col) if self.pos_col is not None else None labels_index = [] if self.labels is None else [ df.columns.get_loc(l) for l in self.labels ] fail_count = 0 success_count = 0 total_count = df.shape[0] for row in tqdm(df.itertuples(index=False), total=total_count): data: Optional[Union[str, List[str]]] = row[data_index] pdbid = row[pdb_index] if pdb_index is not None else None positions = row[pos_index] if pos_index is not None else None labels = [row[i] for i in labels_index] try: # Check for valid data input if data is None: raise TypeError("Invalid type: {}. Should be str or list " \ "of str.".format(type(data).__name__)) elif len(data) == 0: # Raise error for now, if empty list or str is passed in. # TODO: Change how each type (molecule or sequence) feature # processing handles empty data. If mol.GetNumAtoms() == 0 # or len(seq) == 0, then a respective FeatureExtractionError # should be raised. raise ValueError("Cannot process empty data.") # SMILES parsing if not self.process_as_seq: if mutator is not None: warnings.warn( "SMILES string '{}' cannot be mutated.".format( data)) # SMILES string can only be processed as rdkit.Mol instance. mol = rdmolfiles.MolFromSmiles(data, sanitize=True) if mol is None: raise TypeError("Invalid type: {}. Should be " \ "rdkit.Chem.rdchem.Mol.".format(type(mol).__name__)) # Compute features if its a proper molecule if isinstance(pp, MolPreprocessor): input_feats = pp.get_input_feats(mol) else: valid_preprocessors = [ pp.__name__ for pp in preprocess_method_dict.values() if isinstance(pp(), MolPreprocessor) ] raise ValueError("{} cannot compute features for SMILES-based input " \ "'{}'. Choose a valid SMILES-based preprocessor: {}.".format( \ type(pp).__name__, data, valid_preprocessors)) else: # Sequence-based parsing if mutator is not None: if pdbid is None: raise ValueError( "PDB ID not specified. Unable to mutate residue." ) if positions is None: raise ValueError("Positions not specified. PDBMutator needs " \ "residue positions to mutate residues at defined locations.") else: # Raise error for now, as lengths of positions and seqs need to match # to work with the current implementation of mutator. # TODO: Change when implementation of mutator changes. # NOTE: Should we assume that if the len(positions) < len(data), then # the user wants to modify those positions in the sequence? if len(data) != len(positions): raise ValueError("Length of input (N={}) is not the same as number " \ "of positions (N={}) to modify. Did you pass in the full " \ "sequence? Currently, mutations can only be performed with " \ "information about which residue position(s) to modify and the " \ "replacement residue(s) at those positions. If you want to " \ "process only the input sequence (without any mutations), " \ "set mutator=None.".format(len(data), len(positions))) # Mutate residues (to primary or tertiary) based off mutator instance replace_with = { resid: data[i] for i, resid in enumerate(positions) } data = mutator.mutate(pdbid, replace_with=replace_with) # Obtain features based on which preprocessor is used if isinstance(pp, tuple(preprocess_method_dict.values())): input_feats = pp.get_input_feats(data) else: raise NotImplementedError else: # Since it is not mutated, the data can now ONLY be a sequence # (since 3D representation cannot be within a single column in a df) if isinstance(pp, SequencePreprocessor): input_feats = pp.get_input_feats(data) else: valid_preprocessors = [ pp.__name__ for pp in preprocess_method_dict.values() if isinstance(pp(), SequencePreprocessor) ] raise ValueError("{} cannot compute features for sequence-based input " \ "'{}'. Either mutate data (by passing in PDBMutator instance) to " \ "'tertiary' structure or choose a valid sequence-based preprocessor: " \ "{}.".format(type(pp).__name__, data, valid_preprocessors)) except Exception as e: # If for some reason the data cannot be parsed properly, skip print('Error while parsing `{}` as {}, type: {}, {}'.format(\ data, processed_as, type(e).__name__, e.args)) traceback.print_exc() fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Initialize features: list of lists if features is None: num_feats = len(input_feats) if isinstance(input_feats, tuple) else 1 if self.labels is not None: num_feats += 1 features = [[] for _ in range(num_feats)] # Append computed features to respective cols if isinstance(input_feats, tuple): for i in range(len(input_feats)): features[i].append(input_feats[i]) else: features[0].append(input_feats) # Add label values as last column, if provided if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 if return_is_successful: is_successful_list.append(True) print('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(\ fail_count, success_count, total_count)) # Compile feature(s) into individual np.ndarray(s), padding each to max # dims, if necessary. NOTE: The num of examples in the dataset depends # on the data_format specified (represented by first/last channel). all_feats = [broadcast_array(feature) for feature in features] if features else [] if P.data_format() == "batch_last": all_feats = [np.moveaxis(feat, 0, -1) for feat in all_feats] is_successful = np.array( is_successful_list) if return_is_successful else None return {"dataset": all_feats, "is_successful": is_successful}
def smiles_from_seq(seq, cyclize): """Calculates the smiles of a given peptide dendrimer sequence Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ #seq = seq.replace("-z","z").replace("-Z","Z").replace("-p","p").replace("-P","P") gs, bs, terminal, capping = split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]]) else: molecule = '' if cyclize and bs: print( 'dendrimer, cyclization not possible, branching unit will not be considered' ) if cyclize: for gen in gs: metbond = False for aa in gen: if aa == 'X': continue if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]), metbond) if metbond: metbond = False else: # creates the dendrimer structure for gen in gs: metbond = False for aa in gen: if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]), metbond) if metbond: metbond = False if bs: if bs[0] == '-': metbond = True bs.pop(0) if molecule == '': molecule = rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]), metbond) if metbond: metbond = False bs.pop(0) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if molecule == '': smiles = '' return smiles, seq if capping: molecule = utils.attach_capping( molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]])) if cyclize: if is_cyclic(seq): cy = 1 else: cy = 0 molecule = utils.cyclize(molecule, cy) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) molecule_smile = rdmolfiles.MolToSmiles( molecule, isomericSmiles=True).replace('[N]', 'N').replace('[C]', 'C') return molecule_smile, seq