Esempio n. 1
0
    def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]:
        """Prepare both smiles and mol by standardizing to common rules.

        This method should be called before `get_input_feats`.

        Params:
        -------
        mol: rdkit.Chem.rdchem.Mol
            Molecule of interest.

        Returns:
        --------
        canonical_smiles: str
            Canonical SMILES representation of the molecule. 

        mol: rdkit.Chem.rdchem.Mol
            Modified molecule w/ kekulization and Hs added, if specified.
        """
        canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True)
        mol = rdmolfiles.MolFromSmiles(canonical_smiles)

        if self.add_Hs:
            mol = rdmolops.AddHs(mol)
        if self.kekulize:
            rdmolops.Kekulize(mol)
        return canonical_smiles, mol
Esempio n. 2
0
 def import_csv_data(self, file):
     print("Load Processed Data")
     self.__init__()
     self.split = True
     
     with open(file) as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
             mol = rf.MolFromSmiles(row['Smiles'])
             graph = GraphBuilder.molToGraph(mol)
             graph.smiles = row['Smiles']
             graph.label = float(row['Label'])
             self.graphList.append(graph)
             
             if graph.label == 1.0: self.positiveInd.append(int(row['Index']))
             else: self.negativeInd.append(int(row['Index']))
             
             if graph.nodeNum > self.maxNodes:
                 self.maxNodes = graph.nodeNum
                 
             if row['In Train'] == 'True':
                 self.trainInd.append(int(row['Index']))
             else: self.testInd.append(int(row['Index']))
             
     print("Number of Samples in the DataSet: " + str(len(self.graphList)))
     print("Node Number: " + str(self.maxNodes))
     print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim))
     print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim))
     print("Number of Positive Samples: " + str(len(self.positiveInd)))
     print("Number of Negative Samples: " + str(len(self.negativeInd)))
             
     return super().get_trainTestGraph(), self.maxNodes, self.graphList[0].nodeFeatureDim, self.graphList[0].edgeFeatureDim 
     
Esempio n. 3
0
    def load_csv(self, file = '', col_1 = 'smiles', col_2 = 'label'):
        print("Load Data")
        
        with open(file) as csvfile:
            reader = csv.DictReader(csvfile)
            counter = 0
            for row in reader:
                counter += 1
                smiles = row[col_1]
                mol = rf.MolFromSmiles(smiles)
                
                if(len(smiles) > 0 and mol):
                    graph = GraphBuilder.molToGraph(mol)
                    if len(graph.edges) > 0:
                        graph.smiles = smiles
                        graph.label = float(row[col_2])
                        graph.scaled_label = float(row[col_2])
                        
                        self.graphList.append(graph)

                        if graph.nodeNum > self.maxNodes:
                            self.maxNodes = graph.nodeNum

                    else: print("Row " + str(counter) + " has no edge. \n")
                else: print("Row " + str(counter) + " has no smiles/mol. \n")
                
        print("Number of Samples in the DataSet: " + str(len(self.graphList)))
        print("Node Number: " + str(self.maxNodes))
        print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim))
        print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim))
        return
Esempio n. 4
0
 def import_csv_data(self, file):
     print("Load Processed Data")
     self.__init__()
     self.split = True
     
     with open(file) as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
             mol = rf.MolFromSmiles(row['Smiles'])
             graph = GraphBuilder.molToGraph(mol)
             graph.smiles = row['Smiles']
             graph.label = float(row['Label'])
             graph.scaled_label = float(row['Scaled Label'])
             self.graphList.append(graph)
             
             if graph.nodeNum > self.maxNodes:
                 self.maxNodes = graph.nodeNum
                 
             if row['In Train'] == 'True':
                 self.trainInd.append(int(row['Index']))
             else: self.testInd.append(int(row['Index']))
             
     self.mean, self.var = self.__getMeanVar(self.graphList)
     sd = math.sqrt(self.var)
     
     print("Number of Samples in the DataSet: " + str(len(self.graphList)))
     print("Data Mean: " + str(self.mean))
     print("Data Sd: " + str(sd))
     print("Node Number: " + str(self.maxNodes))
     print("Node Feature Dimension: " + str(self.graphList[0].nodeFeatureDim))
     print("Edge Feature Dimension: " + str(self.graphList[0].edgeFeatureDim))
             
     return self.get_trainTestGraph(), self.maxNodes, self.graphList[0].nodeFeatureDim, self.graphList[0].edgeFeatureDim 
    def _transform_input_features(self, input_features):
        """Transform input features into morgan fingerprint binary vectors

        :param input_features: input features
        :type input_features: np.array
        :return: binary vectors of 256 elements
        :rtype: np.array
        """

        mol_smiles = [
            rdmolfiles.MolFromSmiles(input_feature)
            for input_feature in input_features
        ]

        morgan_fingerprints = [
            self._get_morgan_fingerprint_as_bit_vect(mol_smile)
            for mol_smile in mol_smiles
        ]

        binary_vecotrs = [
            np.array(
                self._morgan_fingerprint_to_binary_vector(morgan_fingerprint))
            for morgan_fingerprint in morgan_fingerprints
        ]

        return np.array(binary_vecotrs)
	def assess_retro_template(self, template, reactant_mol_list, retro_outcomes):
		"""Checks whether the recorded reactants belong to the set of generated precursors.

		Parameters:
			template (str): Retro-reaction template SMIRKS string
			reactant_mol_list (list): List of the recorded reactant molecules to be cross-checked as RDKit Mols.
			retro_outcomes (obj): outcome list from the function 'check_retro_template_outcome()' or object generated from RdKit 'RunReactants'.
			stereochemistry (bool): If True, stereochemistry will be considered when matching generated and recorded outcomes. 
				If False (default) stereochemistry will not be considered and matches will be done on the scaffold.
		
		Returns:
			selectivity (float): the fraction of generated precursors matching the recorded precursors
				i.e. 1.0 - match or match.match or match.match.match etc.
					0.5 - match.none or match.none.match.none etc.
					0.0 - none

		Notes:
			Requires an outcome list from the function 'check_retro_template_outcome()'
		"""
		template = rdChemReactions.ReactionFromSmarts(template)

		if cc.USE_STEREOCHEMISTRY is False:
			reactant_mol_list = []
			for smiles in self.reactant_list:
				if not smiles: 
					continue
				reactant_mol_list.append(rdmolfiles.MolFromSmiles(smiles.replace('@','')))
		else: 
			reactant_mol_list = reactant_mol_list

		reactant_inchi = [Chem.MolToInchi(reactant) for reactant in reactant_mol_list]
		precursor_set = []
		for outcome_set in retro_outcomes:
			if cc.USE_STEREOCHEMISTRY is False:
				outcome_set_inchi = [Chem.MolToInchi(Chem.MolFromSmiles(outcome.replace('@',''))) for outcome in outcome_set.split('.')]
			else:
				outcome_set_inchi = [Chem.MolToInchi(Chem.MolFromSmiles(outcome)) for outcome in outcome_set.split('.')]
			precursor_set.append(outcome_set_inchi)
		assessment = []
		for precursor in precursor_set:
			#There must be a match between the generated outcomes and recorded reactants 
			if len(list(set(precursor) & set(reactant_inchi))) != 0:
				assessment.append(2)
			#No match or error
			elif len(list(set(precursor) & set(reactant_inchi))) == 0:
				assessment.append(1)
			else:
				print("Template error")
				assessment.append(0)
		
		#Quantify the level of selectivity, if an error has occured set to 0
		if assessment.count(0) != 0:
			return 0
		else:
			selectivity = assessment.count(2)/len(assessment)
			return selectivity
    def test_get_morgan_fingerprint_as_bit_vect(self):
        smile = 'CCCCCCCC(=O)OCC(COC(=O)CCCCCCC)OC(=O)CCCCCCC'
        mol_from_smile = rdmolfiles.MolFromSmiles(smile)
        f_print = self.dataset_generator._get_morgan_fingerprint_as_bit_vect(
            mol_from_smile)

        file_path = os.path.join(self.current_dir_path,
                                 'data/morgan_fingerprint.pkl')

        with open(file_path, 'rb') as f:
            result_fingerprint = pickle.load(f)

        self.assertEqual(f_print, result_fingerprint)
Esempio n. 8
0
def test_mols():
    mols = []
    all_smiles = [
        'CN=C=O', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2', 'CCCCCCCCCCCCCCCC'
    ]
    for smiles in all_smiles:
        mol = rdmolfiles.MolFromSmiles(smiles)
        mol = rdmolops.AddHs(mol, addCoords=True)
        rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG())
        mol = rdmolops.RemoveHs(mol)
        mol.SetProp('Fitness', str(np.random.rand(1)[0]))
        mols.append(mol)
    return mols
Esempio n. 9
0
    def smiles_from_seq(self, seq):
        """Calculates the smiles of a given peptide dendrimer sequence

        Arguments:
            seq {string} -- peptide dendrimer sequence
        Returns:
            string -- molecule_smile - SMILES of the peptide
        """

        gs, bs, terminal, capping = self.split_seq_components(seq)

        # modifies the Cterminal
        if terminal:
            molecule = rdmolfiles.MolFromSmiles(self.T_SMILES[terminal[0]])
        else:
            molecule = ''

        # creates the dendrimer structure
        for gen in gs:
            for aa in gen:
                if aa == '-':
                    self.metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(self.AA_SMILES[aa])
                else:
                    molecule = self.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(self.AA_SMILES[aa]))

            if bs:
                if bs[0] == '-':
                    self.metbond = True
                    bs.pop(0)
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]])
                else:
                    molecule = self.connect_mol(
                        molecule,
                        rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]]))
                bs.pop(0)

        # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
        # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)
        if capping:
            molecule = attach_capping(
                molecule, rdmolfiles.MolFromSmiles(self.C_SMILES[capping[0]]))

        # clean the smile from all the tags
        for atom in molecule.GetAtoms():
            atom.SetAtomMapNum(0)

        molecule_smile = rdmolfiles.MolToSmiles(molecule,
                                                isomericSmiles=True).replace(
                                                    '[N]',
                                                    'N').replace('[C]', 'C')
        return molecule_smile
Esempio n. 10
0
def smiles_from_seq_cyclic(seq):
    """Calculates the smiles of the given peptide sequence and cyclize it
        Arguments:
            seq {string} -- peptide dendrimer sequence
        Returns:
            string -- molecule_smile - SMILES of the peptide
    """
    # used internally to recognize a methylated aa:
    metbond = False
    # can be set with exclude or allow methylation,
    # it refers to the possibility of having methylation in the entire GA:
    methyl = False

    if 'X' in seq:
        cy = 1
        for i in NT:
            seq = seq.replace(i, '')
        for i in CT:
            seq = seq.replace(i, '')
    else:
        cy = 0

    gs, bs, terminal, capping = split_seq_components(seq)

    # modifies the Cterminal
    if terminal:
        molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]])
    else:
        molecule = ''

    if bs:
        if verbose:
            print(
                'dendrimer, cyclization not possible, branching unit will not be considered'
            )

    # creates the linear peptide structure
    for gen in gs:
        for aa in gen:
            if aa == 'X':
                continue
            if aa == '-':
                metbond = True
                continue
            if molecule == '':
                molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
            else:
                molecule = connect_mol(molecule,
                                       rdmolfiles.MolFromSmiles(AA_SMILES[aa]))

    # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
    # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)
    if capping:
        molecule = attach_capping(
            molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]]))

    # cyclize
    if molecule == '':
        smiles = ''
        return smiles

    #print (cy)
    molecule = cyclize(molecule, cy)

    # clean the smile from all the tags
    for atom in molecule.GetAtoms():
        atom.SetAtomMapNum(0)
    smiles = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace(
        '[N]', 'N').replace('[C]', 'C')

    return smiles
Esempio n. 11
0
def connect_mol(mol1, mol2):
    """it is connecting all Nterminals of mol1 with the Cterminal
        of the maximum possible number of mol2s

        Arguments:
            mol1 {rdKit mol object} -- first molecule to be connected
            mol2 {rdKit mol object} -- second molecule to be connected

        Returns:
            rdKit mol object -- mol1 updated (connected with mol2, one or more)
    """
    # used internally to recognize a methylated aa:
    metbond = False
    # can be set with exclude or allow methylation,
    # it refers to the possibility of having methylation in the entire GA:
    methyl = False

    count = 0

    # detects all the N terminals in mol1
    for atom in mol1.GetAtoms():
        atom.SetProp('Cterm', 'False')
        atom.SetProp('methyl', 'False')
        if atom.GetSmarts() == '[N:2]' or atom.GetSmarts(
        ) == '[NH2:2]' or atom.GetSmarts() == '[NH:2]':
            count += 1
            atom.SetProp('Nterm', 'True')
        else:
            atom.SetProp('Nterm', 'False')

    # detects all the C terminals in mol2 (it should be one)
    for atom in mol2.GetAtoms():
        atom.SetProp('Nterm', 'False')
        atom.SetProp('methyl', 'False')
        if atom.GetSmarts() == '[C:1]' or atom.GetSmarts() == '[CH:1]':
            atom.SetProp('Cterm', 'True')
        else:
            atom.SetProp('Cterm', 'False')

    # mol2 is addes to all the N terminal of mol1
    for i in range(count):
        combo = rdmolops.CombineMols(mol1, mol2)
        Nterm = []
        Cterm = []
        # saves in two different lists the index of the atoms which has to be connected
        for atom in combo.GetAtoms():
            if atom.GetProp('Nterm') == 'True':
                Nterm.append(atom.GetIdx())
            if atom.GetProp('Cterm') == 'True':
                Cterm.append(atom.GetIdx())

        # creates the amide bond
        edcombo = rdchem.EditableMol(combo)
        edcombo.AddBond(Nterm[0], Cterm[0], order=Chem.rdchem.BondType.SINGLE)
        edcombo.RemoveAtom(Cterm[0] + 1)
        clippedMol = edcombo.GetMol()

        # removes tags and lables form c term atoms which reacted
        clippedMol.GetAtomWithIdx(Cterm[0]).SetProp('Cterm', 'False')
        clippedMol.GetAtomWithIdx(Cterm[0]).SetAtomMapNum(0)

        # methylates amide bond
        if metbond == True and methyl == True:
            Nterm = []
            Met = []
            methyl = rdmolfiles.MolFromSmiles('[C:4]')
            for atom in methyl.GetAtoms():
                atom.SetProp('methyl', 'True')
                atom.SetProp('Nterm', 'False')
                atom.SetProp('Cterm', 'False')
            metcombo = rdmolops.CombineMols(clippedMol, methyl)
            for atom in metcombo.GetAtoms():
                if atom.GetProp('Nterm') == 'True':
                    Nterm.append(atom.GetIdx())
                if atom.GetProp('methyl') == 'True':
                    Met.append(atom.GetIdx())
            metedcombo = rdchem.EditableMol(metcombo)
            metedcombo.AddBond(Nterm[0],
                               Met[0],
                               order=Chem.rdchem.BondType.SINGLE)
            clippedMol = metedcombo.GetMol()
            clippedMol.GetAtomWithIdx(Met[0]).SetProp('methyl', 'False')
            clippedMol.GetAtomWithIdx(Met[0]).SetAtomMapNum(0)

        # removes tags and lables form the atoms which reacted
        clippedMol.GetAtomWithIdx(Nterm[0]).SetProp('Nterm', 'False')
        clippedMol.GetAtomWithIdx(Nterm[0]).SetAtomMapNum(0)

        # uptades the 'core' molecule
        mol1 = clippedMol
    metbond = False
    return mol1
Esempio n. 12
0
def sample_mol():
    mol = rdmolfiles.MolFromSmiles('CN=C=O')
    mol = rdmolops.AddHs(mol, addCoords=True)
    rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG())
    return rdmolops.RemoveHs(mol)
Esempio n. 13
0
    def parse(self,
              df: pd.DataFrame,
              target_index: Optional[List[int]] = None,
              return_is_successful: bool = True) -> Dict[str, Any]:
        """Parse dataframe using the preprocessor given.

        Params:
        -------
        df: pd.DataFrame
            DataFrame to be parsed.

        target_index: list of int or None, optional, default=None
            Indicies to extract. If None, then all examples (in the dataset) 
            are parsed. Allows for easier batching.

        return_is_successful: bool, optional, default=True
            If True, boolean list (representing whether parsing of the 
            sequence has succeeded or not) is returned in the key 
            'is_successful'. If False, `None` is returned instead.
        """
        features = None
        is_successful_list = []
        pp = self.preprocessor
        mutator = self.mutator
        processed_as = 'sequence' if self.process_as_seq else 'SMILES'

        if target_index is not None:
            df = df.iloc[target_index]

        data_index = df.columns.get_loc(self.data_col)
        pdb_index = df.columns.get_loc(
            self.pdb_col) if self.pdb_col is not None else None
        pos_index = df.columns.get_loc(
            self.pos_col) if self.pos_col is not None else None
        labels_index = [] if self.labels is None else [
            df.columns.get_loc(l) for l in self.labels
        ]

        fail_count = 0
        success_count = 0
        total_count = df.shape[0]
        for row in tqdm(df.itertuples(index=False), total=total_count):
            data: Optional[Union[str, List[str]]] = row[data_index]
            pdbid = row[pdb_index] if pdb_index is not None else None
            positions = row[pos_index] if pos_index is not None else None
            labels = [row[i] for i in labels_index]

            try:
                # Check for valid data input
                if data is None:
                    raise TypeError("Invalid type: {}. Should be str or list " \
                        "of str.".format(type(data).__name__))
                elif len(data) == 0:
                    # Raise error for now, if empty list or str is passed in.
                    # TODO: Change how each type (molecule or sequence) feature
                    # processing handles empty data. If mol.GetNumAtoms() == 0
                    # or len(seq) == 0, then a respective FeatureExtractionError
                    # should be raised.
                    raise ValueError("Cannot process empty data.")

                # SMILES parsing
                if not self.process_as_seq:
                    if mutator is not None:
                        warnings.warn(
                            "SMILES string '{}' cannot be mutated.".format(
                                data))

                    # SMILES string can only be processed as rdkit.Mol instance.
                    mol = rdmolfiles.MolFromSmiles(data, sanitize=True)
                    if mol is None:
                        raise TypeError("Invalid type: {}. Should be " \
                            "rdkit.Chem.rdchem.Mol.".format(type(mol).__name__))

                    # Compute features if its a proper molecule
                    if isinstance(pp, MolPreprocessor):
                        input_feats = pp.get_input_feats(mol)
                    else:
                        valid_preprocessors = [
                            pp.__name__
                            for pp in preprocess_method_dict.values()
                            if isinstance(pp(), MolPreprocessor)
                        ]
                        raise ValueError("{} cannot compute features for SMILES-based input " \
                            "'{}'. Choose a valid SMILES-based preprocessor: {}.".format( \
                            type(pp).__name__, data, valid_preprocessors))
                else:
                    # Sequence-based parsing
                    if mutator is not None:
                        if pdbid is None:
                            raise ValueError(
                                "PDB ID not specified. Unable to mutate residue."
                            )

                        if positions is None:
                            raise ValueError("Positions not specified. PDBMutator needs " \
                                "residue positions to mutate residues at defined locations.")
                        else:
                            # Raise error for now, as lengths of positions and seqs need to match
                            # to work with the current implementation of mutator.
                            # TODO: Change when implementation of mutator changes.
                            # NOTE: Should we assume that if the len(positions) < len(data), then
                            # the user wants to modify those positions in the sequence?
                            if len(data) != len(positions):
                                raise ValueError("Length of input (N={}) is not the same as number " \
                                    "of positions (N={}) to modify. Did you pass in the full " \
                                    "sequence? Currently, mutations can only be performed with " \
                                    "information about which residue position(s) to modify and the " \
                                    "replacement residue(s) at those positions. If you want to " \
                                    "process only the input sequence (without any mutations), " \
                                    "set mutator=None.".format(len(data), len(positions)))
                            # Mutate residues (to primary or tertiary) based off mutator instance
                            replace_with = {
                                resid: data[i]
                                for i, resid in enumerate(positions)
                            }
                            data = mutator.mutate(pdbid,
                                                  replace_with=replace_with)

                        # Obtain features based on which preprocessor is used
                        if isinstance(pp,
                                      tuple(preprocess_method_dict.values())):
                            input_feats = pp.get_input_feats(data)
                        else:
                            raise NotImplementedError
                    else:
                        # Since it is not mutated, the data can now ONLY be a sequence
                        # (since 3D representation cannot be within a single column in a df)
                        if isinstance(pp, SequencePreprocessor):
                            input_feats = pp.get_input_feats(data)
                        else:
                            valid_preprocessors = [
                                pp.__name__
                                for pp in preprocess_method_dict.values()
                                if isinstance(pp(), SequencePreprocessor)
                            ]
                            raise ValueError("{} cannot compute features for sequence-based input " \
                                "'{}'. Either mutate data (by passing in PDBMutator instance) to " \
                                "'tertiary' structure or choose a valid sequence-based preprocessor: " \
                                "{}.".format(type(pp).__name__, data, valid_preprocessors))
            except Exception as e:
                # If for some reason the data cannot be parsed properly, skip
                print('Error while parsing `{}` as {}, type: {}, {}'.format(\
                    data, processed_as, type(e).__name__, e.args))
                traceback.print_exc()
                fail_count += 1
                if return_is_successful:
                    is_successful_list.append(False)
                continue

            # Initialize features: list of lists
            if features is None:
                num_feats = len(input_feats) if isinstance(input_feats,
                                                           tuple) else 1
                if self.labels is not None:
                    num_feats += 1
                features = [[] for _ in range(num_feats)]

            # Append computed features to respective cols
            if isinstance(input_feats, tuple):
                for i in range(len(input_feats)):
                    features[i].append(input_feats[i])
            else:
                features[0].append(input_feats)

            # Add label values as last column, if provided
            if self.labels is not None:
                features[len(features) - 1].append(labels)

            success_count += 1
            if return_is_successful:
                is_successful_list.append(True)

        print('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(\
            fail_count, success_count, total_count))

        # Compile feature(s) into individual np.ndarray(s), padding each to max
        # dims, if necessary. NOTE: The num of examples in the dataset depends
        # on the data_format specified (represented by first/last channel).
        all_feats = [broadcast_array(feature)
                     for feature in features] if features else []
        if P.data_format() == "batch_last":
            all_feats = [np.moveaxis(feat, 0, -1) for feat in all_feats]
        is_successful = np.array(
            is_successful_list) if return_is_successful else None
        return {"dataset": all_feats, "is_successful": is_successful}
Esempio n. 14
0
def smiles_from_seq(seq, cyclize):
    """Calculates the smiles of a given peptide dendrimer sequence

    Arguments:
        seq {string} -- peptide dendrimer sequence
    Returns:
        string -- molecule_smile - SMILES of the peptide
    """

    #seq = seq.replace("-z","z").replace("-Z","Z").replace("-p","p").replace("-P","P")

    gs, bs, terminal, capping = split_seq_components(seq)

    # modifies the Cterminal
    if terminal:
        molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]])
    else:
        molecule = ''

    if cyclize and bs:
        print(
            'dendrimer, cyclization not possible, branching unit will not be considered'
        )

    if cyclize:
        for gen in gs:
            metbond = False
            for aa in gen:
                if aa == 'X':
                    continue
                if aa == '-':
                    metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]),
                        metbond)
                    if metbond:
                        metbond = False
    else:
        # creates the dendrimer structure
        for gen in gs:
            metbond = False
            for aa in gen:
                if aa == '-':
                    metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]),
                        metbond)
                    if metbond:
                        metbond = False

            if bs:
                if bs[0] == '-':
                    metbond = True
                    bs.pop(0)
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(B_SMILES[bs[0]])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]),
                        metbond)
                    if metbond:
                        metbond = False
                bs.pop(0)

    # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
    # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)

    if molecule == '':
        smiles = ''
        return smiles, seq

    if capping:
        molecule = utils.attach_capping(
            molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]]))

    if cyclize:
        if is_cyclic(seq):
            cy = 1
        else:
            cy = 0
        molecule = utils.cyclize(molecule, cy)

    # clean the smile from all the tags
    for atom in molecule.GetAtoms():
        atom.SetAtomMapNum(0)

    molecule_smile = rdmolfiles.MolToSmiles(
        molecule, isomericSmiles=True).replace('[N]', 'N').replace('[C]', 'C')
    return molecule_smile, seq