def __init__(self, selfies=None, selfies_file=None, vocab_file=None): """ Can be initiated from either a list of SELFIES, or a line-delimited SELFIES file. Args: selfies (list): the complete set of SELFIES that constitute the training dataset selfies_file (string): line-delimited file containing the complete set of SELFIES that constitute the training dataset vocab_file (string): line-delimited file containing all tokens to be used in the vocabulary """ if vocab_file is not None: # read tokens from file, and add to vocabulary all_chars = read_smiles(vocab_file) # prevent chain popping open multi-character tokens self.characters = list(set(chain(*[[char] for char in all_chars]))) else: # read SMILES if selfies is not None: self.selfies = selfies elif selfies_file is not None: self.selfies = read_smiles(selfies_file) else: raise ValueError("must provide SELFIES list or file to" + \ " instantiate Vocabulary") # tokenize all SMILES in the input and add all tokens to vocabulary alphabet = sorted(list(sf.get_alphabet_from_selfies(self.selfies))) self.characters = alphabet # add padding token self.characters.append('<PAD>') # add SOS/EOS tokens self.characters.append('SOS') self.characters.append('EOS') # create dictionaries self.dictionary = {key: idx for idx, key in enumerate(self.characters)} self.reverse_dictionary = {value: key for key, value in \ self.dictionary.items()}
def __init__(self, smiles=None, smiles_file=None, vocab_file=None): """ Can be initiated from either a list of SMILES, or a line-delimited SMILES file, or a file containing only tokens. Args: smiles (list): the complete set of SMILES that constitute the training dataset smiles_file (string): line-delimited file containing the complete set of SMILES that constitute the training dataset vocab_file (string): line-delimited file containing all tokens to be used in the vocabulary """ if vocab_file is not None: # read tokens from file, and add to vocabulary self.characters = read_smiles(vocab_file) else: # read SMILES if smiles is not None: self.smiles = smiles elif smiles_file is not None: self.smiles = read_smiles(smiles_file) else: raise ValueError("must provide SMILES list or file to" + \ " instantiate Vocabulary") # tokenize all SMILES in the input and add all tokens to vocabulary all_chars = [self.tokenize(sm) for sm in self.smiles] self.characters = list(set(chain(*all_chars))) # add padding token if not '<PAD>' in self.characters: # ... unless reading a padded vocabulary from file self.characters.append('<PAD>') # create dictionaries self.dictionary = {key: idx for idx, key in enumerate(self.characters)} self.reverse_dictionary = {value: key for key, value in \ self.dictionary.items()}
def __init__(self, smiles=None, smiles_file=None, vocab_file=None, training_split=0.9): """ Can be initiated from either a list of SMILES, or a line-delimited file. Args: smiles (list): the complete set of SMILES that constitute the training dataset smiles_file (string): line-delimited file containing the complete set of SMILES that constitute the training dataset vocab_file (string): line-delimited file containing all tokens to be used in the vocabulary training_split (numeric): proportion of the dataset to withhold for validation loss calculation """ if smiles: self.smiles = smiles elif smiles_file: self.smiles = read_smiles(smiles_file) else: raise ValueError("must provide SMILES list or file to" + \ " instantiate SmilesDataset") # create vocabulary if vocab_file: self.vocabulary = Vocabulary(vocab_file=vocab_file) else: self.vocabulary = Vocabulary(smiles=self.smiles) # split into training and validation sets np.random.seed(0) n_smiles = len(self.smiles) split = np.random.choice(range(n_smiles), size=int(n_smiles * training_split), replace=False) self.training = [self.smiles[idx] for idx in \ range(len(self.smiles)) if idx in split] self.validation = [self.smiles[idx] for idx in \ range(len(self.smiles)) if not idx in split]
### CLI parser = argparse.ArgumentParser() parser.add_argument('--input_file', type=str) parser.add_argument('--output_file', type=str) parser.add_argument('--enum_factor', type=int, help='factor to augment the dataset by') args = parser.parse_args() # check output directory exists output_dir = os.path.dirname(args.output_file) if not os.path.isdir(output_dir): os.makedirs(output_dir) # read SMILES smiles = read_smiles(args.input_file) # convert to numpy array smiles = np.asarray(smiles) # create enumerator sme = SmilesEnumerator(canonical=False, enum=True) # also store and write information about enumerated library size summary = pd.DataFrame() # enumerate potential SMILES enum = [] max_tries = 200 ## randomized SMILES to generate for each input structure for sm_idx, sm in enumerate(tqdm(smiles)): tries = [] for try_idx in range(max_tries):
help='calculate outcomes for molecules in DeepSMILES format', action='store_true') parser.add_argument('--sampled_files', type=str, nargs='*', help='file(s) containing sampled SMILES') parser.set_defaults(stop_if_exists=False) args = parser.parse_args() print(args) # make output directories if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # read the training set SMILES, and convert to moelcules org_smiles = read_smiles(args.original_file) org_mols = [ mol for mol in clean_mols( org_smiles, selfies=args.selfies, deepsmiles=args.deepsmiles) if mol ] org_canonical = [Chem.MolToSmiles(mol) for mol in org_mols] # define helper function to get # of rotatable bonds def pct_rotatable_bonds(mol): n_bonds = mol.GetNumBonds() if n_bonds > 0: rot_bonds = Lipinski.NumRotatableBonds(mol) / n_bonds else: rot_bonds = 0 return rot_bonds
if args.output_dir is None: args.output_dir = os.path.dirname(args.smiles_file) # optionally stop if output file already exists filename = os.path.basename(args.smiles_file) split = os.path.splitext(filename) output_file = os.path.join(args.output_dir, split[0] + "-outcomes.csv.gz") if os.path.isfile(output_file) and args.stop_if_exists: print("output file " + output_file + " exists: stopping early") sys.exit() # create results container res = pd.DataFrame() # read SMILES and convert to molecules smiles = read_smiles(args.smiles_file) mols = [mol for mol in clean_mols(smiles, selfies=args.selfies, deepsmiles=args.deepsmiles) if mol] canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in mols] # also read the reference file ref_smiles = read_smiles(args.reference_file) ref_mols = [mol for mol in clean_mols(ref_smiles) if mol] ref_canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in \ ref_mols] ## drop known molecules canonical = [sm for sm in canonical if sm not in ref_canonical] # re-parse molecules mols = [mol for mol in clean_mols(canonical) if mol]
os.chdir(python_dir) sys.path.append(python_dir) # import functions from functions import clean_mols, remove_salts_solvents, read_smiles, \ NeutraliseCharges # import Vocabulary from datasets import Vocabulary # parse arguments input_file = sys.argv[1] output_file = sys.argv[2] # read SMILES basename = os.path.basename(input_file) smiles = read_smiles(input_file) # remove duplicated SMILES smiles = np.unique(smiles) # record original count initial_count = len(smiles) print("parsing " + str(initial_count) + " unique SMILES") # convert to molecules mols = clean_mols(smiles, stereochem=False) # remove molecules that could not be parsed mols = [mol for mol in mols if mol] print("parsed " + str(len(mols)) + " unique, valid canonical SMILES") # remove salts/solvents mols = [remove_salts_solvents(mol, hac=3) for mol in tqdm(mols)]