def __init__(self, selfies=None, selfies_file=None, vocab_file=None):
        """
        Can be initiated from either a list of SELFIES, or a line-delimited
        SELFIES file.

        Args:
            selfies (list): the complete set of SELFIES that constitute the
              training dataset
            selfies_file (string): line-delimited file containing the complete
              set of SELFIES that constitute the training dataset
            vocab_file (string): line-delimited file containing all tokens to
              be used in the vocabulary
        """
        if vocab_file is not None:
            # read tokens from file, and add to vocabulary
            all_chars = read_smiles(vocab_file)
            # prevent chain popping open multi-character tokens
            self.characters = list(set(chain(*[[char] for char in all_chars])))
        else:
            # read SMILES
            if selfies is not None:
                self.selfies = selfies
            elif selfies_file is not None:
                self.selfies = read_smiles(selfies_file)
            else:
                raise ValueError("must provide SELFIES list or file to" + \
                                 " instantiate Vocabulary")
            # tokenize all SMILES in the input and add all tokens to vocabulary
            alphabet = sorted(list(sf.get_alphabet_from_selfies(self.selfies)))
            self.characters = alphabet

        # add padding token
        self.characters.append('<PAD>')
        # add SOS/EOS tokens
        self.characters.append('SOS')
        self.characters.append('EOS')
        # create dictionaries
        self.dictionary = {key: idx for idx, key in enumerate(self.characters)}
        self.reverse_dictionary = {value: key for key, value in \
                                   self.dictionary.items()}
Beispiel #2
0
    def __init__(self, smiles=None, smiles_file=None, vocab_file=None):
        """
        Can be initiated from either a list of SMILES, or a line-delimited
        SMILES file, or a file containing only tokens.

        Args:
            smiles (list): the complete set of SMILES that constitute the
              training dataset
            smiles_file (string): line-delimited file containing the complete
              set of SMILES that constitute the training dataset
            vocab_file (string): line-delimited file containing all tokens to
              be used in the vocabulary
        """
        if vocab_file is not None:
            # read tokens from file, and add to vocabulary
            self.characters = read_smiles(vocab_file)
        else:
            # read SMILES
            if smiles is not None:
                self.smiles = smiles
            elif smiles_file is not None:
                self.smiles = read_smiles(smiles_file)
            else:
                raise ValueError("must provide SMILES list or file to" + \
                                 " instantiate Vocabulary")
            # tokenize all SMILES in the input and add all tokens to vocabulary
            all_chars = [self.tokenize(sm) for sm in self.smiles]
            self.characters = list(set(chain(*all_chars)))

        # add padding token
        if not '<PAD>' in self.characters:
            # ... unless reading a padded vocabulary from file
            self.characters.append('<PAD>')

        # create dictionaries
        self.dictionary = {key: idx for idx, key in enumerate(self.characters)}
        self.reverse_dictionary = {value: key for key, value in \
                                   self.dictionary.items()}
    def __init__(self,
                 smiles=None,
                 smiles_file=None,
                 vocab_file=None,
                 training_split=0.9):
        """
        Can be initiated from either a list of SMILES, or a line-delimited
        file.

        Args:
            smiles (list): the complete set of SMILES that constitute the
              training dataset
            smiles_file (string): line-delimited file containing the complete
              set of SMILES that constitute the training dataset
            vocab_file (string): line-delimited file containing all tokens to
              be used in the vocabulary
            training_split (numeric): proportion of the dataset to withhold for
              validation loss calculation
        """
        if smiles:
            self.smiles = smiles
        elif smiles_file:
            self.smiles = read_smiles(smiles_file)
        else:
            raise ValueError("must provide SMILES list or file to" + \
                             " instantiate SmilesDataset")

        # create vocabulary
        if vocab_file:
            self.vocabulary = Vocabulary(vocab_file=vocab_file)
        else:
            self.vocabulary = Vocabulary(smiles=self.smiles)

        # split into training and validation sets
        np.random.seed(0)
        n_smiles = len(self.smiles)
        split = np.random.choice(range(n_smiles),
                                 size=int(n_smiles * training_split),
                                 replace=False)
        self.training = [self.smiles[idx] for idx in \
                         range(len(self.smiles)) if idx in split]
        self.validation = [self.smiles[idx] for idx in \
                           range(len(self.smiles)) if not idx in split]
### CLI
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', type=str)
parser.add_argument('--output_file', type=str)
parser.add_argument('--enum_factor', type=int,
                    help='factor to augment the dataset by')
args = parser.parse_args()

# check output directory exists
output_dir = os.path.dirname(args.output_file)
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# read SMILES
smiles = read_smiles(args.input_file)
# convert to numpy array
smiles = np.asarray(smiles)

# create enumerator
sme = SmilesEnumerator(canonical=False, enum=True)

# also store and write information about enumerated library size
summary = pd.DataFrame()

# enumerate potential SMILES
enum = []
max_tries = 200 ## randomized SMILES to generate for each input structure
for sm_idx, sm in enumerate(tqdm(smiles)):
    tries = []
    for try_idx in range(max_tries):
Beispiel #5
0
    help='calculate outcomes for molecules in DeepSMILES format',
    action='store_true')
parser.add_argument('--sampled_files',
                    type=str,
                    nargs='*',
                    help='file(s) containing sampled SMILES')
parser.set_defaults(stop_if_exists=False)
args = parser.parse_args()
print(args)

# make output directories
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

# read the training set SMILES, and convert to moelcules
org_smiles = read_smiles(args.original_file)
org_mols = [
    mol for mol in clean_mols(
        org_smiles, selfies=args.selfies, deepsmiles=args.deepsmiles) if mol
]
org_canonical = [Chem.MolToSmiles(mol) for mol in org_mols]


# define helper function to get # of rotatable bonds
def pct_rotatable_bonds(mol):
    n_bonds = mol.GetNumBonds()
    if n_bonds > 0:
        rot_bonds = Lipinski.NumRotatableBonds(mol) / n_bonds
    else:
        rot_bonds = 0
    return rot_bonds
if args.output_dir is None:
    args.output_dir = os.path.dirname(args.smiles_file)

# optionally stop if output file already exists
filename = os.path.basename(args.smiles_file)
split = os.path.splitext(filename)
output_file = os.path.join(args.output_dir, split[0] + "-outcomes.csv.gz")
if os.path.isfile(output_file) and args.stop_if_exists:
    print("output file " + output_file + " exists: stopping early")
    sys.exit()

# create results container
res = pd.DataFrame()

# read SMILES and convert to molecules
smiles = read_smiles(args.smiles_file)
mols = [mol for mol in clean_mols(smiles, selfies=args.selfies,
                                  deepsmiles=args.deepsmiles) if mol]
canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in mols]

# also read the reference file
ref_smiles = read_smiles(args.reference_file)
ref_mols = [mol for mol in clean_mols(ref_smiles) if mol]
ref_canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in \
                 ref_mols]

## drop known molecules
canonical = [sm for sm in canonical if sm not in ref_canonical]
# re-parse molecules
mols = [mol for mol in clean_mols(canonical) if mol]
os.chdir(python_dir)
sys.path.append(python_dir)

# import functions
from functions import clean_mols, remove_salts_solvents, read_smiles, \
    NeutraliseCharges
# import Vocabulary
from datasets import Vocabulary

# parse arguments
input_file = sys.argv[1]
output_file = sys.argv[2]

# read SMILES
basename = os.path.basename(input_file)
smiles = read_smiles(input_file)

# remove duplicated SMILES
smiles = np.unique(smiles)
# record original count
initial_count = len(smiles)
print("parsing " + str(initial_count) + " unique SMILES")

# convert to molecules
mols = clean_mols(smiles, stereochem=False)
# remove molecules that could not be parsed
mols = [mol for mol in mols if mol]
print("parsed " + str(len(mols)) + " unique, valid canonical SMILES")

# remove salts/solvents
mols = [remove_salts_solvents(mol, hac=3) for mol in tqdm(mols)]