def setUp(self): """ Set up for tests. Writes molecules and targets to files. """ self.temp_dir = tempfile.mkdtemp() smiles = [ 'CC(=O)OC1=CC=CC=C1C(=O)O', 'C[C@@H](C1=CC=C(C=C1)CC(C)C)C(=O)O' ] self.names = ['aspirin', 'ibuprofen'] engine = conformers.ConformerGenerator(max_conformers=1) self.mols = [] self.smiles = [] # use RDKit-generated SMILES for i in xrange(len(smiles)): mol = Chem.MolFromSmiles(smiles[i]) mol.SetProp('_Name', self.names[i]) self.mols.append(engine.generate_conformers(mol)) self.smiles.append( Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)) # write mols _, self.input_filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir) writer = serial.MolWriter() writer.open(self.input_filename) writer.write(self.mols) writer.close() # write targets self.targets = [0, 1] _, self.targets_filename = tempfile.mkstemp(suffix='.pkl', dir=self.temp_dir) write_pickle(self.targets, self.targets_filename)
def main(active_filename, decoy_filename, output_filename, assign_stereo_from_3d=False): """ Construct target files for datasets with active/decoy labels. Parameters ---------- active_filename : str Active molecule filename. decoy_filename : str Decoy molecule filename. output_filename : str Output filename. assign_stereo_from_3d : bool, optional (default False) Assign stereochemistry from 3D coordinates. """ active_smiles = get_smiles(active_filename, assign_stereo_from_3d) decoy_smiles = get_smiles(decoy_filename, assign_stereo_from_3d) targets = np.concatenate( (np.ones(len(active_smiles), dtype=int), np.zeros(len(decoy_smiles), dtype=int))) smiles = np.concatenate((active_smiles, decoy_smiles)) write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
def main(input_filename, map_filename, output_filename, column_indices=None): """ Get regression targets. Parameters ---------- input_filename : str PCBA data filename. map_filename : str ID->SMILES map filename. output_filename : str Output filename. column_indices : list, optional Column indices to include. If None, compounds are classified by activity. """ parser = PcbaParser(input_filename, map_filename, column_indices=column_indices) if column_indices is not None: print "Extracting data from the following columns:" for col in parser.get_column_names(): print '\t', col smiles, targets = parser.get_targets() # print the fraction of valid assay records that were found in the map total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID)) print '{}/{} records matched'.format(len(targets), total) # save SMILES and targets write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
def setUp(self): """ Set up for tests. Writes molecules and targets to files. """ self.temp_dir = tempfile.mkdtemp() smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'C[C@@H](C1=CC=C(C=C1)CC(C)C)C(=O)O'] self.names = ['aspirin', 'ibuprofen'] engine = conformers.ConformerGenerator(max_conformers=1) self.mols = [] self.smiles = [] # use RDKit-generated SMILES for i in xrange(len(smiles)): mol = Chem.MolFromSmiles(smiles[i]) mol.SetProp('_Name', self.names[i]) self.mols.append(engine.generate_conformers(mol)) self.smiles.append(Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)) # write mols _, self.input_filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir) writer = serial.MolWriter() writer.open(self.input_filename) writer.write(self.mols) writer.close() # write targets self.targets = [0, 1] _, self.targets_filename = tempfile.mkstemp(suffix='.pkl', dir=self.temp_dir) write_pickle(self.targets, self.targets_filename)
def main(input_filename, map_filename, directory='.', prefix='nci60', suffix='pkl.gz'): """ Get regression targets. Parameters ---------- input_filename : str PCBA data filename. map_filename : str ID->SMILES map filename. directory : str, optional (default '.') Directory in which to write target files. prefix : str, optional (default 'nci60') Prefix for target files. suffix : str, optional (default 'pkl.gz') Suffix for target files. """ parser = Nci60Parser(input_filename, map_filename) split_targets = parser.split_targets() # get total record count total = np.count_nonzero(~np.isnan(parser.read_data().NSC)) # write a separate file for each dataset # note that split_targets is an OrderedDict for i, name in enumerate(split_targets.keys()): data = split_targets[name] # print the fraction of valid assay records that were found in the map print '{}\t{}/{} records matched'.format( name, len(data['targets']), total) write_pickle( data, os.path.join(directory, '{}-{:02}-targets.{}'.format(prefix, i, suffix)))
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { 'CID645443': 'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1', 'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1', 'CID2244': 'CC(=O)Oc1ccccc1C(=O)O', 'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1', 'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1' } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl') write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv') # set up parser # settings match PcbaParser defaults self.engine = AssayDataParser(self.data_filename, self.map_filename, delimiter=',', primary_key='PUBCHEM_CID', activity_key='PUBCHEM_ACTIVITY_OUTCOME', activity_value='Active', id_prefix='CID')
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1", "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1", "CID2244": "CC(=O)Oc1ccccc1C(=O)O", "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1", } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl") write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv") # set up parser # settings match PcbaParser defaults self.engine = AssayDataParser( self.data_filename, self.map_filename, delimiter=",", primary_key="PUBCHEM_CID", activity_key="PUBCHEM_ACTIVITY_OUTCOME", activity_value="Active", id_prefix="CID", )
def test_write_pickle_gz(self): """ Test write_pickle with gzipped pickle. """ _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz') write_pickle({'foo': 'bar'}, filename) with gzip.open(filename) as f: assert cPickle.load(f)['foo'] == 'bar'
def test_collate_mols3(self): """ Test collate_mols where targets are in a different order than molecules. """ # write targets targets = {'names': ['ibuprofen', 'aspirin'], 'y': [1, 0]} write_pickle(targets, self.targets_filename) # run script self.check_output(['circular'], (2, 2048))
def test_collate_mols1(self): """ Test collate_mols where molecules are pruned. """ # write targets targets = {'names': ['ibuprofen'], 'y': [0]} write_pickle(targets, self.targets_filename) # run script self.check_output(['circular'], (1, 2048), targets=targets['y'], names=targets['names'], smiles=[self.smiles[1]])
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = {"NSC1": "CC1=CC(=O)C=CC1=O"} _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl") write_pickle(self.map, self.map_filename) this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, "data/test_nci60_data.txt") # set up parser self.engine = Nci60Parser(self.data_filename, self.map_filename)
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = {'NSC1': 'CC1=CC(=O)C=CC1=O'} _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl') write_pickle(self.map, self.map_filename) this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, 'data/test_nci60_data.txt') # set up parser self.engine = Nci60Parser(self.data_filename, self.map_filename)
def main(input_filenames, output_filename, id_prefix=None, allow_duplicates=True, update=False, assign_stereo_from_3d=False): """ Get SMILES for compounds and map to compound names. Parameters ---------- input_filenames : list Input molecule filenames. output_filename : str Output filename. id_prefix : str, optional Prefix to prepend to IDs. allow_duplicates : bool, optional (default True) Allow duplicate SMILES. update : bool, optional (default False) Update an existing map with the same output filename. If False, a new map will be generated using only the input file(s). assign_stereo_from_3d : bool, optional (default False) Assign stereochemistry from 3D coordinates. """ smiles = SmilesMap(prefix=id_prefix, allow_duplicates=allow_duplicates, assign_stereo_from_3d=assign_stereo_from_3d) # update existing map if update: smiles.map = read_pickle(output_filename) for input_filename in input_filenames: print input_filename with serial.MolReader().open(input_filename) as reader: for mol in reader: try: smiles.add_mol(mol) except ValueError: if mol.HasProp('_Name'): print 'Skipping {}'.format(mol.GetProp('_Name')) else: print 'Skipping {}'.format( Chem.MolToSmiles(mol, isomericSmiles=True)) write_pickle(smiles.get_map(), output_filename)
def test_collate_mols2(self): """ Test collate_mols where targets are pruned. """ # write targets targets = {'names': ['aspirin', 'ibuprofen'], 'y': [0, 1]} write_pickle(targets, self.targets_filename) # write mols writer = serial.MolWriter() writer.open(self.input_filename) writer.write([self.mols[0]]) writer.close() # run script self.check_output(['circular'], (1, 2048), targets=[0], names=['aspirin'], smiles=[self.smiles[0]])
def write_output_file(data, output_filename, compression_level=3): """ Pickle output data, possibly to a compressed file. Parameters ---------- data : object Object to pickle in output file. output_filename : str Output filename. Should end with .joblib, .pkl, or .pkl.gz. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. """ if output_filename.endswith('.pkl') or output_filename.endswith('.pkl.gz'): write_pickle(data, output_filename) elif output_filename.endswith('.joblib'): joblib.dump(data, output_filename, compress=compression_level) else: raise NotImplementedError('Unrecognized output file extension.')
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1", "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1", "CID2244": "CC(=O)Oc1ccccc1C(=O)O", "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1", } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl") write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv") # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename)
def main(active_filename, decoy_filename, output_filename, assign_stereo_from_3d=False): """ Construct target files for datasets with active/decoy labels. Parameters ---------- active_filename : str Active molecule filename. decoy_filename : str Decoy molecule filename. output_filename : str Output filename. assign_stereo_from_3d : bool, optional (default False) Assign stereochemistry from 3D coordinates. """ active_smiles = get_smiles(active_filename, assign_stereo_from_3d) decoy_smiles = get_smiles(decoy_filename, assign_stereo_from_3d) targets = np.concatenate((np.ones(len(active_smiles), dtype=int), np.zeros(len(decoy_smiles), dtype=int))) smiles = np.concatenate((active_smiles, decoy_smiles)) write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
def main(config_filename, output_filename, pdb_filename=None): """ Meta-information consists of a row for each target, with a column rach for associated AIDs and PDBs (lists). """ # read target->PDB associations pdb = {} if pdb_filename is not None: with open(pdb_filename) as f: for line in f: target, code = line.split() pdb[target] = code.split( ',') # multiple PDBs can be separated by ',' config = pd.read_csv(config_filename) # get AIDs for each target targets = {} for _, row in config.iterrows(): target = row['target'] try: int(target) target = 'gi_{}'.format(target) # add 'gi_' to integer targets except ValueError: pass if target not in targets: targets[target] = [] targets[target].append(row['aid']) # construct dataframe points = [] for target, aids in targets.iteritems(): points.append({ 'target': target, 'aids': aids, 'pdbs': pdb.get(target) }) df = pd.DataFrame(points) write_pickle(df, output_filename)
def main(input_filename, map_filename, directory='.', prefix='nci60', suffix='pkl.gz'): """ Get regression targets. Parameters ---------- input_filename : str PCBA data filename. map_filename : str ID->SMILES map filename. directory : str, optional (default '.') Directory in which to write target files. prefix : str, optional (default 'nci60') Prefix for target files. suffix : str, optional (default 'pkl.gz') Suffix for target files. """ parser = Nci60Parser(input_filename, map_filename) split_targets = parser.split_targets() # get total record count total = np.count_nonzero(~np.isnan(parser.read_data().NSC)) # write a separate file for each dataset # note that split_targets is an OrderedDict for i, name in enumerate(split_targets.keys()): data = split_targets[name] # print the fraction of valid assay records that were found in the map print '{}\t{}/{} records matched'.format(name, len(data['targets']), total) write_pickle( data, os.path.join(directory, '{}-{:02}-targets.{}'.format(prefix, i, suffix)))
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { 'CID645443': 'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1', 'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1', 'CID2244': 'CC(=O)Oc1ccccc1C(=O)O', 'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1', 'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1' } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl') write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv') # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename)