def get_targets(self): """ Parse data file and return targets and corresponding SMILES. Procedure --------- 1. Read data and get unique rows by compound ID. 2. Map compound IDs to SMILES. 3. Extract targets from data. """ data = self.read_data() id_map = read_pickle(self.map_filename) # get compound SMILES from map # indices are for data rows successfully mapped to SMILES smiles, indices = self.map_ids_to_smiles(data[self.primary_key], id_map) # get targets if self.column_indices is not None: targets = np.zeros((data.shape[0], len(self.column_indices)), dtype=float) for i, idx in enumerate(self.column_indices): targets[:, i] = data[data.columns[idx]] else: targets = np.asarray( data[self.activity_key] == self.activity_value) targets = targets[indices] # reduce targets to matched structures return smiles, targets
def test_read_pickle_gz(self): """ Test read_pickle with gzipped pickle. """ _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz') with gzip.open(filename, 'wb') as f: cPickle.dump({'foo': 'bar'}, f, cPickle.HIGHEST_PROTOCOL) assert read_pickle(filename)['foo'] == 'bar'
def check_output(self, featurize_args, shape, targets=None, mol_ids=None, smiles=None, output_suffix='.pkl'): """ Check features shape, targets, and mol_ids. Parameters ---------- featurize_args : list Featurizer-specific arguments for script. filename : str Output filename. shape : tuple Expected shape of features. targets : list, optional Expected targets. Defaults to self.targets. mol_ids: list, optional Expected mol_ids. Defaults to self.mol_ids. smiles : list, optional Expected SMILES. Defaults to self.smiles. output_suffix : str, optional (default '.pkl') Suffix for output files. """ # generate command-line arguments _, output_filename = tempfile.mkstemp(suffix=output_suffix, dir=self.temp_dir) input_args = [self.input_filename, '-t', self.targets_filename, output_filename] + featurize_args # run script args = parse_args(input_args) main(args.klass, args.input, args.output, target_filename=args.targets, featurizer_kwargs=vars(args.featurizer_kwargs), include_smiles=True, scaffolds=args.scaffolds, chiral_scaffolds=args.chiral_scaffolds) # read output file if output_filename.endswith('.joblib'): data = joblib.load(output_filename) else: data = read_pickle(output_filename) # check values if targets is None: targets = self.targets if mol_ids is None: mol_ids = self.mol_ids if smiles is None: smiles = self.smiles assert len(data) == shape[0] if len(shape) > 1: assert data.ix[0, 'features'].shape == shape[1:] assert np.array_equal(data['y'], targets), data['y'] assert np.array_equal(data['mol_id'], mol_ids), data['mol_id'] assert np.array_equal(data['smiles'], smiles), data['smiles'] # return output in case anything else needs to be checked return data
def test_map_ids_to_smiles(self): """ Test AssayDataParser.map_ids_to_smiles. """ data = self.engine.read_data() id_map = read_pickle(self.map_filename) smiles, indices = self.engine.map_ids_to_smiles(data.PUBCHEM_CID, id_map) assert len(smiles) == len(indices) == 2 assert smiles[0] == self.map["CID645443"] assert smiles[1] == self.map["CID2997889"] assert np.array_equal(indices, [0, 3])
def test_main(self): """ Test main. """ args = parse_args(['-i', self.input_filename, '-o', self.output_filename, '-p', 'CID']) main(args.input, args.output, args.prefix) data = read_pickle(self.output_filename) assert len(data) == len(self.smiles) for smile, cid in zip(self.smiles, self.cids): assert data['CID{}'.format(cid)] == Chem.MolToSmiles( Chem.MolFromSmiles(smile), isomericSmiles=True)
def test_map_ids_to_smiles(self): """ Test AssayDataParser.map_ids_to_smiles. """ data = self.engine.read_data() id_map = read_pickle(self.map_filename) smiles, indices = self.engine.map_ids_to_smiles( data.PUBCHEM_CID, id_map) assert len(smiles) == len(indices) == 2 assert smiles[0] == self.map['CID645443'] assert smiles[1] == self.map['CID2997889'] assert np.array_equal(indices, [0, 3])
def test_main(self): """ Test main. """ args = parse_args([ '-i', self.input_filename, '-o', self.output_filename, '-p', 'CID' ]) main(args.input, args.output, args.prefix) data = read_pickle(self.output_filename) assert len(data) == len(self.smiles) for smile, cid in zip(self.smiles, self.cids): assert data['CID{}'.format(cid)] == Chem.MolToSmiles( Chem.MolFromSmiles(smile), isomericSmiles=True)
def check_output(self, input_args): """ Check main output. Parameters ---------- input_args : list Command-line arguments. """ args = parse_args(input_args) main(args.actives, args.decoys, args.output, args.stereo_from_3d) data = read_pickle(self.output_filename) for smiles, target in zip(data['smiles'], data['targets']): assert smiles in self.smiles assert target == self.y[self.smiles.index(smiles)] return data['smiles'], data['targets']
def main(input_filenames, output_filename, id_prefix=None, allow_duplicates=True, update=False, assign_stereo_from_3d=False): """ Get SMILES for compounds and map to compound names. Parameters ---------- input_filenames : list Input molecule filenames. output_filename : str Output filename. id_prefix : str, optional Prefix to prepend to IDs. allow_duplicates : bool, optional (default True) Allow duplicate SMILES. update : bool, optional (default False) Update an existing map with the same output filename. If False, a new map will be generated using only the input file(s). assign_stereo_from_3d : bool, optional (default False) Assign stereochemistry from 3D coordinates. """ smiles = SmilesMap(prefix=id_prefix, allow_duplicates=allow_duplicates, assign_stereo_from_3d=assign_stereo_from_3d) # update existing map if update: smiles.map = read_pickle(output_filename) for input_filename in input_filenames: print input_filename with serial.MolReader().open(input_filename) as reader: for mol in reader: try: smiles.add_mol(mol) except ValueError: if mol.HasProp('_Name'): print 'Skipping {}'.format(mol.GetProp('_Name')) else: print 'Skipping {}'.format( Chem.MolToSmiles(mol, isomericSmiles=True)) write_pickle(smiles.get_map(), output_filename)
def test_main(self): """ Test main. """ args = get_args([self.filename, '-d', self.temp_dir]) main(args.input, args.merge, args.dir) # check for the right number of files assert len(glob.glob(os.path.join(self.temp_dir, '*.pkl.gz'))) == 6 # inspect files individually for filename in glob.glob(os.path.join(self.temp_dir, '*.pkl.gz')): data = read_pickle(filename) assert len(data['smiles']) == len(data['targets']) # try to read SMILES for this_smiles in data['smiles']: Chem.MolFromSmiles(this_smiles) # check type of targets assert data['targets'].dtype == int
def test_update(self): """ Test update existing map. """ args = parse_args(['-i', self.input_filename, '-o', self.output_filename, '-p', 'CID']) main(args.input, args.output, args.prefix, args.update) # add another molecule self.smiles.append('CC(=O)NC1=CC=C(C=C1)O') self.cids.append(1983) with open(self.input_filename, 'wb') as f: for smile, cid in zip(self.smiles, self.cids): f.write('{}\t{}\n'.format(smile, cid)) # update existing map main(args.input, args.output, args.prefix, True) data = read_pickle(self.output_filename) assert len(data) == len(self.smiles) for smile, cid in zip(self.smiles, self.cids): assert data['CID{}'.format(cid)] == Chem.MolToSmiles( Chem.MolFromSmiles(smile), isomericSmiles=True)
def test_update(self): """ Test update existing map. """ args = parse_args([ '-i', self.input_filename, '-o', self.output_filename, '-p', 'CID' ]) main(args.input, args.output, args.prefix, args.update) # add another molecule self.smiles.append('CC(=O)NC1=CC=C(C=C1)O') self.cids.append(1983) with open(self.input_filename, 'wb') as f: for smile, cid in zip(self.smiles, self.cids): f.write('{}\t{}\n'.format(smile, cid)) # update existing map main(args.input, args.output, args.prefix, True) data = read_pickle(self.output_filename) assert len(data) == len(self.smiles) for smile, cid in zip(self.smiles, self.cids): assert data['CID{}'.format(cid)] == Chem.MolToSmiles( Chem.MolFromSmiles(smile), isomericSmiles=True)
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, include_smiles=False, scaffolds=False, chiral_scaffolds=False, mol_id_prefix=None): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. include_smiles : bool, optional (default False) Include SMILES in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. mol_id_prefix : str, optional Prefix for molecule IDs. """ mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols(mols, mol_ids, targets['y'], targets['mol_id']) mols = mols[mol_indices] mol_ids = mol_ids[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['mol_id'] = mol_ids data['features'] = features # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['mol_id'].shape[0] == len(mols), ( "Molecule IDs do not match molecules.") # smiles, scaffolds, args if include_smiles: smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) # construct a DataFrame try: if data['features'].ndim > 1: # numpy arrays will be "summarized" when written as strings # use str(row.tolist())[1:-1] to remove the surrounding brackets # remove commas (keeping spaces) to avoid conflicts with csv if (output_filename.endswith('.csv') or output_filename.endswith('.csv.gz')): data['features'] = [ str(row.tolist())[1:-1].replace(', ', ' ') for row in data['features'] ] else: data['features'] = [row for row in data['features']] except AttributeError: pass df = pd.DataFrame(data) # write output file write_output_file(df, output_filename, compression_level)
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, names=False, scaffolds=False, chiral_scaffolds=False): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. names : bool, optional (default False) Whether to include molecule names in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. """ mols, mol_names = read_mols(input_filename) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols( mols, mol_names, targets['y'], targets['names']) mols = mols[mol_indices] mol_names = mol_names[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['features'] = features # calculate SMILES smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['smiles'].shape[0] == len(mols), ( "SMILES do not match molecules.") # names, scaffolds, args if names: data['names'] = mol_names if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) data['args'] = {'featurizer_class': featurizer_class.__name__, 'input_filename': input_filename, 'target_filename': target_filename, 'featurizer_kwargs': featurizer_kwargs, 'chiral_scaffolds': chiral_scaffolds} # write output file write_output_file(data, output_filename, compression_level)
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, include_smiles=False, scaffolds=False, chiral_scaffolds=False, mol_id_prefix=None): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. include_smiles : bool, optional (default False) Include SMILES in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. mol_id_prefix : str, optional Prefix for molecule IDs. """ mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols( mols, mol_ids, targets['y'], targets['mol_id']) mols = mols[mol_indices] mol_ids = mol_ids[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['mol_id'] = mol_ids data['features'] = features # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['mol_id'].shape[0] == len(mols), ( "Molecule IDs do not match molecules.") # smiles, scaffolds, args if include_smiles: smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) # construct a DataFrame try: if data['features'].ndim > 1: # numpy arrays will be "summarized" when written as strings # use str(row.tolist())[1:-1] to remove the surrounding brackets # remove commas (keeping spaces) to avoid conflicts with csv if (output_filename.endswith('.csv') or output_filename.endswith('.csv.gz')): data['features'] = [str(row.tolist())[1:-1].replace(', ', ' ') for row in data['features']] else: data['features'] = [row for row in data['features']] except AttributeError: pass df = pd.DataFrame(data) # write output file write_output_file(df, output_filename, compression_level)
def check_output(self, featurize_args, shape, targets=None, names=None, smiles=None, output_suffix='.pkl'): """ Check features shape, targets, and names. Parameters ---------- featurize_args : list Featurizer-specific arguments for script. filename : str Output filename. shape : tuple Expected shape of features. targets : list, optional Expected targets. Defaults to self.targets. names : list, optional Expected names. Defaults to self.names. smiles : list, optional Expected SMILES. Defaults to self.smiles. output_suffix : str, optional (default '.pkl') Suffix for output files. """ # generate command-line arguments _, output_filename = tempfile.mkstemp(suffix=output_suffix, dir=self.temp_dir) input_args = [ self.input_filename, '-t', self.targets_filename, output_filename, '--names' ] + featurize_args # run script args = parse_args(input_args) main(args.klass, args.input, args.output, target_filename=args.targets, featurizer_kwargs=vars(args.featurizer_kwargs), names=args.names, scaffolds=args.scaffolds, chiral_scaffolds=args.chiral_scaffolds) # read output file if output_filename.endswith('.joblib'): data = joblib.load(output_filename) else: data = read_pickle(output_filename) # check values if targets is None: targets = self.targets if names is None: names = self.names if smiles is None: smiles = self.smiles assert len(data) == shape[0] if len(shape) > 1: assert data.ix[0, 'features'].shape == shape[1:] assert np.array_equal(data['y'], targets), data['y'] assert np.array_equal(data['names'], names), data['names'] assert np.array_equal(data['smiles'], smiles), data['smiles'] # return output in case anything else needs to be checked return data