def prepare_dataset(sdffile, dest=None, rename=True, conformations=False, overwrite=False): """ This method bootstraps the analysis of Ames data. - Rename the compounds - Merge train/test - Generate 3D conformations - Save "master" and "saliviewer" tables - Redirects stdout/stderr to a "prepare.log" file """ root, name = op.split(sdffile) name = op.splitext(name)[0] if not dest: dest = root dest_sdf = op.join(dest, name + '-prepared.sdf') master_table = op.join(dest, name + '-prepared-master.csv') sali_table = op.join(dest, name + '-prepared-saliviewer.csv') if op.exists(dest_sdf) and not overwrite: print '%s is already there and not overwriting requested' % dest_sdf else: print 'Reading %s' % sdffile mols = list(pybel.readfile('sdf', sdffile)) print '\tCreating dataset root: %s' % dest if not op.exists(dest): os.makedirs(dest) if rename: print '\tRenaming the compounds to keep track of the provenance' rename_mols_by_index(mols, name + '-') if conformations: print '\tGenerating conformations' for mol in mols: if not any(name in mol.title for name in ('train-3988', 'train-4205', 'dsstox-4205', 'dsstox-4206')): try: print 'Conformation for %s' % mol.title mol.make3D() except Exception: print 'Error computing a 3D conformation for %s' % mol.title print '\tSaving compounds' save_mols(mols, dest_sdf) print '\tCreating \"master\" table: %s' % master_table create_master_table(dest_sdf, master_table, fields=['Activity']) print '\tCreating \"saliviewer\" table: %s' % sali_table create_saliviewer_input(master_table, sali_table) return dest_sdf, master_table
def aid2sdf(sdf, csv, dest=None): """ Reads a pubchem bioassay results and merge it with the SDF file """ #Read the known activities to a dictionary activities = {} for activity in open(csv).readlines()[1:]: value = activity.split(',')[5] molid = activity.split(',')[2] activities[molid] = value #Save the activity to each molecule mols = list(pybel.readfile('sdf', sdf)) for mol in mols: activity = activities[mol.title] if activity == 'Active': actual_activity = '1' elif activity == 'Inactive': actual_activity = '0' else: actual_activity = 'Missing' mol.data['Activity'] = actual_activity if dest: save_mols(mols, dest) return mols
cas = data[0].data['CAS_NO'] if not cas in can_dupes2: can_dupes2[cas] = [data] else: can_dupes2[cas].append(data) for cas in can_dupes2.keys(): groups = can_dupes2[cas] if len(groups) > 1: print 'compound with cas=%s is considered different by OB canonical smiles'%cas for group in groups: print group[0].write('can').strip() print '-'*80 union = sorted([dupe[0] for dupe in cas_dupes.values()], key=lambda mol: mol.title) save_mols(union, op.join(root, 'mutagenicity-all-cas-union.sdf')) print '\t\tUnion size=%d' % len(union) dest_sdf = op.join(root, 'mutagenicity-all-cas-union-prepared.sdf') prepare_dataset(op.join(root, 'mutagenicity-all-cas-union.sdf'), rename=False, conformations=True) #Depict the molecules depict(dest_sdf) #Molecular descriptors print 'Computing fingerprints via JCompoundMapper' #TODO: Extract-method this jcm_fingerprint(dest_sdf, ('ECFP', 'ECFPVariant', 'PHAP3POINT2D', 'SHED', 'DFS', 'RAD2D')) jcm_fingerprint(dest_sdf, ('LSTAR', 'RAD3D', 'PHAP3POINT3D')) print 'Computing descriptors via CDKDescUI' cdkdescuiprops(dest_sdf, desc_types=('constitutional',)) print 'Computing spectrophores'