def search_cocrystals(filter_solvents=True): ''' Search the whole CSD for structures that contain two different molecules with the specific settings ''' start_time = time.clock() csd = MoleculeReader('CSD') entry_reader = EntryReader('CSD') settings = search.Search.Settings() settings.only_organic = True settings.not_polymeric = True settings.has_3d_coordinates = True settings.no_disorder = True settings.no_errors = True settings.no_ions = True settings.no_metals = True pairs=[] for entry in csd: #if len(pairs)==100: # break if settings.test(entry): mol = csd.molecule(entry.identifier) mol.normalise_labels() smi= mol.smiles if smi != None: smi = smi.split('.') # We make sure that the structure consist of two different molecules if len(Remove(smi)) == 2: pairs.append(mol.identifier) # clean the list from solvents if filter_solvents: print('Solvates and hydrates will be removed') solvates=[] name_dict={} for mol1 in pairs: mol = csd.molecule(mol1) e=entry_reader.entry(mol1) name_dict[mol1]=e.chemical_name for i in range(0, (len(mol.components))): if mol.components[i].smiles in clean_smiles.SOLVENT_SMILES: solvates.append(mol.identifier) solvates = Remove(solvates) final_cocrystals = [x for x in pairs if x not in solvates] #print(name_dict) else: final_cocrystals=pairs # Clean the list from polymorphs cocrystals = remove_polymorphs(final_cocrystals) #print the time end_time = time.clock() name=[] name= [name_dict[i] for i in cocrystals] cocrystals_data= pd.concat([pd.DataFrame(cocrystals, columns=['csd_id']), pd.DataFrame(name, columns=['name'])], axis=1) cocrystals_data=cocrystals_data.dropna(axis=0) dataset_cocrystals = cocrystals_data[~cocrystals_data.name.str.contains("solvate")] dataset_cocrystals = dataset_cocrystals[~dataset_cocrystals.name.str.contains("clathrate")] print(end_time-start_time) dataset_cocrystals.to_csv('new_all_cocrystals.csv',index=False) return cocrystals
def get_entry(identifier, database="CSD"): """ input an identifier as a string and get the ccdc.entry.Entry object """ csd_reader = EntryReader(database) entry = csd_reader.entry(identifier) return entry
def __init__(self): d = "CSD_Drug_Subset_updated.gcd" m = "MOF_subset.gcd" self.drugs = self.get_refcodes(d) self.mofs = self.get_refcodes(m) self.subset = [] self.refcode = [] self.year = [] self.smiles = [] self.data = [ self.get_information(entry) for entry in EntryReader('CSD') ]
def run(self): # inputs with HotspotReader(self.args.hotspot_path) as reader: hr = [ h for h in reader.read() if h.identifier == self.args.hotspot_identifier ][0] with MoleculeReader(self.args.docked_mols) as reader: out = os.path.join(os.path.dirname(self.args.docked_mols), "results_no_dummy.mol2") with MoleculeWriter(out) as writer: for mol in reader: for atm in mol.atoms: if atm.atomic_symbol == "Unknown": mol.remove_atom(atm) writer.write(mol) self.args.docked_mols = out entires = EntryReader(self.args.docked_mols) # outputs out_dir = os.path.join(os.path.dirname(self.args.docked_mols)) print(out_dir) # process hr = augmentation(hr, entires) # 1) rescore rescored = {e: score(hr, e) for e in entires} ordered_rescored = OrderedDict( sorted(rescored.items(), key=lambda item: item[1], reverse=True)) # 2) deduplicate: retain highest ranked pose only out_dic = deduplicate(ordered_rescored) # 3) output to dataframe ready for ccdc.descriptors API df = pd.DataFrame({ "identifier": [e.identifier for e in out_dic.keys()], "score": list(out_dic.values()), "activity": [activity_tag(e.identifier) for e in out_dic.keys()] }) df.to_csv(os.path.join(out_dir, "rescored.csv")) with EntryWriter(os.path.join(out_dir, "rescored.sdf")) as w: for e in out_dic.keys(): w.write(e)
def gatherMatches(self): """ Set up as a raw string gather matches for now TODO: create crystal flattener :return: """ #Refine the hit list here to match group. #TMP: right now im just going to return a list of identifiers logger.info("Sending request for hits results\n\n") #cellLib.displayHits(self.searchHits) allHits = [] #TODO: cast to typem ## TMP str return #line = "" #for hit in self.searchHits: # line += hit.crystal.to_string(format='sdf') +"\n" #Quick + dirty format to just display on other side for hit in self.searchHits: #logger.info("Casting one hit " + hit.crystal.to_string(format='sdf')) detailsImp = [] logger.info(hit.molecule.identifier) logger.info("CCDC number: " + str(hit.entry.ccdc_number)) details = EntryReader('CSD').entry(hit.identifier) detailsImp.append(hit.identifier) # refcode logger.info(str(details.crystal.cell_lengths[0])) detailsImp.append(str(details.crystal.cell_lengths[0])) detailsImp.append(str(details.crystal.cell_lengths[1])) detailsImp.append(str(details.crystal.cell_lengths[2])) detailsImp.append(str(details.crystal.cell_angles[0])) detailsImp.append(str(details.crystal.cell_angles[1])) detailsImp.append(str(details.crystal.cell_angles[2])) detailsImp.append(hit.crystal.formula) #detailsImp.append(hit.crystal.lattice_centring) allHits.append(detailsImp) return allHits
def reportGenerator(filepath,refcode): """ :param filepath: :param refcode: :return:path to where generated html report can be found """ entry = EntryReader('csd').entry(refcode) mol = entry.molecule atoms = mol.atoms bonds = mol.bonds img = DiagramGenerator().image(mol) doi = entry.publication.doi if doi is None: doi = ' ' else: doi = '<a href="http://dx.doi.org/%s">%s</a>' % (doi, doi) template_file_name = os.path.join( os.path.dirname(__file__), 'simple_report_template.html' ) template = unicode(open(template_file_name).read()) fileGenPath = os.path.join(filepath + refcode+ '.html') with open(fileGenPath, 'w') as html: s = template.format( entry=entry, molecule=mol, image=img, doi=doi, synonyms='; '.join(s for s in entry.synonyms), counts=dict( natoms=len(atoms), ndonors=len([a for a in atoms if a.is_donor]), nacceptors=len([a for a in atoms if a.is_acceptor]), nrot_bonds=len([b for b in bonds if b.is_rotatable]), ), ) html.write(s.encode('utf8')) return fileGenPath
def anal(self, queryTargetPath, normalizeFlag=False): """Perform geometrical analysis against the CCDC data source-""" retD = {} targetStructures = EntryReader(queryTargetPath) for e in targetStructures: mol = e.molecule if normalizeFlag: mol.assign_bond_types(which="unknown") mol.standardise_aromatic_bonds() mol.standardise_delocalised_bonds() # logger.info("begin analysis - for %s", queryTargetPath) gam = self.__engine.analyse_molecule(mol) bondOutliers = len( [b for b in gam.analysed_bonds if b.unusual and b.enough_hits]) angleOutliers = len([ a for a in gam.analysed_angles if a.unusual and a.enough_hits ]) torsionOutliers = len([ t for t in gam.analysed_torsions if t.unusual and t.enough_hits ]) ringOutliers = len( [r for r in gam.analysed_rings if r.unusual and r.enough_hits]) bL = self.__getBondAnalysis(gam) aL = self.__getAngleAnalysis(gam) tL = self.__getTorsionAnalysis(gam) rL = self.__getRingAnalysis(gam) retD = { "bond_outliers": bondOutliers, "angle_outliers": angleOutliers, "torsion_outliers": torsionOutliers, "ring_outliers": ringOutliers, "bond_list": bL, "angle_list": aL, "torsion_list": tL, "ring_list": rL, } return retD
def analyse_structures(user_gcd_input, user_csv_output): if len(os.path.splitext(user_csv_output)[1]) == 0: user_csv_output += ".csv" with open(user_csv_output, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(('Refcode', 'dimensionality', 'number in gcd file')) csd_reader = EntryReader(user_gcd_input, 'CSD') t2 = time.time() n_structures = 0 n_mof = 0 n_non_mof = 0 for entry in csd_reader: print('CSD entry: ' + str(entry.identifier)) n_structures += 1 # quick counter count_polymers = 0 for component in entry.molecule.components: if component.is_polymeric: count_polymers += 1 if count_polymers > 1: print('multiple polymer units present') if entry.molecule.heaviest_component.is_polymeric: n_mof += 1 framework = entry.molecule.heaviest_component framework.remove_hydrogens( ) # next steps fail if any atoms in the unit do not have coordinates entry.crystal.molecule = framework fig = dimensionality(entry) if fig == 0: dimension = '0D non-MOF' elif fig == 1: dimension = '1D chain' elif fig == 2: dimension = '2D sheet' elif fig == 3: dimension = '3D framework' else: n_non_mof += 1 dimension = 'no polymeric bonds detected' print('Framework dimensions for CSD entry % s: % s \n' % (entry.identifier, dimension)) writer.writerow((entry.identifier, dimension, n_structures)) f.flush() print('Total MOF subset size is: % d' % n_structures) print('Entries recognised as polyermic is: % d' % n_mof) print('Entries not recognised as polymeric (and ignored) is: % d' % n_non_mof) t3 = time.time() overall_time_taken = str(t3 - t2) print('total time elapsed for script % s' % overall_time_taken) f.close()
def search(self, queryTargetId, queryTargetPath, resultPath, normalizeFlag=True, maxHits=50, searchType="similarity", suppressMetals=False): """Search the CCDC database for similar or substructure matches for the input query molecule. Args: queryTargetId (str): query identifier queryTargetPath (str): path to the query molfile (mol, sdf, mol2) resultPath (str): output path to match results normalizeFlag (bool, optional): do standard perceptions on matching molecules. Defaults to True. maxHits (int, optional): maximum number of matches to return. Defaults to 50. searchType (str, optional): search mode (substructure, similarity). Defaults to "similarity". suppressMetals (bool, optional): filter structures containing metals. Defaults to False. Returns: (int): number of matches """ mU = MarshalUtil() logger.info("Start search for target %s path %s result path %s", queryTargetId, queryTargetPath, resultPath) # summaryList = [] # targetDirPath = os.path.dirname(queryTargetPath) cifTargetPath = os.path.join(targetDirPath, queryTargetId + ".cif") # targetStructures = EntryReader(queryTargetPath) dirPath = os.path.join(resultPath, queryTargetId) numHits = 0 for ii, e in enumerate(targetStructures, 1): numHits = 0 startTime = time.time() targetMol = e.molecule if normalizeFlag: targetMol.assign_bond_types(which="unknown") targetMol.standardise_aromatic_bonds() targetMol.standardise_delocalised_bonds() # logger.info("(%d) begin %s search - query id %s", ii, searchType, queryTargetId) if searchType == "similarity": hits = self.__similaritySearch(targetMol, suppressMetals=suppressMetals) elif searchType == "substructure": hits = self.__moleculeSubstructureSearch( targetMol, suppressMetals=suppressMetals) else: hits = [] logger.info("(%d) completed search query id %s in %.3f seconds", ii, queryTargetId, time.time() - startTime) if hits: numHits += len(hits) logger.info("(%d) search for %s matched %d: %r", ii, queryTargetId, numHits, [targetHit.identifier for targetHit in hits]) # for targetHit in hits[:maxHits]: # hI = CcdcMatchIndexInst() hI.setCsdVersion(csd_version()) hI.setCsdDirectory(csd_directory()) hI.setTargetId(queryTargetId) hI.setTargetPath(queryTargetPath) if mU.exists(cifTargetPath): hI.setTargetCcPath(cifTargetPath) hI.setIdentifier(targetHit.identifier) hI.setMatchType(searchType) try: hI.setRFactor(targetHit.entry.r_factor) hI.setChemicalName(targetHit.entry.chemical_name) hI.setTemperature(targetHit.entry.temperature) hI.setRadiationSource(targetHit.entry.radiation_source) hI.setHasDisorder("N") cit = targetHit.entry.publication if cit.doi is not None: hI.setCitationDOI(cit.doi) if searchType == "similarity": hI.setSimilarityScore(targetHit.similarity) elif searchType == "substructure": hI.setMatchedAtomLength( len(targetHit.match_atoms())) except Exception as e: logger.exception("Failing with %s", str(e)) # # mU.mkdir(dirPath) mol2L = [] if searchType == "substructure": for jj, mc in enumerate(targetHit.match_components(), 1): fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".mol2") mol2L.append(fp) with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[1] = lines[1].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".sdf") with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[0] = lines[0].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # # Check for multiple generated result files - # for jj, fp in enumerate(mol2L, 1): logger.debug("(%d) adding component fp %s", jj, fp) hI.setMatchNumber(jj) hI.setMol2Path(fp) tt = fp[:-4] + "sdf" hI.setMolPath(tt) summaryList.append(copy.deepcopy(hI.get())) # else: hI.setMatchNumber(1) summaryList.append(copy.deepcopy(hI.get())) else: logger.info("(%d) search for %s returns no matches", ii, targetMol.identifier) hits = None # if numHits > 0: mU.mkdir(dirPath) fp = os.path.join(dirPath, queryTargetId + "-index.json") cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose) cmI.load(summaryList) cmI.writeIndex() return numHits
entries = list(df.refcode) from ccdc.search import TextNumericSearch data = [] # for e in entries: # query = TextNumericSearch() # query.add_all_identifiers(e) # hits = query.search() # data.append(hits[0].entry.publication.doi) # from pprint import pprint # # print len(data) # print len(set(data)) from ccdc.diagram import DiagramGenerator from ccdc.io import EntryReader diagram_generator = DiagramGenerator() diagram_generator.settings.font_size = 12 diagram_generator.settings.line_width = 1.6 diagram_generator.settings.image_width = 500 diagram_generator.settings.image_height = 500 csd_reader = EntryReader('CSD') mols = set([csd_reader.entry(m) for m in entries]) for i, e in enumerate(mols): img = diagram_generator.image(e) img.save("hit{}.png".format(i))
class CSD_powder: """ CSD_powder class ####################### this class calculates d_spacing, intensities and two theta for a specific crystal Attributes ------------- entry [ccdc entry reader method] crystal_name str the name of a crystal of form 'AABHTZ' Methods --------------- __init__() --------------- takes 2 arguments self,name [str] sets crystal name load_d_space() --------------- uses name atr calls ccdc PowderPattern class calculates d_spacing using braggs law returns d_space[list of d_spacing], intensities[list of peak intensities] load_intensities() ------------------ uses name atr calls ccdc PowderPattern class returns list[intensities] load_two_theta() ------------------- uses name atr calls ccdc PowderPattern class returns list[two theta angles] """ def __init__(self): self.entry = EntryReader('CSD') def get_crystal_name(self): return self.crystal_name def set_crystal_name(self, value): self.crystal_name = value def load_d_space(self): # creates a d_space list with intensities as a second option crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) self.wavelength = PowderPattern.Wavelength.Wavelength_CuKa1 peak_thetas = [] #intents = pattern.intensity two_t = pattern.two_theta intents = pattern.intensity # all pattern intensity intensity = [] # final list of intensities for i in pattern.tick_marks: l = i.two_theta # two theta vals for j, I in zip(two_t, intents): if abs(l - j) < 0.01: # compare lists and find peak 2theta values the above assumption may be changed peak_thetas.append(j) # tick two theta val intensity.append( I) # add the intensity of those peaks to a list break d_space = [] # list of d_spaces peak_thetas = np.array(peak_thetas) / 2 # theta vals instead of 2theta peak_radians = peak_thetas * np.pi / 180 # to radians for peak in peak_radians: d = self.wavelength / (2 * np.sin(peak) ) # get the space values in Angstorms d_space.append(d) # append final values to a list return d_space, intensity def load_intensities(self): # loads the intensities of a given crsytal crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) return pattern.intensity def load_two_theta(self): crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) return pattern.two_theta def get_data(self, option): if option == 1: d_space, intensities = self.load_d_space() for i, j in zip(d_space, intensities): print i, j if option == 2: """all the data""" x1 = self.load_intensities() x2 = self.load_two_theta() for i, j in zip(x1, x2): print i, j else: sys.exit(1)
from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import Draw from ccdc.io import EntryReader import pandas as pd mols = "./pharmit/query_results.sdf" mols = EntryReader(mols) smiles = [] name = [] rmsd = [] for m in mols: smiles.append(m.molecule.smiles) name.append(m.identifier) rmsd.append(m.attributes["rmsd"]) df = pd.DataFrame({"smiles": smiles, "name": name, "rmsd": rmsd}) df.to_csv("pharmit.csv") # entries= [(m.smiles, m.identifier) for m in mols] # # ligs =[] # # for i, entry in enumerate(entries): # lig = Chem.MolFromSmiles(entry[0]) # name = "" # for e in entry[1].split(" "):
def __init__(self): self.entry = EntryReader('CSD')
formulas = open("formulas.txt", 'rb') formulas_list = formulas.readlines() new_formula_list = [] for formula_string in formulas_list: formula_string = formula_string.rstrip('\n') output_formula_list_item = [] fully_split_up = (list(split_text(formula_string))) for index, index_item in enumerate(fully_split_up): if index % 2 == 0: output_formula_list_item.append(index_item + fully_split_up[(index + 1)]) output_formula_list_item.sort() new_formula_list.append(output_formula_list_item) print new_formula_list csd_entry_reader = EntryReader('CSD') output = open("Results.txt", 'w') for entry in csd_entry_reader: for component in entry.molecule.components: entry_formula = (component.formula).strip("(") entry_formula = entry_formula.strip(")n") entry_formula = (entry_formula).split(" ") entry_formula.sort() entry_formula = [i for i in entry_formula if re.search('[a-zA-Z]', i)] print entry.identifier if entry_formula in new_formula_list: output.write(component.formula + "," + entry.identifier + "\n")
def __init__(self,name): "" self.entry = EntryReader('CSD') self.crystal_name = name
from ioAndInterfaces import ccdcCrystalToASE from ccdc.io import EntryReader from ase.io import write as ASEWrite csdEntryReader = EntryReader('CSD') aseCrystal = ccdcCrystalToASE(csdEntryReader.crystal('ABEBUF')) ASEWrite('temp.xyz', aseCrystal)
def create_dataframe(base, run_id, pdbs): format_dic = { "asp": "ASP", "chemscore": "Chemscore", "goldscore": "Goldscore", "plp": "PLP" } data = { "pdb": [], "runid": [], "pose_id": [], "pose_rank": [], "dock_func": [], "dock_fitness": [], "rescore_func": [], "rescore_fitness": [], "gold_score": [], "rmsd": [], "rmsd_rank": [] } pdbs = [ pdb for pdb in pdbs if os.path.isdir(os.path.join(base, pdb, run_id)) ] for pdb in tqdm(pdbs): dpath = os.path.join(base, pdb, run_id) funcs = [ d for d in os.listdir(dpath) if not os.path.isfile(os.path.join(dpath, d)) ] for func in funcs: ff_a, ff_b = process_ff_label(func) s = [] # for the ranking r = [] for i in range(1, 31): pose = EntryReader( os.path.join(dpath, func, "data", f"ranked_{pdb}_ligand_m1_{i}.mol2"))[0] attr = pose.attributes score = float(attr["Gold.Score"].split("\n")[1][:5]) fit_score = { k.split(".")[1]: attr[k] for k in [a for a in attr.keys() if "Fitness" in a] } rmsd = attr["Gold.Reference.RMSD"] data["pdb"].append(pdb) data["runid"].append(run_id) data["pose_id"].append(i) data["dock_func"].append(ff_a) data["dock_fitness"].append(float(fit_score[format_dic[ff_a]])) data["gold_score"].append(score) r.append(float(rmsd)) if ff_b is None: data["rescore_func"].append(ff_a) s.append(float(fit_score[format_dic[ff_a]])) else: data["rescore_func"].append(ff_b) s.append(float(fit_score[format_dic[ff_b]])) data["rescore_fitness"].extend(s) data["pose_rank"].extend(rank_array(s)) data["rmsd"].extend(r) data["rmsd_rank"].extend(rank_array(r)) return pd.DataFrame(data)
class CSD_powder: """ CSD_powder class ####################### this class calculates d_spacing, intensities and two theta for a specific crystal Attributes ------------- entry [ccdc entry reader method] crystal_name str the name of a crystal of form 'AABHTZ' Methods --------------- __init__() --------------- takes 2 arguments self,name [str] sets crystal name load_d_space() --------------- uses name atr calls ccdc PowderPattern class calculates d_spacing using braggs law returns d_space[list of d_spacing], intensities[list of peak intensities] load_intensities() ------------------ uses name atr calls ccdc PowderPattern class returns list[intensities] load_two_theta() ------------------- uses name atr calls ccdc PowderPattern class returns list[two theta angles] """ def __init__(self): self.entry = EntryReader('CSD') def get_crystal_name(self): return self.crystal_name def set_crystal_name(self,value): self.crystal_name = value def load_d_space(self): # creates a d_space list with intensities as a second option crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) self.wavelength = PowderPattern.Wavelength.Wavelength_CuKa1 peak_thetas = [] #intents = pattern.intensity two_t = pattern.two_theta intents = pattern.intensity # all pattern intensity intensity = [] # final list of intensities for i in pattern.tick_marks: l = i.two_theta # two theta vals for j,I in zip(two_t,intents): if abs(l-j) < 0.01: # compare lists and find peak 2theta values the above assumption may be changed peak_thetas.append(j) # tick two theta val intensity.append(I) # add the intensity of those peaks to a list break d_space = [] # list of d_spaces peak_thetas = np.array(peak_thetas)/2 # theta vals instead of 2theta peak_radians = peak_thetas*np.pi/180 # to radians for peak in peak_radians: d = self.wavelength/(2*np.sin(peak)) # get the space values in Angstorms d_space.append(d) # append final values to a list return d_space,intensity def load_intensities(self): # loads the intensities of a given crsytal crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) return pattern.intensity def load_two_theta(self): crystal = self.entry.crystal(self.crystal_name) pattern = PowderPattern.from_crystal(crystal) return pattern.two_theta def get_data(self,option): if option == 1: d_space,intensities = self.load_d_space() for i,j in zip(d_space,intensities): print i,j if option == 2: """all the data""" x1 = self.load_intensities() x2 = self.load_two_theta() for i,j in zip(x1,x2): print i,j else: sys.exit(1)