def statsSummary(pdbSet, data, geos,tag): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' loadPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataK/' fileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv' allAtoms = False bFactorFactor = -1 if pdbSet == 'RESTRICTED': allAtoms = True bFactorFactor = 1.3 georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) for geo in geos: georep.addStatsSummary(data=data, desc=geo + ' ' + pdbSet, geoX=geo, geoY='aa', hue='ID') georep.printToHtml('Stats Summary , set=' + pdbSet, 2, 'StatsSummary_' + pdbSet + tag)
def diffHistograms(pdbList, tag): pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' loadPath = help.rootPath + '/ProteinDataFiles/ccp4_out/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataN/' allRealPdbs = [] for pdb in pdbList: realFileName = pdb + '_DiffHistogram.csv' realData = pd.read_csv(loadPath + realFileName) allRealPdbs.append(realData) #append them all realCsv = pd.concat(allRealPdbs, axis=0, sort=False) georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) georep.addHistogram(data=realCsv, geoX='Percent', title='Difference Histograms - %', count=True, hue='PdbCode') georep.addHistogram(data=realCsv, geoX='Diff', title='Difference Histograms - diff', count=True, hue='PdbCode', palette='LightSeaGreen') georep.addScatter(data=realCsv, geoX='Main', geoY='Percent', hue='Diff', palette='jet', categorical=False, sort='RANDOM', title='Differences') georep.addScatter(data=realCsv, geoX='Diff', geoY='Percent', hue='PdbCode', palette='jet_r', categorical=True, sort='RANDOM', title='Differences') georep.printToHtml('Difference Histograms', 2, 'DiffHist_' + tag)
def scatterReports(pdbSet, data, trios, perAA=True, tag=''): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataI/' #BestFileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv' #dataBest = pd.read_csv(loadPath + BestFileName) aas = data['aa'].values aas = list(set(aas)) aas.sort() #aas = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG','SER', 'THR', 'VAL', 'TRP', 'TYR'] #geosPairs = [['PHI','PSI','TAU'],['PSI','N:N+1','TAU'],['N:CA','CA:C','C:O'],['C:N+1','TAU','CA:C:N+1'],['CA:C:O','O:C:N+1','C-1:N:CA']] georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) for trio in trios: if perAA: for aa in aas: dataCut = data.query("aa == '" + aa + "'") georep.addScatter(data=dataCut, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON') else: georep.addScatter(data=data, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + ':' + trio[1], palette='jet', sort='NON') georep.printToHtml('Scatters , set=' + pdbSet, 4, 'Defensible_Scatters_' + tag)
def makeSlicesHtmlFromValues(mainTitle, dirName, lineRuns,row,tag): printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesI/' georep = psu.GeoReport([], "", "", printPath, ed=False, dssp=False) #We are going to load the data that has been created by density flight and pasted into a text file for lineRun in lineRuns: title = lineRun[0] palette = lineRun[1] fileName = lineRun[2] posName = lineRun[3] inputVals = georep.loadSlice(dirName + fileName) inputPoses = georep.loadSlice(dirName + posName) georep.addSlice(inputVals, palette=palette, title=title, YellowDots=inputPoses,Contour=True) georep.printToHtml(mainTitle,row,tag)
def __init__(self,plotA, plotB, title,report): self.title = title if title!='ghost': self.plotA = plotA self.plotB = plotB else:#In this case we have only the main plot, so we create the dummy plot from PsuGeometry import GeoReport as geor self.plotB = plotA ghostReport = geor.GeoReport(['ghost'],report.pdbDataPath,report.edDataPath,report.outDataPath,report.ed,report.dssp) geoList = [] geoList.append(self.plotB.geoX) if self.plotB.geoY != '': geoList.append(self.plotB.geoY) ghostdata = ghostReport.getGeoemtryCsv(geoList, ['pdbCode']) self.plotA = GeoPlot(ghostdata, self.plotB.geoX, geoY=self.plotB.geoY, title='ghost', hue='pdbCode', palette='Greys',plot=self.plotB.plot,operation=self.plotB.operation,report=report)
def makeCsv(pdbSet, pdbListIn, geos, badAtoms, disordered): print('Getting CSV for', pdbSet) pdbDataPath = filesPDBRoot if pdbSet == 'ADJUSTEDDEN': pdbDataPath = filesDenRoot if pdbSet == 'ADJUSTEDLAP': pdbDataPath = filesLapRoot from PsuGeometry import GeoPdb as geopdb pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False, disordered, badAtoms) pdbmanager.clear() pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False, disordered, badAtoms) pdbList = [ ] #this is so we don't default to getting a pdb file from somewhere we don;t want for pdb in pdbListIn: import os.path filePdb = pdbDataPath + 'pdb' + pdb + '.ent' #print('- Adding to csv',filePdb) if os.path.isfile((filePdb).lower()): pdbList.append(pdb.lower()) else: print('No file:', pdbDataPath, pdb) pdbList.sort() hueList = [ 'aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered', 'occupancy' ] georep = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=disordered) print('geoList', geos) dataBest = georep.getGeoemtryCsv(geos, hueList, -1, allAtoms=True) try: dataBest['rid'] = dataBest['rid'].astype(str) dataBest['ID'] = dataBest['pdbCode'] + dataBest['chain'] + dataBest[ 'rid'] + dataBest['aa'] except: print('empty csv') return dataBest
def clusterEdTauMaker(pdbCode, rid, chain, aa): from PsuGeometry import GeoReport as psu from PsuGeometry import GeoPdb as geopdb pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = 'F:/Code/BbkProject/PhDThesis/0.Papers/1.TauCorrelations/Data/BestSupportedCSVs/Reports/' georep = psu.GeoReport([pdbCode], pdbDataPath, edDataPath, printPath, ed=False, dssp=False) pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, ed=False, dssp=False) apdb = pdbmanager.getPdb(pdbCode, True) pdbcsv = apdb.getDataFrame() queryC = 'rid==' + str( rid) + ' and chain=="' + chain + '"' + ' and atom=="CA"' queryL = 'rid==' + str( rid) + ' and chain=="' + chain + '"' + ' and atom=="N"' queryP = 'rid==' + str( rid) + ' and chain=="' + chain + '"' + ' and atom=="C"' dataC = pdbcsv.query(queryC) dataL = pdbcsv.query(queryL) dataP = pdbcsv.query(queryP) if len(dataC) > 0 and len(dataL) > 0 and len(dataP) > 0: cx = round(dataC['x'].values[0], 3) cy = round(dataC['y'].values[0], 3) cz = round(dataC['z'].values[0], 3) lx = round(dataL['x'].values[0], 3) ly = round(dataL['y'].values[0], 3) lz = round(dataL['z'].values[0], 3) px = round(dataP['x'].values[0], 3) py = round(dataP['y'].values[0], 3) pz = round(dataP['z'].values[0], 3) row = pdbCode + "," + chain + str(rid) + "," + str(cx) + "," + str( cy) + "," + str(cz) row += "," + str(lx) + "," + str(ly) + "," + str(lz) row += "," + str(px) + "," + str(py) + "," + str(pz) return row
def evidenceReports(pdbSet, fourSetNames, dataA, dataB, dataC, dataD ,trios, title,perAA=True, tag=''): pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataI/' aas = dataA['aa'].values aas = list(set(aas)) aas.sort() georep = psu.GeoReport([],pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False,keepDisordered=False) for trio in trios: if perAA: for aa in aas: dataCutA = dataA.query("aa == '" + aa + "'") dataCutB = dataB.query("aa == '" + aa + "'") dataCutC = dataC.query("aa == '" + aa + "'") dataCutD = dataD.query("aa == '" + aa + "'") if len(trio) == 3: georep.addScatter(data=dataCutA, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':'+ trio[1] , palette='jet', sort='NON') georep.addScatter(data=dataCutB, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON') georep.addScatter(data=dataCutC, geoX=trio[0], geoY=trio[1], hue=trio[2],title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON') georep.addScatter(data=dataCutD, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON') else: georep.addHistogram(data=dataCutA, geoX=trio[0],title=fourSetNames[0] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataCutB, geoX=trio[0],title=fourSetNames[1] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataCutC, geoX=trio[0],title=fourSetNames[2] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataCutD, geoX=trio[0],title=fourSetNames[3] + ' ' + trio[0], hue='ID') else: if len(trio) == 3: georep.addScatter(data=dataA, geoX=trio[0], geoY=trio[1], hue=trio[2],title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Unrestricted', palette='jet', sort='NON') georep.addScatter(data=dataB, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Restricted', palette='jet', sort='NON') georep.addScatter(data=dataC, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Restricted+cut', palette='jet', sort='NON') georep.addScatter(data=dataD, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Adjusted', palette='jet', sort='NON') else: georep.addHistogram(data=dataA, geoX=trio[0], title=fourSetNames[0] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataB, geoX=trio[0], title=fourSetNames[1] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataC, geoX=trio[0], title=fourSetNames[2] + ' ' + trio[0], hue='ID') georep.addHistogram(data=dataD, geoX=trio[0], title=fourSetNames[3] + ' ' + trio[0], hue='ID') georep.printToHtml(title, 4, pdbSet + '_Defensible' + tag)
def compareSets(tag): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' loadPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataD/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataE/' geos = ['N:CA', 'CA:C', 'C:O', 'C:N+1', 'TAU', 'C-1:N:CA', 'CA:C:N+1', 'CA:C:O', 'O:C:N+1', 'CA:C:N+1'] aas = ['ALL', 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN','ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR'] fileName = tag + 'Data_SetsSummaryMerged.csv' data = pd.read_csv(loadPath + fileName) georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) for geo in geos: dataCut = data.query('geo == "' + geo + '"') dataCutCount = dataCut.query('count > 0') dataCutCount = dataCutCount.query('aa != "' + 'ALL' + '"') dataCutAll = dataCut.query('aa == "' + 'ALL' + '"') dataCutPRO = dataCut.query('aa == "' + 'PRO' + '"') dataCutGLY = dataCut.query('aa == "' + 'GLY' + '"') georep.addScatter(data=dataCutAll, geoX='mean', geoY='set', hue='sd', title=geo + ' ALL (exc gly/pro)', palette='jet', categorical=False,sort='NON') georep.addScatter(data=dataCutGLY, geoX='mean', geoY='set', hue='sd', title=geo + ' GLY', palette='jet', categorical=False,sort='NON') georep.addScatter(data=dataCutPRO, geoX='mean', geoY='set',hue='sd', title=geo + ' PRO', palette='jet', categorical=False,sort='NON') georep.addScatter(data=dataCutCount, geoX='count', geoY='aa', hue='set', title=geo + ' Best Supported counts per aa', palette='jet_r', categorical=True, sort='NON') georep.addScatter(data=dataCutCount, geoX='mean', geoY='aa', hue='set', title=geo + ' Best Supported means per aa', palette='jet_r',categorical=True, sort='NON') georep.addScatter(data=dataCutCount, geoX='sd', geoY='aa', hue='set', title=geo + ' Best Supported sd per aa', palette='jet_r', categorical=True, sort='NON') georep.printToHtml('Best Supported and Engh&Huber Compare', 3, tag + 'Compare_EH_Sets')
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/density/' ### split list in 2 for memory purposes #pdbList = ['1ejg','1us0','1tt8','1i1w','1ucs','1yk4','1yk4','1hje','1r6j'] #pdbList = ['2bw4','3nir','3x2m','2VB1','3A39','2b97','2OV0','2WFI'] #pdbList = ['4ZM7','4REK','4ZM7','5D8V','5NW3','5qkw'] pdbList = [ '6jvv', '6rr2', '6E6O', '6S2M', '6shk', '6fgz', '6ctd', '6fwf', '6q53' ] #pdbList = ['1ejg','1us0','1tt8','1i1w','1ucs','1yk4','1yk4','1hje','1r6j','2bw4','3nir','3x2m','2VB1','3A39','2b97','2OV0','2WFI','3o4p','1pjx'] #pdbList = ['4ZM7','4REK','4ZM7','5D8V','5NW3','5qkw','6jvv','6rr2','6E6O','6S2M','6shk','6fgz','6ctd','6fwf','6q53'] ### split list in 2 for memory purposes #peaksList=['1ejg','1us0','1tt8','1i1w','1ucs','6jvv','5nqo'] peaksList = [] #['1us0','1tt8'] #peaksList=['1i1w','1ucs','5nqo'] #pointsList=['6fwf','6q53','6ctd','6fgz','6rr2','6shk','6rr2'] pointsList = ['1ejg'] for pdb in peaksList: georep = geor.GeoReport([pdb], pdbDataPath, edDataPath, printPath) georep.printReport('Slow_DensityPeaksPerPdb', pdb + '_denpk') for pdb in pointsList: georep = geor.GeoReport([pdb], pdbDataPath, edDataPath, printPath) georep.printReport('Slow_DensityPointsPerPdb', pdb + '_denpt')
dic = {} A1 = randomOnSphere(A1_atom, 0.01) #7) A2 = randomOnSphere(A2_atom, 0.01) #5) A3 = randomOnSphere(A3_atom, 0.01) #7) a1a2a3 = calcs.angle(A1[0], A1[1], A1[2], A2[0], A2[1], A2[2], A3[0], A3[1], A3[2]) a1a2 = calcs.distance(A1[0], A1[1], A1[2], A2[0], A2[1], A2[2]) a2a3 = calcs.distance(A2[0], A2[1], A2[2], A3[0], A3[1], A3[2]) dic['pdbCode'] = 'Iter_' + str(count) dic['chain'] = 'A' dic['rid'] = 1 dic['ANGLE'] = a1a2a3 dic['A1:A2'] = a1a2 dic['A2:A3'] = a2a3 vals.append(dic) dataFrame = pd.DataFrame.from_dict(vals) georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) georep.addHistogram(data=dataFrame, geoX='ANGLE', title='') georep.addHistogram(data=dataFrame, geoX='A1:A2', title='') georep.addHistogram(data=dataFrame, geoX='A2:A3', title='') georep.printToHtml('Simulated Atoms', 3, 'SimReport')
import pandas as pd import Ch000_Functions as help from PsuGeometry import GeoReport as psu pdblist = help.getPDBList100() pdblist.sort() #pdblist = pdblist[:10] hueList = ['aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered','occupancy','dssp'] dsspPrintPath = '../../PdbLists/' georep = psu.GeoReport(pdblist, help.pdbDataPathLx, help.edDataPath, dsspPrintPath, ed=False, dssp=True, includePdbs=False, keepDisordered=True) datacsv = georep.getGeoemtryCsv(['N:CA'],hueList) datacsv = datacsv[['pdbCode','chain','rid','aa','dssp']] print(datacsv) if False:#don;t accidentally run this and replace it datacsv.to_csv(dsspPrintPath + 'dssp.csv', index=False) print(datacsv)
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/bad/' # Create the geoemtric data geoPsi = ['N:O', 'CB:O', 'N:CA:C:N+1'] geoListMain = ['CA:C', 'N:CA', 'C:O'] hueList = ['dssp', 'aa', 'bfactor', '2FoFc', 'rid'] # note the hues are the sum od the atoms pdbList = ['2lc9', '2lcb', '2cnq', '1i1w'] # structures with errors for pdbCode in pdbList: georep = psu.GeoReport([pdbCode], pdbDataPath, edDataPath, printPath) dataPsi = georep.getGeoemtryCsv(geoPsi, hueList) dataMain = georep.getGeoemtryCsv(geoListMain, hueList) #Create the geoplots printList = [] georep.addHistogram(geoX='N:CA', title='N-CA', ghost=True, hue='rid') georep.addHistogram(geoX='CA:C', title='C-CA', ghost=True, hue='rid') georep.addHistogram(geoX='CA:CA+1', title='CA-CA+1', ghost=True, hue='rid') georep.addHistogram(data=dataMain, geoX='N:CA', title='N-CA', ghost=True, splitKey='pdbCode')
################################################################################### pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/' dsspHue='dssp' includeDSSP = False if myWindowsLaptop: pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/' edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/' printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/' includeDSSP = False # on my windows computer ########################################################################################### georepData = psu.GeoReport(pdbList1000, pdbDataPath, edDataPath, printPath, ed=False, dssp=includeDSSP, includePdbs=True) geoList = [] for geo in dihs: geoList.append(geo) for geo in distances: geoList.append(geo) for geo in angles: geoList.append(geo) count = 0 length = len(pdbList1000) for pdb in pdbList1000: print(pdb,' ', count,'/',length) count += 1
def maximaCompareReal(pdbSet, pdbList, tag): pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/' realCsv, badRealCsv, occRealCsv = help.getMaximaDiffs( pdbSet, pdbList, False) realOccOne = realCsv.query("Occupancy == 1") realCutDown = realOccOne.query("BFactor < 10") realCutDown5 = realCutDown.query("Difference <= 0.05") realCol = 'brg' #'RdPu' georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) georep.addHistogram(data=realCsv, geoX='Difference', title='PDB structures', count=True, hue='pdbCode') georep.addHistogram(data=realCutDown, geoX='Difference', title='PDB structures, Occ=1, BFact<10', count=True, hue='pdbCode') georep.addHistogram( data=realCutDown5, geoX='Difference', title='PDB structures, Occ=1, BFact<10, Difference<=0.05', count=True, hue='pdbCode') georep.addScatter(data=realCsv, geoX='Difference', geoY='BFactor', hue='BGridDistance', palette=realCol, sort='RANDOM', title='Occ=1') georep.addScatter(data=realCutDown, geoX='Difference', geoY='BFactor', hue='BGridDistance', palette=realCol, sort='RANDOM', title='bfactor<=10') georep.addScatter(data=realCutDown5, geoX='Difference', geoY='BFactor', hue='BGridDistance', palette=realCol, sort='RANDOM', title='Diff<=0.05') georep.addScatter(data=realCsv, geoX='Difference', geoY='Width', hue='BFactor', categorical=False, palette=realCol, sort='RANDOM', title='Occ=1') georep.addScatter(data=realCutDown, geoX='Difference', geoY='Width', hue='BFactor', categorical=False, palette=realCol, sort='RANDOM', title='BFactor<=10') georep.addScatter(data=realCutDown5, geoX='Difference', geoY='Width', hue='BFactor', categorical=False, palette=realCol, sort='RANDOM', title='Diif<=0.05') #for pdb in pdbList: # print(pdb) # georep.addHistogram(data=realCsv, geoX='Difference', title='Occ=1', hue='AtomNo', count=True, restrictions={'pdbCode': pdb}) # georep.addHistogram(data=realCutDown, geoX='Difference', title='Occ=1, BFact<10', hue='AtomNo', count=True, restrictions={'pdbCode': pdb}) # georep.addScatter(data=realCutDown, geoX='Difference', geoY='BGridDistance', hue='GridDistance', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='') # georep.addScatter(data=realCutDown, geoX='Difference', geoY='AtomType', hue='AtomNo', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='') georep.printToHtml('Maxima differences in PDB Structures, set=' + pdbSet, 3, 'Maxima_' + pdbSet + tag)
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/' from PsuGeometry import GeoReport as geor # Create the GeoPdb object and the report object with just that single pdb pdbs = ['2bw4','1ejg','1us0'] pdbs = ['1ejg'] georep = geor.GeoReport(pdbs,pdbDataPath,edDataPath,printPath,False,False) # Choose the geometric calculations desired and the hues we might want to look at title = 'Protein Warhol' georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='Spectral',restrictions={'aa':'PRO,ALA'}) georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='twilight_shifted',restrictions={'aa':'ALA'}) georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='inferno',restrictions={'aa':'PRO'}) georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='viridis_r') # And finally create the reort with a file name of choice georep.printToHtml(title,2,'warhol') # Choose the geometric calculations desired and the hues we might want to look at ''' title = 'Protein Halo' georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='Spectral') georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='twilight_shifted') georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='inferno') georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='nipy_spectral_r') # And finally create the reort with a file name of choice georep.printToHtml(title,2,'angel') '''
def createBadDensitySlices(pdbSet, atomCe, atomLi, atomPl): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() pdbOriginalPath = help.rootPath + '/ProteinDataFiles/pdb_data/' pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/' edDataPath = help.rootPath + '/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesF/' # This gets the list of pdbs pdbdata = pd.read_csv( '../../PdbLists/Pdbs_Evidenced.csv' ) # This is a list of pdbs <= 1.1A non homologous to 90% pdbListIn = pdbdata['PDB'].tolist()[0:] #pdbListIn = ['1p1x'] for pdb in pdbListIn: slicesList = [] #fileName = (pdbDataPath + 'pdb' + pdb + '_' + atomCe + atomLi + atomPl + '.bad').lower() fileNameIn = (pdbDataPath + 'pdb' + pdb + '.bad').lower() print(fileNameIn) import os.path if os.path.isfile(fileNameIn): text_file = open(fileNameIn, "r") lines = text_file.read().split('\n') print(len(lines)) text_file.close() for line in lines: atom = line[12:14].lstrip().rstrip() aa = line[14:20].lstrip().rstrip() chain = line[20:22].lstrip().rstrip() rid = line[22:27].lstrip().rstrip() print(pdb, atom, aa, chain, rid, line) if rid != '': if [pdb, chain, rid] not in slicesList: slicesList.append([pdb, chain, rid]) bigstring = "" for sl in slicesList: print(sl) georep = psu.GeoReport([sl[0]], pdbOriginalPath, edDataPath, printPath, ed=False, dssp=False) pdbmanager = geopdb.GeoPdbs(pdbOriginalPath, edDataPath, ed=False, dssp=False) apdb = pdbmanager.getPdb(sl[0], True) pdbcsv = apdb.getDataFrame() queryC = 'rid==' + str( sl[2] ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomCe + '"' queryL = 'rid==' + str( sl[2] ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomLi + '"' queryP = 'rid==' + str( sl[2] ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomPl + '"' dataC = pdbcsv.query(queryC) dataL = pdbcsv.query(queryL) dataP = pdbcsv.query(queryP) if len(dataC) > 0 and len(dataL) > 0 and len(dataP) > 0: cx = round(dataC['x'].values[0], 3) cy = round(dataC['y'].values[0], 3) cz = round(dataC['z'].values[0], 3) lx = round(dataL['x'].values[0], 3) ly = round(dataL['y'].values[0], 3) lz = round(dataL['z'].values[0], 3) px = round(dataP['x'].values[0], 3) py = round(dataP['y'].values[0], 3) pz = round(dataP['z'].values[0], 3) row = sl[0] + "," + sl[1] + str( sl[2]) + "," + str(cx) + "," + str(cy) + "," + str(cz) row += "," + str(lx) + "," + str(ly) + "," + str(lz) row += "," + str(px) + "," + str(py) + "," + str(pz) print(row) bigstring += row + '\n' if len(slicesList) > 0: print("########RESULTS#########") print("") print(bigstring) tag = atomCe + atomLi + atomPl f = open( printPath + 'BadSlice_' + pdbSet + '_' + pdb + '_' + tag + '.txt', "w") f.write(bigstring) f.close()
edSlicePath = 'F:/Code/ProteinDataFiles/ccp4_out/' #printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/' #We are going to load the data that has been created by density flight edSlicePath += FileDir + "/" print(edSlicePath + "_Results.csv") inputdata = pd.read_csv(edSlicePath + "_Results.csv") #PdbCode,Tag, # CentreX,CentreY,CentreZ,# LinearX,LinearY,LinearZ,PlanarX,PlanarY,PlanarZ, # CentreV,LinearV,PlanarV,Angle, # BCentreX,BCentreY,BCentreZ,BLinearX,BLinearY,BLinearZ,BPlanarX,BPlanarY,BPlanarZ, # BCentreC,BLinearV,BPlanarV,BAngle georep = psu.GeoReport([], pdbDataPath, edDataPath, edSlicePath, ed=False, dssp=False) #georepPrint = psu.GeoReport([],pdbDataPath,edDataPath,edSlicePath,ed=False,dssp=False) pdbs = inputdata['PdbCode'].values tags = inputdata['Tag'].values taus = inputdata['Angle'].values btaus = inputdata['BAngle'].values origs = [] betters = [] radiants = [] brads = [] # Once the app has created the data we can load it
It runs correlation reports on proline, which it colours on the hue of CHI1 and CA-1:CA These geometric measures are proxies for up/down pucker of the proline run (up-pucker=-ve CHI1) And cis-trans proline, where pre-omega means proline, which corresponds to short CA-1:CA ''' pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/1000Structures/' pdbList = [] pdbdata = pd.read_csv('structures09.csv') pdbList = pdbdata['pdb_code'] georep = geor.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=False, dssp=False) geoList = [ 'PHI', 'PSI', 'TAU', 'C-1:C', 'C-1:N:CA', 'CHI1', 'CA-1:CA', 'OMEGA', 'CA:CA+1', 'C-1:N:CA:C' ] hueList = ['aa', 'bfactor', 'rid', 'resolution', 'pdbCode'] data = georep.getGeoemtryCsv(geoList, hueList) georep.addScatter(data=data, geoX='PHI', geoY='PSI', hue='CHI1',
def makeSlicesHtml(setName, fileNameOrig,fileNameAdj, title,titleType,tag): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() FileDir = setName firstRow = 1 outfileName = titleType + '_' + setName + "_" + tag pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/' edDataPath = help.rootPath + '/ProteinDataFiles/ccp4_data/' edSlicePath = help.rootPath + '/ProteinDataFiles/ccp4_out/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesH/' #We are going to load the data that has been created by density flight edSlicePath += FileDir + "/" print(edSlicePath + "_Results.csv") inputdata = pd.read_csv(edSlicePath + "_Results.csv") pdbs = inputdata['PdbCode'].values tags = inputdata['Tag'].values taus = inputdata['Angle'].values valsPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesC/' print(valsPath) inputValsOrig = pd.read_csv(valsPath + fileNameOrig) inputValsAdj = pd.read_csv(valsPath + fileNameAdj) #inputVals = pd.read_csv(valsPath + "GoodOutliers_" + tag + ".csv") #print(inputVals) georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False) #origs = [] #radiants = [] for i in range(0,len(pdbs)): pdb = pdbs[i] tag = tags[i] tau = taus[i] pdbInputValsOrig = inputValsOrig.query("pdbCode == '" + pdb + "'") pdbInputValsAdj = inputValsAdj.query("pdbCode == '" + pdb + "'") valsOrig = pdbInputValsOrig['value'].values valsAdj = pdbInputValsAdj['value'].values vpdbs = pdbInputValsAdj['pdbCode'].values vaas = pdbInputValsAdj['aa'].values rids = pdbInputValsAdj['rid'].values chains = pdbInputValsAdj['chain'].values for j in range(0,len(vpdbs)): vpdb = vpdbs[j] newTag = vaas[j] + chains[j] + str(rids[j]) print(vpdb,pdb,tag,newTag) if pdb == vpdb and newTag in tag: sliceOrigVal = georep.loadSlice(edSlicePath + pdb + tag + "value_slice.csv") sliceOrigRad = georep.loadSlice(edSlicePath + pdb + tag + "radiant_slice.csv") sliceOrigMag = georep.loadSlice(edSlicePath + pdb + tag + "magnitude_slice.csv") ''' https://stackoverflow.com/questions/16400241/how-to-redefine-a-color-for-a-specific-value-in-a-matplotlib-colormap/16401183#16401183 ''' sliceOrigPos = georep.loadSlice(edSlicePath + pdb + tag + "poses_slice.csv") #This takes the data from the electron density only imtitle = pdb +' ' + tag + ' value, angle=' + str(round(tau,3)) # This takes the data from seperately created outliers file if '_O' in tag: imtitle = pdb + ' ' + tag + ' value=' + str(round(valsOrig[j], 3)) else: imtitle = pdb + ' ' + tag + ' value=' + str(round(valsAdj[j], 3)) #if pdb != vpdb: # print(pdb,vpdb) # imtitle = title = pdb +' ' + tag + ' value, angle=' + str(round(tau,3)) + ' Error loading outlier' georep.addSlice(sliceOrigVal, palette='cubehelix_r', title=imtitle, YellowDots=sliceOrigPos,Contour=True) georep.addSlice(sliceOrigRad, palette='bone',title=pdb + ' ' + tag + ' radiant',Contour=False,YellowDots=sliceOrigPos) georep.addSlice(sliceOrigMag, palette='bone', title=pdb + ' ' + tag + ' magnitude', Contour=True,YellowDots=sliceOrigPos) #origs.append(sliceOrigVal) #radiants.append(sliceOrigRad) firstRow += 1 #georep.addSlices(origs, palette='cubehelix_r', title='Average values', logged=False, centre=False) #georep.addSlices(radiants, palette='bone', title='Average radiant', logged=False, centre=False,Contour=False) georep.printToHtml(title,6,outfileName)
''' This script looks electron density correlations ''' pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper01/' pdbList = ['2bw4','5nqo','1ejg','6q53'] geoList = ['PHI','PSI','TAU'] hueList = ['aa','bfactor','rid','resolution','pdbCode','2FoFc','dssp'] georep2 = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath) for pdb in pdbList: georep = psu.GeoReport([pdb], pdbDataPath, edDataPath, printPath) #georep.printReport('DataPerPdb', 'Results9_data') #georep.printReport('Slow_DensityPeaksPerPdb', 'Results9_density') data = georep.getGeoemtryCsv(geoList, hueList) georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='TAU', title=pdb + ' Backbone tau', palette='jet',sort='NON', vmin=106, vmax=116) georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='PHI', title=pdb + ' Backbone phi', palette='jet',sort='NON', vmin=-170, vmax=170) georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='PSI', title=pdb + ' Backbone psi', palette='jet',sort='NON', vmin=-170, vmax=170) georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='dssp', title=pdb + ' Backbone dssp', palette='jet_r', sort='NON')
mergedDataSet['CID'] = mergedDataSet['pdbCode'] + mergedDataSet[ 'chain'] + mergedDataSet['rid'] mergedDataSet = mergedDataSet.set_index('CID').join( ccContacts.set_index('CID')) mergedDataSet.to_csv(help.loadPath + "MergedWithContacts.csv", index=False) mergedDataSet = mergedDataSet.dropna() #qu = "dssp == 'E' or dssp == 'B' or dssp == '-' " #qu = qu + "or dssp == 'T' or dssp == 'S' or dssp == 'H' or dssp == 'G' or dssp == 'I')" #mergedDataSet = mergedDataSet.query(qu) # create a report based on contacts georep = psu.GeoReport([], "", "", help.printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) print('### Creating reports ###') georep.addScatter(data=mergedDataSet, geoX='N:CA_Orig', geoY='Contacts', title='N:CA Original', hue='dssp', categorical=True, sort='NON', palette='jet_r') georep.addScatter(data=mergedDataSet, geoX='N:CA_Diff',
def maximaCompareFake(pdbSet, pdbList, tag, reduce): print('Plotting fake maxima differences', pdbSet) pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/' realCsv, badRealCsv, fakeCsv, badFakeCsv = help.getMaximaDiffs( pdbSet, pdbList, True) if reduce: #fakeCsv1 = fakeCsv.query('BGridDistance < 1.7') #fakeCsv2 = fakeCsv1.query('BGridDistance < 0.95') #fakeCsv1 = fakeCsv1.query('BGridDistance > 1.05') #fakeCsv3 = fakeCsv1.query('BGridDistance < 1.35') #fakeCsv1 = fakeCsv1.query('BGridDistance > 1.45') #fakeCsv = pd.concat([fakeCsv1,fakeCsv2,fakeCsv3]) fakeCsv = fakeCsv.query('Difference <= 0.05') #fakeCsv2 = fakeCsv1.query('BGridDistance < 1.4 or BGridDistance > 1.42') #fakeCsv3 = fakeCsv2.query('BGridDistance < 0.95 or BGridDistance > 1.05') print(fakeCsv) fakeCol = 'jet_r' #'GnBu' georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) georep.addHistogram(data=fakeCsv, geoX='Difference', title='Fake PDB structures, Occ=1, BFact=2', count=True, hue='pdbCode', palette='LightSeaGreen') georep.addScatter(data=fakeCsv, geoX='Difference', geoY='pdbCode', hue='Width', categorical=False, palette=fakeCol + '', sort='RANDOM', title='') georep.addScatter(data=fakeCsv, geoX='Difference', geoY='Width', hue='pdbCode', categorical=True, palette='tab20', sort='RANDOM', title='') georep.addScatter(data=fakeCsv, geoX='Difference', geoY='GridDistance', hue='BGridDistance', categorical=False, palette=fakeCol, sort='RANDOM', title='') georep.addScatter(data=fakeCsv, geoX='Difference', geoY='BGridDistance', hue='GridDistance', categorical=False, palette=fakeCol, sort='RANDOM', title='') georep.addScatter(data=fakeCsv, geoX='GridDistance', geoY='BGridDistance', hue='Difference', categorical=False, palette=fakeCol, sort='RANDOM', title='') georep.addProbability(data=fakeCsv, geoX='Difference', geoY='GridDistance', palette='cubehelix_r') georep.addProbability(data=fakeCsv, geoX='Difference', geoY='BGridDistance', palette='cubehelix_r') georep.addProbability(data=fakeCsv, geoX='GridDistance', geoY='BGridDistance', palette='cubehelix_r') for pdb in pdbList: print(pdb) georep.addHistogram(data=fakeCsv, geoX='Difference', title='Fake Density, Occ=1, BFact=10', hue='AtomNo', count=True, restrictions={'pdbCode': pdb}, palette='LightSeaGreen') georep.addHistogram(data=fakeCsv, geoX='Difference', title='Fake Density, Occ=1, BFact=10', hue='Reason', count=True, restrictions={'pdbCode': pdb}, palette='LightSeaGreen') georep.addScatter(data=fakeCsv, geoX='AtomNo', geoY='Difference', hue='AtomType', categorical=True, restrictions={'pdbCode': pdb}, palette='tab20', sort='RANDOM', title='') georep.printToHtml('Maxima differences, set=' + pdbSet, 3, 'Maxima_' + pdbSet + tag)
hueList = ['dssp', 'aa', 'rid', 'bfactor'] aas = ['GLY'] ################################################################################### pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/' if myWindowsLaptop: pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/' edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/' printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/' ########################################################################################### georep = psu.GeoReport(pdbList1000, pdbDataPath, edDataPath, printPath, ed=False, dssp=False, keepDisordered=keepDisordered, includePdbs=False) data = georep.getGeoemtryCsv(geoList, hueList, bfactorFactor) #data = data.query('TAU > 100') #data = data.query('TAU < 125') dataPsiRange = data.query('PSI > -50') dataPsiRange = dataPsiRange.query('PSI < 50') for aa in aas: sql = 'aa == "' + aa + '"' dataaa = data.query(sql) dataPsiRangeaa = dataPsiRange.query(sql) georep.addScatter(data=dataaa,
def compareAtomsPdbAdjusted(dataCombined, geos, pdbSet, tag): pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/' georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) for geo in geos: dataCombined[ geo + '_Diff'] = dataCombined[geo + '_Orig'] - dataCombined[geo + '_Adj'] georep.addHistogram(data=dataCombined, geoX=geo + '_Orig', title='Pdb Atoms', count=True, hue='pdbCode') georep.addHistogram(data=dataCombined, geoX=geo + '_Adj', title='Adjusted Atoms', count=True, hue='pdbCode') #georep.addScatter(data=dataCombined, geoX=geo + '_Diff', geoY='RES', hue='SOFTWARE', palette='jet_r', sort='RANDOM', categorical=True, title='Resolution and atom differences ' + geo) #georep.addScatter(data=dataCombined, geoX=geo + '_Diff', geoY='SOFTWARE', hue='RES', palette='viridis_r', sort='DESC', categorical=False, title='Resolution and atom differences ' + geo) georep.addHexBins(data=dataCombined, geoX=geo + '_Diff', geoY='RES', title='Count ' + geo, hue='count', palette='cubehelix_r') georep.addScatter(data=dataCombined, geoX=geo + '_Orig', geoY=geo + '_Adj', hue='SOFTWARE', palette='jet_r', sort='RANDOM', categorical=True, title='Software and atom positions ' + geo) georep.addScatter(data=dataCombined, geoX=geo + '_Orig', geoY=geo + '_Adj', hue='RES', palette='viridis_r', sort='DESC', categorical=False, title='Resolution and atom positions ' + geo) georep.addHexBins(data=dataCombined, geoX=geo + '_Orig', geoY=geo + '_Adj', title='Count ' + geo, hue='count', palette='cubehelix_r') #georep.addScatter(data=dataCombined, geoX=geo + '_Orig', geoY='SOFTWARE', hue=geo + '_Adj', palette='jet_r', sort='RANDOM', categorical=True, title='Comparing atom positions ' + geo) #for pdb in pdbList: # print(pdb) # georep.addHistogram(data=realCsv, geoX='Difference', title='Occ=1', hue='AtomNo', count=True, restrictions={'pdbCode': pdb}) # georep.addHistogram(data=realCutDown, geoX='Difference', title='Occ=1, BFact<10', hue='AtomNo', count=True, restrictions={'pdbCode': pdb}) # georep.addScatter(data=realCutDown, geoX='Difference', geoY='BGridDistance', hue='GridDistance', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='') # georep.addScatter(data=realCutDown, geoX='Difference', geoY='AtomType', hue='AtomNo', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='') georep.printToHtml( 'Comparing atom positions: PDB vs maxima, set=' + pdbSet, 3, 'Compare_' + pdbSet + tag)
def getCsv(pdbSet, pdbListIn, geos, badAtoms, reloadPdb, reloadCsv, aa='ALL', includeCis=False, allAtoms=False, bFactorFactor=1.3, cutoff=0): print('Getting CSV for', pdbSet) pdbDataPath = rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/' if pdbSet == 'PDB': pdbDataPath = rootPath + '/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' loadPath = rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/' printPath = rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataK/' fileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv' if reloadCsv: from PsuGeometry import GeoPdb as geopdb pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False, False, badAtoms) if reloadPdb: pdbmanager.clear() pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False, False, badAtoms) #pdbdata = pd.read_csv('../../PdbLists/Pdbs_Evidenced.csv') # This is a list of pdbs <= 1.1A non homologous to 90% #pdbListIn = pdbdata['PDB'].tolist()[0:] #if cutoff > 0: # pdbListIn = pdbdata['PDB'].tolist()[0:cutoff] pdbList = [] for pdb in pdbListIn: import os.path filePdb = pdbDataPath + 'pdb' + pdb + '.ent' #print('- Adding to csv',filePdb) if os.path.isfile((filePdb).lower()): pdbList.append(pdb.lower()) else: print('No file:', pdbDataPath, pdb) pdbList.sort() hueList = [ 'aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered' ] georep = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=allAtoms) if includeCis: geos.append('CA-1:C-1:N:CA') print('geoList', geos) dataBest = georep.getGeoemtryCsv(geos, hueList, bFactorFactor, allAtoms=allAtoms, restrictedAa=aa) try: dataBest['rid'] = dataBest['rid'].astype(str) dataBest['ID'] = dataBest['pdbCode'] + dataBest[ 'chain'] + dataBest['rid'] + dataBest['aa'] except: print('empty csv') else: dataBest = pd.read_csv(loadPath + fileName) #aas = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG','SER', 'THR', 'VAL', 'TRP', 'TYR'] if includeCis: dataBest['aa'] = dataBest.apply( lambda row: applyCis(row['aa'], row['CA-1:C-1:N:CA']), axis=1) if aa != 'ALL': dataBest = dataBest.query('aa == "' + aa + '"') return dataBest
includeDSSP = True ################################################################################### pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/' if myWindowsLaptop: pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/' edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/' printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/' includeDSSP = False # on my windows computer ########################################################################################### georep = psu.GeoReport(pdbList1000, pdbDataPath, edDataPath, printPath, ed=False, dssp=includeDSSP, includePdbs=False) data = georep.getGeoemtryCsv(geoList, hueList) #datacorr = data.corr() #sns.heatmap(datacorr, annot=True, cmap="vlag", vmin=-1, vmax=1) #plt.show() #Clean the data data = data.drop('pdbCode', axis=1) data = data.drop('chain', axis=1) data = data.drop('rid', axis=1) data = data.drop('aa', axis=1) data = data.drop('ridx', axis=1) data = data.drop('atomNo', axis=1)
geoLists.append(['5HB', ['N:O-2','C:O-2','N:CA:C:O-2','N:CA:N+1:O-2']]) # Hydrogen bond distances and dihedrals nearest O geoLists.append(['6HBO', ['N:{O}','C:{O}','N:CA:C:{O}','N:CA:N+1:{O}']]) # Water geoLists.append(['7WAT', ['N:HOH','C:HOH','N:CA:C:HOH','N:CA:N+1:HOH']]) # Other! geoLists.append(['8XTRA', ['N:HETATM']]) hueList = ['aa', 'rid', 'bfactor','pdbCode','bfactorRatio','disordered','dssp'] aas = ['ALL'] print('Creating CSV files anew') for geoListT in geoLists: geoList = geoListT[1] set = geoListT[0] for aa in aas: tag = 'Set' + set + aa georep = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=False, dssp=includeDSSP, includePdbs=False,keepDisordered=True) print('Create unrestricted csv', geoList) dataUnrestricted = georep.getGeoemtryCsv(geoList, hueList, -1,allAtoms=True,restrictedAa=aa) dataUnrestricted.to_csv(printPath + 'CsvGeos_' + tag + '.csv', index=False) print('----------Finished----------') endx = time.time() time_diff = endx - startx timestring = str(int(time_diff / 60)) + "m " + str(int(time_diff % 60)) + "s" print(timestring)
def EHCompare(pdbSet): import matplotlib.pyplot as plt plt.close('all') plt.clf() plt.cla() pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' loadPathEH = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/Data/' loadPathCsv = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/' printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataC/' #EH_SET,aa,N:CA,N:CA_SD,CA:C,CA:C_SD,C:O,C:O_SD,C:N+1,C:N+1_SD,N:CA:C,N:CA:C_SD,CA:C:N+1,CA:C:N+1_SD,CA:C:O,CA:C:O_SD,O:C:N+1,O:C:N+1_SD,C-1:N:CA,C-1:N:CA_SD EHFileName = 'Data_EH.csv' BestFileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv' dataEH = pd.read_csv(loadPathEH + EHFileName) dataBest = pd.read_csv(loadPathCsv + BestFileName) aas = [ 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR' ] geos = [ 'N:CA', 'CA:C', 'C:O', 'C:N+1', 'TAU', 'CA:C:N+1', 'CA:C:O', 'O:C:N+1', 'C-1:N:CA' ] #geos = ['N:CA','CA:C','C:O','C:N+1','N:CA:C'] #specifically looking at the mean and sd of the parameters in comparison to EH print(dataEH) georepAA = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) georepSummary = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False) for geo in geos: #Cut on ALL PRO and GLY compareSets = ['1991', '2001'] listCompares = [] ehALL = dataEH.query("aa == 'ALL'") ehGLY = dataEH.query("aa == 'GLY'") ehPRO = dataEH.query("aa == 'PRO'") ehCIS = dataEH.query("aa == 'CIS'") bestALLCut = dataBest.query("aa != 'GLY'") bestALLCut = bestALLCut.query("aa != 'PRO'") bestGLYCut = dataBest.query("aa == 'GLY'") bestPROCut = dataBest.query("aa == 'PRO'") print(pdbSet, bestPROCut) bestPROCut['ABSOMEGA'] = abs(bestPROCut['CA-1:C-1:N:CA']) bestPROCis = bestPROCut.query("ABSOMEGA < 120") bestPROTrans = bestPROCut.query("ABSOMEGA >= 120") titleALL = '' titleGLY = '' titlePRO = '' titleCIS = '' for comp in compareSets: ehSetALL = ehALL.query("EH_SET == '" + comp + "'") ehSetGLY = ehGLY.query("EH_SET == '" + comp + "'") ehSetPRO = ehPRO.query("EH_SET == '" + comp + "'") ehSetCIS = ehCIS.query("EH_SET == '" + comp + "'") meanALL = round(ehSetALL[geo].values[0], 3) sdALL = ehSetALL[geo + '_SD'].values[0] meanGLY = round(ehSetGLY[geo].values[0], 3) sdGLY = ehSetGLY[geo + '_SD'].values[0] meanPRO = round(ehSetPRO[geo].values[0], 3) sdPRO = ehSetPRO[geo + '_SD'].values[0] meanCIS = round(ehSetCIS[geo].values[0], 3) sdCIS = ehSetCIS[geo + '_SD'].values[0] titleALL += 'ALL ' + geo + ' ' + comp + ' Mean=' + str( meanALL) + ' (' + str(sdALL) + ')\n' titleGLY += 'GLY ' + geo + ' ' + comp + ' Mean=' + str( meanGLY) + ' (' + str(sdGLY) + ')\n' titlePRO += 'PRO ' + geo + ' ' + comp + ' Mean=' + str( meanPRO) + ' (' + str(sdPRO) + ')\n' titleCIS += 'CIS ' + geo + ' ' + comp + ' Mean=' + str( meanCIS) + ' (' + str(sdCIS) + ')\n' print(titleGLY, titlePRO, titleALL, titleCIS) if geo == 'N:CA:C': georepSummary.addHistogram(data=bestALLCut, geoX='TAU', title=titleALL) georepSummary.addHistogram(data=bestGLYCut, geoX='TAU', title=titleGLY) georepSummary.addHistogram(data=bestPROTrans, geoX='TAU', title=titlePRO) georepSummary.addHistogram(data=bestPROCis, geoX='TAU', title=titleCIS) else: georepSummary.addHistogram(data=bestALLCut, geoX=geo, title=titleALL) georepSummary.addHistogram(data=bestGLYCut, geoX=geo, title=titleGLY) georepSummary.addHistogram(data=bestPROTrans, geoX=geo, title=titlePRO) georepSummary.addHistogram(data=bestPROCis, geoX=geo, title=titleCIS) ''' for aa in aas: #prepare E&H comparison values useaa = 'ALL' if aa == 'PRO' or aa == 'GLY': useaa = aa ehCut = dataEH.query("aa == '" + useaa + "'") eh1991 = ehCut.query("EH_SET == '1991'") eh2001 = ehCut.query("EH_SET == '2001'") mean1991 = round(eh1991[geo].values[0],3) mean2001 = round(eh2001[geo].values[0],3) sd1991 = eh1991[geo + '_SD'].values[0] sd2001 = eh2001[geo + '_SD'].values[0] title = geo + ' ' + aa + '\nEH 2001: mean=' + str(mean2001) + ' (' +str(sd2001) + ')\n' title = title + 'EH 1991: mean=' + str(mean1991) + ' (' + str(sd1991) + ')' print(aa,geo,mean1991,sd1991,mean2001,sd2001) bestCut = dataBest.query("aa == '" + aa + "'") if geo == 'N:CA:C': georepAA.addHistogram(data=bestCut, geoX='TAU', title=title) else: georepAA.addHistogram(data=bestCut, geoX=geo, title=title) ''' georepSummary.printToHtml( 'Best Supported Engh&Huber Compare, set=' + pdbSet, 4, 'Defensible_EH_' + pdbSet)
# -- ©Rachel Alcraft 2020, PsuGeometry -- from PsuGeometry import GeoReport as geor ''' This script runs a correlation report on a few structures to demonstrate the use of 2Fo-Fc as a hue The data is precalculated into a datafame ''' pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/' edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/' printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Levels/' pdbList = ['1ejg', '1us0', '1tt8', '1i1w', '1ucs', '6jvv', '5nqo'] georep = geor.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=True, dssp=True) geoList = [ 'PHI', 'PSI', 'TAU', 'C-1:C', 'C-1:N:CA', 'CHI1', 'CA-1:CA', 'OMEGA', 'CA:CA+1', 'C-1:N:CA:C' ] hueList = ['aa', 'bfactor', 'rid', 'resolution', 'pdbCode', '2FoFc', 'dssp'] data = georep.getGeoemtryCsv(geoList, hueList) georep.addScatter(data=data, geoX='PHI', geoY='PSI', hue='2FoFc',