예제 #1
0
def statsSummary(pdbSet, data, geos,tag):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    loadPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataK/'

    fileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv'

    allAtoms = False
    bFactorFactor = -1
    if pdbSet == 'RESTRICTED':
        allAtoms = True
        bFactorFactor = 1.3

    georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False)

    for geo in geos:
        georep.addStatsSummary(data=data, desc=geo + ' ' + pdbSet, geoX=geo, geoY='aa', hue='ID')


    georep.printToHtml('Stats Summary , set=' + pdbSet, 2, 'StatsSummary_' + pdbSet + tag)
예제 #2
0
def diffHistograms(pdbList, tag):

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    loadPath = help.rootPath + '/ProteinDataFiles/ccp4_out/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataN/'

    allRealPdbs = []
    for pdb in pdbList:
        realFileName = pdb + '_DiffHistogram.csv'
        realData = pd.read_csv(loadPath + realFileName)
        allRealPdbs.append(realData)

    #append them all
    realCsv = pd.concat(allRealPdbs, axis=0, sort=False)

    georep = psu.GeoReport([],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=False)

    georep.addHistogram(data=realCsv,
                        geoX='Percent',
                        title='Difference Histograms - %',
                        count=True,
                        hue='PdbCode')
    georep.addHistogram(data=realCsv,
                        geoX='Diff',
                        title='Difference Histograms - diff',
                        count=True,
                        hue='PdbCode',
                        palette='LightSeaGreen')

    georep.addScatter(data=realCsv,
                      geoX='Main',
                      geoY='Percent',
                      hue='Diff',
                      palette='jet',
                      categorical=False,
                      sort='RANDOM',
                      title='Differences')
    georep.addScatter(data=realCsv,
                      geoX='Diff',
                      geoY='Percent',
                      hue='PdbCode',
                      palette='jet_r',
                      categorical=True,
                      sort='RANDOM',
                      title='Differences')

    georep.printToHtml('Difference Histograms', 2, 'DiffHist_' + tag)
예제 #3
0
def scatterReports(pdbSet, data, trios, perAA=True, tag=''):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataI/'

    #BestFileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv'
    #dataBest = pd.read_csv(loadPath + BestFileName)

    aas = data['aa'].values
    aas = list(set(aas))
    aas.sort()

    #aas = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG','SER', 'THR', 'VAL', 'TRP', 'TYR']
    #geosPairs = [['PHI','PSI','TAU'],['PSI','N:N+1','TAU'],['N:CA','CA:C','C:O'],['C:N+1','TAU','CA:C:N+1'],['CA:C:O','O:C:N+1','C-1:N:CA']]

    georep = psu.GeoReport([],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=False)

    for trio in trios:
        if perAA:
            for aa in aas:
                dataCut = data.query("aa ==  '" + aa + "'")
                georep.addScatter(data=dataCut,
                                  geoX=trio[0],
                                  geoY=trio[1],
                                  hue=trio[2],
                                  title=aa + ':' + trio[0] + ':' + trio[1],
                                  palette='jet',
                                  sort='NON')
        else:
            georep.addScatter(data=data,
                              geoX=trio[0],
                              geoY=trio[1],
                              hue=trio[2],
                              title=trio[0] + ':' + trio[1],
                              palette='jet',
                              sort='NON')

    georep.printToHtml('Scatters , set=' + pdbSet, 4,
                       'Defensible_Scatters_' + tag)
예제 #4
0
def makeSlicesHtmlFromValues(mainTitle, dirName, lineRuns,row,tag):
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesI/'
    georep = psu.GeoReport([], "", "", printPath, ed=False, dssp=False)
    #We are going to load the data that has been created by density flight and pasted into a text file
    for lineRun in lineRuns:
        title = lineRun[0]
        palette = lineRun[1]
        fileName = lineRun[2]
        posName = lineRun[3]
        inputVals = georep.loadSlice(dirName + fileName)
        inputPoses = georep.loadSlice(dirName + posName)
        georep.addSlice(inputVals, palette=palette, title=title, YellowDots=inputPoses,Contour=True)

    georep.printToHtml(mainTitle,row,tag)
예제 #5
0
 def __init__(self,plotA, plotB, title,report):
     self.title = title
     if title!='ghost':
         self.plotA = plotA
         self.plotB = plotB
     else:#In this case we have only the main plot, so we create the dummy plot
         from PsuGeometry import GeoReport as geor
         self.plotB = plotA
         ghostReport = geor.GeoReport(['ghost'],report.pdbDataPath,report.edDataPath,report.outDataPath,report.ed,report.dssp)
         geoList = []
         geoList.append(self.plotB.geoX)
         if self.plotB.geoY != '':
             geoList.append(self.plotB.geoY)
         ghostdata = ghostReport.getGeoemtryCsv(geoList, ['pdbCode'])
         self.plotA = GeoPlot(ghostdata, self.plotB.geoX, geoY=self.plotB.geoY, title='ghost', hue='pdbCode', palette='Greys',plot=self.plotB.plot,operation=self.plotB.operation,report=report)
예제 #6
0
def makeCsv(pdbSet, pdbListIn, geos, badAtoms, disordered):
    print('Getting CSV for', pdbSet)
    pdbDataPath = filesPDBRoot
    if pdbSet == 'ADJUSTEDDEN':
        pdbDataPath = filesDenRoot
    if pdbSet == 'ADJUSTEDLAP':
        pdbDataPath = filesLapRoot

    from PsuGeometry import GeoPdb as geopdb
    pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False,
                                disordered, badAtoms)
    pdbmanager.clear()
    pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False,
                                disordered, badAtoms)

    pdbList = [
    ]  #this is so we don't default to getting a pdb file from somewhere we don;t want
    for pdb in pdbListIn:
        import os.path
        filePdb = pdbDataPath + 'pdb' + pdb + '.ent'
        #print('- Adding to csv',filePdb)
        if os.path.isfile((filePdb).lower()):
            pdbList.append(pdb.lower())
        else:
            print('No file:', pdbDataPath, pdb)
    pdbList.sort()
    hueList = [
        'aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered',
        'occupancy'
    ]
    georep = psu.GeoReport(pdbList,
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=disordered)
    print('geoList', geos)
    dataBest = georep.getGeoemtryCsv(geos, hueList, -1, allAtoms=True)
    try:
        dataBest['rid'] = dataBest['rid'].astype(str)
        dataBest['ID'] = dataBest['pdbCode'] + dataBest['chain'] + dataBest[
            'rid'] + dataBest['aa']
    except:
        print('empty csv')
    return dataBest
예제 #7
0
def clusterEdTauMaker(pdbCode, rid, chain, aa):
    from PsuGeometry import GeoReport as psu
    from PsuGeometry import GeoPdb as geopdb

    pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = 'F:/Code/BbkProject/PhDThesis/0.Papers/1.TauCorrelations/Data/BestSupportedCSVs/Reports/'

    georep = psu.GeoReport([pdbCode],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False)
    pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, ed=False, dssp=False)
    apdb = pdbmanager.getPdb(pdbCode, True)

    pdbcsv = apdb.getDataFrame()
    queryC = 'rid==' + str(
        rid) + ' and chain=="' + chain + '"' + ' and atom=="CA"'
    queryL = 'rid==' + str(
        rid) + ' and chain=="' + chain + '"' + ' and atom=="N"'
    queryP = 'rid==' + str(
        rid) + ' and chain=="' + chain + '"' + ' and atom=="C"'
    dataC = pdbcsv.query(queryC)
    dataL = pdbcsv.query(queryL)
    dataP = pdbcsv.query(queryP)

    if len(dataC) > 0 and len(dataL) > 0 and len(dataP) > 0:
        cx = round(dataC['x'].values[0], 3)
        cy = round(dataC['y'].values[0], 3)
        cz = round(dataC['z'].values[0], 3)
        lx = round(dataL['x'].values[0], 3)
        ly = round(dataL['y'].values[0], 3)
        lz = round(dataL['z'].values[0], 3)
        px = round(dataP['x'].values[0], 3)
        py = round(dataP['y'].values[0], 3)
        pz = round(dataP['z'].values[0], 3)

        row = pdbCode + "," + chain + str(rid) + "," + str(cx) + "," + str(
            cy) + "," + str(cz)
        row += "," + str(lx) + "," + str(ly) + "," + str(lz)
        row += "," + str(px) + "," + str(py) + "," + str(pz)

        return row
예제 #8
0
def evidenceReports(pdbSet,  fourSetNames, dataA, dataB, dataC, dataD ,trios, title,perAA=True, tag=''):
    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataI/'

    aas = dataA['aa'].values
    aas = list(set(aas))
    aas.sort()

    georep = psu.GeoReport([],pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False,keepDisordered=False)

    for trio in trios:
        if perAA:
            for aa in aas:
                dataCutA = dataA.query("aa ==  '" + aa + "'")
                dataCutB = dataB.query("aa ==  '" + aa + "'")
                dataCutC = dataC.query("aa ==  '" + aa + "'")
                dataCutD = dataD.query("aa ==  '" + aa + "'")
                if len(trio) == 3:
                    georep.addScatter(data=dataCutA, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':'+ trio[1] , palette='jet', sort='NON')
                    georep.addScatter(data=dataCutB, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON')
                    georep.addScatter(data=dataCutC, geoX=trio[0], geoY=trio[1], hue=trio[2],title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON')
                    georep.addScatter(data=dataCutD, geoX=trio[0], geoY=trio[1], hue=trio[2], title=aa + ':' + trio[0] + ':' + trio[1], palette='jet', sort='NON')
                else:
                    georep.addHistogram(data=dataCutA, geoX=trio[0],title=fourSetNames[0] + ' ' + trio[0], hue='ID')
                    georep.addHistogram(data=dataCutB, geoX=trio[0],title=fourSetNames[1] + ' ' + trio[0], hue='ID')
                    georep.addHistogram(data=dataCutC, geoX=trio[0],title=fourSetNames[2] + ' ' + trio[0], hue='ID')
                    georep.addHistogram(data=dataCutD, geoX=trio[0],title=fourSetNames[3] + ' ' + trio[0], hue='ID')
        else:
            if len(trio) == 3:
                georep.addScatter(data=dataA, geoX=trio[0], geoY=trio[1], hue=trio[2],title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Unrestricted', palette='jet', sort='NON')
                georep.addScatter(data=dataB, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Restricted', palette='jet', sort='NON')
                georep.addScatter(data=dataC, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Restricted+cut', palette='jet', sort='NON')
                georep.addScatter(data=dataD, geoX=trio[0], geoY=trio[1], hue=trio[2], title=trio[0] + '|' + trio[1]+ '|' + trio[2] + ' Adjusted', palette='jet', sort='NON')
            else:
                georep.addHistogram(data=dataA, geoX=trio[0], title=fourSetNames[0] + ' ' + trio[0], hue='ID')
                georep.addHistogram(data=dataB, geoX=trio[0], title=fourSetNames[1] + ' ' + trio[0], hue='ID')
                georep.addHistogram(data=dataC, geoX=trio[0], title=fourSetNames[2] + ' ' + trio[0], hue='ID')
                georep.addHistogram(data=dataD, geoX=trio[0], title=fourSetNames[3] + ' ' + trio[0], hue='ID')



    georep.printToHtml(title, 4, pdbSet + '_Defensible' + tag)
예제 #9
0
def compareSets(tag):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    loadPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataD/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataE/'

    geos = ['N:CA', 'CA:C', 'C:O', 'C:N+1', 'TAU', 'C-1:N:CA', 'CA:C:N+1', 'CA:C:O', 'O:C:N+1', 'CA:C:N+1']
    aas = ['ALL', 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN','ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR']
    fileName = tag + 'Data_SetsSummaryMerged.csv'
    data = pd.read_csv(loadPath + fileName)

    georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False, includePdbs=False, keepDisordered=False)

    for geo in geos:
        dataCut = data.query('geo == "' + geo + '"')
        dataCutCount = dataCut.query('count > 0')
        dataCutCount = dataCutCount.query('aa != "' + 'ALL' + '"')
        dataCutAll = dataCut.query('aa == "' + 'ALL' + '"')
        dataCutPRO = dataCut.query('aa == "' + 'PRO' + '"')
        dataCutGLY = dataCut.query('aa == "' + 'GLY' + '"')

        georep.addScatter(data=dataCutAll, geoX='mean', geoY='set', hue='sd', title=geo + ' ALL (exc gly/pro)', palette='jet', categorical=False,sort='NON')
        georep.addScatter(data=dataCutGLY, geoX='mean', geoY='set', hue='sd', title=geo + ' GLY', palette='jet', categorical=False,sort='NON')
        georep.addScatter(data=dataCutPRO, geoX='mean', geoY='set',hue='sd', title=geo + ' PRO', palette='jet', categorical=False,sort='NON')

        georep.addScatter(data=dataCutCount, geoX='count', geoY='aa', hue='set', title=geo + ' Best Supported counts per aa', palette='jet_r', categorical=True, sort='NON')
        georep.addScatter(data=dataCutCount, geoX='mean', geoY='aa', hue='set', title=geo + ' Best Supported means per aa', palette='jet_r',categorical=True, sort='NON')
        georep.addScatter(data=dataCutCount, geoX='sd', geoY='aa', hue='set', title=geo + ' Best Supported sd per aa', palette='jet_r', categorical=True, sort='NON')



    georep.printToHtml('Best Supported and Engh&Huber Compare', 3, tag + 'Compare_EH_Sets')
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/density/'

### split list in 2 for memory purposes
#pdbList = ['1ejg','1us0','1tt8','1i1w','1ucs','1yk4','1yk4','1hje','1r6j']
#pdbList = ['2bw4','3nir','3x2m','2VB1','3A39','2b97','2OV0','2WFI']
#pdbList = ['4ZM7','4REK','4ZM7','5D8V','5NW3','5qkw']
pdbList = [
    '6jvv', '6rr2', '6E6O', '6S2M', '6shk', '6fgz', '6ctd', '6fwf', '6q53'
]
#pdbList = ['1ejg','1us0','1tt8','1i1w','1ucs','1yk4','1yk4','1hje','1r6j','2bw4','3nir','3x2m','2VB1','3A39','2b97','2OV0','2WFI','3o4p','1pjx']
#pdbList = ['4ZM7','4REK','4ZM7','5D8V','5NW3','5qkw','6jvv','6rr2','6E6O','6S2M','6shk','6fgz','6ctd','6fwf','6q53']
### split list in 2 for memory purposes

#peaksList=['1ejg','1us0','1tt8','1i1w','1ucs','6jvv','5nqo']
peaksList = []  #['1us0','1tt8']
#peaksList=['1i1w','1ucs','5nqo']

#pointsList=['6fwf','6q53','6ctd','6fgz','6rr2','6shk','6rr2']
pointsList = ['1ejg']

for pdb in peaksList:
    georep = geor.GeoReport([pdb], pdbDataPath, edDataPath, printPath)
    georep.printReport('Slow_DensityPeaksPerPdb', pdb + '_denpk')

for pdb in pointsList:
    georep = geor.GeoReport([pdb], pdbDataPath, edDataPath, printPath)
    georep.printReport('Slow_DensityPointsPerPdb', pdb + '_denpt')
예제 #11
0
    dic = {}

    A1 = randomOnSphere(A1_atom, 0.01)  #7)
    A2 = randomOnSphere(A2_atom, 0.01)  #5)
    A3 = randomOnSphere(A3_atom, 0.01)  #7)

    a1a2a3 = calcs.angle(A1[0], A1[1], A1[2], A2[0], A2[1], A2[2], A3[0],
                         A3[1], A3[2])
    a1a2 = calcs.distance(A1[0], A1[1], A1[2], A2[0], A2[1], A2[2])
    a2a3 = calcs.distance(A2[0], A2[1], A2[2], A3[0], A3[1], A3[2])
    dic['pdbCode'] = 'Iter_' + str(count)
    dic['chain'] = 'A'
    dic['rid'] = 1
    dic['ANGLE'] = a1a2a3
    dic['A1:A2'] = a1a2
    dic['A2:A3'] = a2a3
    vals.append(dic)

dataFrame = pd.DataFrame.from_dict(vals)
georep = psu.GeoReport([],
                       pdbDataPath,
                       edDataPath,
                       printPath,
                       ed=False,
                       dssp=False,
                       includePdbs=False,
                       keepDisordered=False)
georep.addHistogram(data=dataFrame, geoX='ANGLE', title='')
georep.addHistogram(data=dataFrame, geoX='A1:A2', title='')
georep.addHistogram(data=dataFrame, geoX='A2:A3', title='')
georep.printToHtml('Simulated Atoms', 3, 'SimReport')
예제 #12
0
import pandas as pd
import Ch000_Functions as help
from PsuGeometry import GeoReport as psu
pdblist = help.getPDBList100()
pdblist.sort()
#pdblist = pdblist[:10]
hueList = ['aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered','occupancy','dssp']

dsspPrintPath = '../../PdbLists/'

georep = psu.GeoReport(pdblist, help.pdbDataPathLx, help.edDataPath, dsspPrintPath, ed=False, dssp=True, includePdbs=False, keepDisordered=True)
datacsv = georep.getGeoemtryCsv(['N:CA'],hueList)
datacsv = datacsv[['pdbCode','chain','rid','aa','dssp']]
print(datacsv)
if False:#don;t accidentally run this and replace it
    datacsv.to_csv(dsspPrintPath + 'dssp.csv', index=False)
print(datacsv)
예제 #13
0
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/bad/'

# Create the geoemtric data
geoPsi = ['N:O', 'CB:O', 'N:CA:C:N+1']
geoListMain = ['CA:C', 'N:CA', 'C:O']
hueList = ['dssp', 'aa', 'bfactor', '2FoFc',
           'rid']  # note the hues are the sum od the atoms

pdbList = ['2lc9', '2lcb', '2cnq', '1i1w']  # structures with errors

for pdbCode in pdbList:

    georep = psu.GeoReport([pdbCode], pdbDataPath, edDataPath, printPath)

    dataPsi = georep.getGeoemtryCsv(geoPsi, hueList)
    dataMain = georep.getGeoemtryCsv(geoListMain, hueList)

    #Create the geoplots
    printList = []
    georep.addHistogram(geoX='N:CA', title='N-CA', ghost=True, hue='rid')
    georep.addHistogram(geoX='CA:C', title='C-CA', ghost=True, hue='rid')
    georep.addHistogram(geoX='CA:CA+1', title='CA-CA+1', ghost=True, hue='rid')

    georep.addHistogram(data=dataMain,
                        geoX='N:CA',
                        title='N-CA',
                        ghost=True,
                        splitKey='pdbCode')
예제 #14
0
###################################################################################
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/'
dsspHue='dssp'
includeDSSP = False
if myWindowsLaptop:
    pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/'
    edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/'
    printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/'
    includeDSSP = False  # on my windows computer

###########################################################################################

georepData = psu.GeoReport(pdbList1000, pdbDataPath, edDataPath, printPath, ed=False, dssp=includeDSSP, includePdbs=True)

geoList = []
for geo in dihs:
    geoList.append(geo)
for geo in distances:
    geoList.append(geo)
for geo in angles:
    geoList.append(geo)

count  = 0
length = len(pdbList1000)

for pdb in pdbList1000:
    print(pdb,' ', count,'/',length)
    count += 1
예제 #15
0
def maximaCompareReal(pdbSet, pdbList, tag):
    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/'

    realCsv, badRealCsv, occRealCsv = help.getMaximaDiffs(
        pdbSet, pdbList, False)

    realOccOne = realCsv.query("Occupancy == 1")
    realCutDown = realOccOne.query("BFactor < 10")
    realCutDown5 = realCutDown.query("Difference <= 0.05")
    realCol = 'brg'  #'RdPu'

    georep = psu.GeoReport([],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=False)

    georep.addHistogram(data=realCsv,
                        geoX='Difference',
                        title='PDB structures',
                        count=True,
                        hue='pdbCode')
    georep.addHistogram(data=realCutDown,
                        geoX='Difference',
                        title='PDB structures, Occ=1, BFact<10',
                        count=True,
                        hue='pdbCode')
    georep.addHistogram(
        data=realCutDown5,
        geoX='Difference',
        title='PDB structures, Occ=1, BFact<10, Difference<=0.05',
        count=True,
        hue='pdbCode')
    georep.addScatter(data=realCsv,
                      geoX='Difference',
                      geoY='BFactor',
                      hue='BGridDistance',
                      palette=realCol,
                      sort='RANDOM',
                      title='Occ=1')
    georep.addScatter(data=realCutDown,
                      geoX='Difference',
                      geoY='BFactor',
                      hue='BGridDistance',
                      palette=realCol,
                      sort='RANDOM',
                      title='bfactor<=10')
    georep.addScatter(data=realCutDown5,
                      geoX='Difference',
                      geoY='BFactor',
                      hue='BGridDistance',
                      palette=realCol,
                      sort='RANDOM',
                      title='Diff<=0.05')
    georep.addScatter(data=realCsv,
                      geoX='Difference',
                      geoY='Width',
                      hue='BFactor',
                      categorical=False,
                      palette=realCol,
                      sort='RANDOM',
                      title='Occ=1')
    georep.addScatter(data=realCutDown,
                      geoX='Difference',
                      geoY='Width',
                      hue='BFactor',
                      categorical=False,
                      palette=realCol,
                      sort='RANDOM',
                      title='BFactor<=10')
    georep.addScatter(data=realCutDown5,
                      geoX='Difference',
                      geoY='Width',
                      hue='BFactor',
                      categorical=False,
                      palette=realCol,
                      sort='RANDOM',
                      title='Diif<=0.05')

    #for pdb in pdbList:
    #    print(pdb)
    #    georep.addHistogram(data=realCsv, geoX='Difference', title='Occ=1', hue='AtomNo', count=True, restrictions={'pdbCode': pdb})
    #    georep.addHistogram(data=realCutDown, geoX='Difference', title='Occ=1, BFact<10', hue='AtomNo', count=True, restrictions={'pdbCode': pdb})
    #    georep.addScatter(data=realCutDown, geoX='Difference', geoY='BGridDistance', hue='GridDistance', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='')
    #    georep.addScatter(data=realCutDown, geoX='Difference', geoY='AtomType', hue='AtomNo', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='')

    georep.printToHtml('Maxima differences in PDB Structures, set=' + pdbSet,
                       3, 'Maxima_' + pdbSet + tag)
예제 #16
0
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/'

from PsuGeometry import GeoReport as geor

# Create the GeoPdb object and the report object with just that single pdb
pdbs = ['2bw4','1ejg','1us0']
pdbs = ['1ejg']

georep = geor.GeoReport(pdbs,pdbDataPath,edDataPath,printPath,False,False)
# Choose the geometric calculations desired and the hues we might want to look at
title = 'Protein Warhol'
georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='Spectral',restrictions={'aa':'PRO,ALA'})
georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='twilight_shifted',restrictions={'aa':'ALA'})
georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='inferno',restrictions={'aa':'PRO'})
georep.addProbability(geoX='N:CA:C:CB',geoY='N:CA:C:N+1',title='', palette='viridis_r')
# And finally create the reort with a file name of choice
georep.printToHtml(title,2,'warhol')

# Choose the geometric calculations desired and the hues we might want to look at
'''
title = 'Protein Halo'
georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='Spectral')
georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='twilight_shifted')
georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='inferno')
georep.addProbability(geoX='N:O',geoY='CB:O',title='', palette='nipy_spectral_r')
# And finally create the reort with a file name of choice
georep.printToHtml(title,2,'angel')
'''
예제 #17
0
def createBadDensitySlices(pdbSet, atomCe, atomLi, atomPl):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    pdbOriginalPath = help.rootPath + '/ProteinDataFiles/pdb_data/'
    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/'
    edDataPath = help.rootPath + '/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesF/'

    # This gets the list of pdbs
    pdbdata = pd.read_csv(
        '../../PdbLists/Pdbs_Evidenced.csv'
    )  # This is a list of pdbs <= 1.1A non homologous to 90%
    pdbListIn = pdbdata['PDB'].tolist()[0:]
    #pdbListIn = ['1p1x']
    for pdb in pdbListIn:
        slicesList = []
        #fileName = (pdbDataPath + 'pdb' + pdb + '_' + atomCe + atomLi + atomPl + '.bad').lower()
        fileNameIn = (pdbDataPath + 'pdb' + pdb + '.bad').lower()
        print(fileNameIn)
        import os.path
        if os.path.isfile(fileNameIn):
            text_file = open(fileNameIn, "r")
            lines = text_file.read().split('\n')
            print(len(lines))
            text_file.close()
            for line in lines:

                atom = line[12:14].lstrip().rstrip()
                aa = line[14:20].lstrip().rstrip()
                chain = line[20:22].lstrip().rstrip()
                rid = line[22:27].lstrip().rstrip()
                print(pdb, atom, aa, chain, rid, line)
                if rid != '':
                    if [pdb, chain, rid] not in slicesList:
                        slicesList.append([pdb, chain, rid])

        bigstring = ""

        for sl in slicesList:
            print(sl)
            georep = psu.GeoReport([sl[0]],
                                   pdbOriginalPath,
                                   edDataPath,
                                   printPath,
                                   ed=False,
                                   dssp=False)
            pdbmanager = geopdb.GeoPdbs(pdbOriginalPath,
                                        edDataPath,
                                        ed=False,
                                        dssp=False)
            apdb = pdbmanager.getPdb(sl[0], True)
            pdbcsv = apdb.getDataFrame()
            queryC = 'rid==' + str(
                sl[2]
            ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomCe + '"'
            queryL = 'rid==' + str(
                sl[2]
            ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomLi + '"'
            queryP = 'rid==' + str(
                sl[2]
            ) + ' and chain=="' + sl[1] + '"' + ' and atom=="' + atomPl + '"'
            dataC = pdbcsv.query(queryC)
            dataL = pdbcsv.query(queryL)
            dataP = pdbcsv.query(queryP)

            if len(dataC) > 0 and len(dataL) > 0 and len(dataP) > 0:
                cx = round(dataC['x'].values[0], 3)
                cy = round(dataC['y'].values[0], 3)
                cz = round(dataC['z'].values[0], 3)
                lx = round(dataL['x'].values[0], 3)
                ly = round(dataL['y'].values[0], 3)
                lz = round(dataL['z'].values[0], 3)
                px = round(dataP['x'].values[0], 3)
                py = round(dataP['y'].values[0], 3)
                pz = round(dataP['z'].values[0], 3)

                row = sl[0] + "," + sl[1] + str(
                    sl[2]) + "," + str(cx) + "," + str(cy) + "," + str(cz)
                row += "," + str(lx) + "," + str(ly) + "," + str(lz)
                row += "," + str(px) + "," + str(py) + "," + str(pz)

                print(row)

                bigstring += row + '\n'

        if len(slicesList) > 0:
            print("########RESULTS#########")
            print("")
            print(bigstring)

            tag = atomCe + atomLi + atomPl
            f = open(
                printPath + 'BadSlice_' + pdbSet + '_' + pdb + '_' + tag +
                '.txt', "w")
            f.write(bigstring)
            f.close()
예제 #18
0
    edSlicePath = 'F:/Code/ProteinDataFiles/ccp4_out/'
    #printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/'

#We are going to load the data that has been created by density flight
edSlicePath += FileDir + "/"
print(edSlicePath + "_Results.csv")
inputdata = pd.read_csv(edSlicePath + "_Results.csv")
#PdbCode,Tag,
# CentreX,CentreY,CentreZ,# LinearX,LinearY,LinearZ,PlanarX,PlanarY,PlanarZ,
# CentreV,LinearV,PlanarV,Angle,
# BCentreX,BCentreY,BCentreZ,BLinearX,BLinearY,BLinearZ,BPlanarX,BPlanarY,BPlanarZ,
# BCentreC,BLinearV,BPlanarV,BAngle

georep = psu.GeoReport([],
                       pdbDataPath,
                       edDataPath,
                       edSlicePath,
                       ed=False,
                       dssp=False)
#georepPrint = psu.GeoReport([],pdbDataPath,edDataPath,edSlicePath,ed=False,dssp=False)

pdbs = inputdata['PdbCode'].values
tags = inputdata['Tag'].values
taus = inputdata['Angle'].values
btaus = inputdata['BAngle'].values

origs = []
betters = []
radiants = []
brads = []

# Once the app has created the data we can load it
예제 #19
0
It runs correlation reports on proline, which it colours on the hue of CHI1 and CA-1:CA
These geometric measures are proxies for up/down pucker of the proline run (up-pucker=-ve CHI1)
And cis-trans proline, where pre-omega means proline, which corresponds to short CA-1:CA
'''

pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/1000Structures/'

pdbList = []
pdbdata = pd.read_csv('structures09.csv')
pdbList = pdbdata['pdb_code']

georep = geor.GeoReport(pdbList,
                        pdbDataPath,
                        edDataPath,
                        printPath,
                        ed=False,
                        dssp=False)

geoList = [
    'PHI', 'PSI', 'TAU', 'C-1:C', 'C-1:N:CA', 'CHI1', 'CA-1:CA', 'OMEGA',
    'CA:CA+1', 'C-1:N:CA:C'
]
hueList = ['aa', 'bfactor', 'rid', 'resolution', 'pdbCode']

data = georep.getGeoemtryCsv(geoList, hueList)

georep.addScatter(data=data,
                  geoX='PHI',
                  geoY='PSI',
                  hue='CHI1',
예제 #20
0
def makeSlicesHtml(setName, fileNameOrig,fileNameAdj, title,titleType,tag):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    FileDir = setName
    firstRow = 1
    outfileName = titleType + '_' + setName + "_" + tag


    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_data/'
    edDataPath = help.rootPath + '/ProteinDataFiles/ccp4_data/'
    edSlicePath = help.rootPath + '/ProteinDataFiles/ccp4_out/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesH/'

    #We are going to load the data that has been created by density flight
    edSlicePath += FileDir + "/"
    print(edSlicePath + "_Results.csv")
    inputdata = pd.read_csv(edSlicePath + "_Results.csv")
    pdbs = inputdata['PdbCode'].values
    tags = inputdata['Tag'].values
    taus = inputdata['Angle'].values

    valsPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/SlicesC/'
    print(valsPath)

    inputValsOrig = pd.read_csv(valsPath + fileNameOrig)
    inputValsAdj = pd.read_csv(valsPath + fileNameAdj)
    #inputVals = pd.read_csv(valsPath + "GoodOutliers_" + tag + ".csv")
    #print(inputVals)

    georep = psu.GeoReport([], pdbDataPath, edDataPath, printPath, ed=False, dssp=False)


    #origs = []
    #radiants = []

    for i in range(0,len(pdbs)):
        pdb = pdbs[i]
        tag = tags[i]
        tau = taus[i]

        pdbInputValsOrig = inputValsOrig.query("pdbCode == '" + pdb + "'")
        pdbInputValsAdj = inputValsAdj.query("pdbCode == '" + pdb + "'")

        valsOrig = pdbInputValsOrig['value'].values
        valsAdj = pdbInputValsAdj['value'].values
        vpdbs = pdbInputValsAdj['pdbCode'].values
        vaas = pdbInputValsAdj['aa'].values
        rids = pdbInputValsAdj['rid'].values
        chains = pdbInputValsAdj['chain'].values

        for j in range(0,len(vpdbs)):

            vpdb = vpdbs[j]
            newTag = vaas[j] + chains[j] + str(rids[j])

            print(vpdb,pdb,tag,newTag)

            if pdb == vpdb and newTag in tag:

                sliceOrigVal = georep.loadSlice(edSlicePath + pdb + tag + "value_slice.csv")
                sliceOrigRad = georep.loadSlice(edSlicePath + pdb + tag + "radiant_slice.csv")
                sliceOrigMag = georep.loadSlice(edSlicePath + pdb + tag + "magnitude_slice.csv")
                '''
                https://stackoverflow.com/questions/16400241/how-to-redefine-a-color-for-a-specific-value-in-a-matplotlib-colormap/16401183#16401183
                '''
                sliceOrigPos = georep.loadSlice(edSlicePath + pdb + tag + "poses_slice.csv")


                #This takes the data from the electron density only
                imtitle = pdb +' ' +  tag + ' value, angle=' + str(round(tau,3))
                # This takes the data from seperately created outliers file
                if '_O' in tag:
                    imtitle = pdb + ' ' + tag + ' value=' + str(round(valsOrig[j], 3))
                else:
                    imtitle = pdb + ' ' + tag + ' value=' + str(round(valsAdj[j], 3))
                #if pdb != vpdb:
                #    print(pdb,vpdb)
                #    imtitle = title = pdb +' ' +  tag + ' value, angle=' + str(round(tau,3)) + ' Error loading outlier'

                georep.addSlice(sliceOrigVal, palette='cubehelix_r', title=imtitle, YellowDots=sliceOrigPos,Contour=True)
                georep.addSlice(sliceOrigRad, palette='bone',title=pdb + ' ' + tag + ' radiant',Contour=False,YellowDots=sliceOrigPos)
                georep.addSlice(sliceOrigMag, palette='bone', title=pdb + ' ' + tag + ' magnitude', Contour=True,YellowDots=sliceOrigPos)

                #origs.append(sliceOrigVal)
                #radiants.append(sliceOrigRad)
                firstRow += 1

    #georep.addSlices(origs, palette='cubehelix_r', title='Average values', logged=False, centre=False)
    #georep.addSlices(radiants, palette='bone', title='Average radiant', logged=False, centre=False,Contour=False)
    georep.printToHtml(title,6,outfileName)
예제 #21
0
'''
This script looks electron density correlations 
'''


pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper01/'

pdbList = ['2bw4','5nqo','1ejg','6q53']

geoList = ['PHI','PSI','TAU']
hueList = ['aa','bfactor','rid','resolution','pdbCode','2FoFc','dssp']


georep2 = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath)

for pdb in pdbList:
    georep = psu.GeoReport([pdb], pdbDataPath, edDataPath, printPath)
    #georep.printReport('DataPerPdb', 'Results9_data')
    #georep.printReport('Slow_DensityPeaksPerPdb', 'Results9_density')


    data = georep.getGeoemtryCsv(geoList, hueList)

    georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='TAU', title=pdb + ' Backbone tau', palette='jet',sort='NON', vmin=106, vmax=116)
    georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='PHI', title=pdb + ' Backbone phi', palette='jet',sort='NON', vmin=-170, vmax=170)
    georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='PSI', title=pdb + ' Backbone psi', palette='jet',sort='NON', vmin=-170, vmax=170)
    georep2.addScatter(data=data, geoX='ridx', geoY='2FoFc', hue='dssp', title=pdb + ' Backbone dssp', palette='jet_r', sort='NON')

예제 #22
0
mergedDataSet['CID'] = mergedDataSet['pdbCode'] + mergedDataSet[
    'chain'] + mergedDataSet['rid']
mergedDataSet = mergedDataSet.set_index('CID').join(
    ccContacts.set_index('CID'))
mergedDataSet.to_csv(help.loadPath + "MergedWithContacts.csv", index=False)

mergedDataSet = mergedDataSet.dropna()
#qu = "dssp == 'E' or dssp == 'B' or dssp == '-' "
#qu = qu + "or dssp == 'T' or dssp == 'S' or dssp == 'H' or dssp == 'G' or dssp == 'I')"
#mergedDataSet = mergedDataSet.query(qu)

# create a report based on contacts
georep = psu.GeoReport([],
                       "",
                       "",
                       help.printPath,
                       ed=False,
                       dssp=False,
                       includePdbs=False,
                       keepDisordered=False)

print('### Creating reports ###')
georep.addScatter(data=mergedDataSet,
                  geoX='N:CA_Orig',
                  geoY='Contacts',
                  title='N:CA Original',
                  hue='dssp',
                  categorical=True,
                  sort='NON',
                  palette='jet_r')
georep.addScatter(data=mergedDataSet,
                  geoX='N:CA_Diff',
예제 #23
0
def maximaCompareFake(pdbSet, pdbList, tag, reduce):
    print('Plotting fake maxima differences', pdbSet)

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/'

    realCsv, badRealCsv, fakeCsv, badFakeCsv = help.getMaximaDiffs(
        pdbSet, pdbList, True)

    if reduce:
        #fakeCsv1 = fakeCsv.query('BGridDistance < 1.7')
        #fakeCsv2 = fakeCsv1.query('BGridDistance < 0.95')
        #fakeCsv1 = fakeCsv1.query('BGridDistance > 1.05')
        #fakeCsv3 = fakeCsv1.query('BGridDistance < 1.35')
        #fakeCsv1 = fakeCsv1.query('BGridDistance > 1.45')
        #fakeCsv = pd.concat([fakeCsv1,fakeCsv2,fakeCsv3])
        fakeCsv = fakeCsv.query('Difference <= 0.05')

        #fakeCsv2 = fakeCsv1.query('BGridDistance < 1.4 or BGridDistance > 1.42')
        #fakeCsv3 = fakeCsv2.query('BGridDistance < 0.95 or BGridDistance > 1.05')

    print(fakeCsv)

    fakeCol = 'jet_r'  #'GnBu'

    georep = psu.GeoReport([],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=False)

    georep.addHistogram(data=fakeCsv,
                        geoX='Difference',
                        title='Fake PDB structures, Occ=1, BFact=2',
                        count=True,
                        hue='pdbCode',
                        palette='LightSeaGreen')
    georep.addScatter(data=fakeCsv,
                      geoX='Difference',
                      geoY='pdbCode',
                      hue='Width',
                      categorical=False,
                      palette=fakeCol + '',
                      sort='RANDOM',
                      title='')
    georep.addScatter(data=fakeCsv,
                      geoX='Difference',
                      geoY='Width',
                      hue='pdbCode',
                      categorical=True,
                      palette='tab20',
                      sort='RANDOM',
                      title='')

    georep.addScatter(data=fakeCsv,
                      geoX='Difference',
                      geoY='GridDistance',
                      hue='BGridDistance',
                      categorical=False,
                      palette=fakeCol,
                      sort='RANDOM',
                      title='')
    georep.addScatter(data=fakeCsv,
                      geoX='Difference',
                      geoY='BGridDistance',
                      hue='GridDistance',
                      categorical=False,
                      palette=fakeCol,
                      sort='RANDOM',
                      title='')
    georep.addScatter(data=fakeCsv,
                      geoX='GridDistance',
                      geoY='BGridDistance',
                      hue='Difference',
                      categorical=False,
                      palette=fakeCol,
                      sort='RANDOM',
                      title='')

    georep.addProbability(data=fakeCsv,
                          geoX='Difference',
                          geoY='GridDistance',
                          palette='cubehelix_r')
    georep.addProbability(data=fakeCsv,
                          geoX='Difference',
                          geoY='BGridDistance',
                          palette='cubehelix_r')
    georep.addProbability(data=fakeCsv,
                          geoX='GridDistance',
                          geoY='BGridDistance',
                          palette='cubehelix_r')

    for pdb in pdbList:
        print(pdb)
        georep.addHistogram(data=fakeCsv,
                            geoX='Difference',
                            title='Fake Density, Occ=1, BFact=10',
                            hue='AtomNo',
                            count=True,
                            restrictions={'pdbCode': pdb},
                            palette='LightSeaGreen')
        georep.addHistogram(data=fakeCsv,
                            geoX='Difference',
                            title='Fake Density, Occ=1, BFact=10',
                            hue='Reason',
                            count=True,
                            restrictions={'pdbCode': pdb},
                            palette='LightSeaGreen')
        georep.addScatter(data=fakeCsv,
                          geoX='AtomNo',
                          geoY='Difference',
                          hue='AtomType',
                          categorical=True,
                          restrictions={'pdbCode': pdb},
                          palette='tab20',
                          sort='RANDOM',
                          title='')

    georep.printToHtml('Maxima differences, set=' + pdbSet, 3,
                       'Maxima_' + pdbSet + tag)
hueList = ['dssp', 'aa', 'rid', 'bfactor']
aas = ['GLY']
###################################################################################
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/'
if myWindowsLaptop:
    pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/'
    edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/'
    printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/'

###########################################################################################
georep = psu.GeoReport(pdbList1000,
                       pdbDataPath,
                       edDataPath,
                       printPath,
                       ed=False,
                       dssp=False,
                       keepDisordered=keepDisordered,
                       includePdbs=False)
data = georep.getGeoemtryCsv(geoList, hueList, bfactorFactor)
#data = data.query('TAU > 100')
#data = data.query('TAU < 125')
dataPsiRange = data.query('PSI > -50')
dataPsiRange = dataPsiRange.query('PSI < 50')

for aa in aas:
    sql = 'aa == "' + aa + '"'
    dataaa = data.query(sql)
    dataPsiRangeaa = dataPsiRange.query(sql)

    georep.addScatter(data=dataaa,
예제 #25
0
def compareAtomsPdbAdjusted(dataCombined, geos, pdbSet, tag):
    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataM/'

    georep = psu.GeoReport([],
                           pdbDataPath,
                           edDataPath,
                           printPath,
                           ed=False,
                           dssp=False,
                           includePdbs=False,
                           keepDisordered=False)

    for geo in geos:

        dataCombined[
            geo +
            '_Diff'] = dataCombined[geo + '_Orig'] - dataCombined[geo + '_Adj']

        georep.addHistogram(data=dataCombined,
                            geoX=geo + '_Orig',
                            title='Pdb Atoms',
                            count=True,
                            hue='pdbCode')
        georep.addHistogram(data=dataCombined,
                            geoX=geo + '_Adj',
                            title='Adjusted Atoms',
                            count=True,
                            hue='pdbCode')
        #georep.addScatter(data=dataCombined, geoX=geo + '_Diff', geoY='RES', hue='SOFTWARE', palette='jet_r', sort='RANDOM', categorical=True, title='Resolution and atom differences ' + geo)
        #georep.addScatter(data=dataCombined, geoX=geo + '_Diff', geoY='SOFTWARE', hue='RES', palette='viridis_r', sort='DESC', categorical=False, title='Resolution and atom differences ' + geo)
        georep.addHexBins(data=dataCombined,
                          geoX=geo + '_Diff',
                          geoY='RES',
                          title='Count ' + geo,
                          hue='count',
                          palette='cubehelix_r')

        georep.addScatter(data=dataCombined,
                          geoX=geo + '_Orig',
                          geoY=geo + '_Adj',
                          hue='SOFTWARE',
                          palette='jet_r',
                          sort='RANDOM',
                          categorical=True,
                          title='Software and atom positions ' + geo)
        georep.addScatter(data=dataCombined,
                          geoX=geo + '_Orig',
                          geoY=geo + '_Adj',
                          hue='RES',
                          palette='viridis_r',
                          sort='DESC',
                          categorical=False,
                          title='Resolution and atom positions ' + geo)
        georep.addHexBins(data=dataCombined,
                          geoX=geo + '_Orig',
                          geoY=geo + '_Adj',
                          title='Count ' + geo,
                          hue='count',
                          palette='cubehelix_r')

        #georep.addScatter(data=dataCombined, geoX=geo + '_Orig', geoY='SOFTWARE', hue=geo + '_Adj', palette='jet_r', sort='RANDOM', categorical=True, title='Comparing atom positions ' + geo)

    #for pdb in pdbList:
    #    print(pdb)
    #    georep.addHistogram(data=realCsv, geoX='Difference', title='Occ=1', hue='AtomNo', count=True, restrictions={'pdbCode': pdb})
    #    georep.addHistogram(data=realCutDown, geoX='Difference', title='Occ=1, BFact<10', hue='AtomNo', count=True, restrictions={'pdbCode': pdb})
    #    georep.addScatter(data=realCutDown, geoX='Difference', geoY='BGridDistance', hue='GridDistance', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='')
    #    georep.addScatter(data=realCutDown, geoX='Difference', geoY='AtomType', hue='AtomNo', categorical=False, restrictions={'pdbCode': pdb}, palette=realCol, sort='RANDOM', title='')

    georep.printToHtml(
        'Comparing atom positions: PDB vs maxima, set=' + pdbSet, 3,
        'Compare_' + pdbSet + tag)
예제 #26
0
def getCsv(pdbSet,
           pdbListIn,
           geos,
           badAtoms,
           reloadPdb,
           reloadCsv,
           aa='ALL',
           includeCis=False,
           allAtoms=False,
           bFactorFactor=1.3,
           cutoff=0):
    print('Getting CSV for', pdbSet)
    pdbDataPath = rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/'
    if pdbSet == 'PDB':
        pdbDataPath = rootPath + '/ProteinDataFiles/pdb_data/'

    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    loadPath = rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/'
    printPath = rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataK/'

    fileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv'

    if reloadCsv:
        from PsuGeometry import GeoPdb as geopdb
        pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False,
                                    False, badAtoms)
        if reloadPdb:
            pdbmanager.clear()
            pdbmanager = geopdb.GeoPdbs(pdbDataPath, edDataPath, False, False,
                                        False, badAtoms)

        #pdbdata = pd.read_csv('../../PdbLists/Pdbs_Evidenced.csv')  # This is a list of pdbs <= 1.1A non homologous to 90%
        #pdbListIn = pdbdata['PDB'].tolist()[0:]
        #if cutoff > 0:
        #    pdbListIn = pdbdata['PDB'].tolist()[0:cutoff]

        pdbList = []
        for pdb in pdbListIn:
            import os.path
            filePdb = pdbDataPath + 'pdb' + pdb + '.ent'
            #print('- Adding to csv',filePdb)
            if os.path.isfile((filePdb).lower()):
                pdbList.append(pdb.lower())
            else:
                print('No file:', pdbDataPath, pdb)

        pdbList.sort()

        hueList = [
            'aa', 'rid', 'bfactor', 'pdbCode', 'bfactorRatio', 'disordered'
        ]
        georep = psu.GeoReport(pdbList,
                               pdbDataPath,
                               edDataPath,
                               printPath,
                               ed=False,
                               dssp=False,
                               includePdbs=False,
                               keepDisordered=allAtoms)

        if includeCis:
            geos.append('CA-1:C-1:N:CA')

        print('geoList', geos)
        dataBest = georep.getGeoemtryCsv(geos,
                                         hueList,
                                         bFactorFactor,
                                         allAtoms=allAtoms,
                                         restrictedAa=aa)
        try:
            dataBest['rid'] = dataBest['rid'].astype(str)
            dataBest['ID'] = dataBest['pdbCode'] + dataBest[
                'chain'] + dataBest['rid'] + dataBest['aa']
        except:
            print('empty csv')
    else:
        dataBest = pd.read_csv(loadPath + fileName)

    #aas = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG','SER', 'THR', 'VAL', 'TRP', 'TYR']
    if includeCis:
        dataBest['aa'] = dataBest.apply(
            lambda row: applyCis(row['aa'], row['CA-1:C-1:N:CA']), axis=1)

    if aa != 'ALL':
        dataBest = dataBest.query('aa == "' + aa + '"')

    return dataBest
예제 #27
0
includeDSSP = True
###################################################################################
pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Paper02/'
if myWindowsLaptop:
    pdbDataPath = 'F:/Code/ProteinDataFiles/pdb_data/'
    edDataPath = 'F:/Code/ProteinDataFiles/ccp4_data/'
    printPath = 'F:/Code/ProteinDataFiles/results_psu/Paper02/'
    includeDSSP = False  # on my windows computer

###########################################################################################
georep = psu.GeoReport(pdbList1000,
                       pdbDataPath,
                       edDataPath,
                       printPath,
                       ed=False,
                       dssp=includeDSSP,
                       includePdbs=False)
data = georep.getGeoemtryCsv(geoList, hueList)
#datacorr = data.corr()
#sns.heatmap(datacorr, annot=True, cmap="vlag", vmin=-1, vmax=1)
#plt.show()

#Clean the data
data = data.drop('pdbCode', axis=1)
data = data.drop('chain', axis=1)
data = data.drop('rid', axis=1)
data = data.drop('aa', axis=1)
data = data.drop('ridx', axis=1)
data = data.drop('atomNo', axis=1)
예제 #28
0
geoLists.append(['5HB', ['N:O-2','C:O-2','N:CA:C:O-2','N:CA:N+1:O-2']])
# Hydrogen bond distances and dihedrals nearest O
geoLists.append(['6HBO', ['N:{O}','C:{O}','N:CA:C:{O}','N:CA:N+1:{O}']])
# Water
geoLists.append(['7WAT', ['N:HOH','C:HOH','N:CA:C:HOH','N:CA:N+1:HOH']])
# Other!
geoLists.append(['8XTRA', ['N:HETATM']])


hueList = ['aa', 'rid', 'bfactor','pdbCode','bfactorRatio','disordered','dssp']
aas = ['ALL']

print('Creating CSV files anew')
for geoListT in geoLists:
    geoList = geoListT[1]
    set = geoListT[0]
    for aa in aas:
        tag = 'Set' + set + aa
        georep = psu.GeoReport(pdbList, pdbDataPath, edDataPath, printPath, ed=False, dssp=includeDSSP, includePdbs=False,keepDisordered=True)
        print('Create unrestricted csv', geoList)
        dataUnrestricted = georep.getGeoemtryCsv(geoList, hueList, -1,allAtoms=True,restrictedAa=aa)
        dataUnrestricted.to_csv(printPath + 'CsvGeos_' + tag + '.csv', index=False)


print('----------Finished----------')
endx = time.time()
time_diff = endx - startx
timestring = str(int(time_diff / 60)) + "m " + str(int(time_diff % 60)) + "s"
print(timestring)

예제 #29
0
def EHCompare(pdbSet):
    import matplotlib.pyplot as plt
    plt.close('all')
    plt.clf()
    plt.cla()

    pdbDataPath = help.rootPath + '/ProteinDataFiles/pdb_out/' + pdbSet + '/'
    edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
    loadPathEH = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/Data/'
    loadPathCsv = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataB/'
    printPath = help.rootPath + '/BbkProject/PhDThesis/0.Papers/3.DefensibleGeometry/EvidencedSet/DataC/'

    #EH_SET,aa,N:CA,N:CA_SD,CA:C,CA:C_SD,C:O,C:O_SD,C:N+1,C:N+1_SD,N:CA:C,N:CA:C_SD,CA:C:N+1,CA:C:N+1_SD,CA:C:O,CA:C:O_SD,O:C:N+1,O:C:N+1_SD,C-1:N:CA,C-1:N:CA_SD
    EHFileName = 'Data_EH.csv'
    BestFileName = 'Data_DefensibleWithGeosALL_' + pdbSet + '.csv'

    dataEH = pd.read_csv(loadPathEH + EHFileName)
    dataBest = pd.read_csv(loadPathCsv + BestFileName)

    aas = [
        'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU',
        'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR'
    ]
    geos = [
        'N:CA', 'CA:C', 'C:O', 'C:N+1', 'TAU', 'CA:C:N+1', 'CA:C:O', 'O:C:N+1',
        'C-1:N:CA'
    ]
    #geos = ['N:CA','CA:C','C:O','C:N+1','N:CA:C']

    #specifically looking at the mean and sd of the parameters in comparison to EH
    print(dataEH)

    georepAA = psu.GeoReport([],
                             pdbDataPath,
                             edDataPath,
                             printPath,
                             ed=False,
                             dssp=False,
                             includePdbs=False,
                             keepDisordered=False)
    georepSummary = psu.GeoReport([],
                                  pdbDataPath,
                                  edDataPath,
                                  printPath,
                                  ed=False,
                                  dssp=False,
                                  includePdbs=False,
                                  keepDisordered=False)

    for geo in geos:
        #Cut on ALL PRO and GLY
        compareSets = ['1991', '2001']
        listCompares = []
        ehALL = dataEH.query("aa ==  'ALL'")
        ehGLY = dataEH.query("aa ==  'GLY'")
        ehPRO = dataEH.query("aa ==  'PRO'")
        ehCIS = dataEH.query("aa ==  'CIS'")

        bestALLCut = dataBest.query("aa !=  'GLY'")
        bestALLCut = bestALLCut.query("aa !=  'PRO'")
        bestGLYCut = dataBest.query("aa ==  'GLY'")
        bestPROCut = dataBest.query("aa ==  'PRO'")
        print(pdbSet, bestPROCut)
        bestPROCut['ABSOMEGA'] = abs(bestPROCut['CA-1:C-1:N:CA'])
        bestPROCis = bestPROCut.query("ABSOMEGA < 120")
        bestPROTrans = bestPROCut.query("ABSOMEGA >= 120")

        titleALL = ''
        titleGLY = ''
        titlePRO = ''
        titleCIS = ''
        for comp in compareSets:
            ehSetALL = ehALL.query("EH_SET ==  '" + comp + "'")
            ehSetGLY = ehGLY.query("EH_SET ==  '" + comp + "'")
            ehSetPRO = ehPRO.query("EH_SET ==  '" + comp + "'")
            ehSetCIS = ehCIS.query("EH_SET ==  '" + comp + "'")
            meanALL = round(ehSetALL[geo].values[0], 3)
            sdALL = ehSetALL[geo + '_SD'].values[0]
            meanGLY = round(ehSetGLY[geo].values[0], 3)
            sdGLY = ehSetGLY[geo + '_SD'].values[0]
            meanPRO = round(ehSetPRO[geo].values[0], 3)
            sdPRO = ehSetPRO[geo + '_SD'].values[0]
            meanCIS = round(ehSetCIS[geo].values[0], 3)
            sdCIS = ehSetCIS[geo + '_SD'].values[0]
            titleALL += 'ALL ' + geo + ' ' + comp + ' Mean=' + str(
                meanALL) + ' (' + str(sdALL) + ')\n'
            titleGLY += 'GLY ' + geo + ' ' + comp + ' Mean=' + str(
                meanGLY) + ' (' + str(sdGLY) + ')\n'
            titlePRO += 'PRO ' + geo + ' ' + comp + ' Mean=' + str(
                meanPRO) + ' (' + str(sdPRO) + ')\n'
            titleCIS += 'CIS ' + geo + ' ' + comp + ' Mean=' + str(
                meanCIS) + ' (' + str(sdCIS) + ')\n'
            print(titleGLY, titlePRO, titleALL, titleCIS)
        if geo == 'N:CA:C':
            georepSummary.addHistogram(data=bestALLCut,
                                       geoX='TAU',
                                       title=titleALL)
            georepSummary.addHistogram(data=bestGLYCut,
                                       geoX='TAU',
                                       title=titleGLY)
            georepSummary.addHistogram(data=bestPROTrans,
                                       geoX='TAU',
                                       title=titlePRO)
            georepSummary.addHistogram(data=bestPROCis,
                                       geoX='TAU',
                                       title=titleCIS)
        else:
            georepSummary.addHistogram(data=bestALLCut,
                                       geoX=geo,
                                       title=titleALL)
            georepSummary.addHistogram(data=bestGLYCut,
                                       geoX=geo,
                                       title=titleGLY)
            georepSummary.addHistogram(data=bestPROTrans,
                                       geoX=geo,
                                       title=titlePRO)
            georepSummary.addHistogram(data=bestPROCis,
                                       geoX=geo,
                                       title=titleCIS)
        '''
        for aa in aas:
            #prepare E&H comparison values
            useaa = 'ALL'
            if aa == 'PRO' or aa == 'GLY':
                useaa = aa
            ehCut = dataEH.query("aa ==  '" + useaa + "'")
            eh1991 = ehCut.query("EH_SET ==  '1991'")
            eh2001 = ehCut.query("EH_SET ==  '2001'")
            mean1991 = round(eh1991[geo].values[0],3)
            mean2001 = round(eh2001[geo].values[0],3)
            sd1991 = eh1991[geo + '_SD'].values[0]
            sd2001 = eh2001[geo + '_SD'].values[0]
            title = geo + ' ' + aa + '\nEH 2001: mean=' + str(mean2001) + ' (' +str(sd2001) + ')\n'
            title = title + 'EH 1991: mean=' + str(mean1991) + ' (' + str(sd1991) + ')'
            print(aa,geo,mean1991,sd1991,mean2001,sd2001)
            bestCut = dataBest.query("aa ==  '" + aa + "'")
            if geo == 'N:CA:C':
                georepAA.addHistogram(data=bestCut, geoX='TAU', title=title)
            else:
                georepAA.addHistogram(data=bestCut, geoX=geo, title=title)
        '''

    georepSummary.printToHtml(
        'Best Supported Engh&Huber Compare, set=' + pdbSet, 4,
        'Defensible_EH_' + pdbSet)
# -- ©Rachel Alcraft 2020, PsuGeometry --
from PsuGeometry import GeoReport as geor
'''
This script runs a correlation report on a few structures to demonstrate the use of 2Fo-Fc as a hue
The data is precalculated into a datafame
'''

pdbDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/pdb_data/'
edDataPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/ccp4_data/'
printPath = '/home/rachel/Documents/Bioinformatics/ProteinDataFiles/results_psu/Levels/'

pdbList = ['1ejg', '1us0', '1tt8', '1i1w', '1ucs', '6jvv', '5nqo']

georep = geor.GeoReport(pdbList,
                        pdbDataPath,
                        edDataPath,
                        printPath,
                        ed=True,
                        dssp=True)

geoList = [
    'PHI', 'PSI', 'TAU', 'C-1:C', 'C-1:N:CA', 'CHI1', 'CA-1:CA', 'OMEGA',
    'CA:CA+1', 'C-1:N:CA:C'
]
hueList = ['aa', 'bfactor', 'rid', 'resolution', 'pdbCode', '2FoFc', 'dssp']

data = georep.getGeoemtryCsv(geoList, hueList)

georep.addScatter(data=data,
                  geoX='PHI',
                  geoY='PSI',
                  hue='2FoFc',