def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' % (path, dn))
        for fn in l_fn:
            pdb = fn[:4]
            ##            if pdb.upper() not in s_pdbs:
            ##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': 'SOLUTION NMR'},
                l_data_categories=[
                    '_cell',
                    '_entity',
                    '_exptl',
                    '_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                ],
            )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']:
                continue

            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
                   [0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                    ## treshold
                    '1e54',
                    '1e9i',
                    ## difference between calculated MV and MV in mmCIF
                    '3eiq',
                    ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                    ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                    ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                    ## Toscana has published with Hellinga...
                    '2cjf',
                    '2bt4',
            ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
                ##                if d_mmCIF['_entity.type'][i] == 'polymer':
                s = d_mmCIF['_entity.formula_weight'][i]
                ## unknown ligand
                if s == '?':
                    continue
                mw += float(s)

            MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                    'F 4 3 2',
                    'F 41 3 2',
                    'I 41 3 2',
            ]:
                continue  ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi / 180.
                beta *= math.pi / 180.
                gamma *= math.pi / 180.
                V = a * b * c * math.sqrt(
                    1 - math.cos(alpha)**2 - math.cos(beta)**2 -
                    math.cos(gamma)**2 + 2 *
                    (math.cos(alpha) * math.cos(beta) * math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [
                    ['?'],
                        len(d_mmCIF['_exptl_crystal.density_Matthews']) *
                    ['?'],
                ]:
                    if abs(MV -
                           float(d_mmCIF['_exptl_crystal.density_Matthews'][0])
                           ) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference

            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV, 2), spacegroup


##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
        ##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' % (
            average,
            stderr,
            len(l_MV),
            spacegroup,
        )]

    fd = open('MV_v_spacegroup.txt', 'w')
    fd.writelines(l)
    fd.close()

    return
Exemple #2
0
def main():

    d_units = {'CA':'{\305}','heavy':'\305','phipsi':'/Symbol \260','chi1':'/Symbol \260'} ## octals
    d_octals = {'CA':'C_{/Symbol a}','heavy':'heavy atom','phipsi':'{/Symbol f}{/Symbol y}','chi1':'{/Symbol c}_1'} ## octals
    l_radii = [10,20,40,]
    d_columns = {'CA':9,'heavy':13,'phipsi':17,'chi1':21,}
    l_distances = [0,10,20,40,]
##    suffix = 'sameSG_sameauth'
    suffix = 'sameSG'

    d_rmsds = {}
    for radius in l_radii:
        d_rmsds[radius] = {}
        for key in d_columns.keys():
            d_rmsds[radius][key] = {}
            for dist_from_mut in range(0,83+1):
                d_rmsds[radius][key][dist_from_mut] = []

    ## the function sphere in quakes.py wrote this file when using the -singlemutants flag
    fn = 'sphere/out_%s_allresidues.txt' %(suffix)
    fd = open(fn,'r')
    lines = fd.readlines()
    fd.close()
    print 'read', fn

    print 'looping over lines'
    for line in lines:
        l = line.split()
        ## exclude HEWL and T4L
        if l[0].upper() in s_exclude or l[1].upper() in s_exclude:
            continue
        dist_from_mut = int(float(l[8]))
        for key, col in d_columns.items():
            for i_radii in range(len(l_radii)):
                radius = l_radii[i_radii]
                rmsd = l[col+i_radii]
                if rmsd == 'None':
                    continue
                else:
                    d_rmsds[radius][key][dist_from_mut] += [float(rmsd)]
    print 'looped over lines'


    ##
    ## histogram
    ##
    lines = []
    for dist_from_mut in d_rmsds[10]['CA'].keys():
        lines += ['%s %s\n' %(dist_from_mut, len(d_rmsds[10]['CA'][dist_from_mut]))]
    fd = open('histogram.gnuplotdata','w')
    fd.writelines(lines)
    fd.close()
##    plot_histogram('histogram',suffix,'distance (\305) from mutation','count of RMSDs',)
    plot_histogram('histogram',suffix,'distance ({\305}) from mutation','count of RMSDs',)


    ##
    ## histograms
    ##
    ## divide 1Angstrom into smaller units/tics
##    d_divs = {'CA':1000.,'heavy':500.,'phipsi':50.,'chi1':20.}
    d_divs = {'CA':500.,'heavy':500.,'phipsi':50.,'chi1':20.}
    for key in d_columns.keys():

        print 'temporarily dont plot histograms'
        break

        xlabel = '%s RMSD (%s)' %(d_octals[key],d_units[key])
        div = d_divs[key]
        l_plot_files = []
        for radius in l_radii:

            d_l_rmsds_histogram = {'all':[],}
            for dist in l_distances:
                d_l_rmsds_histogram[dist] = []

##            l_rmsds = []
            ## organize data
            for dist_from_mut in d_rmsds[radius][key].keys():
                for dist in l_distances:
                    if dist_from_mut <= dist:
                        d_l_rmsds_histogram[dist] += d_rmsds[radius][key][dist_from_mut]
                        break
##                l_rmsds += d_rmsds[radius][key][dist_from_mut]
                d_l_rmsds_histogram['all'] += d_rmsds[radius][key][dist_from_mut]

            ## count (yvalues)
            d_count = {'all':{},}
            l_range = []
            for dist in l_distances:
                d_count[dist] = {}
            for dist in d_count.keys():
                ## ???
                for rmsd in range(int(div*min(d_l_rmsds_histogram[dist])),int(div*max(d_l_rmsds_histogram[dist]))+1):
                    d_count[dist][rmsd] = 0
                for rmsd in d_l_rmsds_histogram[dist]:
                    d_count[dist][int(div*rmsd)] += 1
                for rmsd,count in d_count[dist].items():
                    if count > 10:
                        l_range += [rmsd]

            ## xtics
            d_xtics = {}
            for rmsd in range(
                int(min(l_range)/div),
                int(max(l_range)/div)+1,
                ):
                d_xtics['%s' %(rmsd)] = rmsd

            ## convert dict to txt
            for dist in d_count.keys():
                lines = []
                for rmsd in range(
                    int(min(l_range)),
                    int(max(l_range))+1,
                    ):
                    if not rmsd in d_count[dist].keys():
                        count = 0
                    else:
                        count = d_count[dist][rmsd]
                    lines += ['%s %s\n' %(rmsd/div, count)]
                fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' %(key, dist, radius)
                fd = open(fn,'w')
                fd.writelines(lines)
                fd.close()


##            plot_histogram(prefix,suffix,'RMSD (Angstrom)','count of RMSDs',d_xtics=d_xtics)
        for dist in d_count.keys():
            l_plot_files = []
            for radius in l_radii:
                fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' %(key, dist, radius)
                l_plot_files += [fn]
            prefix = 'histogram_%s_distfrommut%s' %(key,dist,)
            if dist == 'all':
                title = 'all distances from site of mutation'
            else:
                title = '%s {\305} from site of mutation' %(dist)
            plot_histogram(prefix,suffix,xlabel,'count of RMSDs',l_plot_files=l_plot_files,title=title)

    for key in d_columns.keys():
        l_plot_files = []
        

    ##        
    ##
    ##
    for key in d_columns.keys():
##        xlabel = 'distance {Symbol \305} from mutation'
        xlabel = 'distance ({\305}) from mutation'
        ylabel = 'RMSD ({Symbol %s}) within sphere' %(d_units[key],)
        title = '%s' %(d_octals[key])
        for i_radii in range(len(l_radii)):
            radius = l_radii[i_radii]
            lines = []
            for dist_from_mut in d_rmsds[radius][key].keys():
                l_rmsds = d_rmsds[radius][key][dist_from_mut]
##                if len(l_rmsds) == 0:
##                if len(l_rmsds) < 100:
##                if len(l_rmsds) < 400:
                if len(l_rmsds) < 1000:
                    if dist_from_mut < 10:
                        print 'skipping', dist_from_mut, key, radius
                    continue
                average,stderr = statistics.do_stderr(l_rmsds)
                lines += ['%s %s %s\n' %(dist_from_mut+0.1*i_radii,average,stderr)]
            fd = open('%s_%s.gnuplotdata' %(key,radius),'w')
            fd.writelines(lines)
            fd.close()
        plot_scatter(key,xlabel,ylabel,l_radii,title=title,)

    plot_scatter_combined(suffix)

    return
Exemple #3
0
def main():

    import os
    d_bfactors1 = init_bfactor_dic()

    path = '/oxygenase_local/data/pdb'
    path = '/media/WDMyBook1TB/2TB/pdb'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        continue
        dn = l_dns[i]
        ##        if dn != 'e7':
        ##            continue
        ##        if dn < 'xk':
        ##            continue
        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        for fn in l_fns:
            continue
            pdb = fn[3:7]
            if pdb in [
                    '2wto',
                    '2wtp',
            ]:  ## remediation
                continue
            l_bfactors = []
            d_bfactors2 = init_bfactor_dic()
            d_ss = {}
            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()
            for line in lines:
                record = line[:6].strip()
                if record == 'EXPDTA':
                    if line[10:13] == 'NMR':
                        break
                    elif line[10:22] == 'SOLUTION NMR':
                        break
                    elif line[10:25] == 'SOLID-STATE NMR':
                        break
                    elif line[
                            10:
                            41] == 'X-RAY DIFFRACTION; SOLUTION NMR':  ## 1iob
                        break
                    elif line[
                            10:
                            43] == 'SOLUTION SCATTERING; SOLUTION NMR':  ## 2klj
                        break
                    elif line[10:29] == 'ELECTRON MICROSCOPY':
                        break
                    elif line[10:27] != 'X-RAY DIFFRACTION':
                        print line
                        break
                    if 'NMR' in line:
                        print fn
                        print line
                        stop
                elif record == 'HEADER':
                    year = int(line[50:59][-2:])
                elif record == 'HELIX':
                    if line[19] != line[31]:
                        print line
                        print pdb
                        stop
                    chain = line[19]
                    res_no1 = int(line[21:25])
                    iCode1 = line[25]
                    res_no2 = int(line[33:37])
                    iCode2 = line[37]
                    d_ss = SSrecord2dic(record, chain, res_no1, res_no2,
                                        iCode1, iCode2, d_ss)
                elif record == 'SHEET':
                    if line[21] != line[32]:
                        print line
                        print pdb
                        stop
                    chain = line[21]
                    res_no1 = int(line[22:26])
                    iCode1 = line[26]
                    res_no2 = int(line[33:37])
                    iCode2 = line[37]
                    d_ss = SSrecord2dic(record, chain, res_no1, res_no2,
                                        iCode1, iCode2, d_ss)
                elif record == 'TURN':
                    stop
                    if line[19] != line[30]:
                        stop
                    chain = line[19]
                    res_no1 = int(line[20:24])
                    iCode1 = line[24]
                    res_no2 = int(line[31:35])
                    iCode2 = line[35]
                    d_ss = SSrecord2dic(record, chain, res_no1, res_no2,
                                        iCode1, iCode2, d_ss)
                elif record == 'ATOM':

                    res_name = line[17:20]
                    ## amino acid residues only
                    if res_name not in d_bfactors2.keys():
                        continue

                    atom_name = line[12:16].strip()
                    ## backbone atoms only
                    if atom_name not in [
                            'N',
                            'CA',
                            'C',
                            'O',
                    ]:
                        continue

                    chain = line[21]
                    res_no = int(line[22:26])
                    iCode = line[26]
                    bfactor = float(line[60:66])

                    l_bfactors += [bfactor]
                    ss = ATOMrecord2SS(
                        chain,
                        res_no,
                        iCode,
                        d_ss,
                    )
                    d_bfactors2[res_name][ss][atom_name] += [bfactor]

            ## no amino acids (or break)
            if len(l_bfactors) == 0:
                continue
            average = sum(l_bfactors) / len(l_bfactors)
            ## NMR structure
            if average == 0:
                continue
            ## identical temperature factors for all atoms
            if len(l_bfactors) * [average] == l_bfactors:
                print 'all bfactors same', pdb, 'year', year, 'bfac', average
                continue
            if average in [
                    2, 3, 4, 5, 6, 7, 8, 9, 99, 90, 50, 20, 25, 1, 100, 10, 0
            ]:
                print average
                print fn
                stop

            for res_name in d_bfactors2.keys():
                for ss in d_bfactors2[res_name].keys():
                    for atom_name in d_bfactors2[res_name][ss].keys():
                        l_bfactors = d_bfactors2[res_name][ss][atom_name]
                        if len(l_bfactors) == 0:
                            continue
                        lines_out = [
                            '%s\n' % (bfac / average) for bfac in l_bfactors
                        ]
                        fd = open(
                            'bfac_%s_%s_%s.txt' % (res_name, ss, atom_name),
                            'a')
                        fd.writelines(lines_out)
                        fd.close()
##                        for bfactor in d_bfactors2[res_name][ss][atom_name]:
##                            bfactor_normalized = bfactor/average
##                            d_bfactors1[res_name][ss][atom_name] += [bfactor_normalized]

    for res_name in d_bfactors1.keys():
        l_bfactors_res = []
        for ss in [
                'HELIX',
                'SHEET',
                'OTHER',
        ]:
            for atom_name in d_bfactors1[res_name][ss].keys():
                fd = open('bfac_%s_%s_%s.txt' % (res_name, ss, atom_name), 'r')
                lines = fd.readlines()
                fd.close()
                l_bfactors = [float(line) for line in lines]
                l_bfactors_res += l_bfactors
##                average, stderr = statistics.do_stderr(l_bfactors)
##                print '%s\t%s\t%s\t%s\t%s\t%s' %(
##                    res_name, ss, atom_name,
##                    len(l_bfactors),
##                    sum(l_bfactors)/len(l_bfactors),
##                    stderr,
##                    )
        average, stderr = statistics.do_stderr(l_bfactors_res)
        print '%s\t%s\t%s\t%s' % (
            res_name,
            len(l_bfactors_res),
            sum(l_bfactors_res) / len(l_bfactors_res),
            stderr,
        )
Exemple #4
0
def main():

    d_units = {
        'CA': '{\305}',
        'heavy': '\305',
        'phipsi': '/Symbol \260',
        'chi1': '/Symbol \260'
    }  ## octals
    d_octals = {
        'CA': 'C_{/Symbol a}',
        'heavy': 'heavy atom',
        'phipsi': '{/Symbol f}{/Symbol y}',
        'chi1': '{/Symbol c}_1'
    }  ## octals
    l_radii = [
        10,
        20,
        40,
    ]
    d_columns = {
        'CA': 9,
        'heavy': 13,
        'phipsi': 17,
        'chi1': 21,
    }
    l_distances = [
        0,
        10,
        20,
        40,
    ]
    ##    suffix = 'sameSG_sameauth'
    suffix = 'sameSG'

    d_rmsds = {}
    for radius in l_radii:
        d_rmsds[radius] = {}
        for key in d_columns.keys():
            d_rmsds[radius][key] = {}
            for dist_from_mut in range(0, 83 + 1):
                d_rmsds[radius][key][dist_from_mut] = []

    ## the function sphere in quakes.py wrote this file when using the -singlemutants flag
    fn = 'sphere/out_%s_allresidues.txt' % (suffix)
    fd = open(fn, 'r')
    lines = fd.readlines()
    fd.close()
    print 'read', fn

    print 'looping over lines'
    for line in lines:
        l = line.split()
        ## exclude HEWL and T4L
        if l[0].upper() in s_exclude or l[1].upper() in s_exclude:
            continue
        dist_from_mut = int(float(l[8]))
        for key, col in d_columns.items():
            for i_radii in range(len(l_radii)):
                radius = l_radii[i_radii]
                rmsd = l[col + i_radii]
                if rmsd == 'None':
                    continue
                else:
                    d_rmsds[radius][key][dist_from_mut] += [float(rmsd)]
    print 'looped over lines'

    ##
    ## histogram
    ##
    lines = []
    for dist_from_mut in d_rmsds[10]['CA'].keys():
        lines += [
            '%s %s\n' % (dist_from_mut, len(d_rmsds[10]['CA'][dist_from_mut]))
        ]
    fd = open('histogram.gnuplotdata', 'w')
    fd.writelines(lines)
    fd.close()
    ##    plot_histogram('histogram',suffix,'distance (\305) from mutation','count of RMSDs',)
    plot_histogram(
        'histogram',
        suffix,
        'distance ({\305}) from mutation',
        'count of RMSDs',
    )

    ##
    ## histograms
    ##
    ## divide 1Angstrom into smaller units/tics
    ##    d_divs = {'CA':1000.,'heavy':500.,'phipsi':50.,'chi1':20.}
    d_divs = {'CA': 500., 'heavy': 500., 'phipsi': 50., 'chi1': 20.}
    for key in d_columns.keys():

        print 'temporarily dont plot histograms'
        break

        xlabel = '%s RMSD (%s)' % (d_octals[key], d_units[key])
        div = d_divs[key]
        l_plot_files = []
        for radius in l_radii:

            d_l_rmsds_histogram = {
                'all': [],
            }
            for dist in l_distances:
                d_l_rmsds_histogram[dist] = []

##            l_rmsds = []
## organize data
            for dist_from_mut in d_rmsds[radius][key].keys():
                for dist in l_distances:
                    if dist_from_mut <= dist:
                        d_l_rmsds_histogram[dist] += d_rmsds[radius][key][
                            dist_from_mut]
                        break
##                l_rmsds += d_rmsds[radius][key][dist_from_mut]
                d_l_rmsds_histogram['all'] += d_rmsds[radius][key][
                    dist_from_mut]

            ## count (yvalues)
            d_count = {
                'all': {},
            }
            l_range = []
            for dist in l_distances:
                d_count[dist] = {}
            for dist in d_count.keys():
                ## ???
                for rmsd in range(
                        int(div * min(d_l_rmsds_histogram[dist])),
                        int(div * max(d_l_rmsds_histogram[dist])) + 1):
                    d_count[dist][rmsd] = 0
                for rmsd in d_l_rmsds_histogram[dist]:
                    d_count[dist][int(div * rmsd)] += 1
                for rmsd, count in d_count[dist].items():
                    if count > 10:
                        l_range += [rmsd]

            ## xtics
            d_xtics = {}
            for rmsd in range(
                    int(min(l_range) / div),
                    int(max(l_range) / div) + 1,
            ):
                d_xtics['%s' % (rmsd)] = rmsd

            ## convert dict to txt
            for dist in d_count.keys():
                lines = []
                for rmsd in range(
                        int(min(l_range)),
                        int(max(l_range)) + 1,
                ):
                    if not rmsd in d_count[dist].keys():
                        count = 0
                    else:
                        count = d_count[dist][rmsd]
                    lines += ['%s %s\n' % (rmsd / div, count)]
                fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' % (
                    key, dist, radius)
                fd = open(fn, 'w')
                fd.writelines(lines)
                fd.close()


##            plot_histogram(prefix,suffix,'RMSD (Angstrom)','count of RMSDs',d_xtics=d_xtics)
        for dist in d_count.keys():
            l_plot_files = []
            for radius in l_radii:
                fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' % (
                    key, dist, radius)
                l_plot_files += [fn]
            prefix = 'histogram_%s_distfrommut%s' % (
                key,
                dist,
            )
            if dist == 'all':
                title = 'all distances from site of mutation'
            else:
                title = '%s {\305} from site of mutation' % (dist)
            plot_histogram(prefix,
                           suffix,
                           xlabel,
                           'count of RMSDs',
                           l_plot_files=l_plot_files,
                           title=title)

    for key in d_columns.keys():
        l_plot_files = []

    ##
    ##
    ##
    for key in d_columns.keys():
        ##        xlabel = 'distance {Symbol \305} from mutation'
        xlabel = 'distance ({\305}) from mutation'
        ylabel = 'RMSD ({Symbol %s}) within sphere' % (d_units[key], )
        title = '%s' % (d_octals[key])
        for i_radii in range(len(l_radii)):
            radius = l_radii[i_radii]
            lines = []
            for dist_from_mut in d_rmsds[radius][key].keys():
                l_rmsds = d_rmsds[radius][key][dist_from_mut]
                ##                if len(l_rmsds) == 0:
                ##                if len(l_rmsds) < 100:
                ##                if len(l_rmsds) < 400:
                if len(l_rmsds) < 1000:
                    if dist_from_mut < 10:
                        print 'skipping', dist_from_mut, key, radius
                    continue
                average, stderr = statistics.do_stderr(l_rmsds)
                lines += [
                    '%s %s %s\n' %
                    (dist_from_mut + 0.1 * i_radii, average, stderr)
                ]
            fd = open('%s_%s.gnuplotdata' % (key, radius), 'w')
            fd.writelines(lines)
            fd.close()
        plot_scatter(
            key,
            xlabel,
            ylabel,
            l_radii,
            title=title,
        )

    plot_scatter_combined(suffix)

    return
Exemple #5
0
def main():

    import os
    d_bfactors1 = init_bfactor_dic()

    path = '/oxygenase_local/data/pdb'
    path = '/media/WDMyBook1TB/2TB/pdb'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        continue
        dn = l_dns[i]
##        if dn != 'e7':
##            continue
##        if dn < 'xk':
##            continue
        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        for fn in l_fns:
            continue
            pdb = fn[3:7]
            if pdb in ['2wto','2wtp',]: ## remediation
                continue
            l_bfactors = []
            d_bfactors2 = init_bfactor_dic()
            d_ss = {}
            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()
            for line in lines:
                record = line[:6].strip()
                if record == 'EXPDTA':
                    if line[10:13] == 'NMR':
                        break
                    elif line[10:22] == 'SOLUTION NMR':
                        break
                    elif line[10:25] == 'SOLID-STATE NMR':
                        break
                    elif line[10:41] == 'X-RAY DIFFRACTION; SOLUTION NMR': ## 1iob
                        break
                    elif line[10:43] == 'SOLUTION SCATTERING; SOLUTION NMR': ## 2klj
                        break
                    elif line[10:29] == 'ELECTRON MICROSCOPY':
                        break
                    elif line[10:27] != 'X-RAY DIFFRACTION':
                        print line
                        break
                    if 'NMR' in line:
                        print fn
                        print line
                        stop
                elif record == 'HEADER':
                    year = int(line[50:59][-2:])
                elif record == 'HELIX':
                    if line[19] != line[31]:
                        print line
                        print pdb
                        stop
                    chain = line[19]
                    res_no1 = int(line[21:25])
                    iCode1 = line[25]
                    res_no2 = int(line[33:37])
                    iCode2 = line[37]
                    d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss)
                elif record == 'SHEET':
                    if line[21] != line[32]:
                        print line
                        print pdb
                        stop
                    chain = line[21]
                    res_no1 = int(line[22:26])
                    iCode1 = line[26]
                    res_no2 = int(line[33:37])
                    iCode2 = line[37]
                    d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss)
                elif record == 'TURN':
                    stop
                    if line[19] != line[30]:
                        stop
                    chain = line[19]
                    res_no1 = int(line[20:24])
                    iCode1 = line[24]
                    res_no2 = int(line[31:35])
                    iCode2 = line[35]
                    d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss)
                elif record == 'ATOM':

                    res_name = line[17:20]
                    ## amino acid residues only
                    if res_name not in d_bfactors2.keys():
                        continue

                    atom_name = line[12:16].strip()
                    ## backbone atoms only
                    if atom_name not in ['N','CA','C','O',]:
                        continue

                    chain = line[21]
                    res_no = int(line[22:26])
                    iCode = line[26]    
                    bfactor = float(line[60:66])

                    l_bfactors += [bfactor]
                    ss = ATOMrecord2SS(chain,res_no,iCode,d_ss,)
                    d_bfactors2[res_name][ss][atom_name] += [bfactor]

            ## no amino acids (or break)
            if len(l_bfactors) == 0:
                continue
            average = sum(l_bfactors)/len(l_bfactors)
            ## NMR structure
            if average == 0:
                continue
            ## identical temperature factors for all atoms
            if len(l_bfactors)*[average] == l_bfactors:
                print 'all bfactors same', pdb, 'year', year, 'bfac', average
                continue
            if average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]:
                print average
                print fn
                stop

            for res_name in d_bfactors2.keys():
                for ss in d_bfactors2[res_name].keys():
                    for atom_name in d_bfactors2[res_name][ss].keys():
                        l_bfactors = d_bfactors2[res_name][ss][atom_name]
                        if len(l_bfactors) == 0:
                            continue
                        lines_out = ['%s\n' %(bfac/average) for bfac in l_bfactors]
                        fd = open('bfac_%s_%s_%s.txt' %(res_name,ss,atom_name),'a')
                        fd.writelines(lines_out)
                        fd.close()
##                        for bfactor in d_bfactors2[res_name][ss][atom_name]:
##                            bfactor_normalized = bfactor/average
##                            d_bfactors1[res_name][ss][atom_name] += [bfactor_normalized]

    for res_name in d_bfactors1.keys():
        l_bfactors_res = []
        for ss in ['HELIX','SHEET','OTHER',]:
            for atom_name in d_bfactors1[res_name][ss].keys():
                fd = open('bfac_%s_%s_%s.txt' %(res_name,ss,atom_name),'r')
                lines = fd.readlines()
                fd.close()
                l_bfactors = [float(line) for line in lines]
                l_bfactors_res += l_bfactors
##                average, stderr = statistics.do_stderr(l_bfactors)
##                print '%s\t%s\t%s\t%s\t%s\t%s' %(
##                    res_name, ss, atom_name,
##                    len(l_bfactors),
##                    sum(l_bfactors)/len(l_bfactors),
##                    stderr,
##                    )
        average, stderr = statistics.do_stderr(l_bfactors_res)
        print '%s\t%s\t%s\t%s' %(
            res_name,
            len(l_bfactors_res),
            sum(l_bfactors_res)/len(l_bfactors_res),
            stderr,
            )
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' %(path,dn))
        for fn in l_fn:
            pdb = fn[:4]
##            if pdb.upper() not in s_pdbs:
##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':'SOLUTION NMR'},
                l_data_categories = [
                    '_cell','_entity','_exptl','_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                    ],
                )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']:
                continue
            
            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                ## treshold
                '1e54','1e9i',
                ## difference between calculated MV and MV in mmCIF
                '3eiq',
                ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                ## Toscana has published with Hellinga...
                '2cjf','2bt4',
                ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
##                if d_mmCIF['_entity.type'][i] == 'polymer':
                    s = d_mmCIF['_entity.formula_weight'][i]
                    ## unknown ligand
                    if s == '?':
                        continue
                    mw += float(s)

            MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                'F 4 3 2',
                'F 41 3 2',
                'I 41 3 2',
                ]:
                continue ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi/180.
                beta *= math.pi/180.
                gamma *= math.pi/180.
                V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]:
                    if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference


            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV,2), spacegroup

##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)]

    fd = open('MV_v_spacegroup.txt','w')
    fd.writelines(l)
    fd.close()

    return