コード例 #1
0
def main():
    res_neighbors = read_pickle()

    with lt.open_file(file_name='hbplus_salt_combine_initial') as w_f:
        for pro, phos_res, neighbor_res in res_neighbors:
            print >> w_f, '{0:<8}{1:<15}{2}'.format(pro, phos_res,
                                                    ',   '.join(neighbor_res))

    lt.pickle_dump(res_neighbors, 'hbplus_salt_combine')
コード例 #2
0
def main():
    hbs = []
    for hb2_f in lt.files_in_dir(sys.argv[-1]):
        if hb2_f[-4:] == '.hb2':
            hbs.extend(read_hb2(hb2_f))

    write_initial(hbs)
    hbs = [(pro, phos_res, neighbor_res)
           for pro, phos_res, neighbor_res, _ in hbs]

    lt.pickle_dump(hbs, 'hbplus')
コード例 #3
0
def mds(scores_matrix):
    seed = np.random.RandomState(seed=3)
    mds = manifold.MDS(n_components=2, max_iter=3000000, eps=1e-9,
                        dissimilarity='precomputed', n_jobs=1)
    pos = mds.fit(scores_matrix).embedding_
    x = [p[0]*100 for p in pos]
    y = [p[1]*100 for p in pos]

    lt.pickle_dump(x,'pos_x')
    lt.pickle_dump(y,'pos_y')
    fig = plt.figure(1)
    plt.scatter(x, y)
    plt.savefig('cluster_mds.png')
    plt.close()
コード例 #4
0
def rcsb_uniprot():
    keywords = [
        'pfam', 'smart', 'supfam', 'uniprot_repeat', 'uniprot_keyword',
        'prosite1', 'prosite2', 'prosite3'
    ]
    p = Pool(8)
    result = p.map(uniprot_wd40, keywords)
    p.close()
    wd40s = []
    for k in keywords:
        for r, v in result:
            if r == k:
                wd40s.append(v)
    wdsp = get_wdsp_acc()
    wd40s.append(wdsp)
    keywords.append('wdsp')

    total = set.union(*map(set, wd40s))
    # if an entry apears in n different querys, its score is n
    wd40s_score = [[] for i in range(9)]

    def acc_score(acc):
        i = 0
        for w in wd40s:
            if acc in w:
                i += 1
        return i

    for acc in total:
        num = acc_score(acc)
        wd40s_score[num - 1].append(acc)

    # use acc to search rcsb
    resolution = 30.0
    results = []
    for beta in range(24):
        for chain_len in range(100, 240, 10):
            filename = str(beta) + '_' + str(chain_len)
            uniprot_pdbids, report = rcsb_acc_customreport(
                total, beta, chain_len, resolution)
            pdb_scores = []
            for p in report[1:]:
                pdb_scores.append(p + [acc_score(p[2])])
            pdb_scores = sorted(pdb_scores, key=lambda x: x[-1], reverse=True)
            lt.pickle_dump(pdb_scores, filename + '_pdb_scores')
            with open(filename + '_uniprot_pdb_scores.txt', 'w') as w_f:
                print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
                    'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len',
                    'release', 'score')
                for p in pdb_scores:
                    print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
                        p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7])
            results.append([str(beta) + '_' + str(chain_len), pdb_scores])

    # plot trending barplot
    keys = [r[0] for r in results]
    data = map(len, [r[1] for r in results])
    df = pd.DataFrame({'Parameters': keys, 'Num': data})
    df = df.sort_values('Num', ascending=True)
    sns.set_color_codes('pastel')
    h = sns.barplot(y='Parameters', x='Num', data=wd, color='b')
    h.figure.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95)
    ax.set(xlabel='Parameters',
           ylabel='Num',
           title='WD40 Structures got with Different Parameters')
    plt.savefig('wd40_structures_got_by_different_parameters', dpi=300)
    plt.close('all')

    # get structures filtered by more strict parameters
    results = sorted(results, key=lambda x: len(x[1]))
    for r1, r2 in zip(results[:-1], results[1:]):
        if len(r1[1]) < len(r2[1]):
            added = set(r2[1]).difference(set(r1[1]))
            filename = r2[0] + '_minus_' + r1[0]
            with open(filename + '.txt', 'w') as w_f:
                print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
                    'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len',
                    'release', 'score')
                for p in added:
                    print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
                        p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7])
コード例 #5
0
def main():
    beta, chain_len, resolution = 16, 160, 30.0
    uniprot_pdbids, pdb_scores = rcsb_uniprot(beta, chain_len, resolution)
    scop_pdbids = rcsb_scop(beta, chain_len, resolution)
    pfam_pdbids = rcsb_pfam(beta, chain_len, resolution)
    txt_pdbids = rcsb_pfam(beta, chain_len, resolution)
    uniprot = set([u.split(':')[0] for u in uniprot_pdbids.split(',')[:-1]])
    scop = set([u.split(':')[0] for u in scop_pdbids.split(',')[:-1]])
    pfam = set([u.split(':')[0] for u in pfam_pdbids.split(',')[:-1]])
    txt = set([u.split(':')[0] for u in txt_pdbids.split(',')[:-1]])

    # plot heatmap of WD40s shared by different methods
    sns.set_color_codes('pastel')
    table = []
    keys = ['Uniprot', 'SCOP', 'Pfam', 'Text']
    total = [uniprot, scop, pfam, txt]
    for w in total:
        row = [len(w.intersection(wr)) * 1.0 / len(w) for wr in total]
        table.append(row)
    data = pd.DataFrame(table, columns=keys, index=keys)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    h = sns.heatmap(data, annot=True, fmt='.2f', cmap='Blues')
    h.figure.subplots_adjust(top=0.9, bottom=0.13, left=0.13, right=0.9)
    ax.set_xticklabels(keys, rotation=90)
    ax.set_yticklabels(keys[::-1], rotation=0)
    ax.set_title('Comaration of Different Annotation Methods')
    plt.savefig('Comaration_of Different_Annotation_Methods.png', dpi=300)
    plt.close('all')

    sns.set_color_codes('bright')
    venn2([uniprot, scop], ['UniProt', 'SCOP'])
    plt.savefig('uniprot_scop.png', dpi=300)
    plt.close('all')
    venn2([uniprot, pfam], ['UniProt', 'Pfam'])
    plt.savefig('uniprot_pfam.png', dpi=300)
    plt.close('all')
    venn2([uniprot, txt], ['UniProt', 'Text'])
    plt.savefig('uniprot_pfam.png', dpi=300)
    plt.close('all')
    venn2([uniprot, set.union(*[scop, pfam, txt])],
          ['UniProt', 'Pfam_SCOP_Text'])
    plt.savefig('uniprot_pfam_scop_txt.png', dpi=300)
    plt.close('all')
    venn2([pfam, txt], ['Pfam', 'Text'])
    plt.savefig('pfam_txt.png', dpi=300)
    plt.close('all')
    venn2([pfam, scop], ['Pfam', 'SCOP'])
    plt.savefig('pfam_scop.png', dpi=300)
    plt.close('all')
    venn3([pfam, scop, txt], ['Pfam', 'SCOP', 'Text'])
    plt.savefig('pfam_scop_txt.png', dpi=300)
    plt.close('all')

    lt.pickle_dump([uniprot, scop, pfam, txt], 'search_method')
    write_lis_lis([uniprot, scop, pfam, txt], 'rcsb_wd40_pdb',
                  ['uniprot', 'scop', 'pfam', 'txt'])

    # plot barplot
    f, ax = plt.subplots()
    total = set.union(*map(set, [uniprot, scop, pfam, txt]))
    sns.set_color_codes('pastel')
    methods = ['UniProt', 'Pfam', 'Text', 'SCOP']
    wd = pd.DataFrame({
        'Search Method': methods,
        'Num': map(len, [total, total, total, total])
    })
    sns.barplot(x='Search Method', y='Num', data=wd, color='b')
    sns.set_color_codes('muted')
    wd = pd.DataFrame({
        'Search Method': methods,
        'Num': map(len, [uniprot, pfam, txt, scop])
    })
    sns.barplot(x='Search Method', y='Num', data=wd, color='b')
    ax.set(xlabel='Search Method',
           ylabel='Num',
           title='WD40 Structures in RCSB')
    plt.savefig('wd40_in_RCSB_pdbs', dpi=300)
    plt.close('all')
コード例 #6
0
def rcsb_uniprot(beta=15, chain_len=150, resolution=3.5):
    keywords = [
        'pfam', 'smart', 'supfam', 'uniprot_repeat', 'uniprot_keyword',
        'prosite1', 'prosite2', 'prosite3'
    ]
    p = Pool(8)
    result = p.map(uniprot_wd40, keywords)
    p.close()
    wd40s = []
    for k in keywords:
        for r, v in result:
            if r == k:
                wd40s.append(v)
    wdsp = get_wdsp_acc()
    wd40s.append(wdsp)
    keywords.append('wdsp')

    total = set.union(*map(set, wd40s))
    # if an entry apears in n different querys, its score is n
    wd40s_score = [[] for i in range(9)]

    def acc_score(acc):
        i = 0
        for w in wd40s:
            if acc in w:
                i += 1
        return i

    for acc in total:
        num = acc_score(acc)
        wd40s_score[num - 1].append(acc)

    # use acc to search rcsb
    uniprot_pdbids, report = rcsb_acc_customreport(total, beta, chain_len,
                                                   resolution)
    pdb_scores = []
    for p in report[1:]:
        pdb_scores.append(p + [acc_score(p[2])])
    pdb_scores = sorted(pdb_scores, key=lambda x: x[-1], reverse=True)
    lt.pickle_dump(pdb_scores, 'pdb_scores')
    with open('uniprot_pdb_scores.txt', 'w') as w_f:
        print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
            'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len',
            'release', 'score')
        for p in pdb_scores:
            print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format(
                p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7])

    # plot wd40 structures annotated by different database
    total_pdb = set([p[2] for p in pdb_scores])
    wd40s_pdb = [[a for a in w if a in total_pdb] for w in wd40s]
    lt.pickle_dump(wd40s_pdb, 'pdb_acc_databases')
    f, ax = plt.subplots()
    keys = [
        'Pfam', 'SMART', 'Superfamily', 'UniProt_repeat', 'UniProt_keyword',
        'Prosite1', 'Prosite2', 'Prosite3', 'WDSP'
    ]
    wd = pd.DataFrame({'Database': keys, 'Num': map(len, wd40s_pdb)})
    wd = wd.sort_values('Num', ascending=False)
    sns.set_color_codes('pastel')
    h = sns.barplot(y='Database', x='Num', data=wd, color='b')
    h.figure.subplots_adjust(top=0.9, bottom=0.05, left=0.14, right=0.95)
    ax.set(xlabel='Database',
           ylabel='Num',
           title='WD40 Structures Annotated by Different Database')
    # plt.xticks(roation=90)
    plt.savefig('wd40_structures_accs_annotated_by_different_database',
                dpi=300)
    plt.close('all')
    write_lis_lis(wd40s_pdb,
                  'wd40_structures_accs_annotated_by_different_database', keys)

    # plot annotation score of wd40 structures
    pdb_acc_scores = [[] for i in range(9)]
    for p in pdb_scores:
        pdb_acc_scores[p[-1] - 1].append(p[2])
    pdb_acc_scores = map(set, pdb_acc_scores)

    lt.pickle_dump(pdb_acc_scores, 'pdb_acc_scores')
    f, ax = plt.subplots()
    wd = pd.DataFrame({
        'Database Score': range(1, 10),
        'Num': map(len, pdb_acc_scores)
    })
    sns.set_color_codes('pastel')
    sns.barplot(x='Database Score', y='Num', data=wd, color='b')
    ax.set(xlabel='Database Score',
           ylabel='Num',
           title='Annotation Score of WD40 Structures')
    plt.savefig('wd40_structures_annotation_score_accs', dpi=300)
    plt.close('all')
    write_lis_lis(pdb_acc_scores, 'wd40_structures_annotation_score_accs',
                  [str(i) for i in range(1, 10)])

    print 'uniprot search is finished'
    return uniprot_pdbids, pdb_scores
コード例 #7
0
def main():

    res_neighbors = lt.pickle_load(sys.argv[-1])

    #delete water
    res_neighbors_dw = [(pdb,res,[n for n in neighbors if not 'HOH' in n]) for pdb,res,neighbors in res_neighbors]
    res_neighbors_dw = [(pdb,res,neighbors) for pdb,res,neighbors in res_neighbors_dw if len(neighbors) > 0]
    write_result(res_neighbors_dw,'1_delete_water')

    #filter entry containing hetero residues
    res_neighbors_fh = [(pdb,res,[n for n in neighbors if n.split('_')[-1] != 'H']) for pdb,res,neighbors in res_neighbors_dw]
    res_neighbors_fh = [(pdb,res,neighbors) for pdb,res,neighbors in res_neighbors_fh if len(neighbors) > 0]
    write_result(res_neighbors_fh,'2_dw_filter_hetero')

    #special case: phos_res neighboring residues involving main-chain interaction is not considered
    res_neighbors_pm = []
    for pdb,res,neighbors in res_neighbors_fh:
        res_id,res_chain = res.split('_')[1:3]
        new_neighbors = []
        for n in neighbors:
            words=n.split('_')
            if words[3] == 'M' and words[2] == res_chain and words[1] == res_id:
                pass
            else:
                new_neighbors.append(n)
        res_neighbors_pm.append((pdb,res,new_neighbors))
    write_result(res_neighbors_pm,'3_dw_fh_pm')

    #filter_entry containing only same-chain neighbors
    res_neighbors_fs = []
    for pdb,res,neighbors in res_neighbors_pm:
        res_chain = [res.split("_")[2]]
        neighbors_chain = [n.split("_")[2] for n in neighbors]
        if set(res_chain)== set(neighbors_chain):
            pass
        else:
            res_neighbors_fs.append((pdb,res,neighbors))
    write_result(res_neighbors_fs,'4_dw_fh_pm_fs')

    #change main-chain interaction residue as 'GLY'
    res_neighbors_cm = []
    for pdb,res,neighbors in res_neighbors_fs:
        new_neighbors = []
        for n in neighbors:
            words = n.split('_')
            if words[3] == 'M':
                words = ['GLY'] + words[1:3]
            else:
                words = words[0:3]
            new_neighbors.append('_'.join(words))
        res_neighbors_cm.append((pdb,res,new_neighbors))
    write_result(res_neighbors_cm,'4.0_dw_fh_fs_cm')
    write_sta_result(res_neighbors_cm,'4.0_dw_fh_pm_cm')


    #filter entry involves main-chain interaction
    res_neighbors_fm = []
    for pdb,res,neighbors in res_neighbors_fs:
        interaction_type = [n.split('_')[3] for n in neighbors]
        if 'M' in interaction_type:
            pass
        else:
            res_neighbors_fm.append((pdb,res,neighbors))
    write_result(res_neighbors_fm,'5_dw_fh_pm_fs_fm')
    write_sta_result(res_neighbors_fm,'5_dw_fh_pm_fs_fm')


    lt.pickle_dump(res_neighbors_dw,'hbplus_salt_combine_1_dw')
    lt.pickle_dump(res_neighbors_fh,'hbplus_salt_combine_2_dw_fh')
    lt.pickle_dump(res_neighbors_pm,'hbplus_salt_combine_3_dw_fh_pm')
    lt.pickle_dump(res_neighbors_fs,'hbplus_salt_combine_4_dw_fh_pm_fs')
    lt.pickle_dump(res_neighbors_cm,'hbplus_salt_combine_4.0_dw_fh_fs_cm')
    lt.pickle_dump(res_neighbors_fm,'hbplus_salt_combine_5_dw_fh_pm_fs_fm')