import sys,os, pickle import seqtools as st from shared import * from glob import * import networkx as nx from IPython import embed np_2_aa_seq = {} for fname in glob('dat_nuccore/*.faa'): nc = os.path.splitext(fname)[0] for h,s in zip(*st.read_fasta(fname)): np = h[1:-1] np_2_aa_seq[np] = s gene_graph = nx.Graph() ncnc2rsds = {} # maps (nc,nc) -> {(np,np)->rsd, ...} for fname in glob('dat_rsd/*-*'): nc1,nc2 = os.path.basename(fname).split('-') nc1,nc2 = nc1,nc2 if nc1<nc2 else nc2,nc1 rsds = {} ncnc2rsds[(nc1,nc2)] = rsds cum_rsd = 0 cum_cnt = 1 for l in open(fname): l = l.split('\t') if l[0] != 'OR': continue np1,np2,rsd = l[1:4] rsd = float(rsd) np1,np2 = np1,np2 if np1<np2 else np2,np1 rsds[(np1,np2)] = rsd gene_graph.add_edge(np1,np2,weight=rsd)
import seqtools as st from shared import * eh,es = st.read_fasta('eugene_seqs.faa') eh = [h.strip()[1:].split(' ')[0] for h in eh] eh2s = {} reh,res = [],[] beh,bes = [],[] for h,s in zip(eh,es): if h.endswith('_RED'): reh.append(h[:-4]) res.append(s) eh2s[h[:-4]]=s elif h.endswith('_BLK'): eh2s[h[:-4]]=s beh.append(h[:-4]) bes.append(s) else: print "Couldn't classify seq "+repr(h) print 'Red symmetric difference: ' + repr(list(set(reh).symmetric_difference(set(rah)))) print 'Black symmetric difference: ' + repr(list(set(beh).symmetric_difference(set(bah)))) for h in rah+bah: try: aln_seq = ''.join([c for c in asmap[h] if c!='-']) e_seq = ''.join([c for c in eh2s[h] if c!='-']) except KeyError: print "Couldn't find eugene seq corresponding to "+h continue if aln_seq!=e_seq:
YP_003262789 YP_003518975 ZP_01544314 ZP_02464292 ZP_05320398 ZP_06350196""".splitlines() alphabet = ['A','C','E','D','G','F','I','M','K','V','-','L','N','Q','P','S','R','T','W','H','Y'] alphabet3 = ['ALA', 'CYS', 'GLU', 'ASP', 'GLY', 'PHE', 'ILE', 'MET', 'LYS', 'VAL', '---', 'LEU', 'ASN', 'GLN', 'PRO', 'SER', 'ARG', 'THR', 'TRP', 'HIS', 'TYR'] la = len(alphabet) chr2idx = np.ones(256)*-1 for i in range(la): chr2idx[ord(alphabet[i])] = i idx2chr = alphabet aligned_hdrs, aligned_seqs = st.read_fasta(open('pdb_seqs_aln.faa')) aligned_hdrs = [h[1:-1] for h in aligned_hdrs] rah, ras = [],[] # Red Aligned Headers, Red Aligned Sequences bah, bas = [],[] rahp, rasp = [],[] # Red Aligned Headers from PDB bahp, basp = [],[] for h,s in zip(aligned_hdrs,aligned_seqs): if h in rnames: rah.append(h.strip()) ras.append(s.strip()) elif h in bnames: bah.append(h.strip()) bas.append(s.strip()) elif h.startswith('RED'): rahp.append(h.strip()) rasp.append(s.strip())
import sys,os from Bio.PDB.PDBParser import PDBParser from glob import glob import seqtools as st import numpy as np from IPython import embed aln_h, aln_s = st.read_fasta('pdb_seqs_aln.faa') aln_h = [h[1:-1] for h in aln_h] # Strip > and \n from '>SEQID_STR\n' id2aln = dict(zip(aln_h,aln_s)) pdbs = glob('/mnt/hgfs/D/jon/Documents/2012/summer/pdbs/o_*') pdbparser = PDBParser(PERMISSIVE=1) ASA_ids = [] ASA_vals = [] # Array of (ASA array of len(aligned sequence)) # pdb_seqs.faa headers look like BLK_NP_24567 # ORDERED.fasta (Adrian) headers look like NP_24567 for fpath in pdbs: name = os.path.basename(fpath) protid = '_'.join(name.split('_')[1:-2]).upper() # Like BLK_NP_12345 structure = pdbparser.get_structure('struct',fpath) models = structure.get_list() # Hardcoded to get model 0, chain 0 chain = models[0].get_list()[0] residues = chain.get_list() ASAs = [r['CA'].get_bfactor() for r in residues] try: aa_alignment = id2aln[protid] except: print "FAILED TO EXTRACT ASA FROM PDB" print "Couldn't maatch protein id %s to an alignment"%protid embed()
Gln Q Gly G His H Ile I Leu L Lys K Met M Phe F Pro P Ser S Thr T Trp W Tyr Y Val V""".upper().splitlines()]) aln_h, aln_s = st.read_fasta('PROTEIN/ALIGNED.fasta') aln_h = [h[1:-1] for h in aln_h] # Strip > and \n from '>SEQID_STR\n' id2aln = dict(zip(aln_h,aln_s)) pdbs = glob('/mnt/hgfs/D/jon/Documents/2012/summer/pdbs/o_*') pdbparser = PDBParser(PERMISSIVE=1) AA_hdrs = [] AA_seqs = [] # Array of (ASA array of len(aligned sequence)) for fpath in pdbs: name = os.path.basename(fpath) protid = '_'.join(name.split('_')[1:-2]).upper() structure = pdbparser.get_structure('struct',fpath) models = structure.get_list() # Hardcoded to get model 0, chain 0 chain = models[0].get_list()[0] residues = chain.get_list() AAs = ''.join([three2one[r.get_resname()] for r in residues])