def create_nucleosome(outfile, save_chain=None, **chains): """Homology model a complete nucleosome particle Parameters: ----------- outfile: File-like object Where to write the output. save_chain: str Save only the strucutre from chain with name save_chain. chains: key, value pairs key is chain name, value is sequence to model Return: ------- A file is written to outfile """ chain_names = "ABCDEFGH" template = { sequence.id.split("_")[1]: sequence for sequence in SeqIO.parse() } seq_aln = {} for chain_name, template_seq in template: template_seq.id = "template" if chain not in chains: chain = template[chain] # If chain was not specified in input, just use sequence from template input_chain = SeqRecord(Seq( chains.get(chain_name, template_seq.seq.tostr())), id="model") seqlist = [chain_seq, input_chain] aln_pir = L_fasta2pir.aln(muscle_aln(seqlist)) aln_pir.add_pir_info('template', 'structureX', 'template_nucleosome', 'FIRST', i, 'LAST', i) aln_pir.add_pir_info('model', 'sequence', 'model_nucleosome') seq_aln[chain_name] = aln_pir mult_aln = L_fasta2pir.aln_mult_chains(list(seq_aln.values())) mult_aln.write('aln.pir') # Now let's do MODELLER env = environ() # create a new MODELLER environment to build this model in env.io.atom_files_directory = [os.path.sep, "HistoneDB"] a = automodel(env, alnfile='aln.pir', knowns='template', sequence='model') a.starting_model = 1 a.ending_model = 1 a.rand_method = None a.max_sc_mc_distance = 10 a.max_sc_sc_distance = 10 a.md_level = None # We will do MD anyway, and the structurea are similar. a.make()
def create_nucleosome(outfile, save_chain=None, **chains): """Homology model a complete nucleosome particle Parameters: ----------- outfile : File-like object Where to write the output. save_chain : str Save only the strucutre from chain with name save_chain. chains: key, value pairs key is chain name, value is sequence to model Return: ------- A file is written to outfile """ chain_names = "ABCDEFGH" template = {sequence.id.split("_")[1]: sequence for sequence in SeqIO.parse()} seq_aln = {} for chain_name, template_seq in template: template_seq.id = "template" if chain not in chains: chain = template[chain] #If chain was not specified in input, just use sequence from template input_chain = SeqRecord(Seq(chains.get(chain_name, template_seq.seq.tostr())), id="model") seqlist = [chain_seq,input_chain] aln_pir = L_fasta2pir.aln(muscle_aln(seqlist)) aln_pir.add_pir_info('template','structureX','template_nucleosome', 'FIRST', i,'LAST',i) aln_pir.add_pir_info('model','sequence','model_nucleosome') seq_aln[chain_name] = aln_pir mult_aln=L_fasta2pir.aln_mult_chains(seq_aln.values()) mult_aln.write('aln.pir') #Now let's do MODELLER env = environ() # create a new MODELLER environment to build this model in env.io.atom_files_directory = [os.path.sep, "HistoneDB"] a = automodel(env, alnfile = 'aln.pir', knowns = 'template', sequence = 'model') a.starting_model= 1 a.ending_model = 1 a.rand_method=None a.max_sc_mc_distance=10 a.max_sc_sc_distance=10 a.md_level=None # We will do MD anyway, and the structurea are similar. a.make()
def annotate_hist_msa(msa, htype, variant=None): """Adds to the MSA lines from features.json""" # read json with open("inp_data/features.json") as ff: f = json.load(ff) f = f[htype] genseq = f["General" + htype]["sequence"] genf = f["General" + htype]["feature1"] a = SummaryInfo(msa) cons = a.dumb_consensus(threshold=0.1, ambiguous="X") sr_c = SeqRecord(id="consensus", seq=cons) sr_genseq = SeqRecord(id="template", seq=Seq(genseq)) auxmsa = muscle_aln([sr_c, sr_genseq]) auxmsa.sort() gapped_template = str(auxmsa[1].seq) gapped_cons = str(auxmsa[0].seq) s = list() for c, i in zip(gapped_cons, range(len(gapped_template))): if c != "-": s.append(gapped_template[i]) newgapped_template = "".join(s) # now we need to gap feature gapped_genf = list() k = 0 for c, i in zip(newgapped_template, range(len(newgapped_template))): if c != "-": gapped_genf.append(genf[i - k]) else: k = k + 1 gapped_genf.append("-") gapped_genf = "".join(gapped_genf) newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))]) newmsa.extend(msa) # print newmsa return newmsa
def main(): #Getting data hist_df = pd.read_csv('inp_data/seqs.csv') #Histone types info fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb")) #Sequences #Getting data taxonomic tree into a linked dictionary with open('taxdmp/nodes.dmp', 'r') as f: for line in f: (k1, k2, rank) = line.split('\t|\t')[0:3] child = int(k1) parent = int(k2) if parent == child: continue if parent in tax_dict_pc: tax_dict_pc[parent].append(child) else: tax_dict_pc[parent] = [child] tax_dict_cp[child] = parent tax_ranks_dict[child] = rank #Getting data - taxonomics names df = pd.read_csv('taxdmp/names.dmp', sep='\t\|\t', converters={'type': lambda x: x[0:-2]}, header=None, names=['taxid', 'name', 'name2', 'type']) df = df[df.type == "scientific name"] taxid2names = pd.Series(df.name.values, index=df.taxid).to_dict() #taxid to taxname dict taxid2names = { key: (value.split()[0][0] + '. ' + ' '.join(value.split()[1:2])) for (key, value) in taxid2names.iteritems() } #Here we do filtering to get a set of desired gis # f_hist_df=hist_df[(hist_df['curated']==True) & (hist_df['hist_type']=='H2A')] # f_hist_df=hist_df[(hist_df['hist_type']=='H2A')] f_hist_df = hist_df[(hist_df['hist_var'] == 'canonical_H2B')] #select one variant per taxid f_hist_df = f_hist_df.drop_duplicates(['taxid', 'hist_var']) #overlay filtering by taxids parent_nodes = [1] #taxids of the parent nodes we want to include. taxids = get_tree_nodes(parent_nodes) f_hist_df = f_hist_df[f_hist_df['taxid'].isin(taxids)] seqtaxids = list(f_hist_df['taxid']) print seqtaxids #We do not want subspecies #Generate a tree from available sequence taxids using original taxonomy as a guide #and take only one representative per species print "starting tree pruning" tree_pc, tree_cp = prune_tree(seqtaxids) print(tree_pc) print(tree_cp) #We need for every species only on taxid. print "%d sequences filtered" % len(f_hist_df) new_taxids = prune_subspecies(tree_pc, tree_cp, seqtaxids) f_hist_df = f_hist_df[f_hist_df['taxid'].isin(new_taxids)] print "%d sequences filtered" % len(f_hist_df) # exit() #get a list of desired fasta seqs f_fasta_dict = { key: value for (key, value) in fasta_dict.iteritems() if key in list(f_hist_df['gi']) } # print f_hist_df.loc[f_hist_df.gi=='15219078','hist_type'].values[0] #Relabel sequences gi=> type and organism #with gi f_fasta_dict_rel = { key: SeqRecord( id=key, description=f_hist_df.loc[f_hist_df.gi == key, 'hist_var'].values[0] + ' ' + taxid2names[f_hist_df.loc[f_hist_df.gi == key, 'taxid'].values[0]], seq=value.seq) for (key, value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } keys = str() #output taxids for (key, value) in f_fasta_dict.iteritems(): keys = keys + str(f_hist_df.loc[f_hist_df.gi == key, 'taxid'].values[0]) + ',' print keys #output patternmatch H2A.Z # for (key,value) in f_fasta_dict.iteritems(): # if(re.search('R[VI][GSA][ASG]K[SA][AGS]',str(value.seq))): # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#00ff00') # else: # if(re.search('R[VI][GSA][ASG]G[SA]P',str(value.seq))): # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#0000ff') # else: # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#ff0000') #output patternmatch H2B for (key, value) in f_fasta_dict.iteritems(): if (re.search('[^K]$', str(value.seq))): print "%s,%s" % (str(f_hist_df.loc[f_hist_df.gi == key, 'taxid'].values[0]), '#00ff00') else: print "%s,%s" % (str(f_hist_df.loc[f_hist_df.gi == key, 'taxid'].values[0]), '#ff0000') exit() #Here we construct MSA msa = muscle_aln(f_fasta_dict_rel.values()) AlignIO.write(msa, "int_data/msa.fasta", "fasta")
def main(): title='' #1. Getting data ######################################################## ######################################################## # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score. df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences #2. Filtering - filter initial dataset by type, variant and other parameters ######################################################## ######################################################## #2.1. Narrow by variant/type ######################################################## title+='H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[(df['hist_type']=='H2A')] print "Number of seqs after narrowing by hist type/var:", len(f_df) #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades ######################################################### title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium #This is akin manual removal of bad species del_nodes=[5782,5690] print "Selecting taxonomic subset for taxids: ",parent_nodes print "while removing taxonomic subset for taxids: ",del_nodes taxids=set(parent_nodes) for i in parent_nodes: taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) for i in del_nodes: taxids=taxids.difference(set([i])) taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True))) f_df=f_df[f_df['taxid'].isin(taxids)] print "Number of seq after taxonomic subset: ",len(f_df) #2.3.0 Marking number of identical sequence within each species and subspecies. #This will simplify further analysis of sequence filtering on similarity #We know that all refseqs are duplicated for instance. ################################################ ident=dict() new_gis=list() tids=set(list(f_df['taxid'])) for i in tids: # print i.name, i.sci_name temp_df=f_df[(f_df['taxid']==i)] gis=list(temp_df['gi']) #this is to limit exec time # print gis if(len(gis)>1): res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00) ident.update(res) else: ident.update({gis[0]:1}) f_df['ident']=[ident.get(k,1) for k in f_df['gi']] #where ident - number of identical sequnces for current sepecies/subspecies. print "Identity of sequence inside each taxid determined" #2.3.1. Calculate number of similar seqs for every seq in tax group ######################################################### # Use powerful method, to get rid of random errors is to identify identical sequences # if a sequence is supported by two or more entires - this is good. # Here we add a degen column to our data set - showing how many similar sequences are found # for a given sequence in its taxonomic clade (genus currently) #We will traverse the species tree by species, genus or family, and determine degeneracy level degen=dict() new_gis=list() tids=list(f_df['taxid']) t = ncbi.get_topology(tids,intermediate_nodes=True) for i in t.search_nodes(rank='family'): # print i.name, i.sci_name nodeset=list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df=f_df[(f_df['taxid'].isin(nodeset))] gis=list(temp_df['gi']) #this is to limit exec time # print gis res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00) degen.update(res) # print degen f_df['degen']=[degen.get(k,1) for k in f_df['gi']] #2.3.2. Remove seqs that do not have support outside their species # if they are not curated or RefSeq NP. ########################################################### f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)] print "After removing mined seqs with no support in neighboring species: ",len(f_df) #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority ########################################################### #RefSeq and degenerate sequence get priority # title+=' 1ptax' f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates # print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank (e.g. genus) ############################################################ pruningrank='genus' print "Pruning taxonomy by ", pruningrank title+=' , one seq. per %s'%pruningrank #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list grouped_taxids=group_taxids(seqtaxids,rank=pruningrank) # print seqtaxids # print grouped_taxids #Now we need to take best representative #refseq NP, curated, or the one with largest degeneracy new_gis=list() for tids in grouped_taxids: t_df=f_df[f_df['taxid'].isin(tids)] #try take curated first if(len(t_df[t_df['curated']==True])>0): new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0]) continue #try take NP records nest #RefSeq 2 means NP, 1 means XP if(len(t_df[t_df['RefSeq']==2])>0): new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0]) continue # take best degenerate otherwise else: t_df=t_df.sort(['degen','RefSeq'],ascending=False) new_gis.append(t_df['gi'].iloc[0]) f_df=f_df[f_df['gi'].isin(new_gis)] print "After pruning taxonomy we have: ",len(f_df) #2.5. Check seq for sanity - needs to be checked! ############################################## # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions ############################################## f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA############ aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2.########## taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation############ ############################# features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main(): title='' #1. Getting data df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences # exit() #2. Filtering ########## #2.1. Narrow by variant/type title+='CenH3' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)] # f_df=df[(df['hist_type']=='H2A')] # exit() print len(f_df) #2.2. Filter by list of taxonomy clades ################ title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium print "Selecting taxonomic subset" taxids=list(parent_nodes) for i in parent_nodes: taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) f_df=f_df[f_df['taxid'].isin(taxids)] print len(f_df) # exit() #2.3*. Alternative powerful method, to get rid of random errors in seqs #We need to cluster seqs, and select only if we have support by two or more similar seqs #We will traverse the species tree by species, genus or family, and determine degeneracy level degen=dict() new_gis=list() tids=list(f_df['taxid']) t = ncbi.get_topology(tids,intermediate_nodes=True) for i in t.search_nodes(rank='genus'): # print i.name, i.sci_name nodeset=list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df=f_df[(f_df['taxid'].isin(nodeset))] gis=list(temp_df['gi']) #this is to limit exec time # print gis res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95) degen.update(res) # exit() # for k,v in res.iteritems(): # if v>2.0: # new_gis.append(k) # f_df=f_df[f_df['gi'].isin(new_gis)] print degen f_df['degen']=[degen.get(k,1) for k in f_df['gi']] #2.4. #####select one variant per taxid, priority to RefSeq # title+=' 1ptax' f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank. ################ title+=' , one seq. per genus, trimmed' print "Pruning taxonomy" #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list grouped_taxids=group_taxids(seqtaxids,rank='species') print seqtaxids print grouped_taxids #Now we need to take best representative #refseq NP, or the one with larges degeneracy new_gis=list() for tids in grouped_taxids: t_df=f_df[f_df['taxid'].isin(tids)] #take NP if we have it if(len(t_df[t_df['RefSeq']==2])>0): new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0]) continue else: # take best degenerate t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates if(t_df['degen'].iloc[0]>100): new_gis.append(t_df['gi'].iloc[0]) f_df=f_df[f_df['gi'].isin(new_gis)] # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe # print "---" # exit() #2.5. Check seq for sanity ################ # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions #################### f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2. taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} print taxid2gi t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main(): title='' #1. Getting data df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences # exit() #2. Filtering ########## #2.1. Narrow by variant/type title+='Canonical H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')] # f_df=df[(df['hist_type']=='H2A')] # exit() print len(f_df) #2.2. #####select one variant per taxid # title+=' 1ptax' f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates f_df=f_df.drop_duplicates(['taxid','hist_var']) # exit() #2.3. Filter by list of taxonomy clades ################ title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi print "Selecting taxonomic subset" taxids=list(parent_nodes) for i in parent_nodes: taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) f_df=f_df[f_df['taxid'].isin(taxids)] print len(f_df) # exit() #2.4 Take one representative per specific taxonomic rank. ################ title+=', one sequence per order' print "Pruning taxonomy" #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe # print "---" print len(f_df) # exit() #2.5. Check seq for sanity ################ # title+=' seqQC ' print "Checkig sequence quality" newgis=list() for i,row in f_df.iterrows(): gi=row['gi'] seq=fasta_dict[str(gi)].seq hist_type=row['hist_type'] hist_var=row['hist_var'] if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)): newgis.append(gi) f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe print len(f_df) # print list(f_df['gi']) # exit() #3. Make a list of seq with good ids and descriptions #################### f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values()) AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps title+=', gaps removed' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) #7. Vizualize MSA with ete2. taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} print taxid2gi t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms') plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
# s = p.get_structure('1id3', '1id3.pdb') # ppb=PPBuilder() # seqs_yeast=dict() # for i in ['A','B','C','D','E','F','G','H']: # seqs_yeast[i]=ppb.build_peptides(s[0][i])[0].get_sequence() # print nucl_yeast #Biopython aligns them and prepares for PIR format #Force gaps to be with high penalty. seq_aln = dict() for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']: seqlist = [ SeqRecord(seqs_xen[i], id='xen'), SeqRecord(seqs_yeast[i], id='yeast') ] aln_pir = L_fasta2pir.aln(muscle_aln(seqlist)) aln_pir.add_pir_info('xen', 'structureX', 'xen_nucl', 'FIRST', i, 'LAST', i) aln_pir.add_pir_info('yeast', 'sequence', 'nucl_yeast') seq_aln[i] = aln_pir mult_aln = L_fasta2pir.aln_mult_chains( [seq_aln[i] for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']]) mult_aln.write('aln.pir') ##### #Now let's do MODELLER env = environ() # create a new MODELLER environment to build this model in env.io.atom_files_directory = ['.', 'data'] # change to folder with data files
def main(): title = '' #1. Getting data ######################################################## ######################################################## # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score. df = pd.read_csv('int_data/seqs_rs.csv') #Histone types info fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb")) #Sequences #2. Filtering - filter initial dataset by type, variant and other parameters ######################################################## ######################################################## #2.1. Narrow by variant/type ######################################################## title += 'H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df = df[( (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X')) & (df['partial'] == False) & (df['non_st_aa'] == False)] # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[(df['hist_type']=='H2A')] print "Number of seqs after narrowing by hist type/var:", len(f_df) #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades ######################################################### title += ' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes = [ 131567 ] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium #This is akin manual removal of bad species del_nodes = [5782, 5690] print "Selecting taxonomic subset for taxids: ", parent_nodes print "while removing taxonomic subset for taxids: ", del_nodes taxids = set(parent_nodes) for i in parent_nodes: taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True)) for i in del_nodes: taxids = taxids.difference(set([i])) taxids = taxids.difference( set(ncbi.get_descendant_taxa(i, intermediate_nodes=True))) f_df = f_df[f_df['taxid'].isin(taxids)] print "Number of seq after taxonomic subset: ", len(f_df) #2.3.0 Marking number of identical sequence within each species and subspecies. #This will simplify further analysis of sequence filtering on similarity #We know that all refseqs are duplicated for instance. ################################################ ident = dict() new_gis = list() tids = set(list(f_df['taxid'])) for i in tids: # print i.name, i.sci_name temp_df = f_df[(f_df['taxid'] == i)] gis = list(temp_df['gi']) #this is to limit exec time # print gis if (len(gis) > 1): res = cluster_seq_support({gi: fasta_dict[str(gi)] for gi in gis}, ident_thresh=1.00) ident.update(res) else: ident.update({gis[0]: 1}) f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']] #where ident - number of identical sequnces for current sepecies/subspecies. print "Identity of sequence inside each taxid determined" #2.3.1. Calculate number of similar seqs for every seq in tax group ######################################################### # Use powerful method, to get rid of random errors is to identify identical sequences # if a sequence is supported by two or more entires - this is good. # Here we add a degen column to our data set - showing how many similar sequences are found # for a given sequence in its taxonomic clade (genus currently) #We will traverse the species tree by species, genus or family, and determine degeneracy level degen = dict() new_gis = list() tids = list(f_df['taxid']) t = ncbi.get_topology(tids, intermediate_nodes=True) for i in t.search_nodes(rank='family'): # print i.name, i.sci_name nodeset = list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df = f_df[(f_df['taxid'].isin(nodeset))] gis = list(temp_df['gi']) #this is to limit exec time # print gis res = cluster_seq_support({gi: fasta_dict[str(gi)] for gi in gis}, ident_thresh=1.00) degen.update(res) # print degen f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']] #2.3.2. Remove seqs that do not have support outside their species # if they are not curated or RefSeq NP. ########################################################### f_df = f_df.sort( ['RefSeq', 'degen'], ascending=False ) # so that RefSeq record get priority on removing duplicates f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) | (f_df['RefSeq'] == 2)] print "After removing mined seqs with no support in neighboring species: ", len( f_df) #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority ########################################################### #RefSeq and degenerate sequence get priority # title+=' 1ptax' f_df = f_df.sort( ['RefSeq', 'degen'], ascending=False ) # so that RefSeq record get priority on removing duplicates # print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank (e.g. genus) ############################################################ pruningrank = 'genus' print "Pruning taxonomy by ", pruningrank title += ' , one seq. per %s' % pruningrank #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids = list(f_df['taxid']) #old list grouped_taxids = group_taxids(seqtaxids, rank=pruningrank) # print seqtaxids # print grouped_taxids #Now we need to take best representative #refseq NP, curated, or the one with largest degeneracy new_gis = list() for tids in grouped_taxids: t_df = f_df[f_df['taxid'].isin(tids)] #try take curated first if (len(t_df[t_df['curated'] == True]) > 0): new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0]) continue #try take NP records nest #RefSeq 2 means NP, 1 means XP if (len(t_df[t_df['RefSeq'] == 2]) > 0): new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0]) continue # take best degenerate otherwise else: t_df = t_df.sort(['degen', 'RefSeq'], ascending=False) new_gis.append(t_df['gi'].iloc[0]) f_df = f_df[f_df['gi'].isin(new_gis)] print "After pruning taxonomy we have: ", len(f_df) #2.5. Check seq for sanity - needs to be checked! ############################################## # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions ############################################## f_fasta_dict = { key: value for (key, value) in fasta_dict.iteritems() if int(key) in list(f_df['gi']) } print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict = { key: SeqRecord( id=key, description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] + ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]], seq=value.seq) for (key, value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot = MultipleSeqAlignment([ SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace( ' ', '-')), id='annotation', name='') ]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi = msa[i].id msa[i].description = f_fasta_dict[gi].description.replace( 'canonical', 'ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA############ aln2html(msa, 'example_h2a.html', features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0), title="canonical H2A alignment", description=True, field1w=10, field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr = trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2.########## taxid2gi = { f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi for gi in list(f_df['gi']) } gi2variant = { gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0] for gi in list(f_df['gi']) } msa_dict = {i.id: i.seq for i in msa_tr} t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False) a = t.add_child(name='annotation') a.add_feature('sci_name', 'annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if (node.rank in ['order', 'class', 'phylum', 'kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name == 'annotation': s = str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi = taxid2gi[int(node.name)] add_face_to_node(TextFace(' ' + str(gi) + ' '), node, column=1, position="aligned") add_face_to_node(TextFace(' ' + str(int(node.name)) + ' '), node, column=2, position="aligned") add_face_to_node(TextFace(' ' + str(gi2variant[gi]) + ' '), node, column=3, position="aligned") if node.is_leaf() and node.name == 'annotation': s = get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '), node, column=1, position="aligned") add_face_to_node(TextFace(' ' + 'NCBI_TAXID' + ' '), node, column=2, position="aligned") add_face_to_node(TextFace(' ' + 'Variant' + ' '), node, column=3, position="aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation############ ############################# features = get_hist_ss_in_aln_for_shade(msa_tr, below=True) cn = add_consensus(msa_tr, threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw', map(lambda x: log(20) + x, map(float, cons_prof(msa_tr, f=0, c=0))), cn, features, axis='conservation') plot_prof4seq('example_cons_ent_unw_norm', map(lambda x: log(20) + x, map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))), cn, features, axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1', map(float, cons_prof(msa_tr, f=0, c=2, m=1)), cn, features, axis='conservation') plot_prof4seq('example_cons_sofp_unw', map(float, cons_prof(msa_tr, f=0, c=2, m=0)), cn, features, axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1', map(float, cons_prof(msa_tr, f=2, c=2, m=1)), cn, features, axis='conservation')
seqs_yeast['D']=Seq('RKETYSSYIYKVLKQTHPDTGISQKSMSILNSFVNDIFERIATEASKLAAYNKKSTISAREIQTAVRLILPGELAKHAVSEGTRAVTKYSSSTQA') seqs_yeast['H']=Seq('RKETYSSYIYKVLKQTHPDTGISQKSMSILNSFVNDIFERIATEASKLAAYNKKSTISAREIQTAVRLILPGELAKHAVSEGTRAVTKYSSSTQA') # s = p.get_structure('1id3', '1id3.pdb') # ppb=PPBuilder() # seqs_yeast=dict() # for i in ['A','B','C','D','E','F','G','H']: # seqs_yeast[i]=ppb.build_peptides(s[0][i])[0].get_sequence() # print nucl_yeast #Biopython aligns them and prepares for PIR format #Force gaps to be with high penalty. seq_aln=dict() for i in ['A','B','C','D','E','F','G','H']: seqlist=[SeqRecord(seqs_xen[i],id='xen'),SeqRecord(seqs_yeast[i],id='yeast')] aln_pir=L_fasta2pir.aln(muscle_aln(seqlist)) aln_pir.add_pir_info('xen','structureX','xen_nucl', 'FIRST', i,'LAST',i) aln_pir.add_pir_info('yeast','sequence','nucl_yeast') seq_aln[i]=aln_pir mult_aln=L_fasta2pir.aln_mult_chains([seq_aln[i] for i in ['A','B','C','D','E','F','G','H']]) mult_aln.write('aln.pir') ##### #Now let's do MODELLER env = environ() # create a new MODELLER environment to build this model in env.io.atom_files_directory = ['.','data'] # change to folder with data files log.verbose() # request verbose output # directories for input atom files
f_hist_df = hist_df[(hist_df["hist_var"] == "canonical_H2B") & (hist_df["curated"] == False)] f_hist_df = f_hist_df.drop_duplicates(["taxid", "hist_var"])[0:200] f_fasta_dict = { key: value for (key, value) in fasta_dict.iteritems() if key in list(f_hist_df["gi"]) } # get fasta dict # relabel with arbitrary index f_fasta_dict_rel = { key: SeqRecord(id=str(index), seq=f_fasta_dict[key].seq) for (index, key) in enumerate(f_fasta_dict) } print len(f_fasta_dict) # 2. Make MSA using my function ################# # msa=muscle_aln(f_fasta_dict_rel.values()) #function takes a list of sequence records!!! #ACTIVATE FOR TEX msa = muscle_aln(f_fasta_dict.values()) # function takes a list of sequence records!!! #ACTIVATE FOR TEX AlignIO.write(msa, "int_data/msa.fasta", "fasta") # 3. Get an annotated PDF of histone alignment using TEXSHADE - old way ############## # get_pdf(hist_name,align,title,shading_modes=['similar'],logo=False,hideseqs=False,splitN=20,setends=[],ruler=False): # The sequence names should be unique and without '|' if 0: get_pdf("H2B", msa, "H2B aln", logo=True, ruler=True) # 4.output to html aln2html(msa, "int_data/h2b.html", features=get_hist_ss_in_aln_for_html(msa, "H2B", 1)) # 5. TEST IT: Annotate our MSA using features.json - new experimental way #################