예제 #1
0
def main():
    title=''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    
    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title+='H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)
    

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes=[5782,5690]

    print "Selecting taxonomic subset for taxids: ",parent_nodes
    print "while removing taxonomic subset for taxids: ",del_nodes

    taxids=set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    for i in del_nodes:
        taxids=taxids.difference(set([i]))
        taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True)))

    f_df=f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ",len(f_df)
    


    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident=dict()
    new_gis=list()
    tids=set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df=f_df[(f_df['taxid']==i)]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        if(len(gis)>1):
            res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]:1})

    f_df['ident']=[ident.get(k,1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently) 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)]
    print "After removing mined seqs with no support in neighboring species: ",len(f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank='genus'
    print "Pruning taxonomy by ", pruningrank
    
    title+=' , one seq. per %s'%pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if(len(t_df[t_df['curated']==True])>0):
            new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) 
            new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ",len(f_df)


    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA############
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='CenH3'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()

    print len(f_df)
    


    #2.2. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.3*. Alternative powerful method, to get rid of random errors in seqs
    #We need to cluster seqs, and select only if we have support by two or more similar seqs 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='genus'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95)
        degen.update(res)
        # exit()
        # for k,v in res.iteritems():
            # if v>2.0:
                # new_gis.append(k)
    # f_df=f_df[f_df['gi'].isin(new_gis)]
    print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.4. #####select one variant per taxid, priority to RefSeq
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank.
    ################
    title+=' , one seq. per genus, trimmed'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank='species')
    print seqtaxids
    print grouped_taxids
    #Now we need to take best representative
    #refseq NP, or the one with larges degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #take NP if we have it
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        else: # take best degenerate
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
            if(t_df['degen'].iloc[0]>100):
                new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list
    # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"

    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()
    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
예제 #3
0
def main():
    title = ''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df = pd.read_csv('int_data/seqs_rs.csv')  #Histone types info
    fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb"))  #Sequences

    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title += 'H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df = df[(
        (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X'))
              & (df['partial'] == False) & (df['non_st_aa'] == False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title += ' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes = [
        131567
    ]  #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes = [5782, 5690]

    print "Selecting taxonomic subset for taxids: ", parent_nodes
    print "while removing taxonomic subset for taxids: ", del_nodes

    taxids = set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True))
    for i in del_nodes:
        taxids = taxids.difference(set([i]))
        taxids = taxids.difference(
            set(ncbi.get_descendant_taxa(i, intermediate_nodes=True)))

    f_df = f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ", len(f_df)

    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident = dict()
    new_gis = list()
    tids = set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df = f_df[(f_df['taxid'] == i)]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        if (len(gis) > 1):
            res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                       for gi in gis},
                                      ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]: 1})

    f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently)

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen = dict()
    new_gis = list()
    tids = list(f_df['taxid'])
    t = ncbi.get_topology(tids, intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset = list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df = f_df[(f_df['taxid'].isin(nodeset))]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                   for gi in gis},
                                  ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) |
                (f_df['RefSeq'] == 2)]
    print "After removing mined seqs with no support in neighboring species: ", len(
        f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])

    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank = 'genus'
    print "Pruning taxonomy by ", pruningrank

    title += ' , one seq. per %s' % pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids = list(f_df['taxid'])  #old list
    grouped_taxids = group_taxids(seqtaxids, rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis = list()
    for tids in grouped_taxids:
        t_df = f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if (len(t_df[t_df['curated'] == True]) > 0):
            new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if (len(t_df[t_df['RefSeq'] == 2]) > 0):
            new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df = t_df.sort(['degen', 'RefSeq'], ascending=False)
            new_gis.append(t_df['gi'].iloc[0])

    f_df = f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ", len(f_df)

    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict = {
        key: value
        for (key, value) in fasta_dict.iteritems()
        if int(key) in list(f_df['gi'])
    }
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict = {
        key: SeqRecord(
            id=key,
            description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] +
            ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]],
            seq=value.seq)
        for (key, value) in f_fasta_dict.iteritems()
    }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot = MultipleSeqAlignment([
        SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(
            ' ', '-')),
                  id='annotation',
                  name='')
    ])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi = msa[i].id
        msa[i].description = f_fasta_dict[gi].description.replace(
            'canonical', 'ca')
    msa.sort(key=lambda x: x.description)

    #5. Visualize MSA############
    aln2html(msa,
             'example_h2a.html',
             features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0),
             title="canonical H2A alignment",
             description=True,
             field1w=10,
             field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr = trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi = {
        f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi
        for gi in list(f_df['gi'])
    }
    gi2variant = {
        gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0]
        for gi in list(f_df['gi'])
    }

    msa_dict = {i.id: i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi = taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' + str(gi) + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(int(node.name)) + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(gi2variant[gi]) + ' '),
                             node,
                             column=3,
                             position="aligned")

        if node.is_leaf() and node.name == 'annotation':
            s = get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'NCBI_TAXID' + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'Variant' + '       '),
                             node,
                             column=3,
                             position="aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features = get_hist_ss_in_aln_for_shade(msa_tr, below=True)
    cn = add_consensus(msa_tr, threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0))),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))),
                  cn,
                  features,
                  axis='conservation')

    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=0)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',
                  map(float, cons_prof(msa_tr, f=2, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='Canonical H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()
    print len(f_df)
    #2.2. #####select one variant per taxid
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df.drop_duplicates(['taxid','hist_var'])


    # exit()
    #2.3. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.4 Take one representative per specific taxonomic rank.
    ################
    title+=', one sequence per order'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list
    f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"
    print len(f_df)
    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    print "Checkig sequence quality"
    newgis=list()
    for i,row in f_df.iterrows():
        gi=row['gi']
        seq=fasta_dict[str(gi)].seq
        hist_type=row['hist_type']
        hist_var=row['hist_var']
        if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)):
            newgis.append(gi)
    f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    print len(f_df)
    # print list(f_df['gi'])
    # exit()

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values())
    AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35)


    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    title+=', gaps removed'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)


    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]

    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    # 2. Make MSA using my function
    #################
    # msa=muscle_aln(f_fasta_dict_rel.values()) #function takes a list of sequence records!!! #ACTIVATE FOR TEX
    msa = muscle_aln(f_fasta_dict.values())  # function takes a list of sequence records!!! #ACTIVATE FOR TEX
    AlignIO.write(msa, "int_data/msa.fasta", "fasta")

    # 3. Get an annotated PDF of histone alignment using TEXSHADE - old way
    ##############
    # get_pdf(hist_name,align,title,shading_modes=['similar'],logo=False,hideseqs=False,splitN=20,setends=[],ruler=False):
    # The sequence names should be unique and without '|'
    if 0:
        get_pdf("H2B", msa, "H2B aln", logo=True, ruler=True)

    # 4.output to html

    aln2html(msa, "int_data/h2b.html", features=get_hist_ss_in_aln_for_html(msa, "H2B", 1))

    # 5. TEST IT: Annotate our MSA using features.json - new experimental way
    #################
    if 0:
        annot_msa = annotate_hist_msa(msa, "H2B")
        mview_plot(annot_msa, "int_data/h2b.html")

    # 6. An example of how to map alignment on one sequence
    if 0:
        templ_H2B = Seq(
            "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK",
            IUPAC.protein,
        )
        newmsa = trim_aln_to_seq(msa, templ_H2B)
        annot_msa = annotate_hist_msa(newmsa, "H2B")