Esempio n. 1
0
def create_nucleosome(outfile, save_chain=None, **chains):
    """Homology model a complete nucleosome particle

    Parameters:
    -----------
    outfile: File-like object
        Where to write the output.
    save_chain: str
        Save only the strucutre from chain with name save_chain.
    chains: key, value pairs
        key is chain name, value is sequence to model

    Return:
    -------
    A file is written to outfile
    """
    chain_names = "ABCDEFGH"
    template = {
        sequence.id.split("_")[1]: sequence
        for sequence in SeqIO.parse()
    }

    seq_aln = {}
    for chain_name, template_seq in template:
        template_seq.id = "template"
        if chain not in chains:
            chain = template[chain]

        # If chain was not specified in input, just use sequence from template
        input_chain = SeqRecord(Seq(
            chains.get(chain_name, template_seq.seq.tostr())),
                                id="model")

        seqlist = [chain_seq, input_chain]
        aln_pir = L_fasta2pir.aln(muscle_aln(seqlist))
        aln_pir.add_pir_info('template', 'structureX', 'template_nucleosome',
                             'FIRST', i, 'LAST', i)
        aln_pir.add_pir_info('model', 'sequence', 'model_nucleosome')
        seq_aln[chain_name] = aln_pir

    mult_aln = L_fasta2pir.aln_mult_chains(list(seq_aln.values()))
    mult_aln.write('aln.pir')

    # Now let's do MODELLER
    env = environ()  # create a new MODELLER environment to build this model in
    env.io.atom_files_directory = [os.path.sep, "HistoneDB"]

    a = automodel(env, alnfile='aln.pir', knowns='template', sequence='model')
    a.starting_model = 1
    a.ending_model = 1
    a.rand_method = None
    a.max_sc_mc_distance = 10
    a.max_sc_sc_distance = 10
    a.md_level = None  # We will do MD anyway, and the structurea are similar.

    a.make()
Esempio n. 2
0
def create_nucleosome(outfile, save_chain=None, **chains):
	"""Homology model a complete nucleosome particle

	Parameters:
	-----------
	outfile : File-like object
		Where to write the output.
	save_chain : str
		Save only the strucutre from chain with name save_chain.
	chains: key, value pairs
		key is chain name, value is sequence to model

	Return:
	-------
	A file is written to outfile
	"""
	chain_names = "ABCDEFGH"
	template = {sequence.id.split("_")[1]: sequence for sequence in SeqIO.parse()}

	seq_aln = {}
	for chain_name, template_seq in template:
		template_seq.id = "template"
		if chain not in chains:
			chain = template[chain]

		#If chain was not specified in input, just use sequence from template
		input_chain = SeqRecord(Seq(chains.get(chain_name, template_seq.seq.tostr())), id="model")

		seqlist = [chain_seq,input_chain]
		aln_pir = L_fasta2pir.aln(muscle_aln(seqlist))
		aln_pir.add_pir_info('template','structureX','template_nucleosome', 'FIRST', i,'LAST',i)
		aln_pir.add_pir_info('model','sequence','model_nucleosome')
		seq_aln[chain_name] = aln_pir

	mult_aln=L_fasta2pir.aln_mult_chains(seq_aln.values())
	mult_aln.write('aln.pir')

	#Now let's do MODELLER
	env = environ()  # create a new MODELLER environment to build this model in
	env.io.atom_files_directory = [os.path.sep, "HistoneDB"]
	
	a = automodel(env,
              	  alnfile  = 'aln.pir',
                  knowns   = 'template',
                  sequence = 'model')
	a.starting_model= 1
	a.ending_model  = 1
	a.rand_method=None
	a.max_sc_mc_distance=10
	a.max_sc_sc_distance=10
	a.md_level=None # We will do MD anyway, and the structurea are similar.

	a.make()
def annotate_hist_msa(msa, htype, variant=None):
    """Adds to the MSA lines from features.json"""

    # read json
    with open("inp_data/features.json") as ff:
        f = json.load(ff)
    f = f[htype]
    genseq = f["General" + htype]["sequence"]
    genf = f["General" + htype]["feature1"]

    a = SummaryInfo(msa)
    cons = a.dumb_consensus(threshold=0.1, ambiguous="X")
    sr_c = SeqRecord(id="consensus", seq=cons)
    sr_genseq = SeqRecord(id="template", seq=Seq(genseq))
    auxmsa = muscle_aln([sr_c, sr_genseq])
    auxmsa.sort()

    gapped_template = str(auxmsa[1].seq)
    gapped_cons = str(auxmsa[0].seq)

    s = list()
    for c, i in zip(gapped_cons, range(len(gapped_template))):
        if c != "-":
            s.append(gapped_template[i])
    newgapped_template = "".join(s)
    # now we need to gap feature
    gapped_genf = list()

    k = 0
    for c, i in zip(newgapped_template, range(len(newgapped_template))):
        if c != "-":
            gapped_genf.append(genf[i - k])
        else:
            k = k + 1
            gapped_genf.append("-")
    gapped_genf = "".join(gapped_genf)

    newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))])
    newmsa.extend(msa)
    # print newmsa
    return newmsa
Esempio n. 4
0
def main():

    #Getting data
    hist_df = pd.read_csv('inp_data/seqs.csv')  #Histone types info
    fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb"))  #Sequences

    #Getting data taxonomic tree into a linked dictionary
    with open('taxdmp/nodes.dmp', 'r') as f:
        for line in f:
            (k1, k2, rank) = line.split('\t|\t')[0:3]
            child = int(k1)
            parent = int(k2)
            if parent == child:
                continue
            if parent in tax_dict_pc:
                tax_dict_pc[parent].append(child)
            else:
                tax_dict_pc[parent] = [child]

            tax_dict_cp[child] = parent
            tax_ranks_dict[child] = rank

    #Getting data - taxonomics names
    df = pd.read_csv('taxdmp/names.dmp',
                     sep='\t\|\t',
                     converters={'type': lambda x: x[0:-2]},
                     header=None,
                     names=['taxid', 'name', 'name2', 'type'])
    df = df[df.type == "scientific name"]
    taxid2names = pd.Series(df.name.values,
                            index=df.taxid).to_dict()  #taxid to taxname dict
    taxid2names = {
        key: (value.split()[0][0] + '. ' + ' '.join(value.split()[1:2]))
        for (key, value) in taxid2names.iteritems()
    }

    #Here we do filtering to get a set of desired gis
    # f_hist_df=hist_df[(hist_df['curated']==True) & (hist_df['hist_type']=='H2A')]
    # f_hist_df=hist_df[(hist_df['hist_type']=='H2A')]
    f_hist_df = hist_df[(hist_df['hist_var'] == 'canonical_H2B')]

    #select one variant per taxid
    f_hist_df = f_hist_df.drop_duplicates(['taxid', 'hist_var'])

    #overlay filtering by taxids
    parent_nodes = [1]  #taxids of the parent nodes we want to include.
    taxids = get_tree_nodes(parent_nodes)
    f_hist_df = f_hist_df[f_hist_df['taxid'].isin(taxids)]

    seqtaxids = list(f_hist_df['taxid'])
    print seqtaxids

    #We do not want subspecies
    #Generate a tree from available sequence taxids using original taxonomy as a guide
    #and take only one representative per species
    print "starting tree pruning"
    tree_pc, tree_cp = prune_tree(seqtaxids)
    print(tree_pc)
    print(tree_cp)
    #We need for every species only on taxid.
    print "%d sequences filtered" % len(f_hist_df)

    new_taxids = prune_subspecies(tree_pc, tree_cp, seqtaxids)
    f_hist_df = f_hist_df[f_hist_df['taxid'].isin(new_taxids)]

    print "%d sequences filtered" % len(f_hist_df)
    # exit()
    #get a list of desired fasta seqs
    f_fasta_dict = {
        key: value
        for (key, value) in fasta_dict.iteritems()
        if key in list(f_hist_df['gi'])
    }
    # print f_hist_df.loc[f_hist_df.gi=='15219078','hist_type'].values[0]

    #Relabel sequences gi=> type and organism
    #with gi
    f_fasta_dict_rel = {
        key: SeqRecord(
            id=key,
            description=f_hist_df.loc[f_hist_df.gi == key,
                                      'hist_var'].values[0] + ' ' +
            taxid2names[f_hist_df.loc[f_hist_df.gi == key, 'taxid'].values[0]],
            seq=value.seq)
        for (key, value) in f_fasta_dict.iteritems()
    }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    keys = str()

    #output taxids
    for (key, value) in f_fasta_dict.iteritems():
        keys = keys + str(f_hist_df.loc[f_hist_df.gi == key,
                                        'taxid'].values[0]) + ','
    print keys

    #output patternmatch H2A.Z
    # for (key,value) in f_fasta_dict.iteritems():
    # if(re.search('R[VI][GSA][ASG]K[SA][AGS]',str(value.seq))):
    # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#00ff00')
    # else:
    # if(re.search('R[VI][GSA][ASG]G[SA]P',str(value.seq))):
    # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#0000ff')
    # else:
    # print "%s,%s"%(str(f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]),'#ff0000')

    #output patternmatch H2B
    for (key, value) in f_fasta_dict.iteritems():
        if (re.search('[^K]$', str(value.seq))):
            print "%s,%s" % (str(f_hist_df.loc[f_hist_df.gi == key,
                                               'taxid'].values[0]), '#00ff00')
        else:
            print "%s,%s" % (str(f_hist_df.loc[f_hist_df.gi == key,
                                               'taxid'].values[0]), '#ff0000')

    exit()

    #Here we construct MSA
    msa = muscle_aln(f_fasta_dict_rel.values())
    AlignIO.write(msa, "int_data/msa.fasta", "fasta")
Esempio n. 5
0
def main():
    title=''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    
    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title+='H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)
    

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes=[5782,5690]

    print "Selecting taxonomic subset for taxids: ",parent_nodes
    print "while removing taxonomic subset for taxids: ",del_nodes

    taxids=set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    for i in del_nodes:
        taxids=taxids.difference(set([i]))
        taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True)))

    f_df=f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ",len(f_df)
    


    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident=dict()
    new_gis=list()
    tids=set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df=f_df[(f_df['taxid']==i)]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        if(len(gis)>1):
            res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]:1})

    f_df['ident']=[ident.get(k,1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently) 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)]
    print "After removing mined seqs with no support in neighboring species: ",len(f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank='genus'
    print "Pruning taxonomy by ", pruningrank
    
    title+=' , one seq. per %s'%pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if(len(t_df[t_df['curated']==True])>0):
            new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) 
            new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ",len(f_df)


    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA############
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='CenH3'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()

    print len(f_df)
    


    #2.2. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.3*. Alternative powerful method, to get rid of random errors in seqs
    #We need to cluster seqs, and select only if we have support by two or more similar seqs 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='genus'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95)
        degen.update(res)
        # exit()
        # for k,v in res.iteritems():
            # if v>2.0:
                # new_gis.append(k)
    # f_df=f_df[f_df['gi'].isin(new_gis)]
    print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.4. #####select one variant per taxid, priority to RefSeq
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank.
    ################
    title+=' , one seq. per genus, trimmed'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank='species')
    print seqtaxids
    print grouped_taxids
    #Now we need to take best representative
    #refseq NP, or the one with larges degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #take NP if we have it
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        else: # take best degenerate
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
            if(t_df['degen'].iloc[0]>100):
                new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list
    # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"

    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()
    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='Canonical H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()
    print len(f_df)
    #2.2. #####select one variant per taxid
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df.drop_duplicates(['taxid','hist_var'])


    # exit()
    #2.3. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.4 Take one representative per specific taxonomic rank.
    ################
    title+=', one sequence per order'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list
    f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"
    print len(f_df)
    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    print "Checkig sequence quality"
    newgis=list()
    for i,row in f_df.iterrows():
        gi=row['gi']
        seq=fasta_dict[str(gi)].seq
        hist_type=row['hist_type']
        hist_var=row['hist_var']
        if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)):
            newgis.append(gi)
    f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    print len(f_df)
    # print list(f_df['gi'])
    # exit()

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values())
    AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35)


    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    title+=', gaps removed'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)


    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]

    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
Esempio n. 8
0
# s = p.get_structure('1id3', '1id3.pdb')
# ppb=PPBuilder()
# seqs_yeast=dict()
# for i in ['A','B','C','D','E','F','G','H']:
# seqs_yeast[i]=ppb.build_peptides(s[0][i])[0].get_sequence()
# print nucl_yeast

#Biopython aligns them and prepares for PIR format
#Force gaps to be with high penalty.
seq_aln = dict()
for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
    seqlist = [
        SeqRecord(seqs_xen[i], id='xen'),
        SeqRecord(seqs_yeast[i], id='yeast')
    ]
    aln_pir = L_fasta2pir.aln(muscle_aln(seqlist))
    aln_pir.add_pir_info('xen', 'structureX', 'xen_nucl', 'FIRST', i, 'LAST',
                         i)
    aln_pir.add_pir_info('yeast', 'sequence', 'nucl_yeast')
    seq_aln[i] = aln_pir

mult_aln = L_fasta2pir.aln_mult_chains(
    [seq_aln[i] for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']])

mult_aln.write('aln.pir')

#####
#Now let's do MODELLER
env = environ()  # create a new MODELLER environment to build this model in
env.io.atom_files_directory = ['.', 'data']
# change to folder with data files
Esempio n. 9
0
def main():
    title = ''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df = pd.read_csv('int_data/seqs_rs.csv')  #Histone types info
    fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb"))  #Sequences

    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title += 'H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df = df[(
        (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X'))
              & (df['partial'] == False) & (df['non_st_aa'] == False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title += ' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes = [
        131567
    ]  #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes = [5782, 5690]

    print "Selecting taxonomic subset for taxids: ", parent_nodes
    print "while removing taxonomic subset for taxids: ", del_nodes

    taxids = set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True))
    for i in del_nodes:
        taxids = taxids.difference(set([i]))
        taxids = taxids.difference(
            set(ncbi.get_descendant_taxa(i, intermediate_nodes=True)))

    f_df = f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ", len(f_df)

    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident = dict()
    new_gis = list()
    tids = set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df = f_df[(f_df['taxid'] == i)]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        if (len(gis) > 1):
            res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                       for gi in gis},
                                      ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]: 1})

    f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently)

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen = dict()
    new_gis = list()
    tids = list(f_df['taxid'])
    t = ncbi.get_topology(tids, intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset = list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df = f_df[(f_df['taxid'].isin(nodeset))]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                   for gi in gis},
                                  ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) |
                (f_df['RefSeq'] == 2)]
    print "After removing mined seqs with no support in neighboring species: ", len(
        f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])

    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank = 'genus'
    print "Pruning taxonomy by ", pruningrank

    title += ' , one seq. per %s' % pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids = list(f_df['taxid'])  #old list
    grouped_taxids = group_taxids(seqtaxids, rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis = list()
    for tids in grouped_taxids:
        t_df = f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if (len(t_df[t_df['curated'] == True]) > 0):
            new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if (len(t_df[t_df['RefSeq'] == 2]) > 0):
            new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df = t_df.sort(['degen', 'RefSeq'], ascending=False)
            new_gis.append(t_df['gi'].iloc[0])

    f_df = f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ", len(f_df)

    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict = {
        key: value
        for (key, value) in fasta_dict.iteritems()
        if int(key) in list(f_df['gi'])
    }
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict = {
        key: SeqRecord(
            id=key,
            description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] +
            ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]],
            seq=value.seq)
        for (key, value) in f_fasta_dict.iteritems()
    }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot = MultipleSeqAlignment([
        SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(
            ' ', '-')),
                  id='annotation',
                  name='')
    ])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi = msa[i].id
        msa[i].description = f_fasta_dict[gi].description.replace(
            'canonical', 'ca')
    msa.sort(key=lambda x: x.description)

    #5. Visualize MSA############
    aln2html(msa,
             'example_h2a.html',
             features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0),
             title="canonical H2A alignment",
             description=True,
             field1w=10,
             field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr = trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi = {
        f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi
        for gi in list(f_df['gi'])
    }
    gi2variant = {
        gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0]
        for gi in list(f_df['gi'])
    }

    msa_dict = {i.id: i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi = taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' + str(gi) + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(int(node.name)) + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(gi2variant[gi]) + ' '),
                             node,
                             column=3,
                             position="aligned")

        if node.is_leaf() and node.name == 'annotation':
            s = get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'NCBI_TAXID' + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'Variant' + '       '),
                             node,
                             column=3,
                             position="aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features = get_hist_ss_in_aln_for_shade(msa_tr, below=True)
    cn = add_consensus(msa_tr, threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0))),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))),
                  cn,
                  features,
                  axis='conservation')

    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=0)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',
                  map(float, cons_prof(msa_tr, f=2, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
seqs_yeast['D']=Seq('RKETYSSYIYKVLKQTHPDTGISQKSMSILNSFVNDIFERIATEASKLAAYNKKSTISAREIQTAVRLILPGELAKHAVSEGTRAVTKYSSSTQA')
seqs_yeast['H']=Seq('RKETYSSYIYKVLKQTHPDTGISQKSMSILNSFVNDIFERIATEASKLAAYNKKSTISAREIQTAVRLILPGELAKHAVSEGTRAVTKYSSSTQA')

# s = p.get_structure('1id3', '1id3.pdb')
# ppb=PPBuilder()
# seqs_yeast=dict()
# for i in ['A','B','C','D','E','F','G','H']:
	# seqs_yeast[i]=ppb.build_peptides(s[0][i])[0].get_sequence()
# print nucl_yeast

#Biopython aligns them and prepares for PIR format
#Force gaps to be with high penalty.
seq_aln=dict()
for i in ['A','B','C','D','E','F','G','H']:
	seqlist=[SeqRecord(seqs_xen[i],id='xen'),SeqRecord(seqs_yeast[i],id='yeast')]
	aln_pir=L_fasta2pir.aln(muscle_aln(seqlist))
	aln_pir.add_pir_info('xen','structureX','xen_nucl', 'FIRST', i,'LAST',i)
	aln_pir.add_pir_info('yeast','sequence','nucl_yeast')
	seq_aln[i]=aln_pir

mult_aln=L_fasta2pir.aln_mult_chains([seq_aln[i] for i in ['A','B','C','D','E','F','G','H']])

mult_aln.write('aln.pir')

#####
#Now let's do MODELLER
env = environ()  # create a new MODELLER environment to build this model in
env.io.atom_files_directory = ['.','data']
# change to folder with data files
log.verbose()    # request verbose output
# directories for input atom files
    f_hist_df = hist_df[(hist_df["hist_var"] == "canonical_H2B") & (hist_df["curated"] == False)]
    f_hist_df = f_hist_df.drop_duplicates(["taxid", "hist_var"])[0:200]
    f_fasta_dict = {
        key: value for (key, value) in fasta_dict.iteritems() if key in list(f_hist_df["gi"])
    }  # get fasta dict
    # relabel with arbitrary index
    f_fasta_dict_rel = {
        key: SeqRecord(id=str(index), seq=f_fasta_dict[key].seq) for (index, key) in enumerate(f_fasta_dict)
    }

    print len(f_fasta_dict)

    # 2. Make MSA using my function
    #################
    # msa=muscle_aln(f_fasta_dict_rel.values()) #function takes a list of sequence records!!! #ACTIVATE FOR TEX
    msa = muscle_aln(f_fasta_dict.values())  # function takes a list of sequence records!!! #ACTIVATE FOR TEX
    AlignIO.write(msa, "int_data/msa.fasta", "fasta")

    # 3. Get an annotated PDF of histone alignment using TEXSHADE - old way
    ##############
    # get_pdf(hist_name,align,title,shading_modes=['similar'],logo=False,hideseqs=False,splitN=20,setends=[],ruler=False):
    # The sequence names should be unique and without '|'
    if 0:
        get_pdf("H2B", msa, "H2B aln", logo=True, ruler=True)

    # 4.output to html

    aln2html(msa, "int_data/h2b.html", features=get_hist_ss_in_aln_for_html(msa, "H2B", 1))

    # 5. TEST IT: Annotate our MSA using features.json - new experimental way
    #################