Exemple #1
0
def get_features_in_aln(alignment, variant, save_dir="", save_gff=True):
    #Let's extract consensus
    a=SummaryInfo(alignment)
    cons=a.dumb_consensus(threshold=0.1, ambiguous='X')
    seq = Sequence(id="Consensus", variant_id=variant, taxonomy_id=1, sequence=cons.tostring())
    updated_features = get_variant_features(seq, save_dir=save_dir, save_gff=save_gff)
    return updated_features
    def test_proteins(self):
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        letters = IUPACData.protein_letters
        base_freq = 1.0 / len(letters)
        e_freq_table = {letter: base_freq for letter in letters}
        ic = s.information_content(e_freq_table=e_freq_table,
                                   chars_to_ignore=["-", "*"])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Exemple #3
0
def trim_aln_to_seq_length(alignment,sequence):
    """Trim alignment to a sequence, i.e. leave only postions that correspond to this sequence span"""

    n1=str(uuid.uuid4())
    n2=str(uuid.uuid4())
    #Get consensus
    a=SummaryInfo(alignment)
    cons=a.dumb_consensus(threshold=0.1, ambiguous='X')
    #Needle it
    SeqIO.write([SeqRecord(cons,id='CONS',name='CONS')],n1+'.fasta','fasta')
    SeqIO.write([SeqRecord(sequence,id='KEY',name='KEY')],n2+'.fasta','fasta')

#Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=n1+".fasta", bsequence=n2+".fasta",gapopen=10, gapextend=0.5, outfile=n1+".txt")
    stdout, stderr = needle_cline()
# print('Needle alignment')

    align = AlignIO.read(n1+".txt", "emboss")
    os.system('rm %s.fasta %s.fasta %s.txt'%(n1,n2,n1))
    # print align
    # print alignment
    #first seq is consensus, we need to get borders useing second one.
    seq=str(align[1,:].seq)
    # print seq
    begin=seq.index(str(sequence[0]))
    end=len(seq)-seq[::-1].index(str(sequence[-1]))
    print begin
    print end
    return alignment[:,begin:end]
def trim_hist_aln_to_core(msa):
    """Trims hist alignment to core"""
    templ_H3 = Seq(
        "ARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEASEAYLVALFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA",
        IUPAC.protein,
    )
    templ_H4 = Seq(
        "SGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG",
        IUPAC.protein,
    )
    templ_H2A = Seq(
        "SGRGKQGGKTRAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPKKTESSKSKSK",
        IUPAC.protein,
    )
    templ_H2B = Seq(
        "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK",
        IUPAC.protein,
    )

    templ_core_H3 = templ_H3[43:114]
    templ_core_H4 = templ_H4[23:93]
    templ_core_H2A = templ_H2A[15:119]
    templ_core_H2B = templ_H2B[33:120]

    templ = {"H3": templ_core_H3, "H4": templ_core_H4, "H2A": templ_core_H2A, "H2B": templ_core_H2B}

    a = SummaryInfo(msa)
    cons = a.dumb_consensus(threshold=0.1, ambiguous="X")

    return trim_aln_to_seq_length(msa, templ[identify_hist_type(cons)])
Exemple #5
0
def trim_aln_to_seq(alignment,sequence):
    """Trim alignment to a sequence, i.e. leave only postions that correspond to this sequence
    Note that seqeuence should be incorportatable into alignment without additional gaps in alignment.
    """

    n1=str(uuid.uuid4())
    n2=str(uuid.uuid4())
    #Get consensus
    a=SummaryInfo(alignment)
    cons=a.dumb_consensus(threshold=0.1, ambiguous='X')
    #Needle it
    SeqIO.write([SeqRecord(cons,id='CONS',name='CONS')],n1+'.fasta','fasta')
    SeqIO.write([SeqRecord(sequence,id='KEY',name='KEY')],n2+'.fasta','fasta')

#Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=n1+".fasta", bsequence=n2+".fasta",gapopen=10, gapextend=0.5, outfile=n1+".txt")
    stdout, stderr = needle_cline()
# print('Needle alignment')

    align = AlignIO.read(n1+".txt", "emboss")
    os.system('rm %s.fasta %s.fasta %s.txt'%(n1,n2,n1))
    # print align
    # print alignment
    align.extend(alignment)
    a=align[1:,:]

    return trim_aln_to_key_seq(a,sequence)[1:,:]
Exemple #6
0
    def test_nucleotides(self):
        filename = "GFF/multi.fna"
        format = "fasta"
        alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna)
        summary = SummaryInfo(alignment)

        c = summary.dumb_consensus(ambiguous="N")
        self.assertEqual(str(c), "NNNNNNNN")

        c = summary.gap_consensus(ambiguous="N")
        self.assertEqual(str(c), "NNNNNNNN")

        expected = {"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}

        m = summary.pos_specific_score_matrix(chars_to_ignore=["-"],
                                              axis_seq=c)
        self.assertEqual(
            str(m), """    A   C   G   T
N  2.0 0.0 1.0 0.0
N  1.0 1.0 1.0 0.0
N  1.0 0.0 2.0 0.0
N  0.0 1.0 1.0 1.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
""")

        # Have a generic alphabet, without a declared gap char, so must tell
        # provide the frequencies and chars to ignore explicitly.
        ic = summary.information_content(e_freq_table=expected,
                                         chars_to_ignore=["-"])
        self.assertAlmostEqual(ic, 7.32029999423075, places=6)
Exemple #7
0
def calcAbundance(alignmentFile, consensusFile, abundanceFile, abundancePercentFile, verbose):
	print('Calculating the abundance matrix...')
	alignment = AlignIO.read(alignmentFile, "fasta")
	summary = SummaryInfo(alignment)
	consensusSeq = SeqIO.read(consensusFile, 'fasta')
	if (len(consensusSeq) == alignment.get_alignment_length()):
		abundanceMatrix = summary.pos_specific_score_matrix(consensusSeq)
	else:
		with open(consensusFile, "w") as f:			
			SeqIO.write(SeqRecord( summary.dumb_consensus(), id='consensus'), f, "fasta")
		abundanceMatrix = summary.pos_specific_score_matrix()
	if verbose:
		print("Abundance matrix (absolute values):")
		print(str(abundanceMatrix))
	with open(abundanceFile, 'w') as f:
		f.write(str(abundanceMatrix))
	for pos, abundance in enumerate(abundanceMatrix):
		for res, value in abundance.items():
			abundanceMatrix[pos][res] = 100.0 * float(value) / float(len(alignment))
	if verbose:
		print("Abundance matrix (percentages):")
		print(str(abundanceMatrix))
	with open(abundancePercentFile, 'w') as f:
		f.write(str(abundanceMatrix))
	print('OK')
Exemple #8
0
def trim_aln_to_seq_length(alignment, sequence):
    """Trim alignment to a sequence, i.e. leave only postions that correspond to this sequence span"""

    n1 = str(uuid.uuid4())
    n2 = str(uuid.uuid4())
    #Get consensus
    a = SummaryInfo(alignment)
    cons = a.dumb_consensus(threshold=0.1, ambiguous='X')
    #Needle it
    SeqIO.write([SeqRecord(cons, id='CONS', name='CONS')], n1 + '.fasta',
                'fasta')
    SeqIO.write([SeqRecord(sequence, id='KEY', name='KEY')], n2 + '.fasta',
                'fasta')

    #Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=n1 + ".fasta",
                                     bsequence=n2 + ".fasta",
                                     gapopen=10,
                                     gapextend=0.5,
                                     outfile=n1 + ".txt")
    stdout, stderr = needle_cline()
    # print('Needle alignment')

    align = AlignIO.read(n1 + ".txt", "emboss")
    os.system('rm %s.fasta %s.fasta %s.txt' % (n1, n2, n1))
    # print align
    # print alignment
    #first seq is consensus, we need to get borders useing second one.
    seq = str(align[1, :].seq)
    # print seq
    begin = seq.index(str(sequence[0]))
    end = len(seq) - seq[::-1].index(str(sequence[-1]))
    print begin
    print end
    return alignment[:, begin:end]
Exemple #9
0
def trim_aln_to_seq(alignment, sequence):
    """Trim alignment to a sequence, i.e. leave only postions that correspond to this sequence
    Note that seqeuence should be incorportatable into alignment without additional gaps in alignment.
    """

    n1 = str(uuid.uuid4())
    n2 = str(uuid.uuid4())
    #Get consensus
    a = SummaryInfo(alignment)
    cons = a.dumb_consensus(threshold=0.1, ambiguous='X')
    #Needle it
    SeqIO.write([SeqRecord(cons, id='CONS', name='CONS')], n1 + '.fasta',
                'fasta')
    SeqIO.write([SeqRecord(sequence, id='KEY', name='KEY')], n2 + '.fasta',
                'fasta')

    #Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=n1 + ".fasta",
                                     bsequence=n2 + ".fasta",
                                     gapopen=10,
                                     gapextend=0.5,
                                     outfile=n1 + ".txt")
    stdout, stderr = needle_cline()
    # print('Needle alignment')

    align = AlignIO.read(n1 + ".txt", "emboss")
    os.system('rm %s.fasta %s.fasta %s.txt' % (n1, n2, n1))
    # print align
    # print alignment
    align.extend(alignment)
    a = align[1:, :]

    return trim_aln_to_key_seq(a, sequence)[1:, :]
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha),
                      id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha),
                      id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha),
                      id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
                SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"),
                SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"),
                SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(str(m), """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Exemple #12
0
def get_aln_and_features(request, ids=None):
    from tools.hist_ss import templ, get_hist_ss_in_aln
    import subprocess
    import StringIO
    from Bio.Align import MultipleSeqAlignment
    from Bio.Align.AlignInfo import SummaryInfo

    if ids is None and request.method == "GET" and "id" in request.GET:
        ids = request.GET.getlist("id")
    else:
        #Returning 'false' stops Bootstrap table
        return "false"

    sequences = Sequence.objects.filter(id__in=ids[:50])
    if len(sequences) == 0:
        return None, None
    elif len(sequences) == 1:
        #Already aligned to core histone
        canonical = {"name":"canonical{}".format(sequences.first().variant.core_type), "seq":str(templ[sequences.first().variant.core_type].seq)}
        sequences = [canonical, sequences.first().sequence.to_dict()]
        features = sequences.first().features
    else:
        try:
            hist_type = max(
               [(hist, sequences.filter(variant__core_type_id=hist).count()) for hist in ["H2A", "H2B", "H3", "H4", "H1"]],
               key=lambda x:x[1]
               )[0]
        except ValueError:
            hist_type = "Unknown"
        
        muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
        process = subprocess.Popen([muscle], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        sequences = "\n".join([s.format() for s in sequences])
        aln, error = process.communicate(sequences)
        seqFile = StringIO.StringIO()
        seqFile.write(aln)
        seqFile.seek(0)
        sequences = list(SeqIO.parse(seqFile, "fasta")) #Not in same order, but does it matter?
        msa = MultipleSeqAlignment(sequences)

        save_dir = os.path.join(os.path.sep, "tmp", "HistoneDB")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        hv,ss = get_hist_ss_in_aln(msa, hist_type=hist_type, save_dir=save_dir, debug=False)
        a = SummaryInfo(msa)
        cons = Sequence(id="consensus", sequence=a.dumb_consensus(threshold=0.1, ambiguous='X').tostring())
        features = Features.from_dict(cons, ss)

        sequences = [{"name":s.id, "seq":s.seq.tostring()} for s in sequences]
        sequences.insert(0, cons.to_dict())
        
    result = {"seqs":sequences, "features":features.full_gff()}
    return JsonResponse(result, safe=False) 
 def lymphocyte_factory( self ):
     num_lymphocytes = self.num_lymphocytes
     self.lymphocytes = []
     summary_info = SummaryInfo( self.friendly )
     consensus = summary_info.dumb_consensus()
     self.consensus = consensus
     for j in range( 0, num_lymphocytes ):
         lymphocyte = self.guess_gaps( consensus.data )
         lymphocyte = self.scramble( lymphocyte )
         self.lymphocytes.append( Lymphocyte( lymphocyte ) )
     self.compute_accum_weight()
Exemple #14
0
def get_features_in_aln(alignment, variant, save_dir="", save_gff=True):
    #Let's extract consensus
    a = SummaryInfo(alignment)
    cons = a.dumb_consensus(threshold=0.1, ambiguous='X')
    seq = Sequence(id="Consensus",
                   variant_id=variant,
                   taxonomy_id=1,
                   sequence=str(cons))
    updated_features = get_variant_features(seq,
                                            save_dir=save_dir,
                                            save_gff=save_gff)
    return updated_features
Exemple #15
0
def get_hist_ss_in_aln(alignment, hist_type='Unknown', save_dir="", debug=True, save_censesus=False):
    """Returns sequence elements in histone alignment, all numbers assume first element in seq has number 0!!! Not like in PDB"""

    #Let's extract consensus
    if(debug):
        print alignment
    a=SummaryInfo(alignment)
    cons=a.dumb_consensus(threshold=0.1, ambiguous='X')
    if(debug):
        print "Consensus"
        print cons
    hv, ss = get_hist_ss(cons,hist_type,save_dir,True)
    if save_censesus:
        return hv,ss,cons
    return hv,ss
def annotate_hist_msa(msa, htype, variant=None):
    """Adds to the MSA lines from features.json"""

    # read json
    with open("inp_data/features.json") as ff:
        f = json.load(ff)
    f = f[htype]
    genseq = f["General" + htype]["sequence"]
    genf = f["General" + htype]["feature1"]

    a = SummaryInfo(msa)
    cons = a.dumb_consensus(threshold=0.1, ambiguous="X")
    sr_c = SeqRecord(id="consensus", seq=cons)
    sr_genseq = SeqRecord(id="template", seq=Seq(genseq))
    auxmsa = muscle_aln([sr_c, sr_genseq])
    auxmsa.sort()

    gapped_template = str(auxmsa[1].seq)
    gapped_cons = str(auxmsa[0].seq)

    s = list()
    for c, i in zip(gapped_cons, range(len(gapped_template))):
        if c != "-":
            s.append(gapped_template[i])
    newgapped_template = "".join(s)
    # now we need to gap feature
    gapped_genf = list()

    k = 0
    for c, i in zip(newgapped_template, range(len(newgapped_template))):
        if c != "-":
            gapped_genf.append(genf[i - k])
        else:
            k = k + 1
            gapped_genf.append("-")
    gapped_genf = "".join(gapped_genf)

    newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))])
    newmsa.extend(msa)
    # print newmsa
    return newmsa
    def test_nucleotides(self):
        filename = "GFF/multi.fna"
        format = "fasta"
        alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna)
        summary = SummaryInfo(alignment)

        c = summary.dumb_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        c = summary.gap_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        expected = FreqTable({"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25},
                             FREQ, unambiguous_dna)

        m = summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                              axis_seq=c)
        self.assertEqual(str(m), """    A   C   G   T
N  2.0 0.0 1.0 0.0
N  1.0 1.0 1.0 0.0
N  1.0 0.0 2.0 0.0
N  0.0 1.0 1.0 1.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
""")

        # Have a generic alphabet, without a declared gap char, so must tell
        # provide the frequencies and chars to ignore explicitly.
        ic = summary.information_content(e_freq_table=expected,
                                         chars_to_ignore=['-'])
        self.assertAlmostEqual(ic, 7.32029999423075, places=6)
Exemple #18
0
def get_seed_aln_and_features(request, seed):
    from Bio.Align import MultipleSeqAlignment
    from Bio.Align.AlignInfo import SummaryInfo

    seed_file = os.path.join(settings.STATIC_ROOT_AUX, "browse", "seeds")
    try:
        histone = Histone.objects.get(id=seed)
        seed_file = os.path.join(seed_file, "{}".format(histone.id))
    except Histone.DoesNotExist:
        try:
            variant = Variant.objects.get(id=seed)
            seed_file = os.path.join(seed_file, variant.hist_type.id, "{}".format(variant.id))
            # the default names for canonical are with underscores, so we do not need to convert back. ALEXEY, 30/12/15
            # seed_file = os.path.join(seed_file, variant.hist_type.id, "{}".format(variant.id.replace("canonical", "canonical_")))
        except Variant.DoesNotExist:
            return HttpResponseNotFound('<h1>No histone variant with name {}</h1>'.format(seed))

    download = request.GET.get("download", False) == "true"

    format = request.GET.get("format", "json")

    try:
        limit = int(request.GET.get("limit", 0))
    except ValueError:
        limit = 0

    consensus = request.GET.get("consensus", False)

    if not consensus in ["limit", "all", False]:
        consensus = "all"

    response = HttpResponse(content_type='text')

    if download:
        response['Content-Disposition'] = 'attachment; filename="{}.{}"'.format(seed, format)

    sequences = SeqIO.parse("{}.fasta".format(seed_file), "fasta")

    if consensus:
        sequences = [s for i, s in enumerate(sequences) if consensus == "all" or (consensus == "limit" and i < limit)]
        msa = MultipleSeqAlignment(sequences)
        a = SummaryInfo(msa)
        sequences.insert(0, SeqRecord(id="Consensus", description="", seq=a.dumb_consensus(threshold=0.1, ambiguous='X')))
        limit = limit+1 if limit > 0 else 0

    def limited_seqs():
        for i, seq in enumerate(sequences):
            if not consensus or consensus == "limit" or (limit > 0 and i < limit):
                yield seq

    with open("{}.gff".format(seed_file)) as feature_file:
        features = feature_file.read()

    if format == "fasta":
        SeqIO.write(limited_seqs(), response, "fasta")
    elif format == "gff":
        response.write(features)
    elif format == "pdf":
        with open("{}.pdf".format(seed_file)) as pdf:
            response.write(pdf.read())
    else:
        #Default format is json
        sequences = [{"name":seq.id, "seq":seq.seq.tostring()} for seq in limited_seqs()]
        result = {"seqs":sequences, "features":features}
        response.write(json.dumps(result))

    return response
Exemple #19
0
def get_aln_and_features(request, ids=None):
    from tools.hist_ss import get_variant_features
    from tools.L_shade_hist_aln import write_alignments
    import subprocess
    import StringIO
    from Bio.Align import MultipleSeqAlignment
    from Bio.Align.AlignInfo import SummaryInfo
    from Bio.SeqRecord import SeqRecord
    
    save_dir = os.path.join(os.path.sep, "tmp", "HistoneDB")
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.chmod(save_dir,0o777)

    if ids is None and request.method == "GET" and "id" in request.GET:
        ids = request.GET.getlist("id")
        sequences = Sequence.objects.filter(id__in=ids[:50])
        download = False
        upload = False
    elif request.GET.get("download", False) == "true":
        download = True
        upload = False
    else:
        #Returning 'false' stops Bootstrap table
        return "false"

    if request.GET.get("upload", False) == "true":
        uploaded_sequence = request.session.get("uploaded_sequences", [])
        if len(uploaded_sequence) > 0:
            try:
                variant = Variant.objects.get(id=uploaded_sequence[0]["variant"])
            except:
                if len(sequences) > 0:
                    variant = sequences[0].variant
                else:
                    return "false"

            uploaded_sequence = Sequence(
                id=uploaded_sequence[0]["id"], 
                variant=variant,
                sequence=uploaded_sequence[0]["sequence"],
                taxonomy=Taxonomy.objects.get(name=uploaded_sequence[0]["taxonomy"]))
            upload = True
            download = False
    

    if not download:
        if len(sequences) == 0:
            return None, None
        elif len(sequences) == 1:
            #Already aligned to core histone
            seq = sequences[0]
            hist_type = seq.variant.hist_type.id
            variants = [seq.variant]
            if upload:
                sequences = [uploaded_sequence, seq]
            else:
                #let's load the corresponding canonical
                try:
                    if(("canonical" in str(seq.variant)) or ("generic" in str(seq.variant))):
                        canonical=seq
                    elif(str(seq.variant.hist_type)=="H1"):
                        canonical=Sequence.objects.filter(variant_id='generic_'+str(seq.variant.hist_type),reviewed=True,taxonomy=seq.taxonomy)[0]
                    else:
                        canonical=Sequence.objects.filter(variant_id='canonical_'+str(seq.variant.hist_type),reviewed=True,taxonomy=seq.taxonomy)[0]
                except:
                    try: #try H2A.X as a substitute for canonical
                        if(str(seq.variant.hist_type)=='H2A'):
                            canonical=Sequence.objects.filter(variant_id='H2A.X',reviewed=True,taxonomy=seq.taxonomy)[0]
                        elif(str(seq.variant.hist_type)=='H3'): #Try H3.3
                            canonical=Sequence.objects.filter(variant_id='H3.3',reviewed=True,taxonomy=seq.taxonomy)[0]
                        elif(str(seq.variant.id)=='scH1'):
                            canonical=seq
                        else:
                            raise

                    except:
                        canonical=seq #we here default not to show the sequence by simply suppling itslef - only one line will be displayed
                        #default Xenopus
                        # if(str(seq.variant.hist_type)=="H1"):
                        #     canonical = Sequence(id="0000|xenopus|generic{}".format(hist_type), sequence=str(TemplateSequence.objects.get(variant="General{}".format(hist_type)).get_sequence().seq))
                        # else:
                        #     canonical = Sequence(id="0000|xenopus|canonical{}".format(hist_type), sequence=str(TemplateSequence.objects.get(variant="General{}".format(hist_type)).get_sequence().seq))
                sequences = [canonical, seq]
            sequence_label = seq.short_description
            
        else:
            seq = sequences[0]
            variants = list(Variant.objects.filter(id__in=sequences.values_list("variant", flat=True).distinct()))
            sequence_label = "Consensus"
        
        muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
        process = subprocess.Popen([muscle], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        sequences = "\n".join([s.format() for s in sequences])
        aln, error = process.communicate(sequences)
        seqFile = StringIO.StringIO()
        seqFile.write(aln)
        seqFile.seek(0)
        sequences = list(SeqIO.parse(seqFile, "fasta")) #Not in same order, but does it matter?
        msa = MultipleSeqAlignment(sequences)
        a = SummaryInfo(msa)
        cons = Sequence(id=sequence_label, variant_id=variants[0].id, taxonomy_id=1, sequence=a.dumb_consensus(threshold=0.1, ambiguous='X').tostring())

        save_dir = os.path.join(os.path.sep, "tmp", "HistoneDB")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        features = get_variant_features(cons, variants=variants, save_dir=save_dir)

        #A hack to avoid two canonical seqs
        unique_sequences = [sequences[0]] if len(sequences) == 2 and sequences[0].id == sequences[1].id else sequences
        # doing the Sequence.short_description work
        #Note that the gffs are also generated with the short description not
        sequences = [{"name":"QUERY" if "QUERY" in s.id else Sequence.long_to_short_description(s.id), "seq":s.seq.tostring()} for s in unique_sequences]
        # sequences = [{"name":s.id, "seq":s.seq.tostring()} for s in sequences]
        
        if sequence_label == "Consensus":
            sequences.insert(0, cons.to_dict(id=True))
            
        request.session["calculated_msa_seqs"] = sequences
        request.session["calculated_msa_features"] = features#.to_ict() if features else {}

        result = {"seqs":sequences, "features":features} #.full_gff() if features else ""}
        return JsonResponse(result, safe=False) 
    else:
        format = request.GET.get("format", "json")
        response = HttpResponse(content_type='text')
        response['Content-Disposition'] = 'attachment; filename="sequences.{}"'.format(format)

        
        sequences = request.session.get("calculated_msa_seqs", [])
        features = request.session.get("calculated_msa_features", "")
        #features = Features.from_dict(Sequence("Consensus"), features_dict) if features_dict else None

        if format == "fasta":
            for s in sequences:
                print >> response, ">{}\n{}".format(s["name"], s["seq"])
        elif format == "gff":
            response.write(features) #.full_gff() if features else "")
        elif format == "pdf":
            aln = MultipleSeqAlignment([SeqRecord(Seq(s["seq"]), id=s["name"]) for s in sequences[1:]])
            result_pdf = write_alignments(
                [aln], 
                save_dir = save_dir
            )

            with open(result_pdf) as pdf:
                response.write(pdf.read())

            #Cleanup
            os.remove(result_pdf)
        else:
            #Default format is json
            result = {"seqs":sequences, "features":features} #.full_gff() if features else ""}
            response.write(json.dumps(result))

        return response
Exemple #20
0
def get_seed_aln_and_features(request, seed):
    from Bio.Align import MultipleSeqAlignment
    from Bio.Align.AlignInfo import SummaryInfo

    seed_file = os.path.join(settings.STATIC_ROOT_AUX, "browse", "seeds")
    try:
        histone = Histone.objects.get(id=seed)
        seed_file = os.path.join(seed_file, "{}".format(histone.id))
    except Histone.DoesNotExist:
        try:
            variant = Variant.objects.get(id=seed)
            seed_file = os.path.join(seed_file, variant.hist_type.id, "{}".format(variant.id))
            # the default names for canonical are with underscores, so we do not need to convert back. ALEXEY, 30/12/15
            # seed_file = os.path.join(seed_file, variant.hist_type.id, "{}".format(variant.id.replace("canonical", "canonical_")))
        except Variant.DoesNotExist:
            return HttpResponseNotFound('<h1>No histone variant with name {}</h1>'.format(seed))

    download = request.GET.get("download", False) == "true"

    format = request.GET.get("format", "json")

    try:
        limit = int(request.GET.get("limit", 0))
    except ValueError:
        limit = 0

    consensus = request.GET.get("consensus", False)

    if not consensus in ["limit", "all", False]:
        consensus = "all"

    response = HttpResponse(content_type='text')

    if download:
        response['Content-Disposition'] = 'attachment; filename="{}.{}"'.format(seed, format)

    sequences = SeqIO.parse("{}.fasta".format(seed_file), "fasta")

    if consensus:
        sequences = [s for i, s in enumerate(sequences) if consensus == "all" or (consensus == "limit" and i < limit)]
        msa = MultipleSeqAlignment(sequences)
        a = SummaryInfo(msa)
        sequences.insert(0, SeqRecord(id="Consensus", description="", seq=a.dumb_consensus(threshold=0.1, ambiguous='X')))
        limit = limit+1 if limit > 0 else 0

    def limited_seqs():
        for i, seq in enumerate(sequences):
            if not consensus or consensus == "limit" or (limit > 0 and i < limit):
                yield seq

    with open("{}.gff".format(seed_file)) as feature_file:
        features = feature_file.read()

    if format == "fasta":
        SeqIO.write(limited_seqs(), response, "fasta")
    elif format == "gff":
        response.write(features)
    elif format == "pdf":
        with open("{}.pdf".format(seed_file)) as pdf:
            response.write(pdf.read())
    else:
        #Default format is json
        sequences = [{"name":seq.id, "seq":seq.seq.tostring()} for seq in limited_seqs()]
        result = {"seqs":sequences, "features":features}
        response.write(json.dumps(result))

    return response
Exemple #21
0
def get_aln_and_features(request, ids=None):
    from tools.hist_ss import get_variant_features
    from tools.L_shade_hist_aln import write_alignments
    import subprocess
    import StringIO
    from Bio.Align import MultipleSeqAlignment
    from Bio.Align.AlignInfo import SummaryInfo
    from Bio.SeqRecord import SeqRecord
    
    save_dir = os.path.join(os.path.sep, "tmp", "HistoneDB")
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.chmod(save_dir,0o777)

    if ids is None and request.method == "GET" and "id" in request.GET:
        ids = request.GET.getlist("id")
        sequences = Sequence.objects.filter(id__in=ids)
        download = False
        upload = False
    elif request.GET.get("download", False) == "true":
        download = True
        upload = False
    else:
        #Returning 'false' stops Bootstrap table
        return "false"

    if request.GET.get("upload", False) == "true":
        uploaded_sequence = request.session.get("uploaded_sequences", [])
        if len(uploaded_sequence) > 0:
            try:
                variant = Variant.objects.get(id=uploaded_sequence[0]["variant"])
            except:
                if len(sequences) > 0:
                    variant = sequences[0].variant
                else:
                    return "false"

            uploaded_sequence = Sequence(
                id=uploaded_sequence[0]["id"], 
                variant=variant,
                sequence=uploaded_sequence[0]["sequence"],
                taxonomy=Taxonomy.objects.get(name=uploaded_sequence[0]["taxonomy"]))
            upload = True
            download = False
    

    if not download:
        if len(sequences) == 0:
            return None, None
        elif len(sequences) == 1:
            #Already aligned to core histone
            seq = sequences[0]
            hist_type = seq.variant.hist_type.id
            variants = [seq.variant]
            if upload:
                sequences = [uploaded_sequence, seq]
            else:
                #let's load the corresponding canonical
                try:
                    if(("canonical" in str(seq.variant)) or ("generic" in str(seq.variant))):
                        canonical=seq
                    elif(str(seq.variant.hist_type)=="H1"):
                        canonical=Sequence.objects.filter(variant_id='generic_'+str(seq.variant.hist_type),reviewed=True,taxonomy=seq.taxonomy)[0]
                    else:
                        canonical=Sequence.objects.filter(variant_id='canonical_'+str(seq.variant.hist_type),reviewed=True,taxonomy=seq.taxonomy)[0]
                except:
                    try: #try H2A.X as a substitute for canonical
                        if(str(seq.variant.hist_type)=='H2A'):
                            canonical=Sequence.objects.filter(variant_id='H2A.X',reviewed=True,taxonomy=seq.taxonomy)[0]
                        elif(str(seq.variant.hist_type)=='H3'): #Try H3.3
                            canonical=Sequence.objects.filter(variant_id='H3.3',reviewed=True,taxonomy=seq.taxonomy)[0]
                        elif(str(seq.variant.id)=='scH1'):
                            canonical=seq
                        else:
                            raise

                    except:
                        canonical=seq #we here default not to show the sequence by simply suppling itslef - only one line will be displayed
                        #default Xenopus
                        # if(str(seq.variant.hist_type)=="H1"):
                        #     canonical = Sequence(id="0000|xenopus|generic{}".format(hist_type), sequence=str(TemplateSequence.objects.get(variant="General{}".format(hist_type)).get_sequence().seq))
                        # else:
                        #     canonical = Sequence(id="0000|xenopus|canonical{}".format(hist_type), sequence=str(TemplateSequence.objects.get(variant="General{}".format(hist_type)).get_sequence().seq))
                sequences = [canonical, seq]
            sequence_label = seq.short_description
            
        else:
            seq = sequences[0]
            variants = list(Variant.objects.filter(id__in=sequences.values_list("variant", flat=True).distinct()))
            sequence_label = "Consensus"
        
        muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
        process = subprocess.Popen([muscle], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        sequences = "\n".join([s.format() for s in sequences])
        aln, error = process.communicate(sequences)
        seqFile = StringIO.StringIO()
        seqFile.write(aln)
        seqFile.seek(0)
        sequences = list(SeqIO.parse(seqFile, "fasta")) #Not in same order, but does it matter?
        msa = MultipleSeqAlignment(sequences)
        a = SummaryInfo(msa)
        cons = Sequence(id=sequence_label, variant_id=variants[0].id, taxonomy_id=1, sequence=a.dumb_consensus(threshold=0.1, ambiguous='X').tostring())

        save_dir = os.path.join(os.path.sep, "tmp", "HistoneDB")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        features = get_variant_features(cons, variants=variants, save_dir=save_dir)

        #A hack to avoid two canonical seqs
        unique_sequences = [sequences[0]] if len(sequences) == 2 and sequences[0].id == sequences[1].id else sequences
        # doing the Sequence.short_description work
        #Note that the gffs are also generated with the short description not
        sequences = [{"name":"QUERY" if "QUERY" in s.id else Sequence.long_to_short_description(s.id), "seq":s.seq.tostring()} for s in unique_sequences]
        # sequences = [{"name":s.id, "seq":s.seq.tostring()} for s in sequences]
        
        if sequence_label == "Consensus":
            sequences.insert(0, cons.to_dict(id=True))
            
        request.session["calculated_msa_seqs"] = sequences
        request.session["calculated_msa_features"] = features#.to_ict() if features else {}

        result = {"seqs":sequences, "features":features} #.full_gff() if features else ""}
        return JsonResponse(result, safe=False) 
    else:
        format = request.GET.get("format", "json")
        response = HttpResponse(content_type='text')
        response['Content-Disposition'] = 'attachment; filename="sequences.{}"'.format(format)

        
        sequences = request.session.get("calculated_msa_seqs", [])
        features = request.session.get("calculated_msa_features", "")
        #features = Features.from_dict(Sequence("Consensus"), features_dict) if features_dict else None

        if format == "fasta":
            for s in sequences:
                print >> response, ">{}\n{}".format(s["name"], s["seq"])
        elif format == "gff":
            response.write(features) #.full_gff() if features else "")
        elif format == "pdf":
            aln = MultipleSeqAlignment([SeqRecord(Seq(s["seq"]), id=s["name"]) for s in sequences[1:]])
            result_pdf = write_alignments(
                [aln], 
                save_dir = save_dir
            )

            with open(result_pdf) as pdf:
                response.write(pdf.read())

            #Cleanup
            os.remove(result_pdf)
        else:
            #Default format is json
            result = {"seqs":sequences, "features":features} #.full_gff() if features else ""}
            response.write(json.dumps(result))

        return response