Ejemplo n.º 1
0
    def translate_alignment(self):
        '''
        translate the alignment and calculate amino acid consensus
        '''
        self.aa_aln = MultipleSeqAlignment([])
        if self.cds['end']>=0:
            last_base = self.cds['end']
        else:
            last_base = self.aln.get_alignment_length()+self.cds['end']+1

        # translate, add cds['pad'] Xs at the beginning
        # TODO: make translation gap-tolerant
        for seq in self.aln:
            try:
                tmp_seq = 'X'*self.cds['pad'] + seq.seq[self.cds['begin']:last_base].translate()
            except:
                tmp_seq = Seq.Seq('X'*(self.cds['pad']+(self.cds['end']-self.cds['begin'])/3), generic_protein)
                print self.cds
            if self.cds['end']-self.cds['begin']==0:
                tmp_seq=Seq.Seq('X', generic_protein)

            self.aa_aln.append(SeqRecord.SeqRecord(seq = tmp_seq, 
                              name=seq.name, id=seq.id))
        # process amino acid alignment
        self.aa_summary_info = AlignInfo.SummaryInfo(self.aa_aln)
        self.aa_consensus = self.aa_summary_info.dumb_consensus()
        self.calculate_aa_allele_frequencies()
Ejemplo n.º 2
0
def get_consensus(seqs, counts):
    # followed advice from here:
    #  http://stackoverflow.com/questions/18860962/run-clustalw2-without-input-fasta-file

    clustalo = '/home/mchan/software/clustalo-1.2.3-Ubuntu-x86_64'

    # Multiple Sequence Alignment
    s1 = []  # reads expanded to represent accurate counts
    for seq, readCount in zip(seqs, counts):
        t1 = [seq] * readCount
        s1 = s1 + t1
    s2 = ['>seq' + str(i) + '\n' + s1[i] + '\n' for i in range(len(s1))]
    str1 = ''.join(s2)
    proc = Popen([clustalo, '--infile=-', '--threads=4'],
                 stdout=PIPE,
                 stdin=PIPE,
                 stderr=STDOUT)
    stdout = proc.communicate(input=str1.encode())[0]

    alignment = Bio.AlignIO.read(StringIO(stdout.decode()), 'fasta')
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensusSeq = summary_align.gap_consensus(threshold=0.6, ambiguous='N')
    consensusSeq = str(consensusSeq).replace("-", "")
    consensusSeq = re.sub('N+$', '', str(consensusSeq))  # trim trailing Ns
    return str(consensusSeq)
Ejemplo n.º 3
0
 def loadAlignment( self, path ):
    """ path is a path to an alignment file in .aln format"""
    alignment = Clustalw.parse_file( path )
    self.allseq = alignment.get_all_seqs()
    self.summary = AlignInfo.SummaryInfo(alignment)
    self.l = alignment.get_alignment_length()
    self.insertLoadedBioAlignment()
Ejemplo n.º 4
0
def process(key):
    options = records[key]
    count = 1
    for option in options:
        option.id = 'record_' + str(count)
        count += 1
    seqs = [str(i.seq) for i in options]

    if method == 'consensus':

        if len(seqs) > 1:
            temp = tempfile.mkdtemp()
            aln, tree = sequtils.clustalw(temp + '/temp.fasta', options)
            summary_align = AlignInfo.SummaryInfo(aln)
            consensus = summary_align.dumb_consensus()
            peptide = str(consensus)
            shutil.rmtree(temp)
        else:
            peptide = seqs[0]

        peptide = tagmatch.character_strip(peptide, 'X')

    elif method == 'concatenated':
        peptide = '-'.join(list(set(seqs)))

    new_record = SeqRecord(id=key, seq=Seq(peptide))
    return new_record
Ejemplo n.º 5
0
def get_consensus(msa, threshold=0.5):
    """
    Return the consensus sequence of the MSA.

    It returns the amino-acid residue that appears in more than `threshold`
    fraction of the sequences without counting sequences that has gaps in
    that column. If any residue is present in more sequences than the
    `threshold`, it introduces an `X` to indicate the ambiguity.
    If there are more than 50% gaps in a column, it displays the residue
    in lowercase.

    >>> from Bio.Alphabet import IUPAC
    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> seq_1 = SeqRecord(Seq("SEEEEACCC", IUPAC.protein), id="I")
    >>> seq_2 = SeqRecord(Seq("SSEEEGCC-", IUPAC.protein), id="II")
    >>> seq_3 = SeqRecord(Seq("SSSEEKC--", IUPAC.protein), id="III")
    >>> seq_4 = SeqRecord(Seq("SSSSEH---", IUPAC.protein), id="IIII")
    >>> msa = MultipleSeqAlignment([seq_1, seq_2, seq_3, seq_4])
    >>> get_consensus(msa)
    'SSXEEXCCc'
    """
    alphabet = IUPAC.protein
    summary_align = AlignInfo.SummaryInfo(msa)
    ungapped = summary_align.dumb_consensus(consensus_alpha=alphabet,
                                            threshold=threshold)
    gapped = summary_align.gap_consensus(consensus_alpha=alphabet,
                                         threshold=0.5)
    return ''.join(res_a if res_b != '-' else res_a.lower()
                   for (res_a, res_b) in zip(ungapped, gapped))
Ejemplo n.º 6
0
def summary_aln(aln_params):
    '''
    Take an alignment and coordinates (unpacked from aln_params).  Take a 
    sub-alignment bounded by the coordinates.  Calculate a pssm for the 
    sub-alignment.  Find the variable sites using the pssm.  Take a 
    sub-sub-alignment for each variable site.  Bind all the sub-sub-alignments
    to return a sub-alignment of just variable sites (SNVs).
    '''
    (aln, i, j) = aln_params
    alignment = aln[:, i:j]
    summary_align = AlignInfo.SummaryInfo(alignment)
    first_seq = (alignment[0].seq)
    my_pssm = summary_align.pos_specific_score_matrix(first_seq)
    dna_bases = ['A', 'C', 'T', 'G']
    pos = 0
    variant_sites = []
    for base_dict in my_pssm:
        base_count = [base_dict[k] for k in dna_bases if base_dict[k] > 0]
        if len(base_count) > 1:
            variant_sites.append(pos)
        pos += 1
    variant_cols = []
    for s in variant_sites:
        variant_cols.append(alignment[:,s:s+1])
    if len(variant_cols) > 0:
        aln = MultipleSeqAlignment(variant_cols[0])
        for t in variant_cols[1:]:
            aln += t
        return aln
Ejemplo n.º 7
0
    def out():
        win.filename = e1.get()
        alignment = AlignIO.read(win.filename, "clustal")
        summary_align = AlignInfo.SummaryInfo(alignment)
        consensus = summary_align.dumb_consensus()
        print(consensus)

        win.filename = e2.get()
        filename = win.filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError:
                raise

        with open(filename, "w") as f:
            f.write(str(consensus))

        r = open(e2.get(), 'r').read()
        root = Tk()
        S = Scrollbar(root)
        T = Text(root, height=50, width=500)
        S.pack(side=RIGHT, fill=Y)
        T.pack(side=LEFT, fill=Y)
        S.config(command=T.yview)
        S.config(command=T.xview)
        T.config(yscrollcommand=S.set)
        T.config(xscrollcommand=S.set)
        quote = r
        T.insert(END, quote, 'color')
        mainloop()
def Find_SNP_locations(alignment, startinglocations):

    SNPlocations = []

    summary_align = AlignInfo.SummaryInfo(alignment)

    print "Identifying SNP locations"
    sys.stdout.flush()

    count = 0
    total = 0.0

    for x in startinglocations:

        count = count + 1
        if count == 10000:
            total = total + count
            count = 0
            print "%.2f%% complete\r" % (
                100 * (total / alignment.get_alignment_length())),
            sys.stdout.flush()

        foundbases = []
        for record in alignment:
            base = record.seq[x].upper()

            if base != '-' and base not in foundbases:  #
                foundbases.append(base)
            if len(foundbases) > 1:
                SNPlocations.append(x)
                break

    print "100.00% complete\n"  #Found %d SNP locations" % len(SNPlocations),
    sys.stdout.flush()
    return SNPlocations
Ejemplo n.º 9
0
def phylip_writer(handle_in,handle_out,nloci,full_list,loci_list,sample,read_len):
	N = 0
	alignment = {}
	line = handle_in.readline()
	while line:
		if line.startswith('>'):
			locus = int(line.split('_')[1].lstrip('locus'))
			if locus < nloci:
				seq = handle_in.readline().rstrip()
				if locus in alignment:
					alignment[locus].append(seq)
				else:
					alignment[locus] = [seq]
		line = handle_in.readline()
	handle_in.close()
	for i in full_list:
		if i in alignment:
			cov = loci_list[sample].count(i)
			if cov > 1:
				phylip = open('temp.phylip','w')
				phylip.write('%d\t%d\n'%(cov,read_len))
				for j in range(cov):
					phylip.write('Sample%d\t%s\n'%(j,alignment[i][j]))
				phylip.close()
				alignments = AlignIO.read(open('temp.phylip'),'phylip-relaxed')
				summary_align = AlignInfo.SummaryInfo(alignments)
				seq = str(summary_align.dumb_consensus())
				N = N + seq.count('X')
			elif cov == 1:
				seq = ''.join(alignment[i])
		else:
			seq = '-'*read_len
		handle_out.write(seq)
	return N
Ejemplo n.º 10
0
    def genebank_seq(self):
        Entrez.email = "*****@*****.**"

        type_of_seq = self.data['sequence-type']
        lines = self.data['genebank-seq'].split('\n')
        seq_id_list = [line for line in lines if line != '']
        seq_content_list = []

        for seq in seq_id_list:
            with Entrez.efetch(db=type_of_seq,
                               rettype="gb",
                               retmode="text",
                               id=seq) as handle:
                seq_record = SeqIO.read(handle, "gb")
                seq = str(seq_record.seq)
                seq_content_list.append(seq)

        file_content = ""
        for seq_name, seq_content in zip(seq_id_list, seq_content_list):
            file_content += ">" + seq_name + "\n" + seq_content + "\n"

        filename = create_seq_file(file_content)

        muscle = MuscleCommandline(input=filename)
        stdout, stderr = muscle()
        align = AlignIO.read(StringIO(stdout), "fasta")

        summary_align = AlignInfo.SummaryInfo(align)
        consensus = summary_align.gap_consensus(threshold=0.55, ambiguous='N')

        remove_temp_file(filename)

        return f'>consensus sequence {len(consensus)} bp\n' + str(consensus)
Ejemplo n.º 11
0
def design_primers(source_dir, target_dir, settings, logfile):
    print("\nDesigning primers using PriFi...\n", file=logfile)
    # get rid of previous files
    utils.purge_dir(target_dir)
    aln_files = glob(os.path.join(source_dir, '*.fasta'))
    print("\tChecking for empty alignments...", file=logfile)
    for f in aln_files:
        try:
            align = AlignIO.read(f, 'fasta')
            filename = os.path.basename(f)
            shutil.copyfile(f, os.path.join(target_dir, filename))
        except Exception:
            print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile)
            continue

    # call PriFi for actual primer design
    for f in glob(os.path.join(target_dir, '*.fasta')):
        aln = AlignIO.read(f, 'fasta')
        summary = AlignInfo.SummaryInfo(aln)
        l = aln.get_alignment_length()
        primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings,
                                          logfile)
        if not primerpairs:
            print("%s: No valid primer pair found" % f, file=logfile)
        else:
            print(
                '%s: Found %d primer pair suggestions. Writing primer files:' %
                (f, len(primerpairs)),
                file=logfile)
            prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
Ejemplo n.º 12
0
def main():
    # load germline sequences
    ref_seqs = {}
    for f in glob("fasta/full_ref/*.fasta"):
        for s in SeqIO.parse(f, "fasta"):
            s.id = s.name = s.description = s.id.split("|")[1].replace("*", "-")
            ref_seqs[s.id] = s

    # check and collect references
    references = []
    for f in glob("fasta/IG*.fasta"):
        name = re.sub("^.*/|[.].*$", "", f)

        ref = ref_seqs[name]
        seqs = [x for x in SeqIO.parse(f, "fasta")]

        # get consensus sequence from file
        m_align = MultipleSeqAlignment(seqs)
        align_summary = AlignInfo.SummaryInfo(m_align)
        consensus = align_summary.dumb_consensus()

        # cut reference seq to match other seqs length
        seq_len = len(consensus)
        ref = ref[:seq_len].upper()

        alignments = pairwise2.align.globalxs(ref.seq, consensus, -5, -2)
        # check score... left 1 base for errors with consensus
        if alignments[0][-3] < seq_len - 1:
            print("Hey, check {}".format(f))
            print(pairwise2.format_alignment(*alignments[0]))
        else:
            references.append(ref)

    SeqIO.write(references, "fasta/references.fasta", "fasta-2line")
Ejemplo n.º 13
0
def sequence_consensus(sequence_list, vir_typer_pk):
    from Bio.Align import AlignInfo
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import IUPAC
    from Bio import AlignIO, SeqIO
    from Bio.Seq import Seq
    consensus = list()
    records = list()
    if len(sequence_list) >= 2:
        for sequence_dict in sequence_list:
            # print(sequence_dict)
            for seq_code, sequence in sequence_dict.items():
                record = SeqRecord(Seq(sequence, IUPAC.unambiguous_dna),
                                   id=seq_code,
                                   description='')
                records.append(record)
        file_path = 'olc_webportalv2/media/vir_typer/{pk}/'.format(
            pk=vir_typer_pk)
        try:
            os.makedirs(file_path)
        except FileExistsError:
            pass
        with open(file_path + 'alignment.fasta', 'w') as alignment_file:
            SeqIO.write(records, alignment_file, 'fasta')
        with open(file_path + 'alignment.fasta', 'r') as alignment_file:
            alignment = AlignIO.read(alignment_file, 'fasta')
        summary_align = AlignInfo.SummaryInfo(alignment)
        consensus = summary_align.dumb_consensus()
    else:
        for sequence_dict in sequence_list:
            # print(sequence_dict)
            for seq_code, sequence in sequence_dict.items():
                consensus = sequence
    return consensus
Ejemplo n.º 14
0
def parse_alleles_file(inpath):
    seqlist = []
    with open(inpath, 'r') as allfile:
        temp = ''
        for line in allfile:
            if '//' not in line and line.lstrip('\n'):
                temp = temp + line
            elif not line.rstrip('\n '):  #sometimes there are blank lines...
                continue
            else:
                nchar = int(len(temp.split('\n')[0].split()[-1]))
                ntaxa = int(temp.count('\n'))
                temp = str(ntaxa) + '\t' + str(nchar) + '\n' + temp
                phylip = StringIO.StringIO(temp)
                alignment = AlignIO.read(phylip,
                                         "phylip-relaxed",
                                         alphabet=IUPACAmbiguousDNA())
                summary_align = AlignInfo.SummaryInfo(alignment)
                consensus = summary_align.dumb_consensus(threshold=0.5,
                                                         ambiguous='N')
                seqlist.append(
                    SeqRecord(consensus, id=str(len(seqlist)), description=''))
                phylip.close()
                temp = ''  #zero temp again
    return seqlist
Ejemplo n.º 15
0
def ConsensusSequences(clusters):
    reads = []

    # go through each cluster of sequencing reads provided
    for g in clusters:
        # temporary files
        tmp_file = tempfile.NamedTemporaryFile(delete=False)
        tmp_file2 = tempfile.NamedTemporaryFile(delete=False)
        tmp_file2.close()
        # if group is larger than MAX_READS_FOR_CONSENSUS reads, select random sample of MAX_READS_FOR_CONSENSUS
        if len(g) > CdrExtractionOptions.MAX_READS_FOR_CONSENSUS:
            subsample = random.sample(
                g, CdrExtractionOptions.MAX_READS_FOR_CONSENSUS)
        else:
            subsample = g  # use all available reads
        # store cluster in temporary file
        for i in range(0, len(subsample)):
            tmp_file.write('>' + str(i) + '\n')
            tmp_file.write(subsample[i] + '\n')
        tmp_file.close()
        # generate consensus read using muscle
        muscle_cline = MuscleCommandline(input=tmp_file.name,
                                         out=tmp_file2.name,
                                         maxiters=2)
        muscle_cline()
        align = AlignIO.read(tmp_file2.name, 'fasta')
        summary_align = AlignInfo.SummaryInfo(align)
        # store consensus read in list
        reads.append([summary_align.gap_consensus(ambiguous=''), len(g)])

        # remove tmp files
        os.unlink(tmp_file.name)
        os.unlink(tmp_file2.name)

    return reads
Ejemplo n.º 16
0
def get_ic(path, pdb):
    """
    Process given MSA (in file_path). Return consensus and information content.
    """

    alignment = AlignIO.read(path, "stockholm")

    summary_align = AlignInfo.SummaryInfo(alignment)

    consensus = summary_align.dumb_consensus()

    pdb2alignid = get_pdb2alignmentid(path)

    alignid = pdb2alignid[pdb]

    aligned_pdb_seq = None
    for seq in alignment:
        if seq.id == alignid:
            aligned_pdb_seq = seq
            break

    aligned_seqs = [str(a.seq) for a in alignment]
    freqs = count_freqs(aligned_seqs)

    info_content = []
    for pos in range(len(consensus)):
        info_content.append(
            summary_align.information_content(start=pos,
                                              end=pos + 1,
                                              e_freq_table=FreqTable.FreqTable(
                                                  freqs,
                                                  dict_type=FreqTable.FREQ)))

    return consensus, info_content, aligned_pdb_seq
def fetch_additional_data(MSA):
    summary_align = AlignInfo.SummaryInfo(MSA)
    consensus = summary_align.dumb_consensus()
    print(consensus)
    PSSM = summary_align.pos_specific_score_matrix()
    print(PSSM)
    return consensus, PSSM
def createScoreMatrixFromAlignment(filename, output):
    """ Generate substitution matrix from a file containing alignment

	Args:
		filename (str): path to a fasta file containing aligned sequences
		output (str): path to a pickle file for the matrix to be stored

	Returns:
		my_lom (array): log odds matrix


	"""

    c_align = AlignIO.read(filename, "fasta")
    summary_align = AlignInfo.SummaryInfo(c_align)
    replace_info = summary_align.replacement_dictionary(["*"])
    my_arm = SubsMat.SeqMat(replace_info)

    # Add pseudocounts
    for m in my_arm:
        my_arm[m] += 1
    my_lom = SubsMat.make_log_odds_matrix(my_arm)

    pickle.dump(my_lom, open(output, "wb"))
    return my_lom
def consensusFromSequencesCnf(sequences, threshold, ambiguous,
                              require_multiple):
    """ Compute the naive concensus sequence from an alignment of sequences.

	Args:
		sequences (array): the array of aligned sequences
		threshold (float): the value of column agreement under which a single
						letter will be asigned to the column
		ambiguous (char): a char letter that will be assigned to a column
						for an agreement lower than the threshold
		require_multiple: see biopython.org/DIST/docs/api/Bio.Align.AlignInfo.SummaryInfo-class.html

	Return:
		consensus (str): the consensus sequence


	"""

    io.exportGroupOfSequencesToFASTA(sequences, "temp/forconsensus.fasta")
    c_align = AlignIO.read("temp/forconsensus.fasta", "fasta")
    summary_align = AlignInfo.SummaryInfo(c_align)
    concensus = summary_align.dumb_consensus(threshold=threshold,
                                             ambiguous=ambiguous,
                                             consensus_alpha=None,
                                             require_multiple=require_multiple)
    return concensus
Ejemplo n.º 20
0
    def test_read_fasta(self):
        path = os.path.join(os.curdir, "Quality", "example.fasta")
        alignment = AlignIO.read(path,
                                 "fasta",
                                 alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna))
        self.assertEqual(len(alignment), 3)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324")
        self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC")
        seq_record = alignment[1]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792")
        self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA")
        seq_record = alignment[2]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
        self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
        self.assertEqual(alignment.get_alignment_length(), 25)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
        self.assertEqual(
            str(alignment), """\
Gapped(IUPACAmbiguousDNA(), '-') alignment with 3 rows and 25 columns
CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324
TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792
GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
Ejemplo n.º 21
0
def printAlignmentInfo(alignment, alphabet):
    seqlist = []
    for record in alignment:
        seqlist.append(record.seq)

    m = motifs.create(seqlist, alphabet)
    pwm = m.counts.normalize()
    consensus = pwm.consensus

    summary_align = AlignInfo.SummaryInfo(alignment)

    consensus2 = summary_align.dumb_consensus()
    my_pssm = summary_align.pos_specific_score_matrix(consensus,
                                                      chars_to_ignore=['N'])

    print(alignment)

    print('first description: %s' % alignment[0].description)
    print('first sequence: %s' % alignment[0].seq)
    print('length %i' % alignment.get_alignment_length())

    print('matrix pwm %s' % pwm)
    print('consensus (motifs) %s' % consensus)

    print('matrix pssm %s' % my_pssm)
    print('consensus (AlignInfo.SummaryInfo) %s' % consensus2)

    return
Ejemplo n.º 22
0
def get_consensus_seq(fasta_file, q_id):
    align = AlignIO.read(fasta_file, 'fasta')
    seq = SeqRecord(AlignInfo.SummaryInfo(align).gap_consensus(),
                    id=q_id,
                    description='')
    print("'" + q_id + "' consensus sequence generated (original: " +
          fasta_file + ')')
    return seq
Ejemplo n.º 23
0
def obtain_consensus(pair):
    """Obtain a consensus from the multiple sequence alignment"""
    consensus_sequences = open("consensus_sequences.fasta", "a")
    align = AlignIO.read("mafft_temp.out", "clustal")
    summary_align = AlignInfo.SummaryInfo(align)
    consensus = summary_align.dumb_consensus(threshold=0.51, ambiguous='N')
    new_sequence_name = ">{0},{1}\n{2}\n".format(pair[0], pair[1], consensus)
    consensus_sequences.write(new_sequence_name)
Ejemplo n.º 24
0
def build_cons_seq(infile):
    # https://www.biostars.org/p/14026/
    from Bio import AlignIO
    from Bio.Align import AlignInfo

    alignment = AlignIO.read(open(infile), "fasta")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus
Ejemplo n.º 25
0
def closestToConsensus(linIt):

	results = []
	print( "Starting vsearch on a new chunk..." )

	for lineage in linIt:
		#save time on singletons (if they weren't excluded by minSeq)
		if len(lineage['desc']) == 1:
			with open("%s/%s.fa" % (prj_tree.lineage, lineage['name']), "r") as handle:
				results.append( SeqIO.read(handle,'fasta') )
		else:

			FNULL = open(os.devnull, 'w') #don't clutter up output with tons of vsearch messages

			#cluster and rapid align with vsearch
			subprocess.call([vsearch, "-cluster_size", "%s/%s.fa" % (prj_tree.lineage, lineage['name']), 
					 "-id", "0.97", "-sizein", "-sizeout",
					 "-msaout", "%s/%s_msa.fa"%(prj_tree.lineage, lineage['name']), "-clusterout_sort"],
					stdout=FNULL, stderr=subprocess.STDOUT)

			#extract biggest cluster
			with open("%s/%s_msa.fa"%(prj_tree.lineage, lineage['name']), "r") as allClusters:
				with open("%s/%s_msaBiggest.fa"%(prj_tree.lineage, lineage['name']), "w") as biggestOnly:
					blank = next( allClusters )
					for line in allClusters:
						if "consensus" in line:
							break
						biggestOnly.write(line)
	    
			#open the msa
			with open("%s/%s_msaBiggest.fa" % (prj_tree.lineage, lineage['name']), "r") as handle:
				aln = AlignIO.read(handle, "fasta")
		
			#add derep size to alignment as weighting
			for rec in aln:
				rec.annotations['weight'] = int( rec.id.split(";")[1].split("=")[1] )

			summary_align = AlignInfo.SummaryInfo(aln)
			pssm = summary_align.pos_specific_score_matrix()

			#score each sequence and save the best one
			scores = dict()
			for record in aln:
				myScore = 0
				for i,l in enumerate(record):
					myScore += pssm[i][l]
				scores[record.id] = myScore
				
			d=sorted(aln, key=lambda rec: scores[rec.id], reverse=True) #reverse -> get max
			d[0].seq = d[0].seq.ungap("-") #remove gaps
			d[0].id = d[0].id.split(";")[0] #remove vsearch size annotation
			d[0].id = re.sub("^\*","",d[0].id) #get rid of possible annotation from vsearch
			d[0].description = lineage['desc'][d[0].id] #restore original info

			results.append( d[0] )

	return results
Ejemplo n.º 26
0
    def find_sqce_consensus( list_of_sequences, sqce_type=Constants.SEQUENCE_TYPE_DNA, \
                             threshold=Constants.DEFAULT_SQCE_CONSENSUS_AMBIG_THRESHOLD, \
                             fasta_end_name = '' ):

        if (sqce_type == Constants.SEQUENCE_TYPE_DNA):
            alphabet = generic_dna
            ambiguous = Constants.SEQUENCE_AMBIGUOUS_DNA_BASE

        elif (sqce_type == Constants.SEQUENCE_TYPE_PROT):
            alphabet = generic_protein
            ambiguous = Constants.SEQUENCE_AMBIGUOUS_PROT_AA

        else:
            raise DenCellORFException(
                'MergeStrategy.find_sqce_consensus(): The type of sequence provided'
                + ' has to be ' + Constants.SEQUENCE_TYPE_DNA + ' or ' +
                Constants.SEQUENCE_TYPE_PROT + ' (provided type: ' +
                str(sqce_type) + ').')

        # Store the input sequences in a fasta file in order to run Muscle
        input_sequences = (SeqRecord(Seq(s, alphabet))
                           for s in list_of_sequences)

        if (not os.path.exists(DefaultTemporaryFolder.TEMPORARY_FOLDER)):
            os.makedirs(DefaultTemporaryFolder.TEMPORARY_FOLDER)

        input_sequences_file = os.path.join(
            DefaultTemporaryFolder.TEMPORARY_FOLDER,
            'input_sequences' + fasta_end_name + '.fasta')
        SeqIO.write(input_sequences, input_sequences_file, 'fasta')

        # Perform the multiple sequences alignment and
        # store the output in a fasta file
        aligned_sequences_file = os.path.join(
            DefaultTemporaryFolder.TEMPORARY_FOLDER,
            'aligned_sequences' + fasta_end_name + '.fasta')
        muscle_cline = MuscleCommandline(cmd='/bin/muscle',
                                         input=input_sequences_file,
                                         out=aligned_sequences_file)

        (stdout, stderr) = muscle_cline()

        # Read the fasta file containing aligned sequences
        align = AlignIO.read(aligned_sequences_file, 'fasta')

        summary_align = AlignInfo.SummaryInfo(align)

        # Compute the consensus
        consensus = summary_align.gap_consensus(threshold=threshold,
                                                ambiguous=ambiguous)

        # Remove the temporary fasta files
        os.remove(input_sequences_file)
        os.remove(aligned_sequences_file)

        return str(consensus)
Ejemplo n.º 27
0
 def substitution_matrices(self):
     subs = []
     for msa in [self.kinase_msa, self.peptide_msa]:
         c_align = AlignIO.read(msa, "tab")
         summary_align = AlignInfo.SummaryInfo(c_align)
         replace_info = summary_align.replacement_dictionary()
         my_arm = SubsMat.SeqMat(replace_info)
         my_lom = SubsMat.make_log_odds_matrix(my_arm)
         subs.append(my_lom)
     return subs[0], subs[1]
Ejemplo n.º 28
0
def compute_ic_content(alignment):
    align_info = AlignInfo.SummaryInfo(alignment)
    align_info.information_content()
    ic_vector = align_info.ic_vector
    # biopython wrong version hack
    if isinstance(ic_vector, dict):
        ic_vector = np.zeros(len(align_info.ic_vector))
        for (ic_i, ic_v) in align_info.ic_vector.items():
            ic_vector[ic_i] = ic_v
    return list(ic_vector)
Ejemplo n.º 29
0
def get_consensus_sequence(input_fasta, output_fasta):
    alignment = AlignIO.read(input_fasta, 'fasta')
    information = AlignInfo.SummaryInfo(alignment)
    consensus_sequence = information.dumb_consensus()
    consensus_record = SeqRecord(
        consensus_sequence,
        id='consensus',
        description=''
    )
    SeqIO.write(consensus_record, output_fasta, 'fasta')
Ejemplo n.º 30
0
def get_consensus(filename):
    ''' (str) -> str
    reads in input aligned fasta and generates a consensus
    consensus uses AlignIO's default threshold of 70%
    '''
    # https://www.biostars.org/p/284637/
    alignment = AlignIO.read(filename, 'fasta')
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus(threshold=0.7, ambiguous='N')
    return str(consensus)