def test_append_proteins(self): self.test_chars.append(Seq.Seq("K", Alphabet.generic_protein)) self.test_chars.append( Seq.Seq("K-", Alphabet.Gapped(Alphabet.generic_protein, "-"))) self.test_chars.append( Seq.Seq("K@", Alphabet.Gapped(IUPAC.protein, "@"))) self.assertEqual(7, len(self.test_chars))
def test_ungap(self): seq = Seq.UnknownSeq(7, alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(), "-")) self.assertEqual("NNNNNNN", str(seq.ungap("-"))) seq = Seq.UnknownSeq(20, alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(), "-"), character='-') self.assertEqual("", seq.ungap("-"))
def test_exception_when_added_protein_has_more_than_one_stop_codon_type( self): """Test resulting protein has stop codon types '*' and '@'""" a = Seq.Seq( "MEDG-KRXR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")) b = Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")) with self.assertRaises(ValueError): a + b
def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)): Alignment.__init__(self, alphabet) # represent all of those stars in the aln output format self._star_info = '' self._version = ''
def replace_stop_codons_with_gapps(aln_file, in_format="fasta", output=None): aln_file = check_filename(aln_file) if output == None: output = aln_file else: output = check_filename(output, Truefile=False) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) stop_codon_count = 0 for seq in aln: new_seq = "" for i in range(0, len(seq.seq), 3): codon = seq.seq[i:i + 3] if "-" in codon: new_seq += codon elif codon in ["TAA", "TAG", "TGA"]: if len(seq.seq) - i == 3: # the final stop codon new_seq += "---" else: new_seq += "---" stop_codon_count += 1 else: new_seq += codon seq.seq = new_seq SeqIO.write(aln, output, "fasta") print("%i replacments of stop codons to ---" % stop_codon_count)
def remove_gapped_positions_codon(aln_file, output=None, in_format="fasta"): """ removes positions in an alignment which are all gapped if output == None - rewrites on the input file :param aln_file: input alignment file path :param output: output file path (default: None) :param in_format: input format (default: fatsa) :return: ouptut file path """ aln_file = check_filename(aln_file) if output == None: output = aln_file else: output = check_filename(output, Truefile=False) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) new_aln = None for i in range(0, len(aln[0]), 3): position = aln[:, i:i + 3] if "".join(set(position[0])) != "-" or "".join(set( position[2])) != "-" or "".join(set(position[2])) != "-": if new_aln == None: new_aln = aln[:, i:i + 3] else: new_aln = new_aln + aln[:, i:i + 3] AlignIO.write(new_aln, output, "fasta")
def test_read_fasta(self): path = os.path.join(os.curdir, "Quality", "example.fasta") alignment = AlignIO.read(path, "fasta", alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna)) self.assertEqual(len(alignment), 3) seq_record = alignment[0] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324") self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC") seq_record = alignment[1] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792") self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA") seq_record = alignment[2] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348") self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG") self.assertEqual(alignment.get_alignment_length(), 25) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6) self.assertIsInstance(consensus, Seq) self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN") self.assertEqual( str(alignment), """\ Gapped(IUPACAmbiguousDNA(), '-') alignment with 3 rows and 25 columns CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324 TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792 GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
def parse_file(file_name, type = 'DNA'): """Parse the given file into a FastaAlignment object. Arguments: o file_name - The location of the file to parse. o type - The type of information contained in the file. """ if type.upper() == 'DNA': alphabet = IUPAC.ambiguous_dna elif type.upper() == 'RNA': alphabet = IUPAC.ambiguous_rna elif type.upper() == 'PROTEIN': alphabet = IUPAC.protein else: raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN" % type) # create a new alignment object fasta_align = FastaAlignment(Alphabet.Gapped(alphabet)) # now parse the file and fill up the alignment object align_file = open(file_name, 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(align_file, parser) cur_align = iterator.next() while cur_align: fasta_align.add_sequence(cur_align.title, cur_align.sequence) cur_align = iterator.next() return fasta_align
def action(arguments): """ Trim the alignment as specified """ # Determine file format for input and output source_format = (arguments.source_format or fileformat.from_handle(arguments.source_file)) output_format = (arguments.output_format or fileformat.from_handle(arguments.output_file)) # Load the alignment with arguments.source_file: sequences = SeqIO.parse(arguments.source_file, source_format, alphabet=Alphabet.Gapped( Alphabet.single_letter_alphabet)) # Locate primers (forward_start, forward_end), (reverse_start, reverse_end) = \ locate_primers(sequences, arguments.forward_primer, arguments.reverse_primer, arguments.reverse_complement, arguments.max_hamming_distance) # Generate slice indexes if arguments.include_primers: start = forward_start end = reverse_end + 1 else: start = forward_end + 1 end = reverse_start # Rewind the input file arguments.source_file.seek(0) sequences = SeqIO.parse(arguments.source_file, source_format, alphabet=Alphabet.Gapped( Alphabet.single_letter_alphabet)) # Apply the transformation prune_action = _ACTIONS[arguments.prune_action] transformed_sequences = prune_action(sequences, start, end) with arguments.output_file: SeqIO.write(transformed_sequences, arguments.output_file, output_format)
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.dna = [ Seq.Seq("ATCG", IUPAC.ambiguous_dna), Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), ] self.rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), ] self.nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide)] self.protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq( "ME-K-DRXR*XU", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.Seq( "MEDG-KRXR@", Alphabet.HasStopCodon( Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")), Seq.Seq( "ME-KR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")), Seq.Seq( "MEDG.KRXR@", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")), ] self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
def read_fasta(filename): """ Reading .fasta files Input: filename - name of the file Output: ndarray """ msa = AlignIO.read(filename, 'fasta', alphabet=Alphabet.Gapped(Alphabet.IUPAC.protein)) return np.array([list(rec) for rec in msa], np.character)
def test_to_alignment(self): tree = self.phyloxml.phylogenies[0] aln = tree.to_alignment() self.assertTrue(isinstance(aln, MultipleSeqAlignment)) self.assertEqual(len(aln), 0) # Add sequences to the terminals alphabet = Alphabet.Gapped(Alphabet.generic_dna) for tip, seqstr in zip(tree.get_terminals(), ('AA--TTA', 'AA--TTG', 'AACCTTC')): tip.sequences.append(PX.Sequence.from_seqrecord( SeqRecord(Seq(seqstr, alphabet), id=str(tip)))) # Check the alignment aln = tree.to_alignment() self.assertTrue(isinstance(aln, MultipleSeqAlignment)) self.assertEqual(len(aln), 3) self.assertEqual(aln.get_alignment_length(), 7)
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def count_gaps_and_characters(aln_file, file_format = "fasta"): """ count how many gaps and how many characters there are in an alignemnt :param aln_file: input alignment file :param file_format: input file format (default: fasta) :return: alignment length, number of gap chars, number of non-gap chars """ aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, file_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) total_gaps = 0 total_not_gaps = 0 for record in aln: local_gaps = record.seq.count("-") local_not_gaps = len(record.seq) - local_gaps total_gaps += local_gaps total_not_gaps += local_not_gaps return len(aln), total_gaps, total_not_gaps
def get_major_and_minor_consensus(aln_file, in_format="fasta"): """ calculates major and minor consensus and each position's probability - major consensus - the most prominent base (including "-") - minor consensus - the most prominent base (not including "-") :param aln_file: alignment file path :param in_format: input alignment format (default: fasta) :return: major_consensus, major_freqs, minor_consensus, minor_freqs """ aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) len_aln = len(aln[0]) num_of_seq = len(aln) major_consensus = "" major_freqs = [] minor_consensus = "" minor_freqs = [] for i in range(len_aln): counter = collections.Counter(aln[:, i]) major_count = 0 minor_count = 0 major_char = "" minor_char = "" for j in counter: if counter[j] > major_count: major_count = counter[j] major_char = j if j != "-": minor_count = counter[j] minor_char = j if counter[j] > minor_count and j != "-": if j not in ["A", "C", "G", "T"]: minor_count = counter[j] minor_char = "N" else: minor_count = counter[j] minor_char = j gap_count = counter["-"] major_consensus += major_char major_freqs.append(round(major_count / (num_of_seq - gap_count), 2)) minor_consensus += minor_char minor_freqs.append(round(minor_count / (num_of_seq - gap_count), 2)) return major_consensus, major_freqs, minor_consensus, minor_freqs
def format_changer(filename, out_format, outfile= None, in_format="fasta"): """ sequence file format changer :param filename: input sequence filename :param out_format: output format :param outfile: output file (default: None) :param in_format: input format (default: fasta) :return: out file path in out format """ filename = check_filename(filename) if outfile != None: outfile = check_filename(outfile, Truefile=False) else: outfile = path.splitext(filename)[0] + "." + out_format alignment = AlignIO.read(filename, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) AlignIO.write(alignment, outfile, out_format) print("saved %s in format %s" % (outfile, out_format)) return outfile
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (Bio.Align.Generic)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict)+1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa fssp_align = Generic.Alignment(Alphabet.Gapped( Alphabet.IUPAC.extended_protein)) for i in sorted(mult_align_dict): fssp_align.add_sequence(sum_dict[i].pdb2+sum_dict[i].chain2, mult_align_dict[i]) # fssp_align._add_numbering_table() return fssp_align
def get_longest_sequence_name_in_fasta(aln_file, in_format="fasta"): """ returns the longest sequence name in the alignment :param aln_file: input alignment file path :param in_format: input format (default = fasta) :return: name of the longest sequence in the alignment """ aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) longest = 0 longest_name = "" for i in aln: seq = str(i.seq) seq = seq.replace("-", "") l = len(seq) if l > longest: longest = l longest_name = i.name return longest_name
def cut_alignemnt_by_coordinates(aln_file, coor=[], perfix="cut", in_format="fasta"): """ cuts alignment file by sequnce coordinate attention - the coordinates must be normelized to the specific alignment :param aln_file: input alignment file :param coor: input coordinates (default: []) :param perfix: perfix for output file (default: cut) :param in_format: input alignment formar (default: fasta) :return: output filename of cut alignment """ if coor == []: raise Exception("no coordinates") aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) new_aln = aln[:, coor[0]:coor[1]] output = aln_file.split(".aln")[0] + "_%s.aln" % perfix AlignIO.write(new_aln, output, "fasta") print("wrote cut alignemnt in %s" % output) return output
def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0): """Parse the given file into a clustal aligment object. Arguments: o file_name - The name of the file to parse. o alphabet - The type of alphabet to use for the alignment sequences. This should correspond to the type of information contained in the file. Defaults to be unambiguous_dna sequence. There is a deprecated optional argument debug_level which has no effect. Since Biopython 1.46, this has called Bio.AlignIO internally. """ # Avoid code duplication by calling Bio.AlignIO to do this for us. handle = open(file_name, 'r') from Bio import AlignIO generic_alignment = AlignIO.read(handle, "clustal") handle.close() #Force this generic alignment into a ClustalAlignment... nasty hack if isinstance(alphabet, Alphabet.Gapped) : alpha = alphabet else : alpha = Alphabet.Gapped(alphabet) clustal_alignment = ClustalAlignment(alpha) clustal_alignment._records = generic_alignment._records for record in clustal_alignment._records : record.seq.alphabet = alpha try : clustal_alignment._version = generic_alignment._version except AttributeError : #Missing the version, could be a 3rd party tool's output pass try : clustal_alignment._star_info = generic_alignment._star_info except AttributeError : #Missing the consensus, again, this is not always present pass return clustal_alignment
def unalign(filename, in_format="fasta", gap = "-", outfile = None): """ unaligns file :param filename: input alignment filename :param in_format: input format (default: fasta) :param gap: gap type (default: - ) :return: out file path without gaps """ filename = check_filename(filename) alignment = AlignIO.read(filename, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) for seq in alignment: seq.seq = seq.seq.ungap(gap) if outfile == None: outfile = path.splitext(filename)[0] + "-unaligned.fasta" else: outfile = check_filename(outfile, Truefile=None) SeqIO.write(alignment, outfile, "fasta") print("saved unaligned %s" % outfile) return outfile
def get_consensus_from_alignment(aln_file, in_format="fasta"): """ constructs a consensus sequence from alignment file :param aln_file: alignment file :param in_format: file format (default: fasta) :return: consensus sequence """ aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) len_aln = len(aln[0]) consensus = "" for i in range(len_aln): count = 0 max_char = "" counter = collections.Counter(aln[:, i]) for j in counter: if counter[j] > count: count = counter[j] max_char = j if max_char == "-": continue consensus += max_char return consensus
def get_consensus_percentage(aln_file, in_format="fasta"): """ gets alignment file and returns the consensus and the percentage of each position in the alignment the percentage calculation ignores gaps :param aln_file: input alignment file path :param in_format: input file format (defualt: fasta) :return: consensus sequance and consensus percentage """ aln_file = check_filename(aln_file) aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) len_aln = len(aln[0]) num_of_seq = len(aln) consensus_percentage= {1:0, 0.9:0, 0.8:0, 0.7:0, 0.6:0, 0.5:0, 0.4:0, 0.3:0, 0.2:0} consensus = "" for i in range(len_aln): counter = collections.Counter(aln[:, i]) count = 0 max_char = "" for j in counter: if j == "-": continue elif counter[j] > count: count = counter[j] max_char = j if "-" not in counter: gap_count = 0 else: gap_count = counter["-"] percentage = round(count/(num_of_seq-gap_count), 1) consensus_percentage[percentage] += 1 consensus += max_char for n in consensus_percentage: consensus_percentage[n] = round(consensus_percentage[n] / len_aln, 3) return consensus, consensus_percentage
def setUp(self): self.dna = [ Seq.Seq("ATCG", IUPAC.ambiguous_dna), Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), "TGGTCA", ] self.rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU", ] self.nuc = [ Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG", ] self.protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq( "ME-K-DRXR*XU", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), "TEDDF", ]
def get_alphabet(self): alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) if self.mol_seq and self.mol_seq.is_aligned: return Alphabet.Gapped(alph) return alph
print repr(test_seq[1::3]) print repr(test_seq[2::3]) print "Setting wobble codon to N (set slice with stride 3):" test_seq[2::3] = "N" * len(test_seq[2::3]) print repr(test_seq) ########################################################################### print print "Testing Seq addition" print "====================" dna = [ Seq.Seq("ATCG", IUPAC.ambiguous_dna), Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), "TGGTCA" ] rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU" ] nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG"] protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq(
def get_alphabet(self): """Get the alphabet for the sequence.""" alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) if self.mol_seq and self.mol_seq.is_aligned: return Alphabet.Gapped(alph) return alph
lineno, file=None, line=None): #TODO - Have Biopython DataLossWarning? if category in [UserWarning]: print "%s - %s" % (category.__name__, message) warnings.showwarning = send_warnings_to_stdout protein_alphas = [Alphabet.generic_protein] dna_alphas = [Alphabet.generic_dna] rna_alphas = [Alphabet.generic_rna] nucleotide_alphas = [ Alphabet.generic_nucleotide, Alphabet.Gapped(Alphabet.generic_nucleotide) ] no_alpha_formats = [ "fasta", "clustal", "phylip", "phylip-relaxed", "phylip-sequential", "tab", "ig", "stockholm", "emboss", "fastq", "fastq-solexa", "fastq-illumina", "qual" ] possible_unknown_seq_formats = ["qual", "genbank", "gb", "embl", "imgt"] #List of formats including alignment only file formats we can read AND write. #The list is initially hard coded to preserve the original order of the unit #test output, with any new formats added since appended to the end. test_write_read_alignment_formats = [ "fasta", "clustal", "phylip", "stockholm", "phylip-relaxed" ] for format in sorted(SeqIO._FormatToWriter):
print consensus consensus = summary.gap_consensus(ambiguous="N") print consensus print print summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus) print #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly. print summary.information_content(e_freq_table=expected, chars_to_ignore=['-']) print print "Trying a protein sequence with gaps and stops" alpha = Alphabet.HasStopCodon( Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print a print "=" * a.get_alignment_length() s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print c c = s.gap_consensus(ambiguous="X") print c print print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
#!/usr/bin/env python #coding: utf-8 from Bio import AlignIO, SeqIO, Align, Alphabet import pandas as pd import os, re, sys from copy import deepcopy aln_alphabet = Alphabet.Gapped(Alphabet.IUPAC.ambiguous_dna) aln_folder = '/work/abg_tree/concatenated_trees/3rd_try/alignments' output_folder = '/work/abg_tree/concatenated_trees/3rd_try' genomes = {} for aln in os.listdir(aln_folder): alignment = AlignIO.read('%s/%s' %(aln_folder, aln), 'fasta') genomes[aln] = set() for entry in alignment: if re.match('GC[AF]_', entry.name): genome, gene = entry.name.split('|') else: genome, gene = entry.name.split('_') if genome in genomes[aln]: sys.exit('\t**Error, duplicated genome in %s: %s' %(aln, genome)) genomes[aln].add(genome) genome_union = set.union(*genomes.values()) missing_genes = {} # just to keep track of the number of missing marker genes in each genome concatenation = {}