def writeNoGaps(self, outputfile): fastaFile = self.pathToFasta from Bio import AlignIO, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, Gapped seqIterator = SeqIO.parse(fastaFile, "fasta", alphabet=Gapped(IUPAC.ExtendedIUPACProtein(), "-")) outFasta = outputfile records = list() for alignment in seqIterator: desiredSeqString = str(alignment.seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment.id) #print seqRecNoGaps.seq #print seqRecNoGaps.id records.append(seqRecNoGaps) SeqIO.write(records, outFasta, "fasta")
def getFastaEntry(self, entryNum, resultFolder): fastaFile = self.pathToFasta from Bio import AlignIO, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, Gapped alignmentIterator = AlignIO.parse(fastaFile, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) queryFasta = resultFolder + "/" + "Query_%d.faa" % (entryNum, ) alignment = alignmentIterator.next() if entryNum > len(alignment): return None desiredSeqString = str(alignment[entryNum - 1].seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment[entryNum - 1].id) #print seqRecNoGaps.seq #print seqRecNoGaps.id SeqIO.write(seqRecNoGaps, queryFasta, "fasta") return queryFasta
def main(): (opts, args) = getoptions() # Load PWMs pssms = load_motifs(opts.pwm_dir, opts.pseudocount) if opts.testseq is not None: if opts.seqtype == 'RNA': seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousRNA()).back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() else: seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA()) final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) else: # Scan in sequence print >> sys.stderr, "Scanning sequences ", tic = time.time() for seqrecord in SeqIO.parse(open(args[0]), "fasta"): seq = seqrecord.seq if opts.seqtype == "RNA": seq = seq.back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) toc = time.time() print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == "parse": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == "index": # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) nucl.close() # Close the indexed FASTA file elif i[1] == "id": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = {i.split()[0]: i.split()[1] for i in handle} with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) self.alns = alns
def get_seq_record(record, start, stop, description): """Return a SeqRecord for query between start and stop. Given a sam record, find query sequences that cover the [start, stop] interval completely and create a SeqRecord object with sequence in the reference orientation. """ # get the query positions of the bases mapped to start, stop # currently only extracts from reads where both endpoints are mapped positions = record.get_aligned_pairs(matches_only=True) first_position = [item[0] for item in positions if item[1] == start] last_position = [item[0] for item in positions if item[1] == stop] # fetch and reorient sequence if first_position and last_position: name = record.query_name if not record.is_reverse: seq = Seq( record.get_forward_sequence() [first_position[0]:last_position[0]], IUPAC.IUPACUnambiguousDNA()) direction = 'f' else: length = record.query_length seq = Seq( record.get_forward_sequence()[length - last_position[0]:length - first_position[0]], IUPAC.IUPACUnambiguousDNA()).reverse_complement() direction = 'rc' return SeqRecord(seq, id=name, description=('|').join( [description, direction[record.is_reverse]]))
def chgAlpha(self, newAlpha): """Accepts 'DNA' 'RNA' or 'protein' or an alphabet object""" from Bio.Seq import Seq from Bio.Alphabet import IUPAC alpha = None if newAlpha == "DNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "RNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "protein": alpha = IUPAC.IUPACProtein() self.typ = alpha else: raise NameError, "type not 'DNA', 'RNA', or 'protein'" if not alpha: alpha = newAlpha self.seq = Seq(self.seq.tostring(), alpha) self.checkAlpha()
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein())) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def load_csv_file(file, delimiter=";"): """ This function loads a "Primer" file. @returns: List of PrimerPair instances """ pos = { "id": 0, "forwardPrimer": 0, "reversePrimer": 0, "fPDNA": 0, "rPDNA": 0, "ampliconMinLength": 0, "ampliconMaxLength": 0 } header_len = len(pos) primer_dict = {} with open(file, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter=delimiter) headers = next(csvreader) if (len(headers) != header_len): raise ValueError("Wrong header") for i in range(len(headers)): if (headers[i] not in pos): raise ValueError("Unknown header " + headers[i]) pos[headers[i]] = i i = 1 for row in csvreader: i += 1 if (len(row) == header_len): fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA()) fprimer = SeqRecord(fprimer) fprimer.id = row[pos["forwardPrimer"]] rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA()) rprimer = SeqRecord(rprimer) if (True): #TODO rprimer = rprimer.reverse_complement() rprimer.id = row[pos["reversePrimer"]] primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer, int(row[pos["ampliconMinLength"]]), int(row[pos["ampliconMaxLength"]])) if (check_primer_pair_integrity(primer_pair)): primer_dict[row[pos["id"]]] = primer_pair else: logging.warning("Skipping primer pair " + primer_pair.id + ", bad sequence") else: logging.warning("Wrong primer pair in line " + str(i)) return primer_dict
def setUp(self): # Test set 1 seq1 = SeqRecord(Seq( 'TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') seq2 = SeqRecord(Seq( 'TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein), id='pro1') pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein), id='pro2') aln1 = MultipleSeqAlignment([pro1, pro2]) self.aln1 = aln1 self.seqlist1 = [seq1, seq2] # Test set 2 # M K K H E L(F)L C Q G T S N K L T Q(L)L G T F E D H F L S L Q R M F N N C E V V seq3 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') seq4 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') seq5 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') pro3 = SeqRecord(Seq( 'MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro1') pro4 = SeqRecord(Seq( 'MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro2') pro5 = SeqRecord(Seq( 'MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro3') aln2 = MultipleSeqAlignment([pro3, pro4, pro5]) self.aln2 = aln2 self.seqlist2 = [seq3, seq4, seq5]
def sequence(ungapped, position, length): """ Given an ungapped sequence and a positive or negative number (position), return the nucleotide at that position plus [length] nucleotides in the positive direction. """ if position >= 0: return str( Seq(str(ungapped.seq), IUPAC.IUPACUnambiguousDNA())[position:position + length]) else: return getNegative( str( Seq(str(ungapped.seq), IUPAC.IUPACUnambiguousDNA())[position:position + length]))
def writeFastas(self, pathToCompleteFastaAlignment, outputPath): """ Writes individual clades as fasta alignments, where the files are stored in the output path and the name of each file is composed of <clade_name>.fasta . Data is obtained from the complete fasta alignment provided (pathToCompleteFastaAlignment) and the mappings given to the constructor. The mappings are a dictionary of fasta ids to clade names. Elements in the alignment not present in the mappings won't end in any clade fasta file. :param pathToCompleteFastaAlignment: :param outputPath: """ from Bio import SeqIO, AlignIO from Bio.Alphabet import IUPAC, Gapped # create file handles for each clade for cladeName in list(self._mappings.values()): self._clade2fileHandle[cladeName] = open(outputPath + "/" + cladeName + ".fas") # read complete alignment alignment = AlignIO.read(pathToCompleteFastaAlignment, "fasta", alphabet=Gapped(IUPAC.ExtendedIUPACProtein(), "-")) for record in alignment: handle = self._clade2fileHandle[self._mappings[record.id]] SeqIO.write(record, handle, "fasta") for cladeName in self._clade2fileHandle.keys(): self._clade2fileHandle[cladeName].close()
def gb(self): l = self.length() g = SeqRecord( Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()), id=self.name[0:8], name=self.name[0:8], description=self.description ) g.features = [] for f in self.features(): t = f.type if f.direction == 'f': strand = 1 else: strand = -1 if self.shape == 'c' and f.end > l: f1 = FeatureLocation(ExactPosition(f.start), ExactPosition(l), strand) f2 = FeatureLocation(ExactPosition(0), ExactPosition(f.end - l), strand) if strand == 1: floc = CompoundLocation([f1, f2]) else: floc = CompoundLocation([f2, f1]) else: floc = FeatureLocation(ExactPosition(f.start),ExactPosition(f.end), strand) sf = SeqFeature(floc, f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) g.features.append(sf) return g.format('genbank')
def genome_to_seqrecord(phage_genome): """Creates a SeqRecord object from a pdm_utils Genome object. :param phage_genome: A pdm_utils Genome object. :type phage_genome: Genome :returns: A BioPython SeqRecord object :rtype: SeqRecord """ assert phage_genome != None,\ "Genome object passed is None and not initialized" try: record = SeqRecord(phage_genome.seq) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() except AttributeError: print("Genome object failed to be converted to SeqRecord.", "Genome valid attribute 'seq' is required to", "convert to SeqRecord object.") raise record.name = phage_genome.name if phage_genome.accession != "": record.id = phage_genome.accession record.features = get_seqrecord_features(phage_genome) record.description = get_seqrecord_description(phage_genome) record.annotations=\ get_seqrecord_annotations(phage_genome) return record
def testLimit(self, list_seqs, start): """ Extract the aa sequences in the window. list_seqs is the list of sequence id in the alignment (not the id associated with the Bio.Seq object). start is the index of the start of the window. """ frame = start % 3 aa_window_length = int(self.window_length / 3) begin = int((start - frame) / 3) end = int(begin + aa_window_length) if frame == 0: t_align = self.t_align0 elif frame == 1: t_align = self.t_align1 else: t_align = self.t_align2 sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for idx in list_seqs: sub_align.append(t_align[idx][begin:end]) result = [] for c in range(aa_window_length): c = Counter(sub_align[:, c]) # count the most common aa nbr_most_common = c.most_common(1)[0][1] if nbr_most_common / len(list_seqs) >= self.min_aa_ratio: result.append(True) else: result.append(False) return result
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len): """ --------------------------------------------------------------------- MODIFIED FUNCTION FROM PHYLUCE: generic_align.py --------------------------------------------------------------------- First stage alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ start, end = running_average(alignment, window_size, proportion, threshold) s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: s1_trimmed.append(sequence[start:end]) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def find_gapped_columns(align, cfg): """Find all columns that contain more gaps than the cfg setting using a sliding windows""" max_gap_proportion = cfg["max_gap_proportion"] nbr_sequences = len(align) columns_to_remove = [] len_align = align.get_alignment_length() for index in range(len_align): column = str(align[:, index]).replace("n", "N") gap_freq = (column.count("N") + column.count("-")) / nbr_sequences if gap_freq > max_gap_proportion: columns_to_remove.append(index) if columns_to_remove: idxs = [x for x in range(len_align) if x not in columns_to_remove] trimmed_records = [] for rec in align: L_seq = list(rec.seq) new_seq = "".join([L_seq[i] for i in idxs]) new_rec = SeqRecord( Seq(new_seq, IUPAC.IUPACAmbiguousDNA()), name=rec.name, id=rec.id, description="", ) trimmed_records.append(new_rec) return trimmed_records return align
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) aln = CodonAlign.build(prot, nucl, corr_dict=id_corr, alphabet=CodonAlign.default_codon_alphabet) self.aln = aln
def generateFastaWithOutEntry(self, entryNumToRemove, resultFolder): fastaFile = self.pathToFasta # fastaFileClade = sys.argv[2] # entryToTest = int(sys.argv[3]) entryToTest = int(entryNumToRemove) from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import MultipleSeqAlignment alignmentIterator = AlignIO.parse(fastaFile, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) alignment = alignmentIterator.next() pathToNewFile = resultFolder + "/" + "WithOutEntry_%d.faa" % ( entryToTest, ) #print testAlignment[entryToTest].id #print testAlignment[entryToTest].seq #print "Number of entries: ",len(testAlignment) # Here we remove the desired element newTestAlignment = [] for i in range(len(alignment)): if i != entryToTest - 1: newTestAlignment.append(alignment[i]) newAlignment = MultipleSeqAlignment(newTestAlignment) AlignIO.write(newAlignment, pathToNewFile, "fasta") #print "Number of entries after: ",len(newTestAlignment) return pathToNewFile
def main(): logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument('--fasta', dest='fasta_file', metavar='STRING', required=True, type=str) parser.add_argument('--num_fragments', dest='num_fragments', metavar='int', required=True, type=int) parser.add_argument('--mean_frag_size', dest='frag_size_mu', metavar='int', required=True, type=int) parser.add_argument('--frag_size_std', dest='frag_size_sigma', metavar='int', required=True, type=int) parser.add_argument('--mean_mutation_rate', dest='mutation_rate_mu', metavar='float', required=True, type=float) parser.add_argument('--mutation_rate_std', dest='mutation_rate_sigma', metavar='float', required=True, type=float) parser.add_argument('--output', dest='output_file', metavar='string', required=True, type=str) args = parser.parse_args() outhandle = open(args.output_file, 'w') generated_seqs = [] for record in SeqIO.parse(args.fasta_file, 'fasta'): base_id = record.id base_seq = str(record.seq) while len(generated_seqs) < args.num_fragments: try: mutation_rate = rnd.gauss(args.mutation_rate_mu, args.mutation_rate_sigma) subsequence = subselect_sequence(base_seq, args.frag_size_mu, args.frag_size_sigma) mutated_subsequence = mutate(subsequence, mutation_rate) new_id = '%s__mut_%.2f__len_%i' % (base_id, mutation_rate, len(subsequence)) generated_seqs.append(SeqRecord(Seq(mutated_subsequence, IUPAC.IUPACAmbiguousDNA()), id=new_id, name=new_id, description='')) except Exception as e: print(e) SeqIO.write(generated_seqs, outhandle, 'fasta') outhandle.close()
def get_sines(sine_fname): """As given in file + reverse complements.""" for (i, sine_record) in enumerate(SeqIO.parse(sine_fname, "fasta")): cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA()) yield str(cur_seq) cur_seq_rc = cur_seq.reverse_complement() yield str(cur_seq_rc) print(cur_seq, cur_seq_rc, '''\n ======================''')
def translate(self, align, offset): """ Translate the alignment according to the selected frame which is set according to 'offset' value """ end = ((align.get_alignment_length() - offset) // 3) * 3 + offset t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for rec in align: seq = str(rec.seq).upper().replace("-", "N").replace("n", "N") new_seq = Seq(seq, IUPAC.IUPACAmbiguousDNA())[offset:end].translate() new_rec = SeqRecord(new_seq, name=rec.name, id=rec.id, description="") t_align.append(new_rec) return t_align
def get_sine_forward(sine_fname): """Only in direction given in file.""" [sine_record] = SeqIO.parse(sine_fname, "fasta", alphabet=IUPAC.IUPACAmbiguousDNA()) # TODO: If we return it as dumb string, why did we bother about the alphabet? # TODO: The reference SINEs do contain a couple ambiguous chars - N, Y. return str(sine_record.seq)
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonWarning) aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet) self.aln = aln
def test_reverse_complements(self): """Test double reverse complement preserves the sequence.""" sorted_amb_rna = sorted(ambiguous_rna_values) sorted_amb_dna = sorted(ambiguous_dna_values) for sequence in [ Seq.Seq("".join(sorted_amb_rna)), Seq.Seq("".join(sorted_amb_dna)), Seq.Seq("".join(sorted_amb_rna), Alphabet.generic_rna), Seq.Seq("".join(sorted_amb_dna), Alphabet.generic_dna), Seq.Seq("".join(sorted_amb_rna).replace("X", ""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted_amb_dna).replace("X", ""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG"), ]: # Note no U or T reversed_sequence = sequence.reverse_complement() self.assertEqual(str(sequence), str(reversed_sequence.reverse_complement()))
def set_primer_seqs(self, fwd_sequence, rev_sequence): """Set the primer sequences. Set the primer sequences from the given forward and reverse sequences. Parameters ---------- fwd_sequence : string forward primer sequence - ambiguities allowed. rev_sequence : string reverse primer sequence - ambiguities allowed. """ fwd_primer = Seq(fwd_sequence, IUPAC.IUPACAmbiguousDNA()) rev_primer = Seq(rev_sequence, IUPAC.IUPACAmbiguousDNA()) self.logger.info("Setting foward primer to " + fwd_sequence) self.logger.info("Setting reverse primer to " + rev_sequence) self._primer_pair = (fwd_primer, rev_primer)
def setUp(self): self.aln_file = [ TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6 ] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, alphabet=CodonAlign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, alphabet=CodonAlign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) id = dict((i.split()[0], i.split()[1]) for i in open(i[0][2]).readlines()) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, corr_dict=id, alphabet=CodonAlign.default_codon_alphabet) alns.append(caln) self.alns = alns
def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None): """ to replace proteinToCodonAlignment() """ protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) dnaAlignFasta = StringIO() prot_align_len = proteinAlignment.get_alignment_length() for seqId in dnaSeqDict: dnaSeq = dnaSeqDict[seqId].seq if len(dnaSeq) < 3 * prot_align_len: # this is to handle cases where protein exists but DNA does not dnaSeq += '---' * (prot_align_len - len(dnaSeq)) protSeq = protSeqDict[seqId].seq dnaAlignFasta.write(">" + seqId + "\n") dnaSeqPos = 0 for protPos in range(0, len(protSeq)): if protSeq[protPos] == '-': codon = '---' else: # TODO: in future use a codon table to check correct matching codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3]) dnaSeqPos += 3 dnaAlignFasta.write(codon) protPos += 1 # should now be equal to prot_align_len if Debug: LOG.write( seqId + " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n" .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq))) if protPos < prot_align_len: dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos))) LOG.write( "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n" .format(seqId, protPos, prot_align_len, len(dnaSeq), len(protSeq))) dnaAlignFasta.write("\n") dnaAlignFasta_text = dnaAlignFasta.getvalue() retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta') return retval
def test_translate(self): """Test that a dna open reading frame is translated correctly.""" orf = 'ATGTGGAGACGGAAACATCCGAGGACATCCGGAGGAACCCGGGGAGTTCTGAGTGGTAATTAG' expected_primers = Seq('MWRRKHPRTSGGTRGVLSGN*', HasStopCodon(IUPAC.ExtendedIUPACProtein(), '*')) result_primers = translate(orf) self.assertEqual(result_primers, expected_primers) self.assertEqual(len(result_primers), 21) self.assertEqual(isinstance(result_primers, Seq), True)
def cast_sequence(ungapped_sequence): """ ungapped_sequence: a list with the sequence and id for all the species in a file. Returns a list sequences with the type cast as c. """ casted = [] for record in ungapped_sequence: casted.append(Seq(str(record.seq), IUPAC.IUPACUnambiguousDNA())) return casted
def test_compute_background_1(self): target = ms.compute_background(self.fastas, IUPAC.IUPACUnambiguousRNA(), verbose=False) expected = {'A': 0.1944, 'C': 0.1388, 'U': 0.5277, 'G': 0.1388} for key,value in expected.items(): self.assertAlmostEqual(target[key], value, 3)