def test_get_representatives(self): """get_representatives should return the representatives as list of Sequence.""" result = """>1: 5 ABABABA >3: 1 BABA >4: 1 ABABAA >8: 2 BABBA""" seqs = self.data.iteritems mapping = self.mapping test_result = list(get_representatives(mapping, seqs())) test_result_as_fasta = "\n".join( map(lambda a: a.toFasta(), test_result)) self.assertEqual(test_result_as_fasta, result) # another example mapping = {'1': ('a', 'b', 'c'), '2': ('d', 'e', 'f')} seqs = [('1', "ACGT"), ('2', "TAGC"), ('a', "TTTTT")] observed = list(get_representatives(mapping, seqs)) expected = [ Sequence(name=">1", seq="ACGT"), Sequence(name='2', seq="TAGC") ] self.assertEqual(observed, expected)
def test_hasTerminalStop(self): """test check for terminal stop codons""" seq = Sequence(DNA, seq='ACTTAA') assert seq.hasTerminalStop() == True seq = Sequence(DNA, seq='ACTTAT') == False try: # only sequences with length divisible by 3 should work seq = Sequence(DNA, seq='ACTTA') seq.hasTerminalStop() except AssertionError: pass
def test_reversecomplement(self): """testing reversal and complementing of a sequence""" seq = Sequence(DNA, seq='ACTGTAA') rev = seq.reversecomplement() self.assertEqual(str(rev), 'TTACAGT') seq = Sequence(DNA, seq='ACTG-TAA') rev = seq.reversecomplement() self.assertEqual(str(rev), 'TTA-CAGT') #try amigbuities seq = Sequence(DNA, seq='ACHNRTAA') rev = seq.reversecomplement() self.assertEqual(str(rev), 'TTAYNDGT')
def test_hasTerminalStop(self): """test check for terminal stop codons""" seq = Sequence(DNA, seq='ACTTAA') assert seq.hasTerminalStop() == True seq = Sequence(DNA, seq='ACTTAT') == False # for sequence not divisible by 3 seq = Sequence(DNA, seq='ACTTA') # fail self.assertRaises(ValueError, seq.hasTerminalStop) # unless explicitly over-ride length issue using allow_partial # in which case, returns False self.assertFalse(seq.hasTerminalStop(allow_partial=True))
def test_withoutTerminalStopCodon(self): """testing deleting terminal stop""" # for standard code seq = Sequence(DNA, seq='ACTTAA') seq2 = seq.withoutTerminalStopCodon() self.assertEqual(str(seq2), "ACT") # for sequence not divisible by 3 seq = Sequence(DNA, seq='ACTTA') # fail self.assertRaises(ValueError, seq.withoutTerminalStopCodon) # unless explicitly over-ride length issue using allow_partial seq2 = seq.withoutTerminalStopCodon(allow_partial=True)
def write_Fasta_from_name_seq_pairs(name_seqs, fh): """writes a list of (name,seqs) to filehandle. name_seqs: (name,seqs) pair such as from MinimalFASTAParser fh: an open filehandle """ if fh==None: raise ValueError,"Need open file handle to write to." for (name,seq) in name_seqs: fh.write("%s\n"% Sequence(name=name, seq = seq).toFasta())
def write_Fasta_from_name_seq_pairs(name_seqs, fh): """writes a list of (name,seqs) to filehandle. name_seqs: (name,seqs) pair such as from parse_fasta fh: an open filehandle """ if fh is None: raise ValueError("Need open file handle to write to.") for (name, seq) in name_seqs: fh.write("%s\n" % Sequence(name=name, seq=seq).toFasta())
def get_representatives(mapping, seqs): """Returns representative seqs. mapping: The prefix mapping dict seqs_fh: An open Fasta filehandle """ for (label, seq) in seqs: if (mapping.has_key(label)): seq = Sequence(name="%s: %d" % (label, len(mapping[label]) + 1), seq=seq) yield seq
def degap_fasta_aln(seqs): """degap a Fasta aligment. seqs: list of label,seq pairs """ for (label, seq) in seqs: degapped_seq = Sequence(moltype=DNA_with_more_gaps, seq=seq, name=label).degap() degapped_seq.Name = label yield degapped_seq
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], MinimalFastaParser(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in MinimalFastaParser(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(Sequence(name=newlabel, seq=seq).toFasta() + "\n")
def ipynast_seqs(candidate_sequences, template_alignment, max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None, log_fp=None, logger=None, temp_dir=get_pynast_temp_dir(), **kwargs): """Iterator that yields results of pynast on candidate_sequences This function yields the sequence and exit status of the alignment step, as (sequence, exit status) tuples. Status values can be: 0 : indicates a sucessful alignment, in which case the sequence will be aligned 1 : indicates unsucessful sequence search, in which case the sequence will be unaligned 2 : indicates alignment did not meet minimum requirements, in which case the sequence will be unaligned All sequences are returned as DNA sequence objects. candidate_sequences an iterable object (e.g., a list) containing tuples of (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser) or a fasta filepath template_alignment a PyCogent alignment object containing the template alignment or a fasta filepath max_hits Maximum number of uclust hits to return min_pct minimum % identity for best database match min_len minimum length of match for alignment align_unaligned_seqs_f Function to align sequences. Must be of the form: align_unaligned_seqs(seqs, moltype, params=None) see cogent.app.muscle_v38.align_unaligned_seqs log_fp Optional path to log file logger Optional NastLogger object, takes precedence over log_fp """ deprecation_warning(kwargs) files_to_remove = [] if type(candidate_sequences) == str: # filepath provided for candidate sequences candidate_sequences = MinimalFastaParser(open(candidate_sequences)) # sequence list provided for candidate sequence -- write # the seqs to a temp file to pass to uclust. This is done in all # cases to convert the sequences to uppercase in case they're not already. # The bad handling of upper versus lower-cased sequences is a uclust issue. # Note that delete = False here because we don't want these to # be deleted when they are closed (since we need to pass # the filepaths around after we write and close them). The files # are deleted explicitly at the end of this function. candidate_fasta_f = NamedTemporaryFile(prefix='pynast_candidate', suffix='.fasta', dir=temp_dir, delete=False) candidate_fasta_filepath = candidate_fasta_f.name for seq_id, seq in candidate_sequences: candidate_fasta_f.write('>%s\n%s\n' % (seq_id, str(seq).upper())) candidate_fasta_f.close() files_to_remove.append(candidate_fasta_filepath) # degap the template alignment for the sequence searching step and # write it to file. See above comment about delete=False template_fasta_f = NamedTemporaryFile(prefix='pynast_template', suffix='.fasta', dir=temp_dir, delete=False) template_fasta_filepath = template_fasta_f.name if type(template_alignment) == str: # the template alignment was received as a filepath try: template_alignment_f = open(template_alignment) except IOError: raise IOError,\ "Cannot open specified filepath: %s" % template_alignment # template alignment provided as filepath -- process it iteratively # to handle potentially massive template_alignments template_alignment = {} for seq_id, seq in MinimalFastaParser(template_alignment_f): template_alignment[seq_id] = seq seq = Sequence(seq=seq, moltype=DNA) template_fasta_f.write('>%s\n%s\n' % (seq_id, seq.degap())) else: # the template alignment was received as a filepath template_fasta_f.write(template_alignment.degap().toFasta()) template_fasta_f.close() files_to_remove.append(template_fasta_filepath) # Set up logging. NastLogger object takes precedence over log # file path, if both are provided. if logger is not None: logger = logger elif log_fp is not None: logger = NastLogger(log_fp) else: logger = NastLogger() min_pct /= 100. # get the alignment iterator pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath( candidate_fasta_filepath, template_fasta_filepath, percent_ID=min_pct, enable_rev_strand_matching=True, tmp_dir=temp_dir) try: current_result = pw_alignment_iterator.next() except StopIteration: current_result = None for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)): seq_len = len(seq) if '-' in seq: # clean-up temporary blast database files if any were created pw_alignment_iterator.close() remove_files(files_to_remove, error_on_missing=False) raise ValueError, "Candidate sequence contains gaps. This is not supported." try: candidate_seq_id, template_seq_id, pw_aligned_candidate,\ pw_aligned_template, pct_identity = current_result except TypeError: pass if not current_result or seq_id.split()[0] != candidate_seq_id.split( )[0]: # a suitable match was not found - don't align the sequence # log the failure logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 1 else: # this sequence was aligned if align_unaligned_seqs_f: # if an alternate pairwise aligner was specified, unalign # and re-align the sequences. pw_aligned_template, pw_aligned_candidate =\ align_two_seqs(pw_aligned_template.replace('-',''), pw_aligned_candidate.replace('-',''), align_unaligned_seqs_f) # Cast the pairwise alignments to DNA sequence objects pw_aligned_candidate = \ DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id) pw_aligned_template = \ DNA.makeSequence(pw_aligned_template,Name=template_seq_id) # Remove any terminal gaps that were introduced into the template # sequence pw_aligned_candidate, pw_aligned_template = \ remove_template_terminal_gaps( pw_aligned_candidate, pw_aligned_template) candidate_seq_id = pw_aligned_candidate.Name # get the aligned template sequence from the template alignment try: template_aligned_seq = \ template_alignment.getGappedSeq(template_seq_id) except AttributeError: template_aligned_seq = \ Sequence(seq=template_alignment[template_seq_id],moltype=DNA) # reintroduce the gap spacing from the template alignment pw_aligned_template, pw_aligned_candidate, new_gaps =\ reintroduce_template_spacing(template_aligned_seq,\ pw_aligned_template,pw_aligned_candidate) # delete any new gaps that were introduced during the # pairwise alignment step pw_aligned_template, pw_aligned_candidate = adjust_alignment(\ pw_aligned_template,pw_aligned_candidate,new_gaps) # reintroduce any terminal gaps that were present in the template result = introduce_terminal_gaps(\ template_aligned_seq,pw_aligned_template,pw_aligned_candidate) unaligned_length = len(result.degap()) if unaligned_length < min_len: # alignment is too short - log this as a failure error = "Alignment does not meet minimum length "+\ "requirement for alignment (%d < %d)"\ % (seq_len,min_len) logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 2 else: # log the alignment logger.record( seq_id, # input sequence identifier len(seq), # input sequence length '', # Errors template_seq_id, # best template match id '%3.2f' % pct_identity, # pct id to template unaligned_length, # post alignment sequence length ) # yield the aligned sequence and sucess code yield DNA.makeSequence(result, Name=candidate_seq_id), 0 # get the next alignment try: current_result = pw_alignment_iterator.next() except StopIteration: # end of the input fasta file indicates completion, # not end of the aligned sequences continue # clean-up temporary blast database files if any were created remove_files(files_to_remove, error_on_missing=False)
def test_ambig_translate(self): """test of translating seqs""" seq = Sequence(DNA, 'CGNTGN???---').getTranslation() self.assertEqual(str(seq), 'RX?-')
def test_translate(self): """test of translating seqs""" seq = Sequence(DNA, 'ATGACGTTGCGTAGCATAGCTCGA').getTranslation() self.assertEqual(str(seq), 'MTLRSIAR')
def setUp(self): self.seq = Sequence(DNA, 'ATGACGTTGCGTAGCATAGCTCGA')
def test_withoutTerminalStopCodon(self): """testing deleting terminal stop""" # for standard code seq = Sequence(DNA, seq='ACTTAA') seq2 = seq.withoutTerminalStopCodon() self.assertEqual(str(seq2), "ACT")