def testRnaRevCom(self): # Without quality test_record = Sequence("id1", "AuMGCUN").rnaRevCom() expected_revcom = Sequence("id1", "NAGCKaU") self.assertTrue(cmpSequences(test_record, expected_revcom)) # With quality test_record = Sequence("id1", "AuMGCUN", "", "18AAEGH").rnaRevCom() expected_revcom = Sequence("id1", "NAGCKaU", "", "HGEAA81") self.assertTrue(cmpSequences(test_record, expected_revcom))
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_multi_line = os.path.join(tmp_folder, unique_id + "_multi.fasta") self.tmp_multi_line_gz = os.path.join(tmp_folder, unique_id + "_multi.fasta.gz") self.tmp_mono_line = os.path.join(tmp_folder, unique_id + "_mono.fasta") self.tmp_out = os.path.join(tmp_folder, unique_id + "_out.fasta") # Expected records self.expected_rec = [ Sequence( "seq1", "ATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATAC", "test description"), Sequence("seq2", ""), Sequence("seq3", "ATGAAAAAAAAAAAAANTGATGAAAAAAAAAAAAANTG", "test description 2"), Sequence("seq4", "", "trimmed") ] # Create multi line content = """>seq1 test description ATAGATAGCATCCCCCCNATAC ATAGATAGCATCCCCCCNATAC ATAGATAGCATCCCCCCNATAC >seq2 >seq3 test description 2 ATGAAAAAAAAAAAAANTG ATGAAAAAAAAAAAAANTG >seq4 trimmed """ with open(self.tmp_multi_line, "w") as FH_out: FH_out.write(content) # Create mono line content = """>seq1 test description ATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATAC >seq2 >seq3 test description 2 ATGAAAAAAAAAAAAANTGATGAAAAAAAAAAAAANTG >seq4 trimmed """ with open(self.tmp_mono_line, "w") as FH_out: FH_out.write(content) with gzip.open(self.tmp_multi_line_gz, "wt") as FH_out: FH_out.write(content)
def nextSeq(self): """ Return the next sequence. :return: The next sequence or None if it is the end of file. :rtype: anacore.sequence.Sequence """ seq_record = None try: prev_file_pos = self.file_handle.tell() header = self.file_handle.readline().strip() new_file_pos = self.file_handle.tell() if prev_file_pos != new_file_pos: # Header fields = header[1:].split(None, 1) seq_id = fields[0] seq_desc = fields[1] if len(fields) == 2 else None self.current_line_nb += 1 # Sequence seq_str = self.file_handle.readline().strip() self.current_line_nb += 1 # Separator self.file_handle.readline() self.current_line_nb += 1 # Quality seq_qual = self.file_handle.readline().strip() self.current_line_nb += 1 # Record seq_record = Sequence(seq_id, seq_str, seq_desc, seq_qual) except Exception: raise IOError('The line {} in "{}" cannot be parsed by {}.'.format( self.current_line_nb, self.filepath, self.__class__.__name__)) return seq_record
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_seq = os.path.join(tmp_folder, unique_id + "_ref.fastq") self.tmp_seq_gz = os.path.join(tmp_folder, unique_id + "_ref.fastq.gz") self.tmp_out = os.path.join(tmp_folder, unique_id + "_out.fastq") # Expected self.expected_rec = [ Sequence("seq1", "ATAGATAGCATCCCCCCNATAC", "test description", "@?>=<;:9876543210##-,+"), Sequence("seq2", "", None, ""), Sequence("seq3", "ATGAAAAAAAAAAAAANTG", "test description 2", "@?>=<;:987654321#/."), Sequence("seq4", "", "trimmed", "") ] # Create file content = """@seq1 test description ATAGATAGCATCCCCCCNATAC + @?>=<;:9876543210##-,+ @seq2 + @seq3 test description 2 ATGAAAAAAAAAAAAANTG + @?>=<;:987654321#/. @seq4 trimmed + """ with open(self.tmp_seq, "w") as FH_out: FH_out.write(content) with gzip.open(self.tmp_seq_gz, "wt") as FH_out: FH_out.write(content)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_fasta_idx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_fasta = os.path.join(tmp_folder, unique_id + ".fasta") # Expected self.expected_rec = { "one": Sequence( "one", "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCAT" ), "two": Sequence("two", "ATGCATGCATGCATGCATGCATGCATGC", "another chromosome") } # Create sequence file content_fasta = """>one ATGCATGCATGCATGCATGCATGCATGCAT GCATGCATGCATGCATGCATGCATGCATGC ATGCAT >two another chromosome ATGCATGCATGCAT GCATGCATGCATGC""" with open(self.tmp_fasta, "w") as FH_out: FH_out.write(content_fasta) # Create index content_fasta_idx = """one 66 5 30 31 two 28 98 14 15""" with open(self.tmp_fasta_idx, "w") as FH_out: FH_out.write(content_fasta_idx)
def get(self, id): """ Return the sequence from file. :param id: The sequence ID. :type id: str :return: The sequence selected from the file. :rtype: anacore.sequence.Sequence """ if self.cached is not None: if self.cached.id == id: return self.cached # The sequence is not already cached seq = self.getSub(id, 1, None) selected = Sequence(id, seq) if self.use_cache: self.cached = selected return selected
def nextSeq(self): """ Return the next sequence. :return: The next sequence. :rtype: anacore.sequence.Sequence """ seq_record = None if not self._end_of_file: line = "" try: # First line in file if self.current_line_nb == 1: self._next_id = self.file_handle.readline().strip() self.current_line_nb += 1 # Sequence seq_str = "" while not line.startswith('>'): seq_str += line.strip() line = self.file_handle.readline() if not line: line = None self._end_of_file = True break self.current_line_nb += 1 fields = self._next_id[1:].split(None, 1) seq_id = fields[0] seq_desc = fields[1].strip() if len(fields) == 2 else None seq_record = Sequence(seq_id, seq_str, seq_desc) self._next_id = line # next seq_id except Exception: raise IOError( 'The line {} in "{}" cannot be parsed by {}.\ncontent: {}'. format(self.current_line_nb, self.filepath, self.__class__.__name__, line)) return seq_record