Esempio n. 1
0
 def testRnaRevCom(self):
     # Without quality
     test_record = Sequence("id1", "AuMGCUN").rnaRevCom()
     expected_revcom = Sequence("id1", "NAGCKaU")
     self.assertTrue(cmpSequences(test_record, expected_revcom))
     # With quality
     test_record = Sequence("id1", "AuMGCUN", "", "18AAEGH").rnaRevCom()
     expected_revcom = Sequence("id1", "NAGCKaU", "", "HGEAA81")
     self.assertTrue(cmpSequences(test_record, expected_revcom))
Esempio n. 2
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_multi_line = os.path.join(tmp_folder,
                                           unique_id + "_multi.fasta")
        self.tmp_multi_line_gz = os.path.join(tmp_folder,
                                              unique_id + "_multi.fasta.gz")
        self.tmp_mono_line = os.path.join(tmp_folder,
                                          unique_id + "_mono.fasta")
        self.tmp_out = os.path.join(tmp_folder, unique_id + "_out.fasta")

        # Expected records
        self.expected_rec = [
            Sequence(
                "seq1",
                "ATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATAC",
                "test description"),
            Sequence("seq2", ""),
            Sequence("seq3", "ATGAAAAAAAAAAAAANTGATGAAAAAAAAAAAAANTG",
                     "test description 2"),
            Sequence("seq4", "", "trimmed")
        ]

        # Create multi line
        content = """>seq1 test description
ATAGATAGCATCCCCCCNATAC
ATAGATAGCATCCCCCCNATAC
ATAGATAGCATCCCCCCNATAC
>seq2

>seq3 test description 2
ATGAAAAAAAAAAAAANTG
ATGAAAAAAAAAAAAANTG
>seq4 trimmed

"""
        with open(self.tmp_multi_line, "w") as FH_out:
            FH_out.write(content)

        # Create mono line
        content = """>seq1 test description
ATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATACATAGATAGCATCCCCCCNATAC
>seq2

>seq3 test description 2
ATGAAAAAAAAAAAAANTGATGAAAAAAAAAAAAANTG
>seq4 trimmed

"""
        with open(self.tmp_mono_line, "w") as FH_out:
            FH_out.write(content)
        with gzip.open(self.tmp_multi_line_gz, "wt") as FH_out:
            FH_out.write(content)
Esempio n. 3
0
    def nextSeq(self):
        """
        Return the next sequence.

        :return: The next sequence or None if it is the end of file.
        :rtype: anacore.sequence.Sequence
        """
        seq_record = None
        try:
            prev_file_pos = self.file_handle.tell()
            header = self.file_handle.readline().strip()
            new_file_pos = self.file_handle.tell()
            if prev_file_pos != new_file_pos:
                # Header
                fields = header[1:].split(None, 1)
                seq_id = fields[0]
                seq_desc = fields[1] if len(fields) == 2 else None
                self.current_line_nb += 1
                # Sequence
                seq_str = self.file_handle.readline().strip()
                self.current_line_nb += 1
                # Separator
                self.file_handle.readline()
                self.current_line_nb += 1
                # Quality
                seq_qual = self.file_handle.readline().strip()
                self.current_line_nb += 1
                # Record
                seq_record = Sequence(seq_id, seq_str, seq_desc, seq_qual)
        except Exception:
            raise IOError('The line {} in "{}" cannot be parsed by {}.'.format(
                self.current_line_nb, self.filepath, self.__class__.__name__))
        return seq_record
Esempio n. 4
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_seq = os.path.join(tmp_folder, unique_id + "_ref.fastq")
        self.tmp_seq_gz = os.path.join(tmp_folder, unique_id + "_ref.fastq.gz")
        self.tmp_out = os.path.join(tmp_folder, unique_id + "_out.fastq")

        # Expected
        self.expected_rec = [
            Sequence("seq1", "ATAGATAGCATCCCCCCNATAC", "test description",
                     "@?>=<;:9876543210##-,+"),
            Sequence("seq2", "", None, ""),
            Sequence("seq3", "ATGAAAAAAAAAAAAANTG", "test description 2",
                     "@?>=<;:987654321#/."),
            Sequence("seq4", "", "trimmed", "")
        ]

        # Create file
        content = """@seq1 test description
ATAGATAGCATCCCCCCNATAC
+
@?>=<;:9876543210##-,+
@seq2

+

@seq3 test description 2
ATGAAAAAAAAAAAAANTG
+
@?>=<;:987654321#/.
@seq4 trimmed

+

"""
        with open(self.tmp_seq, "w") as FH_out:
            FH_out.write(content)
        with gzip.open(self.tmp_seq_gz, "wt") as FH_out:
            FH_out.write(content)
Esempio n. 5
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_fasta_idx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_fasta = os.path.join(tmp_folder, unique_id + ".fasta")

        # Expected
        self.expected_rec = {
            "one":
            Sequence(
                "one",
                "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCAT"
            ),
            "two":
            Sequence("two", "ATGCATGCATGCATGCATGCATGCATGC",
                     "another chromosome")
        }

        # Create sequence file
        content_fasta = """>one
ATGCATGCATGCATGCATGCATGCATGCAT
GCATGCATGCATGCATGCATGCATGCATGC
ATGCAT
>two another chromosome
ATGCATGCATGCAT
GCATGCATGCATGC"""
        with open(self.tmp_fasta, "w") as FH_out:
            FH_out.write(content_fasta)

        # Create index
        content_fasta_idx = """one	66	5	30	31
two	28	98	14	15"""
        with open(self.tmp_fasta_idx, "w") as FH_out:
            FH_out.write(content_fasta_idx)
Esempio n. 6
0
    def get(self, id):
        """
        Return the sequence from file.

        :param id: The sequence ID.
        :type id: str
        :return: The sequence selected from the file.
        :rtype: anacore.sequence.Sequence
        """
        if self.cached is not None:
            if self.cached.id == id:
                return self.cached
        # The sequence is not already cached
        seq = self.getSub(id, 1, None)
        selected = Sequence(id, seq)
        if self.use_cache:
            self.cached = selected
        return selected
Esempio n. 7
0
    def nextSeq(self):
        """
        Return the next sequence.

        :return: The next sequence.
        :rtype: anacore.sequence.Sequence
        """
        seq_record = None
        if not self._end_of_file:
            line = ""
            try:
                # First line in file
                if self.current_line_nb == 1:
                    self._next_id = self.file_handle.readline().strip()
                    self.current_line_nb += 1
                # Sequence
                seq_str = ""
                while not line.startswith('>'):
                    seq_str += line.strip()
                    line = self.file_handle.readline()
                    if not line:
                        line = None
                        self._end_of_file = True
                        break
                    self.current_line_nb += 1
                fields = self._next_id[1:].split(None, 1)
                seq_id = fields[0]
                seq_desc = fields[1].strip() if len(fields) == 2 else None
                seq_record = Sequence(seq_id, seq_str, seq_desc)
                self._next_id = line  # next seq_id
            except Exception:
                raise IOError(
                    'The line {} in "{}" cannot be parsed by {}.\ncontent: {}'.
                    format(self.current_line_nb, self.filepath,
                           self.__class__.__name__, line))
        return seq_record