Beispiel #1
0
class TestFastaParser(unittest.TestCase):

    def setUp(self):
        self.fasta_parser = FastaParser()
        self.example_data = ExampleData()

    def test_parse_1(self):
        fasta_fh = StringIO(self.example_data.fasta_seqs_1)
        self.assertEqual(
            list(self.fasta_parser.entries(fasta_fh)), 
            [('test_1 a random sequence', 'TTTAGAAATTACACA'), 
             ('test_2 another random sequence', 'ACGAGAAATTAAATTAAATT'), 
             ('test_3 another random sequence', 'TAGAGACATTGGATTTTATT')])

    def test_parse_empty_file(self):
        fasta_fh = StringIO("")
        self.assertEqual(
            list(self.fasta_parser.entries(fasta_fh)), [])

    def test_single_entry_file_header(self):
        fasta_fh = StringIO(self.example_data.fasta_seqs_2)
        self.assertEqual(self.fasta_parser.single_entry_file_header(fasta_fh), 
                         "test_4 a random sequence")

    def test_header_id_1(self):
        self.assertEqual(
            self.fasta_parser.header_id("seq_10101 An important protein"),
            "seq_10101")

    def test_header_id_2(self):
        self.assertEqual(
            self.fasta_parser.header_id("seq_10101\tAn important protein"),
            "seq_10101")
Beispiel #2
0
 def _ref_ids_to_file(self, ref_seq_paths):
     """Translate the reference ID to file paths."""
     ref_ids_to_file = {}
     fasta_parser = FastaParser()
     for ref_seq_path in ref_seq_paths:
         ref_seq_file = os.path.basename(ref_seq_path)
         with open(ref_seq_path) as ref_seq_fh:
             ref_seq_id = fasta_parser.header_id(
                 fasta_parser.single_entry_file_header(ref_seq_fh))
             ref_ids_to_file[ref_seq_id] = ref_seq_file
     return ref_ids_to_file
Beispiel #3
0
 def _ref_ids_to_file(self, ref_seq_paths):
     """Translate the reference ID to file paths."""
     ref_ids_to_file = {}
     fasta_parser = FastaParser()
     for ref_seq_path in ref_seq_paths:
         ref_seq_file = os.path.basename(ref_seq_path)
         with open(ref_seq_path) as ref_seq_fh:
             ref_seq_id = fasta_parser.header_id(
                 fasta_parser.single_entry_file_header(ref_seq_fh))
             ref_ids_to_file[ref_seq_id] = ref_seq_file
     return ref_ids_to_file
def test_fasta_parser():
    # Define some dummy data & parser object
    fasta_parser = FastaParser()
    fasta_seqs_1 = """>test_1 a random sequence
TTTAG
AAATT
ACACA
>test_2 another random sequence
ACGAG
AAATT
AAATT
AAATT
>test_3 another random sequence
TAGAG
ACATT
GGATT
TTATT
"""
    fasta_seqs_2 = """>test_4 a random sequence
TTTAG
AAATT
ACACA
"""

    # test fasta entry
    fasta_fh = StringIO(fasta_seqs_1)
    assert list(fasta_parser.entries(fasta_fh)) == [
        ('test_1 a random sequence', 'TTTAGAAATTACACA'),
        ('test_2 another random sequence', 'ACGAGAAATTAAATTAAATT'),
        ('test_3 another random sequence', 'TAGAGACATTGGATTTTATT')
    ]

    # test empty fasta file
    fasta_empty_fh = StringIO("")
    assert list(fasta_parser.entries(fasta_empty_fh)) == []

    # test single entry file header
    fasta_header_fh = StringIO(fasta_seqs_2)
    assert fasta_parser.single_entry_file_header(
        fasta_header_fh) == "test_4 a random sequence"

    # test header id 1
    assert fasta_parser.header_id(
        "seq_10101 An important protein") == "seq_10101"

    # test header id 2
    assert fasta_parser.header_id(
        "seq_10101\tAn important protein") == "seq_10101"
 def __init__(self):
     self.fasta_parser = FastaParser()
class ReadAlignerStats(object):

    def __init__(self):
        self.fasta_parser = FastaParser()

    def count(self, read_alignment_result_bam_path, unaligned_reads_path):
        self._stats = {}
        self._count_aligned_reads_and_alignments(
            read_alignment_result_bam_path)
        self._count_unaligned_reads(unaligned_reads_path)
        return self._stats

    def _count_unaligned_reads(self, unaligned_read_paths):
        
        with open(unaligned_read_paths) as fasta_fh:
            self._stats["stats_total"][
                "no_of_unaligned_reads"] = self._count_fasta_entries(fasta_fh)

    def _count_fasta_entries(self, fasta_fh):
        return reduce(lambda x, y: x + 1,
                      self.fasta_parser.entries(fasta_fh), 0)

    def _count_aligned_reads_and_alignments(
            self, read_alignment_result_bam_path):
        bam = pysam.Samfile(read_alignment_result_bam_path)
        stats_per_ref = defaultdict(dict)
        no_of_hits_per_read_freq = {}
        for ref_id in bam.references:
            self._init_counting_dict(stats_per_ref, ref_id)
        for entry in bam.fetch():
            ref_id = bam.getrname(entry.tid)
            try:
                self._count_alignment(
                    entry, ref_id, stats_per_ref, no_of_hits_per_read_freq)
            except KeyError:
                sys.stderr.write(
                    "SAM entry with unspecified reference found! Stoping\n")
                sys.exit(2)
        self._stats["stats_per_reference"] = stats_per_ref
        for ref_id, stats in stats_per_ref.items():
            stats_per_ref[ref_id][
                "no_of_hits_per_read_and_freqs"] = self._calc_down_to_read(
                stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"])
        self._stats["stats_total"] = self._sum_countings(stats_per_ref)

    def _sum_countings(self, stats_per_ref):
        total_stats = {}
        for ref_id, stats in stats_per_ref.items():
            for attribute, value in stats.items():
                if type(value) is int or type(value) is float:
                    total_stats.setdefault(attribute, 0)
                    total_stats[attribute] += value
                elif type(value) is dict:
                    total_stats.setdefault(attribute, {})
                    for value_int, freq in value.items():
                        total_stats[attribute].setdefault(value_int, 0)
                        total_stats[attribute][value_int] += freq
        return total_stats

    def _calc_down_to_read(self, no_of_hits_per_read_freq):
        """As the frequencies were determined via the alignments we need
        to normalized each frequency value down to the read by
        dividing the frequencig by the number of hits per read.
        """
        return dict((no_of_hits_per_read, freq/no_of_hits_per_read)
                    for no_of_hits_per_read, freq in
                    no_of_hits_per_read_freq.items())

    def _init_counting_dict(self, stats_per_ref, ref_id):
        stats_per_ref[ref_id] = defaultdict(float)
        stats_per_ref[ref_id]["no_of_alignments"]
        stats_per_ref[ref_id]["no_of_aligned_reads"]
        stats_per_ref[ref_id]["no_of_split_alignments"]
        stats_per_ref[ref_id]["no_of_uniquely_aligned_reads"]
        stats_per_ref[ref_id][
            "alignment_length_and_freqs"] = defaultdict(int)
        stats_per_ref[ref_id][
            "no_of_hits_per_read_and_freqs"] = defaultdict(int)

    def _count_alignment(self, entry, ref_id, stats_per_ref,
                         no_of_hits_per_read_freq):
        entry_tags_dict = dict(entry.tags)
        no_of_hits = entry_tags_dict["NH"]
        # Consider split reads
        no_of_splits = float(entry_tags_dict.get("XL", 1))
        stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"][
            no_of_hits] += 1
        if "XL" in entry_tags_dict:
            stats_per_ref[ref_id]["no_of_split_alignments"] += 1.0/no_of_splits
        stats_per_ref[ref_id]["no_of_alignments"] += 1.0/no_of_splits
        stats_per_ref[
            ref_id]["no_of_aligned_reads"] += 1.0/(
            float(no_of_hits) * no_of_splits)
        if no_of_hits == 1:
            stats_per_ref[ref_id][
                "no_of_uniquely_aligned_reads"] += 1.0/no_of_splits
        stats_per_ref[ref_id][
            "alignment_length_and_freqs"][entry.alen] += 1
Beispiel #7
0
 def setUp(self):
     self.fasta_parser = FastaParser()
     self.example_data = ExampleData()
Beispiel #8
0
 def setUp(self):
     self.fasta_parser = FastaParser()
     self.example_data = ExampleData()
Beispiel #9
0
 def __init__(self):
     self.fasta_parser = FastaParser()
Beispiel #10
0
class ReadAlignerStats(object):
    def __init__(self):
        self.fasta_parser = FastaParser()

    def count(self, read_alignment_result_bam_path, unaligned_reads_path):
        self._stats = {}
        if unaligned_reads_path == "NA":
            self._count_aligned_reads_and_alignments(
                read_alignment_result_bam_path)
        else:
            self._count_aligned_reads_and_alignments(
                read_alignment_result_bam_path)
            self._count_unaligned_reads(unaligned_reads_path)
        return self._stats

    def _count_unaligned_reads(self, unaligned_read_paths):
        if os.path.isfile(unaligned_read_paths):
            with open(unaligned_read_paths) as fasta_fh:
                self._stats["stats_total"][
                    "no_of_unaligned_reads"] = self._count_fasta_entries(
                        fasta_fh)

    def _count_fasta_entries(self, fasta_fh):
        return reduce(lambda x, y: x + 1, self.fasta_parser.entries(fasta_fh),
                      0)

    def _count_aligned_reads_and_alignments(self,
                                            read_alignment_result_bam_path):
        bam = pysam.Samfile(read_alignment_result_bam_path)
        stats_per_ref = defaultdict(dict)
        no_of_hits_per_read_freq = {}
        for ref_id in bam.references:
            self._init_counting_dict(stats_per_ref, ref_id)
        for entry in bam.fetch():
            ref_id = bam.get_reference_name(entry.tid)
            try:
                self._count_alignment(entry, ref_id, stats_per_ref,
                                      no_of_hits_per_read_freq)
            except KeyError:
                sys.stderr.write(
                    "SAM entry with unspecified reference found! Stoping\n")
                sys.exit(2)
        self._stats["stats_per_reference"] = stats_per_ref
        for ref_id, stats in stats_per_ref.items():
            stats_per_ref[ref_id][
                "no_of_hits_per_read_and_freqs"] = self._calc_down_to_read(
                    stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"])
        self._stats["stats_total"] = self._sum_countings(stats_per_ref)

    def _sum_countings(self, stats_per_ref):
        total_stats = {}
        for ref_id, stats in stats_per_ref.items():
            for attribute, value in stats.items():
                if type(value) is int or type(value) is float:
                    total_stats.setdefault(attribute, 0)
                    total_stats[attribute] += value
                elif type(value) is dict:
                    total_stats.setdefault(attribute, {})
                    for value_int, freq in value.items():
                        total_stats[attribute].setdefault(value_int, 0)
                        total_stats[attribute][value_int] += freq
        return total_stats

    def _calc_down_to_read(self, no_of_hits_per_read_freq):
        """As the frequencies were determined via the alignments we need
        to normalized each frequency value down to the read by
        dividing the frequencig by the number of hits per read.
        """
        return dict(
            (no_of_hits_per_read, freq / no_of_hits_per_read)
            for no_of_hits_per_read, freq in no_of_hits_per_read_freq.items())

    def _init_counting_dict(self, stats_per_ref, ref_id):
        stats_per_ref[ref_id] = defaultdict(float)
        stats_per_ref[ref_id]["no_of_alignments"]
        stats_per_ref[ref_id]["no_of_aligned_reads"]
        stats_per_ref[ref_id]["no_of_split_alignments"]
        stats_per_ref[ref_id]["no_of_uniquely_aligned_reads"]
        stats_per_ref[ref_id]["alignment_length_and_freqs"] = defaultdict(int)
        stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"] = defaultdict(
            int)

    def _count_alignment(self, entry, ref_id, stats_per_ref,
                         no_of_hits_per_read_freq):
        entry_tags_dict = dict(entry.tags)
        no_of_hits = entry_tags_dict["NH"]
        # Consider split reads
        no_of_splits = float(entry_tags_dict.get("XL", 1))
        stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"][no_of_hits] += 1
        if "XL" in entry_tags_dict:
            stats_per_ref[ref_id][
                "no_of_split_alignments"] += 1.0 / no_of_splits
        stats_per_ref[ref_id]["no_of_alignments"] += 1.0 / no_of_splits
        stats_per_ref[ref_id]["no_of_aligned_reads"] += 1.0 / (
            float(no_of_hits) * no_of_splits)
        if no_of_hits == 1:
            stats_per_ref[ref_id][
                "no_of_uniquely_aligned_reads"] += 1.0 / no_of_splits
        stats_per_ref[ref_id]["alignment_length_and_freqs"][
            entry.query_length] += 1