Example #1
0
    def test_nucleotide_search_unaligned_reads_read_count_unaligned_minimize_memory_use(
            self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test for unaligned read counts
        Test with minimize memory use
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads(minimize_memory_use=True)

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # check the unaligned reads count
        self.assertEqual(unaligned_reads_store.count_reads(),
                         cfg.sam_file_unaligned_reads_total_unaligned)
Example #2
0
    def test_nucleotide_search_unaligned_reads_output_fasta_format(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test output file is of fasta format
        Test sam file is not removed
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # check for fasta output file format
        file_format = utilities.determine_file_format(
            unaligned_reads_file_fasta)
        self.assertEqual("fasta", file_format)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)
Example #3
0
    def test_nucleotide_search_unaligned_reads_annotations_reference(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test the different annotation formats are recognized for reference
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_annotations,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # two of the hits should be for gene "UniRef50"
        hits = alignments.hits_for_gene("UniRef50")
        self.assertEqual(len(hits), 2)
Example #4
0
    def test_nucleotide_search_unaligned_reads_output_blast_format(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test the aligned reads file created is of the blastm8 format
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        config.file_basename = "TEST"

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_annotations,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # test file is of the blastm8 format
        file_format = utilities.determine_file_format(
            reduced_aligned_reads_file)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        self.assertEqual(file_format, "blastm8")
Example #5
0
    def test_nucleotide_search_unaligned_reads_scores(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test the scores are based on percent identities
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_annotations,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # there should be 4 hits identified
        all_hits = alignments.get_hit_list()

        # check for set and default gene lengths
        expected_score = math.pow(151.0, config.match_power)

        for hit in all_hits:
            query, bug, reference, score, length = hit
            self.assertEqual(score, expected_score)
Example #6
0
    def test_nucleotide_search_unaligned_reads_read_count_aligned(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test for aligned read counts
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # check the aligned reads count
        self.assertEqual(len(alignments.get_hit_list()),
                         cfg.sam_file_unaligned_reads_total_aligned)
    def test_translated_search_unaligned_reads_annotations_bug(self):
        """
        Test the unaligned reads and the store alignments
        Test with a rapsearch2 output file
        Test the different annotation formats are recognized for bug
        Test without the coverage filter
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # set the coverage threshold to zero so as to not test with filter on
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0

        # load the rapsearch2 output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store, cfg.rapsearch_file_annotations, alignments)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # there should be one bug name and the other should be unclassified
        self.assertEqual(
            sorted(alignments.bug_list()),
            sorted(["s__Bacteroides_xylanisolvens", "unclassified"]))
    def test_translated_search_unaligned_reads_annotations_reference(self):
        """
        Test the unaligned reads and the store alignments
        Test with a rapsearch2 output file
        Test the different annotation formats are recognized for reference
        Test without the coverage filter
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # set the coverage threshold to zero so as to not test with filter on
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0

        # load the rapsearch2 output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store, cfg.rapsearch_file_annotations, alignments)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # three of the hits should be for gene "UniRef50"
        hits = alignments.hits_for_gene("UniRef50")
        self.assertEqual(len(hits), 3)
Example #9
0
    def test_nucleotide_search_unaligned_reads_read_count_aligned_identity_threshold(
            self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test for aligned read counts
        Test the identity threshold does filter alignments
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # update the identity threshold to a number larger than those in the alignments
        original_identity_threshold = config.identity_threshold
        config.identity_threshold = 101.0

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # reset the identity threshold back to the original
        config.identity_threshold = original_identity_threshold

        # check the aligned reads count (it should be zero as none should pass the threshold)
        self.assertEqual(len(alignments.get_hit_list()), 0)
Example #10
0
    def test_nucleotide_search_unaligned_reads_read_count_aligned_evalue_threshold(
            self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test for aligned read counts
        Test the evalue threshold does not filter alignments
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # update the evalue threshold to a number less than those for the alignment file
        original_evalue_threshold = config.evalue_threshold
        config.evalue_threshold = 1e-15

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # reset the evalue threshold back to the original
        config.evalue_threshold = original_evalue_threshold

        # check the aligned reads count (all reads should be aligned even though they do not
        # meet the threshold as the evalue threshold is not applied for this type of alignment)
        self.assertEqual(len(alignments.get_hit_list()),
                         cfg.sam_file_unaligned_reads_total_aligned)
Example #11
0
 def test_Read_print_fasta_count_reads(self):
     """
     Read class: Test the loading of a full fasta file
     Test the total number of expected reads counted
     """
     
     reads_store=store.Reads(cfg.small_fasta_file)
     
     # Check that the total number of expected reads are counted
     self.assertEqual(reads_store.count_reads(), cfg.small_fasta_file_total_sequences)
Example #12
0
 def test_Read_print_fasta_count_reads_minimize_memory_use(self):
     """
     Read class: Test the loading of a full fasta file
     Test the total number of expected reads counted
     Test with minimize memory use
     """
     
     reads_store=store.Reads(cfg.small_fasta_file, minimize_memory_use=True)
     
     # Check that the total number of expected reads are counted
     self.assertEqual(reads_store.count_reads(), cfg.small_fasta_file_total_sequences)            
    def test_translated_search_unaligned_reads_blastm8(self):
        """
        Test the unaligned reads and the store alignments
        Test with a blastm8-like output file
        Test with empty reads structure
        Test that function does not require gene lengths in reference id
        Test without the coverage filter
        """

        # create a set of alignments
        alignments = store.Alignments()

        # set the coverage threshold to zero so as to not test with filter on
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0

        # load the blastm8-like output
        file_handle = open(cfg.rapsearch2_output_file_without_header)

        for line in file_handle:
            if not re.search("^#", line):
                data = line.strip().split(config.blast_delimiter)

                referenceid = data[config.blast_reference_index]
                queryid = data[config.blast_query_index]
                identity = float(data[config.blast_identity_index])
                alignment_length = float(
                    data[config.blast_aligned_length_index])

                alignments.add(referenceid, 0, queryid,
                               identity / 100.0 * alignment_length,
                               "unclassified", alignment_length)

        file_handle.close()

        alignments_test = store.Alignments()
        unaligned_reads_store = store.Reads()

        # load the blastm8-like output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store, cfg.rapsearch2_output_file_without_header,
            alignments_test)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # check the values are unchanged
        self.assertEqual(sorted(alignments.get_hit_list()),
                         sorted(alignments_test.get_hit_list()))
Example #14
0
 def test_Read_add_count_reads_duplicate(self):
     """
     Read class: Test the adding of a set of reads
     Test the count is correct with duplicate ids
     """
     
     reads_store=store.Reads()
     
     reads_store.add("id1","ATCG")
     reads_store.add("id2","ATTG")
     reads_store.add("id1","ATCG")
     
     self.assertEqual(reads_store.count_reads(), 2)
Example #15
0
    def test_Read_add_count_reads_duplicate_minimize_memory_use(self):
        """
        Read class: Test the adding of a set of reads
        Test the count is correct with duplicate ids
        Test with minimize memory use
        """

        reads_store=store.Reads(minimize_memory_use=True)
        
        reads_store.add("id1","ATCG")
        reads_store.add("id2","ATTG")
        reads_store.add("id1","ATCG")
        
        self.assertEqual(reads_store.count_reads(), 2)
Example #16
0
 def test_Read_delete_id(self):
     """
     Read class: Test the deleting of ids
     """
     
     reads_store=store.Reads(cfg.small_fasta_file)
     
     # delete all but one of the reads and check structure is empty
     id_list=reads_store.id_list()
     keep_id=id_list.pop()
     
     for id in id_list:
         reads_store.remove_id(id)
         
     self.assertEqual(reads_store.id_list(), [keep_id])
Example #17
0
 def test_Read_print_fasta_id_list_minimize_memory_use(self):
     """
     Read class: Test the loading of a full fasta file
     Test the expected ids are loaded
     Test with minimize memory use
     """
     
     reads_store=store.Reads(cfg.small_fasta_file, minimize_memory_use=True)
     
     # Check the reads are printed correctly
     stored_fasta=[]
     for line in reads_store.get_fasta():
         stored_fasta.append(line)
         
     printed_stored_fasta="\n".join(stored_fasta)
     
     compare_fasta={}
     # organize the fasta from the read class and the 
     # file of correct fasta output
     file_handle=open(cfg.small_fasta_file_single_line_sequences)
     for input in [printed_stored_fasta.split("\n"), file_handle]:
         id=""
         seq=""
         for line in input:
             if re.search(">",line):
                 # store prior id
                 if id and seq:
                     compare_fasta[id]=compare_fasta.get(id,[])+[seq]
                 id=line.strip()
                 seq=""
             else:
                 seq=line.strip()
                 
         # store the last sequence found
         if id and seq:
             compare_fasta[id]=compare_fasta.get(id,[])+[seq]
     
     file_handle.close()
     
     # check there are still the same number of ids
     self.assertEqual(len(compare_fasta.keys()),cfg.small_fasta_file_total_sequences)
     
     # check the sequences match
     for id, sequences in compare_fasta.items():
         self.assertTrue(len(sequences)==2)
         self.assertEqual(sequences[0], sequences[1])
    def test_translated_search_unaligned_reads_annotations_gene_length(self):
        """
        Test the unaligned reads and the store alignments
        Test with a rapsearch2 output file
        Test the different annotation formats are recognized for gene length
        Test without the coverage filter
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # set the coverage threshold to zero so as to not test with filter on
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0

        # load the rapsearch2 output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store, cfg.rapsearch_file_annotations, alignments)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # there should be 4 hits identified
        all_hits = alignments.get_hit_list()
        self.assertEqual(len(all_hits), 4)

        # check for set and default gene lengths
        read_length = 50
        expected_length_uniref50 = (abs(2000 - read_length) + 1) / 1000.0
        expected_length_other = (abs(1000 - read_length) + 1) / 1000.0

        # check for set and default gene lengths
        for hit in all_hits:
            query, bug, reference, score, length = hit
            if reference == "UniRef50":
                self.assertEqual(length, expected_length_uniref50)
            else:
                self.assertEqual(length, expected_length_other)
Example #19
0
    def test_nucleotide_search_unaligned_reads_annotations_gene_length(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test the different annotation formats are recognized for gene length
        Test the gene length uses the read length from the sam file
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_annotations,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # there should be 4 hits identified
        all_hits = alignments.get_hit_list()
        self.assertEqual(len(all_hits), 4)

        # check for set and default gene lengths
        read_length = 151
        expected_length_uniref50 = (abs(2000 - read_length) + 1) / 1000.0
        expected_length_other = (abs(1000 - read_length) + 1) / 1000.0

        for hit in all_hits:
            query, bug, reference, score, length = hit
            if reference == "UniRef50":
                self.assertEqual(length, expected_length_uniref50)
            else:
                self.assertEqual(length, expected_length_other)
Example #20
0
    def test_nucleotide_search_unaligned_reads_annotations_bug(self):
        """
        Test the unaligned reads and the store alignments
        Test with a bowtie2/sam output file
        Test the different annotation formats are recognized for bug
        """

        # create a set of alignments
        alignments = store.Alignments()
        unaligned_reads_store = store.Reads()

        # read in the aligned and unaligned reads
        [unaligned_reads_file_fasta, reduced_aligned_reads_file
         ] = nucleotide.unaligned_reads(cfg.sam_file_annotations,
                                        alignments,
                                        unaligned_reads_store,
                                        keep_sam=True)

        # remove temp files
        utils.remove_temp_file(unaligned_reads_file_fasta)
        utils.remove_temp_file(reduced_aligned_reads_file)

        # there should be one bug which is unclassified
        self.assertEqual(alignments.bug_list(), ["unclassified"])
    def test_translated_search_unaligned_reads_identity_threshold(self):
        """
        Test the unaligned reads function
        Test with a rapsearch output file
        Test the identity threshold filtering
        Test without the coverage filter
        """

        # create a set of alignments
        alignments = store.Alignments()

        # set the coverage threshold to zero so as to not test with filter on
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0

        # load the rapsearch output
        file_handle = open(cfg.rapsearch2_output_file_with_header)

        original_identity_threshold = config.identity_threshold

        # set a new threshold that will select 3 of the 5 alignments
        config.identity_threshold = 60.0

        for line in file_handle:
            if not re.search("^#", line):
                data = line.strip().split(config.blast_delimiter)

                referenceid = data[config.blast_reference_index]
                queryid = data[config.blast_query_index]
                identity = float(data[config.blast_identity_index])
                alignment_length = float(
                    data[config.blast_aligned_length_index])

                # only store those alignments with identities that meet threshold
                if identity > config.identity_threshold:
                    alignments.add(referenceid, 0, queryid,
                                   identity / 100.0 * alignment_length,
                                   "unclassified", alignment_length)

        file_handle.close()

        alignments_test = store.Alignments()
        unaligned_reads_store = store.Reads()

        # load the rapsearch output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store, cfg.rapsearch2_output_file_with_header,
            alignments_test)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # set the threshold back to the default
        config.identity_threshold = original_identity_threshold

        # check the total number of alignments is the same
        self.assertEqual(len(alignments.get_hit_list()),
                         len(alignments_test.get_hit_list()))
    def test_translated_search_unaligned_reads_blastm8_coverage_filter(self):
        """
        Test the unaligned reads and the store alignments
        Test with a blastm8-like output file
        Test with empty reads structure
        Test that function does not require gene lengths in reference id
        Test with the coverage filter
        Test with query length annotations
        Test that an alignment with query start larger than query end is not filtered
        """

        # create a set of alignments
        alignments = store.Alignments()

        # set the coverage threshold to a small value so as to have some alignments pass
        current_coverage_threshold = config.translated_subject_coverage_threshold
        config.translated_subject_coverage_threshold = 0.50

        # get the set of allowed proteins
        allowed_proteins = blastx_coverage.blastx_coverage(
            cfg.rapsearch2_output_file_without_header_coverage,
            config.translated_subject_coverage_threshold, alignments, True)

        # load the blastm8-like output
        file_handle = open(cfg.rapsearch2_output_file_without_header_coverage)

        for line in file_handle:
            if not re.search("^#", line):
                data = line.strip().split(config.blast_delimiter)

                referenceid = data[config.blast_reference_index]
                gene, length, bug = alignments.process_reference_annotation(
                    referenceid)
                queryid, query_length = utilities.get_length_annotation(
                    data[config.blast_query_index])
                identity = float(data[config.blast_identity_index])
                alignment_length = float(
                    data[config.blast_aligned_length_index])

                if gene in allowed_proteins:
                    alignments.add(gene, length, queryid,
                                   identity / 100.0 * alignment_length, bug,
                                   alignment_length)

        file_handle.close()

        alignments_test = store.Alignments()
        unaligned_reads_store = store.Reads()

        # load the blastm8-like output with the unaligned reads function
        unaligned_file_fasta = translated.unaligned_reads(
            unaligned_reads_store,
            cfg.rapsearch2_output_file_without_header_coverage,
            alignments_test)

        # remove temp file
        utils.remove_temp_file(unaligned_file_fasta)

        # reset the coverage threshold
        config.translated_subject_coverage_threshold = current_coverage_threshold

        # check the values are unchanged
        self.assertEqual(sorted(alignments.get_hit_list()),
                         sorted(alignments_test.get_hit_list()))