def test_translated_search_unaligned_reads_blastm8(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that function does not require gene lengths in reference id Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) alignments.add(referenceid, 0, queryid, identity / 100.0 * alignment_length, "unclassified", alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_without_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def test_translated_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for bug Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be one bug name and the other should be unclassified self.assertEqual( sorted(alignments.bug_list()), sorted(["s__Bacteroides_xylanisolvens", "unclassified"]))
def test_Alignments_id_mapping_half_hits_with_temp_alignment_file(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the lengths are mapped correctly with only some references included in those provided for id mapping Test with the temp alignment file """ alignments_store = store.Alignments(minimize_memory_use=True) # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref1|100") alignments_store.add_annotated("query3", 1, "200|ref2") hit_list = alignments_store.get_hit_list() # delete the temp alignment file alignments_store.delete_temp_alignments_file() # test the lengths are correct stored_lengths = [item[-1] for item in hit_list] self.assertEqual( sorted(stored_lengths), sorted([1 / 1000.0, 100 / 1000.0, 200 / 1000.0, 1000 / 1000.0]))
def test_nucleotide_search_unaligned_reads_output_fasta_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test output file is of fasta format Test sam file is not removed """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # check for fasta output file format file_format = utilities.determine_file_format( unaligned_reads_file_fasta) self.assertEqual("fasta", file_format) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file)
def test_gene_families_tsv_output_with_names(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the tsv output Test that gene families have names applied to them Test unmapped reads total is written with the same precision as other lines """ # update the max decimals to allow for rounding config.output_max_decimals = 7 # set to a smaller mapping file original_gene_family_mapping_file = config.gene_family_name_mapping_file config.gene_family_name_mapping_file = cfg.gene_families_to_names_file # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_uniref50_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignments.add(referenceids[1], 1, queryid, identity, referenceids[0]) file_handle.close() # set the output format config.output_format = "tsv" # set the location of the file to write to as a temp file file_out, gene_families_file = tempfile.mkstemp() os.close(file_out) config.genefamilies_file = gene_families_file # create gene_scores instance gene_scores = store.GeneScores() # obtain the gene families gene_families_file = families.gene_families(alignments, gene_scores, 1) # check the gene families output is as expected self.assertTrue( filecmp.cmp(gene_families_file, cfg.gene_familes_uniref50_with_names_file, shallow=False)) # reset the mapping file config.gene_family_name_mapping_file = original_gene_family_mapping_file # delete the temp file utils.remove_temp_file(gene_families_file)
def test_nucleotide_search_unaligned_reads_read_count_aligned_identity_threshold( self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the identity threshold does filter alignments """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # update the identity threshold to a number larger than those in the alignments original_identity_threshold = config.identity_threshold config.identity_threshold = 101.0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the identity threshold back to the original config.identity_threshold = original_identity_threshold # check the aligned reads count (it should be zero as none should pass the threshold) self.assertEqual(len(alignments.get_hit_list()), 0)
def test_nucleotide_search_unaligned_reads_read_count_unaligned_minimize_memory_use( self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for unaligned read counts Test with minimize memory use """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads(minimize_memory_use=True) # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the unaligned reads count self.assertEqual(unaligned_reads_store.count_reads(), cfg.sam_file_unaligned_reads_total_unaligned)
def test_nucleotide_search_unaligned_reads_output_blast_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the aligned reads file created is of the blastm8 format """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() config.file_basename = "TEST" # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # test file is of the blastm8 format file_format = utilities.determine_file_format( reduced_aligned_reads_file) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) self.assertEqual(file_format, "blastm8")
def test_gene_families_gene_list(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the gene list """ # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignments.add(referenceids[1], 1, queryid, identity, referenceids[0]) file_handle.close() # check the genes were loaded correctly self.assertEqual(sorted(cfg.usearch_file_gene_list), sorted(alignments.gene_list()))
def test_nucleotide_search_unaligned_reads_scores(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the scores are based on percent identities """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be 4 hits identified all_hits = alignments.get_hit_list() # check for set and default gene lengths expected_score = math.pow(151.0, config.match_power) for hit in all_hits: query, bug, reference, score, length = hit self.assertEqual(score, expected_score)
def test_translated_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for reference Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # three of the hits should be for gene "UniRef50" hits = alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits), 3)
def test_nucleotide_search_unaligned_reads_read_count_aligned(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the aligned reads count self.assertEqual(len(alignments.get_hit_list()), cfg.sam_file_unaligned_reads_total_aligned)
def test_nucleotide_search_unaligned_reads_read_count_aligned_evalue_threshold( self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the evalue threshold does not filter alignments """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # update the evalue threshold to a number less than those for the alignment file original_evalue_threshold = config.evalue_threshold config.evalue_threshold = 1e-15 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the evalue threshold back to the original config.evalue_threshold = original_evalue_threshold # check the aligned reads count (all reads should be aligned even though they do not # meet the threshold as the evalue threshold is not applied for this type of alignment) self.assertEqual(len(alignments.get_hit_list()), cfg.sam_file_unaligned_reads_total_aligned)
def test_Alignments_id_mapping_all_bug_list_with_temp_alignment_file(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the bugs are mapped correctly Test with the temp alignment file """ alignments_store = store.Alignments(minimize_memory_use=True) # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") bug_list = alignments_store.bug_list() # delete the temp alignment file alignments_store.delete_temp_alignments_file() # test the bugs are correct self.assertEqual(sorted(bug_list), sorted(["bug3", "unclassified"]))
def test_nucleotide_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for reference """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file ] = nucleotide.unaligned_reads(cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # two of the hits should be for gene "UniRef50" hits = alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits), 2)
def test_Alignments_process_chocophlan_length(self): """ Test the process_chocophlan_length with standard length format """ alignments_store=store.Alignments() length=alignments_store.process_chocophlan_length("1-100","gene") self.assertEqual(length, 100)
def test_Alignments_process_chocophlan_length_multiple(self): """ Test the process_chocophlan_length with multiple lengths Test with one length on the reverse strand """ alignments_store=store.Alignments() length=alignments_store.process_chocophlan_length("c:100-1,1-100","gene") self.assertEqual(length, 200)
def test_Alignments_process_reference_annotation_gene_length_reversed(self): """ Test the process reference annotation function with a gene and length reversed """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("3000|gene") expected_output=["gene",3000,"unclassified"] self.assertEqual(expected_output,output)
def test_Alignments_process_reference_annotation_unknown_annotations_three_items_length_string(self): """ Test the process reference annotation function with unknown annotations (three items) with string for length """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("UniRef90_W1Q3F0|UniRef50_P59787|5000") expected_output=["UniRef90_W1Q3F0|UniRef50_P59787|5000",0,"unclassified"] self.assertEqual(expected_output,output)
def test_Alignments_process_reference_annotation_unknown_annotations_four_items(self): """ Test the process reference annotation function with unknown annotations (four items) """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("UniRef90_W1Q3F0|UniRef50_P59787|5000|bug") expected_output=["UniRef90_W1Q3F0|UniRef50_P59787|5000|bug",0,"unclassified"] self.assertEqual(expected_output,output)
def test_Alignments_process_reference_annotation_gene_length_with_bug(self): """ Test the process reference annotation function with a gene and length and bug """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("gene|3000|bug") expected_output=["gene",3000,"bug"] self.assertEqual(expected_output,output)
def test_Alignments_process_reference_annotation_unknown_annotations_three_items_bug_int(self): """ Test the process reference annotation function with unknown annotations (three items) with int as bug """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("UniRef90_W1Q3F0|5000|5000") expected_output=["UniRef90_W1Q3F0|5000|5000",0,"unclassified"] self.assertEqual(expected_output,output)
def test_Alignments_process_reference_annotation_numerical_gene_length(self): """ Test the process reference annotation function with gene (as number) and length """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation("59787|5000") expected_output=["59787",5000,"unclassified"] self.assertEqual(expected_output,output)
def test_Alignments_compute_gene_scores_double_gene_double_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test two hits to gene with more than one hit per query Test with the temp alignment file """ # create a set of hits # bug, reference, reference_length, query, matches = hit matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # gene1 hit1_score = math.pow(matches1, config.match_power) hit2_score = math.pow(matches2, config.match_power) query1_sum = hit1_score + hit2_score # convert lengths to per kb gene2_length = gene2_length / 1000.0 # gene2 hit3_score = math.pow(matches3, config.match_power) query2_sum = hit3_score expected_gene_score = hit3_score / query2_sum / gene2_length + hit2_score / query1_sum / gene2_length actual_gene_score = gene_scores_store.get_score("bug1", "gene2") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertAlmostEqual(actual_gene_score, expected_gene_score, places=7)
def test_Alignments_process_reference_annotation_new_chocophlan_annotations(self): """ Test the process reference annotation function with the new chocophlan annotations """ alignments_store=store.Alignments() output=alignments_store.process_reference_annotation( "gi|554771211|gb|ACIN03000006.1|:c1189-5|46125|g__Abiotrophia.s__Abiotrophia_defectiva|UniRef90_W1Q3F0|UniRef50_P59787|5000") expected_output=["UniRef50_P59787",5000,"g__Abiotrophia.s__Abiotrophia_defectiva"] self.assertEqual(expected_output,output)
def test_Alignments_add_gene_count(self): """ Alignments class: Test add function Test the total genes """ alignments_store=store.Alignments() alignments_store.add("gene2", 1, "Q3", 0.01, "bug1",1) alignments_store.add("gene1", 1, "Q1", 0.01, "bug2",1) alignments_store.add("gene3", 1, "Q2", 0.01, "bug3",1) alignments_store.add("gene1", 1, "Q1", 0.01, "bug1",1) # check the total genes self.assertEqual(alignments_store.count_genes(),3)
def test_Alignments_add_gene_list(self): """ Alignments class: Test add function Test the gene list """ alignments_store=store.Alignments() alignments_store.add("gene2", 1, "Q3", 0.01, "bug1",1) alignments_store.add("gene1", 1, "Q1", 0.01, "bug2",1) alignments_store.add("gene3", 1, "Q2", 0.01, "bug3",1) alignments_store.add("gene1", 1, "Q1", 0.01, "bug1",1) # check gene list self.assertEqual(sorted(alignments_store.gene_list()),["gene1","gene2","gene3"])
def test_Alignments_add_gene_lengths(self): """ Alignments class: Test add function Test the gene lengths """ alignments_store=store.Alignments() alignments_store.add("gene2", 10, "Q3", 0.01, "bug1",1) alignments_store.add("gene1", 100, "Q1", 0.01, "bug2",1) alignments_store.add("gene3", 1000, "Q2", 0.01, "bug3",1) alignments_store.add("gene1", 0, "Q1", 0.01, "bug1",1) # test the lengths are correct stored_lengths=[item[-1] for item in alignments_store.get_hit_list()] self.assertEqual(sorted(stored_lengths),sorted([10/1000.0,100/1000.0,1000/1000.0,1000/1000.0]))
def test_Alignments_compute_gene_scores_single_gene_single_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test one hit for gene with one hit for query Test with the temp alignment file """ # create a set of hits matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene3_length = gene3_length / 1000.0 # gene3 hit4_score = math.pow(matches4, config.match_power) query3_sum = hit4_score expected_gene_score = hit4_score / query3_sum / gene3_length actual_gene_score = gene_scores_store.get_score("bug1", "gene3") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertEqual(actual_gene_score, expected_gene_score)
def test_translated_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for gene length Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be 4 hits identified all_hits = alignments.get_hit_list() self.assertEqual(len(all_hits), 4) # check for set and default gene lengths read_length = 50 expected_length_uniref50 = (abs(2000 - read_length) + 1) / 1000.0 expected_length_other = (abs(1000 - read_length) + 1) / 1000.0 # check for set and default gene lengths for hit in all_hits: query, bug, reference, score, length = hit if reference == "UniRef50": self.assertEqual(length, expected_length_uniref50) else: self.assertEqual(length, expected_length_other)