def test_translated_search_unaligned_reads_blastm8(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that function does not require gene lengths in reference id Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) alignments.add(referenceid, 0, queryid, identity / 100.0 * alignment_length, "unclassified", alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_without_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def test_Alignments_id_mapping_all_bug_list_with_temp_alignment_file(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the bugs are mapped correctly Test with the temp alignment file """ alignments_store = store.Alignments(minimize_memory_use=True) # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") bug_list = alignments_store.bug_list() # delete the temp alignment file alignments_store.delete_temp_alignments_file() # test the bugs are correct self.assertEqual(sorted(bug_list), sorted(["bug3", "unclassified"]))
def test_Alignments_id_mapping_half_hits_with_temp_alignment_file(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the lengths are mapped correctly with only some references included in those provided for id mapping Test with the temp alignment file """ alignments_store = store.Alignments(minimize_memory_use=True) # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref1|100") alignments_store.add_annotated("query3", 1, "200|ref2") hit_list = alignments_store.get_hit_list() # delete the temp alignment file alignments_store.delete_temp_alignments_file() # test the lengths are correct stored_lengths = [item[-1] for item in hit_list] self.assertEqual( sorted(stored_lengths), sorted([1 / 1000.0, 100 / 1000.0, 200 / 1000.0, 1000 / 1000.0]))
def test_gene_families_gene_list(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the gene list """ # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignments.add(referenceids[1], 1, queryid, identity, referenceids[0]) file_handle.close() # check the genes were loaded correctly self.assertEqual(sorted(cfg.usearch_file_gene_list), sorted(alignments.gene_list()))
def test_gene_families_tsv_output_with_names(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the tsv output Test that gene families have names applied to them Test unmapped reads total is written with the same precision as other lines """ # update the max decimals to allow for rounding config.output_max_decimals = 7 # set to a smaller mapping file original_gene_family_mapping_file = config.gene_family_name_mapping_file config.gene_family_name_mapping_file = cfg.gene_families_to_names_file # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_uniref50_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignments.add(referenceids[1], 1, queryid, identity, referenceids[0]) file_handle.close() # set the output format config.output_format = "tsv" # set the location of the file to write to as a temp file file_out, gene_families_file = tempfile.mkstemp() os.close(file_out) config.genefamilies_file = gene_families_file # create gene_scores instance gene_scores = store.GeneScores() # obtain the gene families gene_families_file = families.gene_families(alignments, gene_scores, 1) # check the gene families output is as expected self.assertTrue( filecmp.cmp(gene_families_file, cfg.gene_familes_uniref50_with_names_file, shallow=False)) # reset the mapping file config.gene_family_name_mapping_file = original_gene_family_mapping_file # delete the temp file utils.remove_temp_file(gene_families_file)
def test_nucleotide_search_unaligned_reads_read_count_unaligned_minimize_memory_use(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for unaligned read counts Test with minimize memory use """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads(minimize_memory_use=True) # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the unaligned reads count self.assertEqual(unaligned_reads_store.count_reads(),cfg.sam_file_unaligned_reads_total_unaligned)
def test_nucleotide_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for reference """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # two of the hits should be for gene "UniRef50" hits=alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits),2)
def test_nucleotide_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for bug """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be one bug which is unclassified self.assertEqual(alignments.bug_list(),["unclassified"])
def test_translated_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for bug Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be one bug name and the other should be unclassified self.assertEqual( sorted(alignments.bug_list()), sorted([ "g__Bacteroides.s__Bacteroides_xylanisolvens", "unclassified" ]))
def test_translated_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for reference Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # three of the hits should be for gene "UniRef50" hits = alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits), 3)
def test_nucleotide_search_unaligned_reads_output_fasta_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test output file is of fasta format Test sam file is not removed """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # check for fasta output file format file_format=utilities.determine_file_format(unaligned_reads_file_fasta) self.assertEqual("fasta",file_format) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file)
def test_nucleotide_search_unaligned_reads_output_blast_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the aligned reads file created is of the blastm8 format """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 config.file_basename="TEST" # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # test file is of the blastm8 format file_format=utilities.determine_file_format(reduced_aligned_reads_file) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) self.assertEqual(file_format,"blastm8")
def test_nucleotide_search_unaligned_reads_read_count_aligned_subject_coverage(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test with subject coverage filtering """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off subject filtering config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset subject filtering config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the aligned reads count self.assertEqual(len(alignments.get_hit_list()),cfg.sam_file_unaligned_reads_total_aligned_subject_coverage)
def test_Alignments_compute_gene_scores_double_gene_double_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test two hits to gene with more than one hit per query Test with the temp alignment file """ # create a set of hits # bug, reference, reference_length, query, matches = hit matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # gene1 hit1_score = math.pow(matches1, config.match_power) hit2_score = math.pow(matches2, config.match_power) query1_sum = hit1_score + hit2_score # convert lengths to per kb gene2_length = gene2_length / 1000.0 # gene2 hit3_score = math.pow(matches3, config.match_power) query2_sum = hit3_score expected_gene_score = hit3_score / query2_sum / gene2_length + hit2_score / query1_sum / gene2_length actual_gene_score = gene_scores_store.get_score("bug1", "gene2") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertAlmostEqual(actual_gene_score, expected_gene_score, places=7)
def test_blastx_coverage(self): """ Test the coverage filter Test with one protein with one alignment passing threshold Test with one protein with two alignments passing threshold (does not pass with only one alignment) Test with other proteins with one more more alignments not passing threshold """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to a small value so as to have some alignments pass current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 50.0 # get the set of allowed proteins allowed_proteins = blastx_coverage.blastx_coverage( cfg.rapsearch2_output_file_without_header_coverage, config.translated_subject_coverage_threshold, alignments, True) # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header_coverage) found_proteins = set() for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] gene, length, bug = alignments.process_reference_annotation( referenceid) queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) # the proteins that pass have "_coverage50" as part of their names if "_coverage50" in gene: found_proteins.add(gene) file_handle.close() # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(allowed_proteins), sorted(found_proteins))
def test_Alignments_compute_gene_scores_single_gene_single_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test one hit for gene with one hit for query Test with the temp alignment file """ # create a set of hits matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene3_length = gene3_length / 1000.0 # gene3 hit4_score = math.pow(matches4, config.match_power) query3_sum = hit4_score expected_gene_score = hit4_score / query3_sum / gene3_length actual_gene_score = gene_scores_store.get_score("bug1", "gene3") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertEqual(actual_gene_score, expected_gene_score)
def test_nucleotide_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for gene length Test the gene length uses the read length from the sam file """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be 4 hits identified all_hits=alignments.get_hit_list() self.assertEqual(len(all_hits),4) # check for set and default gene lengths read_length = 151 expected_length_uniref50 = (abs(2000 - read_length)+1)/1000.0 expected_length_other = (abs(1000 - read_length)+1)/1000.0 for hit in all_hits: query, bug, reference, score, length = hit if reference == "UniRef50": self.assertEqual(length,expected_length_uniref50) else: self.assertEqual(length,expected_length_other)
def test_blastx_coverage_gene_names_id_mapping(self): """ Test the blastx_coverage function Test the gene names with chocophlan annotations Test without filter """ # create a set of alignments alignments = store.Alignments() # process the id mapping alignments.process_id_mapping(cfg.coverage_id_mapping_file) # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # get the set of allowed proteins allowed_proteins = blastx_coverage.blastx_coverage( cfg.rapsearch2_output_file_without_header_coverage, config.translated_subject_coverage_threshold, alignments, log_messages=True) # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header_coverage) all_proteins = set() for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) # just like the id mapping, remove the UniRef50_ protein_name = data[config.blast_reference_index].split( config.chocophlan_delimiter)[0] protein_name = protein_name.replace("UniRef50_", "") all_proteins.add(protein_name) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the expected proteins are found self.assertEqual(sorted(all_proteins), sorted(allowed_proteins))
def test_translated_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for gene length Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be 4 hits identified all_hits = alignments.get_hit_list() self.assertEqual(len(all_hits), 4) # check for set and default gene lengths read_length = 50 expected_length_uniref50 = (abs(2000 - read_length) + 1) / 1000.0 expected_length_other = (abs(1000 - read_length) + 1) / 1000.0 # check for set and default gene lengths for hit in all_hits: query, bug, reference, score, length = hit if reference == "UniRef50": self.assertEqual(length, expected_length_uniref50) else: self.assertEqual(length, expected_length_other)
def test_Alignments_id_mapping_all_bug_list(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the bugs are mapped correctly """ alignments_store = store.Alignments() # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") # test the bugs are correct self.assertEqual(sorted(alignments_store.bug_list()), sorted(["bug3", "unclassified"]))
def test_Alignments_compute_gene_scores_single_gene_double_query(self): """ Test the compute_gene_scores function Test one hit for gene with more than one hit per query """ # create a set of hits # bug, reference, reference_length, query, matches = hit matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments() alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene1_length = gene1_length / 1000.0 # gene1 hit1_score = math.pow(matches1, config.match_power) hit2_score = math.pow(matches2, config.match_power) query1_sum = hit1_score + hit2_score gene_score = hit1_score / query1_sum / gene1_length self.assertEqual(gene_scores_store.get_score("bug1", "gene1"), gene_score)
def test_Alignments_id_mapping_all_hits(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the lengths are mapped correctly """ alignments_store = store.Alignments() # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") # test the lengths are correct stored_lengths = [item[-1] for item in alignments_store.get_hit_list()] self.assertEqual(sorted(stored_lengths), sorted([1 / 1000.0, 10 / 1000.0, 1000 / 1000.0]))
def test_nucleotide_search_unaligned_reads_read_count_aligned_evalue_threshold(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the evalue threshold does not filter alignments """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # update the evalue threshold to a number less than those for the alignment file original_evalue_threshold=config.evalue_threshold config.evalue_threshold=1e-15 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the evalue threshold back to the original config.evalue_threshold=original_evalue_threshold # check the aligned reads count (all reads should be aligned even though they do not # meet the threshold as the evalue threshold is not applied for this type of alignment) self.assertEqual(len(alignments.get_hit_list()),cfg.sam_file_unaligned_reads_total_aligned)
def test_nucleotide_search_unaligned_reads_read_count_aligned_identity_threshold(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the identity threshold does filter alignments """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # update the identity threshold to a number larger than those in the alignments original_identity_threshold=config.identity_threshold config.identity_threshold=101.0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the identity threshold back to the original config.identity_threshold=original_identity_threshold # check the aligned reads count (it should be two as both should pass the threshold) self.assertEqual(len(alignments.get_hit_list()),2)
def test_nucleotide_search_unaligned_reads_scores(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the scores are based on percent identities """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be 4 hits identified all_hits=alignments.get_hit_list() # check for set and default gene lengths expected_score=math.pow(151.0, config.match_power) for hit in all_hits: query, bug, reference, score, length = hit self.assertEqual(score,expected_score)
def blastx_coverage_parallel( alignment_file_tsv, min_coverage, alignments=None, log_messages=None, apply_filter=None, nucleotide=False, query_coverage_threshold=config.translated_query_coverage_threshold, identity_threshold=config.nucleotide_identity_threshold): """ 并行计算 融合原有的blastx_coverage和utilities.get_filtered_translated_alignments """ # create alignments instance if none is passed if alignments is None: alignments = store.Alignments() # store protein lengths protein_lengths = {} # store unique positions hit in each protein as sets protein_hits = defaultdict(set) # track proteins with sufficient coverage allowed = set() # track alignments unable to compute coverage no_coverage = 0 # parse blast6out file, applying filtering as selected # if identity threshold is not set, use the config default if identity_threshold is None: identity_threshold = config.identity_threshold import time start = time.time() from multiprocessing import Pool from itertools import repeat log_evalue = False large_evalue_count = 0 small_identity_count = 0 small_query_coverage_count = 0 percent_identity_convert_error = 0 alignment_length_convert_error = 0 evalue_convert_error = 0 rapsearch_evalue_convert_error = 0 with Pool(config.threads) as p, \ open(alignment_file_tsv, "rt") as file_handle: alignments = store.Alignments() chunk_size = 1000000 * config.threads lines = file_handle.readlines(chunk_size) while len(lines) > 0: res = p.starmap( line_process, zip(lines, repeat(alignments), repeat(identity_threshold), repeat(query_coverage_threshold), repeat(apply_filter), repeat(nucleotide))) # no_coverage += sum([r['no_coverage'] for r in filter(None, res)]) # percent_identity_convert_error += \ # sum([r['percent_identity_convert_error'] for r in filter(None, res)]) # alignment_length_convert_error += \ # sum([r['alignment_length_convert_error'] for r in filter(None, res)]) # large_evalue_count += \ # sum([r['large_evalue_count'] for r in filter(None, res)]) # small_identity_count += \ # sum([r['small_identity_count'] for r in filter(None, res)]) # small_query_coverage_count += \ # sum([r['small_query_coverage_count'] for r in filter(None, res)]) for r in filter(None, res): protein_hits[r['protein_name']].update(r['protein_range']) protein_lengths[r['protein_name']] = r['gene_length'] lines = file_handle.readlines(chunk_size) # TODO 完善log # if log_messages: # logger.debug("Total alignments where percent identity is not a number: " + str(percent_identity_convert_error)) # logger.debug("Total alignments where alignment length is not a number: " + str(alignment_length_convert_error)) # logger.debug("Total alignments where E-value is not a number: " + str(evalue_convert_error)) # if log_evalue: # logger.debug("Total alignments unable to convert rapsearch e-value: " + str(rapsearch_evalue_convert_error)) # logger.debug("Total alignments not included based on large e-value: " + # str(large_evalue_count)) # logger.debug("Total alignments not included based on small percent identity: " + # str(small_identity_count)) # logger.debug("Total alignments not included based on small query coverage: " + # str(small_query_coverage_count)) # track proteins without lengths no_length = 0 for protein_name, hit_positions in protein_hits.items(): try: # compute coverage, with 50 indicating that 50% of the protein is covered coverage = len(hit_positions) / float( protein_lengths[protein_name]) * 100 except ZeroDivisionError: coverage = 0 no_length += 1 if coverage >= min_coverage: allowed.add(protein_name) output_messages = [ "Total alignments without coverage information: " + str(no_coverage) ] output_messages += [ "Total proteins in blastx output: " + str(len(protein_lengths)) ] output_messages += ["Total proteins without lengths: " + str(no_length)] output_messages += [ "Proteins with coverage greater than threshold (" + str(min_coverage) + "): " + str(len(allowed)) ] # write out informational messages to log or stdout, depending on input parameters if log_messages: for message in output_messages: logger.info(message) else: print("\n".join(output_messages)) return allowed
def test_translated_search_unaligned_reads_identity_threshold(self): """ Test the unaligned reads function Test with a rapsearch output file Test the identity threshold filtering Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch output file_handle = open(cfg.rapsearch2_output_file_with_header) original_identity_threshold = config.identity_threshold # set a new threshold that will select 3 of the 5 alignments config.identity_threshold = 60.0 for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) # only store those alignments with identities that meet threshold if identity > config.identity_threshold: alignments.add(referenceid, 0, queryid, identity / 100.0 * alignment_length, "unclassified", alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the rapsearch output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_with_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # set the threshold back to the default config.identity_threshold = original_identity_threshold # check the total number of alignments is the same self.assertEqual(len(alignments.get_hit_list()), len(alignments_test.get_hit_list()))
def test_translated_search_unaligned_reads_blastm8_coverage_filter(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that function does not require gene lengths in reference id Test with the coverage filter Test with query length annotations Test that an alignment with query start larger than query end is not filtered """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to a small value so as to have some alignments pass current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0.50 # get the set of allowed proteins allowed_proteins = blastx_coverage.blastx_coverage( cfg.rapsearch2_output_file_without_header_coverage, config.translated_subject_coverage_threshold, alignments, True) # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header_coverage) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] gene, length, bug = alignments.process_reference_annotation( referenceid) queryid, query_length = utilities.get_length_annotation( data[config.blast_query_index]) identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) if gene in allowed_proteins: alignments.add(gene, length, queryid, identity / 100.0 * alignment_length, bug, alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_without_header_coverage, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def blastx_coverage( blast6out, min_coverage, alignments=None, log_messages=None, apply_filter=None, nucleotide=False, query_coverage_threshold=config.translated_query_coverage_threshold, identity_threshold=config.nucleotide_identity_threshold): # create alignments instance if none is passed if alignments is None: alignments = store.Alignments() # store protein lengths protein_lengths = {} # store unique positions hit in each protein as sets protein_hits = defaultdict(str) # track proteins with sufficient coverage allowed = set() # track alignments unable to compute coverage no_coverage = 0 # parse blast6out file, applying filtering as selected for alignment_info in utilities.get_filtered_translated_alignments( blast6out, alignments, apply_filter=apply_filter, log_filter=log_messages, query_coverage_threshold=query_coverage_threshold, identity_threshold=identity_threshold): (protein_name, gene_length, queryid, matches, bug, alignment_length, subject_start_index, subject_stop_index) = alignment_info # divide the gene length by 3 to get protein length from nucleotide length if not nucleotide: gene_length = gene_length / 3 # store the protein length protein_lengths[protein_name] = gene_length # add the range of the alignment to the protein hits protein_range = range(subject_start_index, subject_stop_index) if protein_range: # keep track of unique hit positions in this protein protein_hits[protein_name] += "{0}-{1};".format( subject_start_index, subject_stop_index) else: no_coverage += 1 # track proteins without lengths no_length = 0 # compute coverage for protein_name, hit_positions in protein_hits.items(): # compile the hit positions range_hit_positions = set() for alignment_hit in hit_positions.split(";")[:-1]: start_index, stop_index = alignment_hit.split("-") new_range = range(int(start_index), int(stop_index)) range_hit_positions.update(new_range) try: # compute coverage, with 50 indicating that 50% of the protein is covered coverage = len(range_hit_positions) / float( protein_lengths[protein_name]) * 100 except ZeroDivisionError: coverage = 0 no_length += 1 if coverage >= min_coverage: allowed.add(protein_name) output_messages = [ "Total alignments without coverage information: " + str(no_coverage) ] output_messages += [ "Total proteins in blastx output: " + str(len(protein_lengths)) ] output_messages += ["Total proteins without lengths: " + str(no_length)] output_messages += [ "Proteins with coverage greater than threshold (" + str(min_coverage) + "): " + str(len(allowed)) ] # write out informational messages to log or stdout, depending on input parameters if log_messages: for message in output_messages: logger.info(message) else: print("\n".join(output_messages)) return allowed