def test_gene_families_gene_list(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the gene list """ # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] evalue = float(data[config.blast_evalue_index]) alignments.add(referenceids[1], 1, queryid, evalue, referenceids[0]) file_handle.close() # check the genes were loaded correctly self.assertEqual(sorted(cfg.usearch_file_gene_list), sorted(alignments.gene_list()))
def test_translated_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for gene length """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta=translated_search.unaligned_reads(unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # there should be 4 hits identified all_hits=alignments.get_hit_list() self.assertEqual(len(all_hits),4) # check for set and default gene lengths for hit in all_hits: query, bug, reference, evalue, length = hit if reference == "UniRef50": self.assertEqual(length,2000) else: self.assertEqual(length,1000)
def test_Alignments_process_chocophlan_length(self): """ Test the process_chocophlan_length with standard length format """ alignments_store=store.Alignments() length=alignments_store.process_chocophlan_length("1-100","gene") self.assertEqual(length, 100)
def test_Alignments_process_chocophlan_length_multiple(self): """ Test the process_chocophlan_length with multiple lengths Test with one length on the reverse strand """ alignments_store=store.Alignments() length=alignments_store.process_chocophlan_length("c:100-1,1-100","gene") self.assertEqual(length, 200)
def test_translated_search_unaligned_reads_rapsearch_log(self): """ Test the unaligned reads function Test with a rapsearch output file Test that log of evalue is taken """ # create a set of alignments alignments=store.Alignments() # load the rapsearch output file_handle=open(cfg.rapsearch2_output_file_with_header) for line in file_handle: if not re.search("^#",line): data=line.strip().split(config.blast_delimiter) referenceid=data[config.blast_reference_index] queryid=data[config.blast_query_index] evalue=float(data[config.blast_evalue_index]) alignments.add(referenceid, 0, queryid, evalue,"unclassified") file_handle.close() alignments_test=store.Alignments() unaligned_reads_store=store.Reads() # load the rapsearch output with the unaligned reads function unaligned_file_fasta=translated_search.unaligned_reads(unaligned_reads_store, cfg.rapsearch2_output_file_with_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # check the evalues are changed hit1_evalue=sorted(alignments.get_hit_list())[0][-2] hit1_evalue_test=sorted(alignments_test.get_hit_list())[0][-2] self.assertAlmostEqual(math.pow(10.0,math.log(hit1_evalue)*-1), math.log(hit1_evalue_test)*-1,places=7)
def test_translated_search_unaligned_reads_blastm8(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that log of evalue is not taken Test that function does not require gene lengths in reference id """ # create a set of alignments alignments=store.Alignments() # load the blastm8-like output file_handle=open(cfg.rapsearch2_output_file_without_header) for line in file_handle: if not re.search("^#",line): data=line.strip().split(config.blast_delimiter) referenceid=data[config.blast_reference_index] queryid=data[config.blast_query_index] evalue=float(data[config.blast_evalue_index]) alignments.add(referenceid, 0, queryid, evalue,"unclassified") file_handle.close() alignments_test=store.Alignments() unaligned_reads_store=store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta=translated_search.unaligned_reads(unaligned_reads_store, cfg.rapsearch2_output_file_without_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # check the evalues are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def test_gene_families_tsv_output(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the tsv output """ # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] evalue = float(data[config.blast_evalue_index]) alignments.add(referenceids[1], 1, queryid, evalue, referenceids[0]) file_handle.close() # set the output format config.output_format = "tsv" # set the location of the file to write to as a temp file file_out, gene_families_file = tempfile.mkstemp() os.close(file_out) config.genefamilies_file = gene_families_file # create gene_scores instance gene_scores = store.GeneScores() # obtain the gene families gene_families_file = quantify_families.gene_families( alignments, gene_scores) # check the gene families output is as expected self.assertTrue( filecmp.cmp(gene_families_file, cfg.gene_familes_file, shallow=False)) # delete the temp file utils.remove_temp_file(gene_families_file)
def test_Alignments_add_gene_list(self): """ Alignments class: Test add function Test the gene list """ alignments_store=store.Alignments() alignments_store.add("gene2", 1, "Q3", 0.01, "bug1") alignments_store.add("gene1", 1, "Q1", 0.01, "bug2") alignments_store.add("gene3", 1, "Q2", 0.01, "bug3") alignments_store.add("gene1", 1, "Q1", 0.01, "bug1") # check gene list self.assertEqual(sorted(alignments_store.gene_list()),["gene1","gene2","gene3"])
def test_Alignments_add_gene_count(self): """ Alignments class: Test add function Test the total genes """ alignments_store=store.Alignments() alignments_store.add("gene2", 1, "Q3", 0.01, "bug1") alignments_store.add("gene1", 1, "Q1", 0.01, "bug2") alignments_store.add("gene3", 1, "Q2", 0.01, "bug3") alignments_store.add("gene1", 1, "Q1", 0.01, "bug1") # check the total genes self.assertEqual(alignments_store.count_genes(),3)
def test_Alignments_compute_gene_scores_double_gene_double_query(self): """ Test the compute_gene_scores function Test two hits to gene with more than one hit per query """ # create a set of hits # bug, reference, reference_length, query, evalue = hit eval1 = 1e-4 eval2 = 3e-7 eval3 = 2e-10 eval4 = 2e-10 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments() alignments_store.add("gene1", gene1_length, "query1", eval1, "bug1") alignments_store.add("gene2", gene2_length, "query1", eval2, "bug1") alignments_store.add("gene2", gene2_length, "query2", eval3, "bug1") alignments_store.add("gene3", gene3_length, "query3", eval4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # gene1 hit1_score = math.exp(-eval1) hit2_score = math.exp(-eval2) query1_sum = hit1_score + hit2_score # convert lengths to per kb gene2_length = gene2_length / 1000.0 # gene2 hit3_score = math.exp(-eval3) query2_sum = hit3_score gene_score = hit3_score / query2_sum / gene2_length + hit2_score / query1_sum / gene2_length self.assertAlmostEqual(gene_scores_store.get_score("bug1", "gene2"), gene_score, places=12)
def test_Alignments_id_mapping_all_hits(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the lengths are mapped correctly """ alignments_store = store.Alignments() # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") # test the lengths are correct stored_lengths = [item[-1] for item in alignments_store.get_hit_list()] self.assertEqual(sorted(stored_lengths), sorted([1, 10, 1000]))
def test_Alignments_id_mapping_all_bug_list(self): """ Test the store_id_mapping function Test the add_annotated and process_reference_annotation with id mapping Test the bugs are mapped correctly """ alignments_store = store.Alignments() # load in the id_mapping file alignments_store.process_id_mapping(cfg.id_mapping_file) # store some alignments alignments_store.add_annotated("query1", 1, "ref1") alignments_store.add_annotated("query2", 1, "ref2") alignments_store.add_annotated("query3", 1, "ref3") # test the bugs are correct self.assertEqual(sorted(alignments_store.bug_list()), sorted(["bug3", "unclassified"]))
def test_translated_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for bug """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta=translated_search.unaligned_reads(unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # there should be one bug name and the other should be unclassified self.assertEqual(sorted(alignments.bug_list()),sorted(["s__Bacteroides_xylanisolvens","unclassified"]))
def test_translated_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for reference """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta=translated_search.unaligned_reads(unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # three of the hits should be for gene "UniRef50" hits=alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits),3)
def test_Alignments_compute_gene_scores_single_gene_single_query(self): """ Test the compute_gene_scores function Test one hit for gene with one hit for query """ # create a set of hits eval1 = 1e-4 eval2 = 3e-7 eval3 = 2e-10 eval4 = 2e-10 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments() alignments_store.add("gene1", gene1_length, "query1", eval1, "bug1") alignments_store.add("gene2", gene2_length, "query1", eval2, "bug1") alignments_store.add("gene2", gene2_length, "query2", eval3, "bug1") alignments_store.add("gene3", gene3_length, "query3", eval4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene3_length = gene3_length / 1000.0 # gene3 hit4_score = math.exp(-eval4) query3_sum = hit4_score gene_score = hit4_score / query3_sum / gene3_length self.assertEqual(gene_scores_store.get_score("bug1", "gene3"), gene_score)