def test_duplicate_position(self): reference_position = Position(call='G', coverage='-', proportion='-') dups_position = Position(call='1', coverage='-', proportion='-') samples = ((Position(call='G', coverage='-', proportion='-'), ), ) self.analysis.analyze_position(reference_position, dups_position, samples)
def test_varscan_call_cannot_be_made(self): """ VarScan may include a position with ALT values when a call cannot be made. It should still be called missing (X). TODO: Add See Also VarScan documentation """ # The following is from a SRR011186 sample using bwamem and varscan. # The positions from the source data were 34072-34074. vcf_data = ( "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SRR011186\n" "gi|561108321|ref|NC_018143.2| 1 . GC C . PASS ADP=114;WT=0;HET=0;HOM=1;NC=0 GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR 1/1:255:114:114:6:108:94.74%:1.6043E-58:40:38:2:4:46:62\n" # This position should be called missing because the GT column is './.' "gi|561108321|ref|NC_018143.2| 2 . C G . PASS ADP=108;WT=1;HET=0;HOM=0;NC=0 GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR ./.:.:108\n" "gi|561108321|ref|NC_018143.2| 3 . A . . PASS ADP=112;WT=1;HET=0;HOM=0;NC=0 GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR 0/0:209:112:112:111:0:0%:1E0:38:0:47:64:0:0\n" ) expected = (Position(call='G', simple_call='G', coverage=114, proportion=0.05263157894736842), Position(call='X', simple_call='N', coverage=108.0, proportion='-'), Position(call='A', simple_call='A', coverage=112, proportion=0.9910714285714286)) with tempfile.NamedTemporaryFile('w+') as tmpfile: # Seed the file with test data tmpfile.write(vcf_data) tmpfile.seek(0) # Find the test contig. vcf = Vcf(tmpfile.name, 'SRR011186', 'varscan', 'bwamem') contig = vcf.get_contig('gi|561108321|ref|NC_018143.2|') positions = contig.positions self.assertIsInstance(contig, VcfContig) # Check position values. position = 0 for expect, observe in zip(expected, positions): position += 1 self.assertEqual(expect, observe) # It yields all expected positions self.assertEqual(position, len(expected)) # All following positions should be empty self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions))
def test_sample_vcfs_return_infinite_positions(self): vcf = Vcf(testdata.GATK_VCF, 'test_name', 'test_aliner', 'test_snpcaller') contig = vcf.get_contig('500WT1_test') positions = contig.positions expected = ( Position(call='C', simple_call='C', coverage=19049, proportion='-'), Position(call='C', simple_call='C', coverage=19049, proportion='-'), Position(call='T', simple_call='T', coverage=18824, proportion='-'), Position(call='G', simple_call='G', coverage=18804, proportion='-'), Position(call='X', simple_call='N', coverage='?', proportion='?'), Position(call='X', simple_call='N', coverage='?', proportion='?'), Position(call='G', simple_call='G', coverage=18895, proportion='-'), Position(call='A', simple_call='A', coverage=19005, proportion='-'), ) # It should yield all the contig positions. position = 0 for expect, observe in zip(expected, positions): position += 1 self.assertEqual(expect, observe) self.assertEqual(len(expected), position) # It should yield empty positions after the contig is exhausted. self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions)) self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions))
def test_call_normalizes_to_uppercase(self): observed = ['g', 'a', 't', 'c', 'd', 'x', '.'] expected = ['G', 'A', 'T', 'C', 'D', 'X', '.'] for observe, expect in zip(observed, expected): pos = Position(call=observe, coverage='-', proportion='-') self.assertEqual(expect, pos.call)
def test_simple_call_normalizes_to_uppercase_and_masks_degeneracies_with_N( self): observed = ['g', 'a', 't', 'c', 'd', 'x', '.'] expected = ['G', 'A', 'T', 'C', 'N', 'N', 'N'] for observe, expect in zip(observed, expected): pos = Position(call=observe, coverage='-', proportion='-') self.assertEqual(expect, pos.simple_call)
def test_unique_position(self): reference_position = Position(call='G', coverage='-', proportion='-') dups_position = Position( # Unique position call='0', coverage='-', proportion='-') samples = ( ( # Single Nucleotide Monomorphism Position(call='G', coverage='-', proportion='-'), # Single Nucleotide Polymorphism Position(call='A', coverage='-', proportion='-'), ), ) self.analysis.analyze_position(reference_position, dups_position, samples)
def test_no_duplicate_information_position(self): """ Scenario: The reference was not scanned for duplicate positions. As a result, it is assumed all positions passed. The following fields should be affected: - is_reference_duplicated is False - called_snp is incremented for snps, but not for monomorphisms - is_missing_matrix is True - is_all_quality_breadth is True """ reference_position = Position(call='G', coverage='-', proportion='-') dups_position = Position( # No duplicate information call='X', coverage='-', proportion='-') samples = ( ( # Single Nucleotide Monomorphism Position(call='G', coverage='-', proportion='-'), # Single Nucleotide Polymorphism Position(call='A', coverage='-', proportion='-'), ), ) expected = PositionInfo(is_all_called=True, is_reference_clean=True, is_reference_duplicated=False, is_all_passed_coverage=True, is_all_passed_proportion=True, is_all_passed_consensus=True, is_all_quality_breadth=True, is_best_snp=False, all_sample_stats=[ (Counter({ 'quality_breadth': 1, 'was_called': 1, 'called_reference': 1, 'passed_proportion_filter': 1, 'passed_coverage_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'was_called': 1, 'called_reference': 1, 'passed_proportion_filter': 1, 'passed_coverage_filter': 1, 'called_snp': 0, 'called_degen': 0 })), [ Counter({ 'quality_breadth': 1, 'was_called': 1, 'called_reference': 1, 'passed_proportion_filter': 1, 'passed_coverage_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'was_called': 1, 'called_reference': 1, 'passed_proportion_filter': 1, 'passed_coverage_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'was_called': 1, 'called_reference': 1, 'passed_proportion_filter': 1, 'passed_coverage_filter': 1, 'called_snp': 0, 'called_degen': 0 }) ] ], is_missing_matrix=False, called_reference=1, called_snp=0, passed_coverage_filter=1, passed_proportion_filter=1, num_A=0, num_C=0, num_G=1, num_T=0, num_N=0, call_str=['G', 'G'], masked_call_str=['G', 'G'], CallWasMade='Y', PassedDepthFilter='-', PassedProportionFilter='-', Pattern=['1', '1']) expected = PositionInfo( is_all_called=True, is_reference_clean=True, is_reference_duplicated=False, is_all_passed_coverage=True, is_all_passed_proportion=True, is_all_passed_consensus=False, is_all_quality_breadth=False, is_best_snp=False, all_sample_stats=[[ Counter({ 'quality_breadth': 1, 'called_reference': 1, 'called_snp': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_reference': 0, 'called_snp': 0, 'called_degen': 0 }) ], [ Counter({ 'quality_breadth': 1, 'called_reference': 1, 'called_snp': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_reference': 0, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'called_reference': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'quality_breadth': 1, 'called_snp': 1, 'passed_coverage_filter': 1, 'passed_proportion_filter': 1, 'was_called': 1, 'called_reference': 0, 'called_degen': 0 }) ]], is_missing_matrix=True, called_reference=1, called_snp=1, passed_coverage_filter=2, passed_proportion_filter=2, num_A=1, num_C=0, num_G=1, num_T=0, num_N=0, call_str=['G', 'G', 'A'], masked_call_str=['G', 'G', 'A'], CallWasMade='YY', PassedDepthFilter='--', PassedProportionFilter='--', Pattern=['1', '1', '2']) self.assertEqual( expected, self.analysis.analyze_position(reference_position, dups_position, samples))
def test_fasta_position(self): reference_position = Position(call='G', coverage='-', proportion='-') dups_position = Position(call='0', coverage='-', proportion='-') samples = ((Position(call='G', coverage='-', proportion='-'), Position(call='G', coverage='-', proportion='-')), ) expected = PositionInfo( is_all_called=True, is_reference_clean=True, is_reference_duplicated=False, is_all_passed_coverage=True, is_all_passed_proportion=True, is_all_passed_consensus=True, is_all_quality_breadth=True, is_best_snp=False, all_sample_stats=[[ Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }) ], [ Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'called_reference': 1, 'passed_coverage_filter': 1, 'quality_breadth': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_snp': 0, 'called_degen': 0 }) ]], is_missing_matrix=False, called_reference=2, called_snp=0, passed_coverage_filter=2, passed_proportion_filter=2, num_A=0, num_C=0, num_G=2, num_T=0, num_N=0, call_str=['G', 'G', 'G'], masked_call_str=['G', 'G', 'G'], CallWasMade='YY', PassedDepthFilter='--', PassedProportionFilter='--', Pattern=['1', '1', '1']) self.assertEqual( expected, self.analysis.analyze_position(reference_position, dups_position, samples))
def test_duplicate_information_position(self): """ Scenario: The reference was scanned for duplicate positions. Positions marked as duplicates are not quality positions and should not increment the sample statistics. - is_reference_duplicated is True - is_missing_matrix is False - is_all_quality_breadth is False - is_all_passed_consensus is False - None of the sample stats are incremented """ reference_position = Position(call='G', coverage='-', proportion='-') dups_position = Position( # No duplicate information call='1', coverage='-', proportion='-') samples = ( ( # Single Nucleotide Monomorphism Position(call='G', coverage='-', proportion='-'), # Single Nucleotide Polymorphism Position(call='A', coverage='-', proportion='-'), ), ) expected = PositionInfo( is_all_called=True, is_reference_clean=True, is_reference_duplicated=True, is_all_passed_coverage=True, is_all_passed_proportion=True, is_all_passed_consensus=False, is_all_quality_breadth=False, is_best_snp=False, all_sample_stats=[[ Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }) ], [ Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }), Counter({ 'passed_coverage_filter': 1, 'was_called': 1, 'passed_proportion_filter': 1, 'called_reference': 0, 'quality_breadth': 0, 'called_snp': 0, 'called_degen': 0 }) ]], is_missing_matrix=False, called_reference=1, called_snp=0, passed_coverage_filter=2, passed_proportion_filter=2, num_A=1, num_C=0, num_G=1, num_T=0, num_N=0, call_str=['G', 'G', 'A'], masked_call_str=['G', 'G', 'A'], CallWasMade='YY', PassedDepthFilter='--', PassedProportionFilter='--', Pattern=['1', '1', '2']) self.assertEqual( expected, self.analysis.analyze_position(reference_position, dups_position, samples))
def setUp(self): # Source fasta values contigs = ( # No gap (">contig0\n" "GATC\n" "GGAA\n"), # Gap after contig ( ">contig1\n" "GATC\n" "GGAA\n" "\n" # ), # # No linebreak # ( # ">contig2\n" # "GATCGGAA" ) # TODO: >80 characters contig? ) # Expected values self.contigs_expected = ({ 'name': 'contig0', 'file_position': 0 + len('>contig0\n'), 'positions': (Position(call='G', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-'), Position(call='T', coverage='-', proportion='-'), Position(call='C', coverage='-', proportion='-'), Position(call='G', coverage='-', proportion='-'), Position(call='G', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-')) }, { 'name': 'contig1', 'file_position': len(contigs[0]) + len('>contig1\n'), 'positions': (Position(call='G', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-'), Position(call='T', coverage='-', proportion='-'), Position(call='C', coverage='-', proportion='-'), Position(call='G', coverage='-', proportion='-'), Position(call='G', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-'), Position(call='A', coverage='-', proportion='-')) }) fasta_content = "".join(contigs) # Create a mock fasta self.fasta_file = tempfile.NamedTemporaryFile(mode='w+', delete=False) self.fasta_file.write(fasta_content) self.fasta_file.seek(0) # Instantiate test contigs self.contig0 = FastaContig(self.contigs_expected[0]['name'], len(self.contigs_expected[0]['positions']), self.contigs_expected[0]['file_position'], self.fasta_file.name, is_reference=False) self.contig1 = FastaContig(self.contigs_expected[1]['name'], len(self.contigs_expected[1]['positions']), self.contigs_expected[1]['file_position'], self.fasta_file.name, is_reference=False) self.ref_contig0 = FastaContig( self.contigs_expected[0]['name'], len(self.contigs_expected[0]['positions']), self.contigs_expected[0]['file_position'], self.fasta_file.name, is_reference=True) self.ref_contig1 = FastaContig( self.contigs_expected[1]['name'], len(self.contigs_expected[1]['positions']), self.contigs_expected[1]['file_position'], self.fasta_file.name, is_reference=True)
def test_multi_base_call_raises_exception(self): # TODO: use appropriate exception with self.assertRaises(Exception): Position(call='gatc', coverage='-', proportion='-')