def setup_class(self): genes_file = TEST_PATH + "/data/hxb2_pol.bed" mutation_db_file = TEST_PATH + "/data/mutation_db.tsv" genes = parse_genes_file(genes_file, "hxb2_pol") self.mutation_db = MutationDB(mutation_db_file, genes)
def cli(ctx, bam, reference, genes_file, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]["frame"]) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) if output: output.write(aa_census.coverage(frames)) output.close() else: click.echo(aa_census.coverage(frames))
def __init__(self, id, output_dir, reads, reference, genes_file, mutation_db, quiet, consensus_pct): self.id = id self.output_dir = output_dir self.reads = reads self.reference = reference self.mutation_db = mutation_db self.genes_file = genes_file self.quiet = quiet self.consensus_pct = consensus_pct self.filtered = {} self.filtered["status"] = 0 self.filtered["length"] = 0 self.filtered["score"] = 0 self.filtered["ns"] = 0 self.input_size = 0 self.determine_input_size() self.references = parse_references_from_fasta(self.reference) self.genes = parse_genes_file(genes_file, self.references[0].name) self.filtered_reads = "%s/filtered.fastq" % output_dir if not os.path.isdir(output_dir): os.mkdir(output_dir)
def test_invalid_genes_file(self): """Tests to make sure that an exception is raised when attempting to parse an invalid genes file. """ # Create an invalid genes file invalid_genes_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "data", "invalid_genes_file.bed") ref_name = "ref1" with open(invalid_genes_file, "w+") as f: f.write("%s\t0\t100\t0\n" % ref_name) # Add a genes reference name that doesn't match # This should raise a ValueError f.write("different_reference\t101\t200\t2") with pytest.raises(ValueError): parse_genes_file(invalid_genes_file, ref_name) os.remove(invalid_genes_file)
def cli(ctx, bam, reference, variants, genes_file, min_freq, mutation_db, reporting_threshold, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the hmcf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True) # Build the mutation database mutation_db = MutationDB(mutation_db, genes) # Generate the mutation report if output: output.write( aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) output.close() else: click.echo( aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
def test_from_aacensus(self): bam = TEST_PATH + "/data/align.bam" genes_file = TEST_PATH + "/data/hxb2_pol.bed" mapped_read_collection_arr = [] error_rate = 0.0038 # Create a MappedReadCollection object for r in self.references: mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, self.references, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_genes_file(genes_file, self.references[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(self.reference, mapped_read_collection_arr, genes, frames) test_variants = CodonVariantCollection.from_aacensus(aa_census) ref_seq = self.references[0].seq for gene in test_variants.variants: assert gene in genes for pos in test_variants.variants[gene]: for frame in frames: nt_pos = pos/3 - frame assert nt_pos >= genes[gene]['start'] or nt_pos <= genes[gene]['end'] for codon in test_variants.variants[gene][pos]: ref_codon = ref_seq[(pos):(pos) + 3].lower() assert codon != ref_codon
def test_valid_genes_file(self): """Tests to make sure that valid genes files (bed files) are parsed properly. """ # Create a valid genes file valid_genes_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "data", "valid_genes_file.bed") ref_name = "ref1" genes = { "gene1": { "start": 0, "end": 100 }, "gene 2": { "start": 101, "end": 200 }, # Spaces are allowed in the gene name "gene3": { "start": 201, "end": 300 } } with open(valid_genes_file, "w+") as f: for gene in genes: f.write( "%s\t%s\t%s\t%s\n" % (ref_name, genes[gene]["start"], genes[gene]["end"], gene)) parsed_genes = parse_genes_file(valid_genes_file, ref_name) for gene in parsed_genes: assert gene in genes assert parsed_genes[gene]["start"] == genes[gene]["start"] assert parsed_genes[gene]["end"] == genes[gene]["end"] assert parsed_genes[gene]["frame"] == genes[gene]["start"] % 3 os.remove(valid_genes_file)
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the hmcf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) if output: output.write(aa_vars.to_hmcf_file(CONFIDENT)) else: click.echo(aa_vars.to_hmcf_file(CONFIDENT))
def setup(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" genes_file = TEST_PATH + "/data/hxb2_pol.bed" mutation_db = TEST_PATH + "/data/mutation_db.tsv" min_freq = 0.01 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Find the AA mutations self.aa_collection = AAVariantCollection.from_aacensus(aa_census) # Build the mutation database self.mutation_db = MutationDB(mutation_db, genes)
def setup_class(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" genes_file = TEST_PATH + "/data/hxb2_pol.bed" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in self.frames = set() for gene in genes: self.frames.add(genes[gene]["frame"]) self.aa_census = AACensus(reference, mapped_read_collection_arr, genes, self.frames)
def setup(self): bam = TEST_PATH + "/data/align.bam" reference = TEST_PATH + "/data/hxb2_pol.fas" genes_file = TEST_PATH + "/data/hxb2_pol.bed" error_rate = 0.0038 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
def codonvar(bam, reference, offset, genes_file, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) codon_variants = CodonVariantCollection.from_aacensus(aa_census) if output: output.write(codon_variants.to_csv_file(offset)) output.close() else: click.echo(codon_variants.to_csv_file(offset))