Example #1
0
    def setup_class(self):
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        mutation_db_file = TEST_PATH + "/data/mutation_db.tsv"

        genes = parse_genes_file(genes_file, "hxb2_pol")

        self.mutation_db = MutationDB(mutation_db_file, genes)
Example #2
0
def cli(ctx, bam, reference, genes_file, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]["frame"])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    if output:
        output.write(aa_census.coverage(frames))
        output.close()
    else:
        click.echo(aa_census.coverage(frames))
Example #3
0
    def __init__(self, id, output_dir, reads, reference,
                 genes_file, mutation_db, quiet, consensus_pct):
        self.id = id
        self.output_dir = output_dir
        self.reads = reads
        self.reference = reference
        self.mutation_db = mutation_db
        self.genes_file = genes_file

        self.quiet = quiet
        self.consensus_pct = consensus_pct

        self.filtered = {}
        self.filtered["status"] = 0
        self.filtered["length"] = 0
        self.filtered["score"] = 0
        self.filtered["ns"] = 0

        self.input_size = 0
        self.determine_input_size()

        self.references = parse_references_from_fasta(self.reference)
        self.genes = parse_genes_file(genes_file, self.references[0].name)

        self.filtered_reads = "%s/filtered.fastq" % output_dir

        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
Example #4
0
    def test_invalid_genes_file(self):
        """Tests to make sure that an exception is raised when attempting to
        parse an invalid genes file.
        """

        # Create an invalid genes file
        invalid_genes_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "data",
            "invalid_genes_file.bed")

        ref_name = "ref1"

        with open(invalid_genes_file, "w+") as f:
            f.write("%s\t0\t100\t0\n" % ref_name)
            # Add a genes reference name that doesn't match
            # This should raise a ValueError
            f.write("different_reference\t101\t200\t2")

        with pytest.raises(ValueError):
            parse_genes_file(invalid_genes_file, ref_name)

        os.remove(invalid_genes_file)
Example #5
0
def cli(ctx, bam, reference, variants, genes_file, min_freq, mutation_db,
        reporting_threshold, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the hmcf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True)

    # Build the mutation database
    mutation_db = MutationDB(mutation_db, genes)

    # Generate the mutation report
    if output:
        output.write(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
        output.close()
    else:
        click.echo(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
Example #6
0
    def test_from_aacensus(self):
        bam = TEST_PATH + "/data/align.bam"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        mapped_read_collection_arr = []
        error_rate = 0.0038

        # Create a MappedReadCollection object
        for r in self.references:
            mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

            variants = NTVariantCollection.from_mapped_read_collections(
                    error_rate, self.references, *mapped_read_collection_arr)
            variants.filter('q30', 'QUAL<30', True)
            variants.filter('ac5', 'AC<5', True)
            variants.filter('dp100', 'DP<100', True)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)
        
        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, self.references[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        aa_census = AACensus(self.reference, mapped_read_collection_arr, genes, frames)

        test_variants = CodonVariantCollection.from_aacensus(aa_census)
        ref_seq = self.references[0].seq

        for gene in test_variants.variants:
            assert gene in genes
            for pos in test_variants.variants[gene]:
                for frame in frames:
                    nt_pos = pos/3 - frame
                    assert nt_pos >= genes[gene]['start'] or nt_pos <= genes[gene]['end'] 
                for codon in test_variants.variants[gene][pos]:
                    ref_codon = ref_seq[(pos):(pos) + 3].lower()   
                    assert codon != ref_codon
Example #7
0
    def test_valid_genes_file(self):
        """Tests to make sure that valid genes files (bed files) are parsed
        properly.
        """

        # Create a valid genes file
        valid_genes_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "data",
            "valid_genes_file.bed")

        ref_name = "ref1"

        genes = {
            "gene1": {
                "start": 0,
                "end": 100
            },
            "gene 2": {
                "start": 101,
                "end": 200
            },  # Spaces are allowed in the gene name
            "gene3": {
                "start": 201,
                "end": 300
            }
        }

        with open(valid_genes_file, "w+") as f:
            for gene in genes:
                f.write(
                    "%s\t%s\t%s\t%s\n" %
                    (ref_name, genes[gene]["start"], genes[gene]["end"], gene))

        parsed_genes = parse_genes_file(valid_genes_file, ref_name)

        for gene in parsed_genes:
            assert gene in genes
            assert parsed_genes[gene]["start"] == genes[gene]["start"]
            assert parsed_genes[gene]["end"] == genes[gene]["end"]
            assert parsed_genes[gene]["frame"] == genes[gene]["start"] % 3

        os.remove(valid_genes_file)
Example #8
0
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the hmcf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    if output:
        output.write(aa_vars.to_hmcf_file(CONFIDENT))
    else:
        click.echo(aa_vars.to_hmcf_file(CONFIDENT))
Example #9
0
    def setup(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        mutation_db = TEST_PATH + "/data/mutation_db.tsv"
        min_freq = 0.01

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # Create a MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants_obj)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        # Create an AACensus object
        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        # Find the AA mutations
        self.aa_collection = AAVariantCollection.from_aacensus(aa_census)

        # Build the mutation database
        self.mutation_db = MutationDB(mutation_db, genes)
Example #10
0
    def setup_class(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # create MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        self.frames = set()

        for gene in genes:
            self.frames.add(genes[gene]["frame"])

        self.aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                                  self.frames)
Example #11
0
    def setup(self):
        bam = TEST_PATH + "/data/align.bam"
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        error_rate = 0.0038

        rs = parse_references_from_fasta(reference)
        mapped_read_collection_arr = []

        # Create a MappedReadCollection object
        for r in rs:
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
Example #12
0
def codonvar(bam, reference, offset, genes_file, error_rate, output):
    rs = parse_references_from_fasta(reference)
    mapped_read_collection_arr = []

    # Create a MappedReadCollection object
    for r in rs:
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants = NTVariantCollection.from_mapped_read_collections(
        error_rate, rs, *mapped_read_collection_arr)
    variants.filter('q30', 'QUAL<30', True)
    variants.filter('ac5', 'AC<5', True)
    variants.filter('dp100', 'DP<100', True)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants)

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    codon_variants = CodonVariantCollection.from_aacensus(aa_census)

    if output:
        output.write(codon_variants.to_csv_file(offset))
        output.close()
    else:
        click.echo(codon_variants.to_csv_file(offset))