Exemple #1
0
    def __init__(self, id, output_dir, reads, reference,
                 genes_file, mutation_db, quiet, consensus_pct):
        self.id = id
        self.output_dir = output_dir
        self.reads = reads
        self.reference = reference
        self.mutation_db = mutation_db
        self.genes_file = genes_file

        self.quiet = quiet
        self.consensus_pct = consensus_pct

        self.filtered = {}
        self.filtered["status"] = 0
        self.filtered["length"] = 0
        self.filtered["score"] = 0
        self.filtered["ns"] = 0

        self.input_size = 0
        self.determine_input_size()

        self.references = parse_references_from_fasta(self.reference)
        self.genes = parse_genes_file(genes_file, self.references[0].name)

        self.filtered_reads = "%s/filtered.fastq" % output_dir

        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
Exemple #2
0
def cli(ctx, bam, reference, genes_file, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]["frame"])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    if output:
        output.write(aa_census.coverage(frames))
        output.close()
    else:
        click.echo(aa_census.coverage(frames))
Exemple #3
0
    def setup_class(self):
        self.references = parse_references_from_fasta('tests/data/ref1.fasta')
        self.variant_collection = NTVariantCollection(self.references)

        self.variant_collection.variants['ref1']['3']['t'] = NTVariant(
            chrom='ref1',
            pos=3,
            ref='c',
            alt='t',
            qual=30,
            info={
                'DP': 400,
                'AC': 12,
                'AF': 0.03
            })
        self.variant_collection.variants['ref1']['10']['a'] = NTVariant(
            chrom='ref1',
            pos=10,
            ref='a',
            alt='t',
            qual=23,
            info={
                'DP': 200,
                'AC': 7,
                'AF': 0.035
            })
Exemple #4
0
def cli(ctx, bam, reference, bed4_file, output):
    """This script builds an amino acid census and returns its coverage.
    The BAM alignment file corresponds to a pileup of sequences aligned to
    the REFERENCE. A BAM index file (.bai) must also be present and, except
    for the extension, have the same name as the BAM file. The REFERENCE must
    be in FASTA format. The BED4_FILE must be a BED file with at least 4
    columns and specify the gene locations within the REFERENCE.

    The output is in CSV format."""

    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]["frame"])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    if output:
        output.write(aa_census.coverage(frames))
        output.close()
    else:
        click.echo(aa_census.coverage(frames))
Exemple #5
0
def cli(ctx, bam, reference, percentage, id, output):
    rs = parse_references_from_fasta(reference)
    bam_header = pysam.Samfile(bam, "rb").header

    if id:
        fasta_id = id
    else:
        fasta_id = os.path.basename(bam).split('.')[0]

    for r in rs:
        mrc = parse_mapped_reads_from_bam(r, bam)

        conseq = mrc.to_consensus(percentage)

        if hasattr(bam_header, 'RG'):
            fasta_id = bam_header['RG']

        if output:
            output.write('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage,
                                                    r.name, conseq))
        else:
            click.echo('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage, r.name,
                                                  conseq))
    if output:
        output.close()
Exemple #6
0
def ntvar(bam, reference, error_rate, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # create MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    mapped_read_collection_arr = []
    for r in rs:
        # create MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants = NTVariantCollection.from_mapped_read_collections(
        error_rate, rs, *mapped_read_collection_arr)

    variants.filter('q30', 'QUAL<30', True)
    variants.filter('ac5', 'AC<5', True)
    variants.filter('dp100', 'DP<100', True)

    if output:
        output.write(variants.to_vcf_file())
        output.close()
    else:
        click.echo(variants.to_vcf_file())
Exemple #7
0
    def setup(self):
        csv = TEST_PATH + "/data/output/mutant_types.csv"
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        self.offset = 1269

        rs = parse_references_from_fasta(reference)
        self.ref_seq = rs[0].seq

        self.codon_variants = parse_codon_variants(csv, rs)
Exemple #8
0
def cli(ctx, csv, reference, offset, output):
    rs = parse_references_from_fasta(reference)
    ref_seq = rs[0].seq

    codon_variants = parse_codon_variants(csv, rs)

    if output:
        output.write(codon_variants.report_dnds_values(ref_seq, offset))
    else:
        click.echo(codon_variants.report_dnds_values(ref_seq, offset))
def cli(ctx, bam, reference, variants, bed4_file, min_freq, mutation_db,
        reporting_threshold, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the aavf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True)

    # Build the mutation database
    mutation_db = MutationDB(mutation_db, genes)

    # Generate the mutation report
    if output:
        output.write(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
        output.close()
    else:
        click.echo(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
Exemple #10
0
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the hmcf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    if output:
        output.write(aa_vars.to_hmcf_file(CONFIDENT))
    else:
        click.echo(aa_vars.to_hmcf_file(CONFIDENT))
Exemple #11
0
def codonvar(bam, reference, offset, bed4_file, variants, error_rate, output):
    rs = parse_references_from_fasta(reference)
    mapped_read_collection_arr = []

    # Create a MappedReadCollection object
    for r in rs:
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    if variants:
        variants_obj = parse_nt_variants_from_vcf(variants, rs)
    else:
        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)
        variants_obj = variants

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    codon_variants = CodonVariantCollection.from_aacensus(aa_census)

    if output:
        output.write(codon_variants.to_csv_file(offset))
        output.close()
    else:
        click.echo(codon_variants.to_csv_file(offset))
    def setup_class(self):
        self.reference = TEST_PATH + "/data/hxb2_pol.fas"
        self.references = parse_references_from_fasta(self.reference)
        self.variant_collection = CodonVariantCollection(self.references)
        self.offset = 1269

        self.variant_collection.variants['gag']['3']['aTa'] = CodonVariant(
            chrom="hxb2_pol",
            pos=1,
            gene="gag",
            nt_start_gene=1309,
            nt_end_gene=2841,
            nt_start=2077,
            nt_end=2079,
            ref_codon="ata",
            mutant_codon="aTa",
            ref_aa="I",
            mutant_aa="K",
            coverage=563,
            mutant_freq=1.60,
            mutant_type="S",
            ns_count=1.0000,
            s_count=1.5000)
        self.variant_collection.variants['tat']['10']['aAa'] = CodonVariant(
            chrom="hxb2_pol",
            pos=2,
            gene="tat",
            nt_start_gene=3309,
            nt_end_gene=4841,
            nt_start=4000,
            nt_end=4002,
            ref_codon="ata",
            mutant_codon="aAa",
            ref_aa="I",
            mutant_aa="K",
            coverage=563,
            mutant_freq=1.60,
            mutant_type="S",
            ns_count=1.0000,
            s_count=1.5000)
Exemple #13
0
    def setup(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        mutation_db = TEST_PATH + "/data/mutation_db.tsv"
        min_freq = 0.01

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # Create a MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants_obj)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        # Create an AACensus object
        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        # Find the AA mutations
        self.aa_collection = AAVariantCollection.from_aacensus(aa_census)

        # Build the mutation database
        self.mutation_db = MutationDB(mutation_db, genes)
Exemple #14
0
    def setup_class(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        BED4_file = TEST_PATH + "/data/hxb2_pol.bed"

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # create MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        genes = parse_BED4_file(BED4_file, rs[0].name)

        # Determine which frames our genes are in
        self.frames = set()

        for gene in genes:
            self.frames.add(genes[gene]["frame"])

        self.aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                                  self.frames)
Exemple #15
0
    def setup(self):
        bam = TEST_PATH + "/data/align.bam"
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        error_rate = 0.0038

        rs = parse_references_from_fasta(reference)
        mapped_read_collection_arr = []

        # Create a MappedReadCollection object
        for r in rs:
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
    def __init__(self, id, output_dir, reads, reference, BED4_file,
                 mutation_db, quiet, consensus_pct):
        self.id = id
        self.output_dir = output_dir
        self.reads = reads
        self.reference = reference
        self.mutation_db = mutation_db
        self.BED4_file = BED4_file

        self.quiet = quiet
        self.consensus_pct = consensus_pct

        self.input_size = 0
        self.determine_input_size()

        self.references = parse_references_from_fasta(self.reference)
        self.genes = parse_BED4_file(BED4_file, self.references[0].name)

        self.quality = QualityControl()

        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)

        self.filtered_reads_dir = "%s/filtered.fastq" % output_dir
    def test_valid_vcf_file(self):
        """Tests to ensure that valid vcf files are parsed properly."""

        reference = TEST_PATH + \
            "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # Create a MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants_obj = NTVariantCollection(rs)

        for i in range(0, 20):
            variant = NTVariant(chrom="hxb2_pol",
                                pos=i,
                                id=".",
                                ref='a',
                                alt='t',
                                qual="50",
                                filter="PASS",
                                info={
                                    "DP": "300",
                                    "AC": "1",
                                    "AF": "0.0025"
                                })

            variants_obj.variants["hxb2_pol"][i]['t'] = variant

        #Create a valid vcf file
        valid_vcf_file = TEST_PATH + "/data/valid_vcf_file.vcf"

        with open(valid_vcf_file, "w+") as f:
            f.write(
                "##fileformat=VCFv4.2\n"
                "##fileDate=20171005\n"
                "##source=quasitools\n"
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"
                "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele Count\">\n"
                "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n"
                "##FILTER=<ID=q30,Description=\"Quality below 30\">\n"
                "##FILTER=<ID=dp100,Description=\"Read depth below 100\">\n"
                "##FILTER=<ID=ac5,Description=\"Allele count below 5\">\n"
                "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

            for rid in variants_obj.variants:
                for pos in variants_obj.variants[rid]:
                    for alt in variants_obj.variants[rid][pos]:
                        variant = variants_obj.variants[rid][pos][alt]
                        f.write("\n%s\t%i\t%s\t%s\t%s\t%s\t%s" %
                                (variant.chrom, int(
                                    variant.pos), variant.id, variant.ref,
                                 variant.alt, variant.qual, variant.filter))
                        f.write(
                            "\tDP=%i;AC=%i;AF=%0.4f" %
                            (int(variant.info["DP"]), int(variant.info["AC"]),
                             float(variant.info["AF"])))

        parsed_nt_var = parse_nt_variants_from_vcf(valid_vcf_file, rs)

        # Check equality of parsed NTVariantCollection vs. the valid NTVariantCollection
        for rid in parsed_nt_var.variants:
            for pos in parsed_nt_var.variants[rid]:
                for alt in parsed_nt_var.variants[rid][pos]:
                    parsed_variant = parsed_nt_var.variants[rid][pos][alt]
                    variant = variants_obj.variants[rid][pos][alt]

                    assert parsed_variant.chrom == variant.chrom
                    assert parsed_variant.pos == variant.pos
                    assert parsed_variant.id == variant.id
                    assert parsed_variant.ref == variant.ref
                    assert parsed_variant.alt == variant.alt
                    assert parsed_variant.qual == variant.qual
                    assert parsed_variant.filter == variant.filter
                    assert parsed_variant.info["DP"] == variant.info["DP"]
                    assert parsed_variant.info["AC"] == variant.info["AC"]
                    assert parsed_variant.info["AF"] == variant.info["AF"]

        os.remove(valid_vcf_file)
Exemple #18
0
 def setup_class(self):
     self.bam1 = TEST_PATH + '/data/quasi1.bam'
     self.bam2 = TEST_PATH + '/data/quasi2.bam'
     self.test_cp_files = (self.bam1, self.bam2)
     self.test_cp_ref = TEST_PATH + '/data/hxb2_pol.fas'
     self.references = parse_references_from_fasta(self.test_cp_ref)
Exemple #19
0
def dist(ctx, reference, bam, normalize, output_distance, startpos, endpos,
         output, no_coverage):
    """
    dist - Performs the main part of the program

    INPUT:
        [CONTEXT] [ctx]
        [FASTA FILE LOCATION] [reference]
        [BAM FILE LOCATION] [bam]
        [BOOL] [normalize/dont_normalize]
        [BOOL] [output_distance/output_similarity]
        [INT] [startpos]
        [INT] [endpos]
        [STRING] [output]
            Output the CSV-formatted matrix output in a file
            instead of in the terminal.
        [STRING] [truncate/remove_no_coverage/keep_no_coverage]
            Options to truncate low-coverage regions on the ends of the pileup,
            ignore all low coverage regions, or keep all low coverage regions

    RETURN:
        None.

    POST:
        The distance matrix is printed out unless an error message was raised.

    """

    if len(bam) < 2:
        raise click.UsageError("At least two bam file locations are required" +
                               " to perform quasispecies distance comparison")
    # indicate if the start or end position is < 0 or a priori invalid
    if type(startpos) == int and int(startpos) < 1:
        raise click.UsageError("Start position must be >= 1.")
    if type(endpos) == int and int(endpos) < 1:
        raise click.UsageError("End position must be >= 1.")
    if (type(startpos) == int and type(endpos) == int and (startpos > endpos)):
        raise click.UsageError("Start position must be <= end position")

    # Build the reference object.
    references = parse_references_from_fasta(reference)

    pileups = Pileup_List.construct_pileup_list(bam, references)

    if startpos is None:
        startpos = 1
    if endpos is None:
        endpos = pileups.get_pileup_length()

    if pileups.get_pileup_length() == 0:
        raise click.UsageError("Empty pileup was produced from BAM files." +
                               "Halting program")

    click.echo("The start position is %d." % startpos)
    click.echo("The end position is %d." % endpos)
    click.echo("Constructed pileup from reference.")
    # click.echo the number of positions in pileup
    click.echo("The pileup covers %d positions before modifications." %
               pileups.get_pileup_length())

    # indicate whether the user-specified start and end position is out
    # of bounds (comparing to actual number of positions in pileup)
    if startpos > pileups.get_pileup_length():
        raise click.UsageError("Start position must be less than or" +
                               " equal to the number of nucleotide base " +
                               "positions in pileup (%s)." %
                               pileups.get_pileup_length())
    if endpos > pileups.get_pileup_length():
        raise click.UsageError("End position must be less than or equal to " +
                               "the number of nucleotide base positions in " +
                               "pileup (%s)." % pileups.get_pileup_length())

    # we convert the start and end positions from one-based indexing to
    # zero-based indexing which is expected by distance.py and pileup.py
    startpos -= 1
    endpos -= 1

    # if there is no errors so far, proceed with running program
    modified = modify_pileups(ctx, normalize, startpos, endpos, no_coverage,
                              pileups)

    if (no_coverage is not 'keep_no_coverage') and (len(modified) == 0):
        raise click.UsageError("Entire pileup was truncated due to " +
                               "lack of coverage. Halting program")

    dist = DistanceMatrix(modified, bam)

    if output_distance:
        click.echo("Outputting an angular cosine distance matrix.")
        if output:
            output.write(dist.get_distance_matrix_as_csv())
        else:
            click.echo(dist.get_distance_matrix_as_csv())

    else:
        click.echo("Outputting a cosine similarity matrix.")
        if output:
            output.write(dist.get_similarity_matrix_as_csv())
        else:
            click.echo(dist.get_similarity_matrix_as_csv())
Exemple #20
0
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq,
          error_rate, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    if variants:
        variants_obj = parse_nt_variants_from_vcf(variants, rs)
    else:
        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)
        variants_obj = variants

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the aavf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference),
                                   CONFIDENT)
    records = list(aavf_obj)

    if output:
        writer = parser.Writer(output, aavf_obj)
    else:
        writer = parser.Writer(sys.stdout, aavf_obj)

    for record in records:
        writer.write_record(record)

    if output:
        output.close

    writer.close()
Exemple #21
0
def bam(reference_location, bam_location, k, haplotype_filter,
        output_location):
    '''
    Reports the per-amplicon (fasta)  or k-mer complexity of the pileup,
    for each k-mer position in the reference complexity (bam and reference)
    of a quasispecies using several measures outlined in the following work:

    Gregori, Josep, et al. "Viral quasispecies complexity measures."
    Virology 493 (2016): 227-237.
    '''
    """
    # ========================================================================

    BAM COMPLEXITY


    PURPOSE
    -------

    Create a report of  k-mer complexity of the pileup, for each k-mer position
    in the reference.


    INPUT
    -----

    [(BAM) FILE LOCATION] [bam_location]
        The file location of a bam file.
    [(REFERENCE) FILE LOCATION] [reference_location]
        The file location of the reference file.
    [INT] k
        Provides the sequence length for our reads from a given starting
        position.
    [FLOAT] haplotype_filter:
        User defined filter between 0 and 100, haplotypes under the filter
        size  will be removed from each positional list. Default is set to
        0 (i.e it will not filter).

    [(OUTPUT) FILE LOCATION] [output_location]
        The location of the output file.

    RETURN
    ------

    [NONE]


    POST
    ----

    The complexity computation will be completed and the results will be stored
    in CSV file or std.out.

    # ========================================================================
    """
    k = int(k)

    references = parse_references_from_fasta(reference_location)
    # A list where each position contains a list of haplotypes of length k
    # starting at that position in the reference.
    haplotype_list = parse_haplotypes_from_bam(references, reference_location,
                                               bam_location, k)

    measurements_list = []

    for i in range(len(haplotype_list)):

        haplotypes = haplotype_list[i]
        # Remove haplotypes below threshold.

        # Get total number of haplotypes for each position.
        total_haplotypes = haplotype.calculate_total_clones(haplotypes)

        # Add haplotypes within threshold to new haplotypes list
        haplotypes_within_filter = [
            hap for hap in haplotypes
            if (float(hap.count) / float(total_haplotypes) *
                100) >= haplotype_filter
        ]

        measurements = measure_complexity(haplotypes_within_filter)
        measurements_list.append(measurements)

    # if the output_location is specificed open it as complexit_file, if not
    # specified, complexity_file is set as sys.stdout.
    with open(output_location, 'w') if output_location else sys.stdout as \
            complexity_file:
        measurement_to_csv(measurements_list, complexity_file)
Exemple #22
0
    def test_valid_csv_file(self):
        """Tests to make sure that a valid codon variant csv file is properly
        parsed into a CodonVariantCollection object.
        """

        reference = TEST_PATH + "/data/hxb2_pol.fas"
        rs = parse_references_from_fasta(reference)

        var_obj = CodonVariantCollection(rs)

        for i in range(0, 30):
            variant = CodonVariant(chrom="hxb2_pol",
                                   pos=i,
                                   gene="gag",
                                   nt_start_gene=1309 + i,
                                   nt_end_gene=2841 + i,
                                   nt_start=2077 + i,
                                   nt_end=2079 + i,
                                   ref_codon="ata",
                                   mutant_codon="aAa",
                                   ref_aa="I",
                                   mutant_aa="K",
                                   coverage=563 + i,
                                   mutant_freq=1.60 + i,
                                   mutant_type="S",
                                   ns_count=1.0000,
                                   s_count=1.5000)

            pos = int(variant.nt_start) - int(variant.nt_start_gene)
            var_obj.variants["gag"][pos]["aAa"] = variant

        valid_csv = TEST_PATH + "/data/valid_csv.csv"

        with open(valid_csv, "w+") as f:
            f.write("#gene,nt position (gene),nt start position,"
                    "nt end position,ref codon,mutant codon,ref AA,mutant AA,"
                    "coverage,mutant frequency,mutant type,NS count,S count")

            for gene in var_obj.variants:
                for pos in var_obj.variants[gene]:
                    for codon in var_obj.variants[gene][pos]:
                        variant = var_obj.variants[gene][pos][codon]

                        f.write(
                            "%s,%i-%i,%i,%i,%s,%s,%s,%s,%i,%.2f,%s,%0.4f,%0.4f\n"
                            % (variant.gene, variant.nt_start_gene,
                               variant.nt_end_gene, variant.nt_start,
                               variant.nt_end, variant.ref_codon,
                               variant.mutant_codon, variant.ref_aa,
                               variant.mutant_aa, variant.coverage,
                               variant.mutant_freq, variant.mutant_type,
                               variant.ns_count, variant.s_count))

        parsed_codon_variants = parse_codon_variants(valid_csv, rs)

        for gene in parsed_codon_variants.variants:
            for pos in parsed_codon_variants.variants[gene]:
                for codon in parsed_codon_variants.variants[gene][pos]:
                    parsed_variant = parsed_codon_variants.variants[gene][pos][
                        codon]
                    variant = var_obj.variants[gene][pos][codon]

                    assert parsed_variant.chrom == variant.chrom
                    assert parsed_variant.nt_start_gene == variant.nt_start_gene
                    assert parsed_variant.nt_end_gene == variant.nt_end_gene
                    assert parsed_variant.nt_start == variant.nt_start
                    assert parsed_variant.nt_end == variant.nt_end
                    assert parsed_variant.ref_codon == variant.ref_codon
                    assert parsed_variant.mutant_codon == variant.mutant_codon
                    assert parsed_variant.ref_aa == variant.ref_aa
                    assert parsed_variant.mutant_aa == variant.mutant_aa
                    assert parsed_variant.coverage == variant.coverage
                    assert parsed_variant.mutant_freq == variant.mutant_freq
                    assert parsed_variant.mutant_type == variant.mutant_type
                    assert parsed_variant.ns_count == variant.ns_count
                    assert parsed_variant.s_count == variant.s_count

        os.remove(valid_csv)