Esempio n. 1
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path,
                                        runlength_ref_sequences, runlength_read_data):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.

    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequences:
    :param runlength_read_data:
    :return:
    """
    for chromosome_name in runlength_ref_sequences:
        shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length)

        n_reads = parse_reads(chromosome_name=chromosome_name,
                              fasta_handler=fasta_handler,
                              reads=reads,
                              complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS],
                              runlength_read_data=runlength_read_data,
                              matrix=matrix)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
def get_aligned_contig_lengths(bam_path, aligned_assembly_contigs):
    bam_handler = BamHandler(bam_file_path=bam_path)

    reads = bam_handler.get_reads(chromosome_name=None, start=None, stop=None)

    aligned_lengths = list()

    n_secondary = 0
    for read in reads:
        if read.is_secondary:
            n_secondary += 1

        if read.mapping_quality > 5 and not read.is_secondary:
            read_id = read.query_name
            ref_alignment_start = read.reference_start
            ref_alignment_stop = get_read_stop_position(read)
            ref_length = ref_alignment_stop - ref_alignment_start

            aligned_lengths.append([read_id, ref_length])

            print(read_id, ref_length)

    aligned_lengths = sorted(aligned_lengths,
                             key=lambda x: x[LENGTH],
                             reverse=True)

    aligned_assembly_contigs[bam_path] = aligned_lengths
Esempio n. 3
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Esempio n. 4
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        read_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_read_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param runlength_ref_sequence_path:
    :param read_vs_ref_bam_path:
    :return:
    """

    for chromosome_name in runlength_ref_sequences.keys():
        # allowed_chromosomes = {"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10"}
        # allowed_chromosomes = {"chrX"}
        # if chromosome_name not in allowed_chromosomes:
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT IN %s" % (chromosome_name, str(allowed_chromosomes)))
        #     continue

        # if not chromosome_name.startswith("chrX"):
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT chrX" % chromosome_name)
        #     continue

        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(read_vs_ref_bam_path)

        bam_handler = BamHandler(read_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        # chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=None,
                                      stop=None)

        n_reads = parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_read_sequences=runlength_read_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" %
                             chromosome_name)
Esempio n. 5
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Esempio n. 6
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Esempio n. 7
0
def main(bam_file_path, cutoff, contig_name):
    # ---- GIAB E. Coli - (dev machine) ---------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # bam_file_path = "/home/ryan/data/Nanopore/ecoli/flapppie/03_22_19_R941_gEcoli_first_410k_VS_refEcoli.sorted.bam"
    # -------------------------------------------------------------------------

    bam_handler = BamHandler(bam_file_path)
    reads = bam_handler.get_reads(chromosome_name=contig_name, start=None, stop=None)

    all_counts = defaultdict(lambda: Counter())

    sys.stderr.write("reading file...\n")
    sys.stderr.flush()

    c = 0
    for read in reads:
        if read.mapping_quality <= 5 or read.is_secondary or read.is_unmapped \
                or read.is_qcfail:
            continue

        c += 1

        if c % 100 == 0:
            sys.stderr.write("\rParsed %d reads" % c)

        if c > cutoff:
            break

        sequence = read.query_sequence

        # print(read.query_name)
        # print(len(sequence))
        # print(sequence[:10])

        character_counts = count_runlength_per_character(sequence)

        for character in character_counts:
            all_counts[character] += character_counts[character]

    sys.stderr.write("\n")

    for character in sorted(all_counts):
        print(">%s" % character)
        for length in sorted(all_counts[character].keys()):
            print(length, all_counts[character][length])
Esempio n. 8
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        assembly_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_assembly_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param chromosome_name:
    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequence:
    :param runlength_ref_lengths:
    :param observations:
    :return:
    """

    for chromosome_name in runlength_ref_sequences:
        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(assembly_vs_ref_bam_path)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_assembly_sequences=runlength_assembly_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        yield matrix
Esempio n. 9
0
def parse_bam(bam_path, reference_path):
    """
    Iterate a BAM file and count summary stats from that file
    :param bam_path:
    :param reference_path:
    :return:
    """
    fasta_handler = FastaHandler(reference_path)
    chromosome_names = fasta_handler.get_contig_names()

    chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int))

    n_alignments = 0
    n_primary = 0
    n_supplementary = 0
    n_secondary = 0

    map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6)

    for chromosome_name in chromosome_names:
        bam_handler = BamHandler(bam_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        chromosomal_cigar_counts, \
        n_alignments, \
        n_primary, \
        n_supplementary, \
        n_secondary, \
        map_qualities = count_cigar_operations(reads=reads,
                                               chromosome_name=chromosome_name,
                                               chromosomal_cigar_counts=chromosomal_cigar_counts,
                                               n_alignments=n_alignments,
                                               n_primary=n_primary,
                                               n_supplementary=n_supplementary,
                                               n_secondary=n_secondary,
                                               map_qualities=map_qualities)

    return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
Esempio n. 10
0
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start,
                         stop, output_dir, bam_path):
    fasta_handler = FastaHandler(reference_file_path=reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length,
        fasta_handler=fasta_handler)

    genome_data.append(chromosome_data)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)
Esempio n. 11
0
def get_chromosome_data(bam_path, reference_path, chromosome_name, output_dir,
                        centromere_table_path, gap_table_path,
                        segdup_table_path, genome_data):
    fasta_handler = FastaHandler(reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    start = 0
    stop = chromosome_length

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        fasta_handler=fasta_handler,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length)

    genome_data.append(chromosome_data)

    # Calculate total identity, and approximate 0 if denominator is zero for F and R
    total_weighted_identity = sum(
        [x[ALIGNMENT_LENGTH] * x[SEQUENCE_IDENTITY] for x in read_data])
    total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
    total_identity = total_weighted_identity / max(1e-9, total_alignment_bases)
    total_identity = round(total_identity, 6)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)

    if centromere_table_path is not None:
        centromere_coordinates = read_centromere_table(
            centromere_table_path=centromere_table_path,
            target_chromosome_name=chromosome_name)
    else:
        centromere_coordinates = None

    if gap_table_path is not None:
        gap_coordinates = read_gap_table(
            table_path=gap_table_path, target_chromosome_name=chromosome_name)
    else:
        gap_coordinates = None

    if segdup_table_path is not None:
        segdup_coordinates = read_gap_table(
            table_path=segdup_table_path,
            target_chromosome_name=chromosome_name,
            size_cutoff=10000)
    else:
        segdup_coordinates = None

    figure, axes = plot_contigs(output_dir=output_dir,
                                read_data=read_data,
                                chromosome_name=chromosome_name,
                                chromosome_length=chromosome_length,
                                total_identity=total_identity,
                                bam_path=bam_path,
                                centromere_coordinates=centromere_coordinates,
                                gap_coordinates=gap_coordinates,
                                segdup_coordinates=segdup_coordinates,
                                show=False)

    pyplot.close(figure)