Exemple #1
0
def runlength_encode_parallel(fasta_sequence_path, contig_name,
                              runlength_sequences, min_length):
    fasta_handler = FastaHandler(fasta_sequence_path)

    try:
        sequence = fasta_handler.get_sequence(chromosome_name=contig_name,
                                              start=None,
                                              stop=None)
    except ValueError as e:
        print(e)
        print("ERROR: pysam fetch failed on contig: %s" % contig_name)
        return

    if len(sequence) < min_length:
        return

    character_sequence = [numpy.uint8(x) for x in range(0)]
    character_counts = [numpy.uint8(x) for x in range(0)]
    current_character = ""

    for character in sequence:
        if character != current_character:
            character_sequence.append(character)
            character_counts.append(1)
        else:
            character_counts[-1] += 1

        current_character = character

    character_sequence = ''.join(character_sequence)

    runlength_sequences[contig_name] = (character_sequence, character_counts)

    sys.stderr.write("\rRun length encoded %s            " % contig_name)
Exemple #2
0
def run_parameter_comparison():
    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382"
    chromosome_name = "NC_003279.8"

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
    region = [0, chromosome_length]

    runlength = True

    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_anchored_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    encode_region_parallel(bam_file_path=bam_file_path,
                           reference_file_path=reference_file_path,
                           chromosome_name=chromosome_name,
                           region=region,
                           window_size=20,
                           output_dir=output_dir,
                           runlength=runlength,
                           max_threads=30,
                           windows_path=chromosomal_window_path,
                           sort_sequences_by_length=False,
                           reverse_sort=False,
                           two_pass=True)
Exemple #3
0
def runlength_encode_fasta_parallel(fasta_sequence_path,
                                    max_threads=None,
                                    min_length=0):
    if min_length > 0:
        print("WARNING: excluding all sequences less than length %d" %
              min_length)

    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    manager = Manager()
    runlength_sequences = manager.dict()

    args = list()
    for contig_name in contig_names:
        args.append([
            fasta_sequence_path, contig_name, runlength_sequences, min_length
        ])

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    if max_threads > len(args):
        max_threads = len(args)

    with Pool(processes=max_threads, maxtasksperchild=40) as pool:
        pool.starmap(runlength_encode_parallel, args, chunksize=1)

    sys.stderr.write("\n")

    return runlength_sequences
Exemple #4
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path,
                                        runlength_ref_sequences, runlength_read_data):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.

    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequences:
    :param runlength_read_data:
    :return:
    """
    for chromosome_name in runlength_ref_sequences:
        shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length)

        n_reads = parse_reads(chromosome_name=chromosome_name,
                              fasta_handler=fasta_handler,
                              reads=reads,
                              complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS],
                              runlength_read_data=runlength_read_data,
                              matrix=matrix)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
Exemple #5
0
def main():
    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    # chromosomal_window_path = "output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1"   # kernel method
    chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382"  # transition method
    chromosome_name = "NC_003279.8"

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reference_sequence = fasta_handler.get_sequence(
        chromosome_name=chromosome_name, start=0, stop=chromosome_length)

    windows = load_windows(chromosomal_window_path)

    long_repeat_positions = find_repeats(sequence=reference_sequence,
                                         repeat_threshold=1)

    split_counts_per_length, split_repeat_windows, unsplit_repeat_windows = \
        locate_repeats_in_anchored_windows(windows=windows, repeat_positions=long_repeat_positions)

    plot_split_ratios_per_length(split_counts_per_length)
    plot_pileups_for_split_repeats(split_repeat_windows=split_repeat_windows,
                                   bam_file_path=bam_file_path,
                                   reference_file_path=reference_file_path,
                                   chromosome_name=chromosome_name)
Exemple #6
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Exemple #7
0
def main():
    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Illumina (laptop) --------------------------------------------------
    # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam"
    # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa"
    # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz"
    # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed"

    # ---- GIAB (dev machine) -------------------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56"

    # ---- Nanopore GUPPY - E. Coli - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta.reads.sorted.bam"
    bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam"
    reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    # reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    chromosome_name = "gi"  # E coli
    # chromosome_name = "NC_003279.8"     # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5
    # chromosome_name = "1"
    # chromosome_name = "chr" + chromosome_name

    lengths = list()
    for name in contig_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(name)
        lengths.append(chromosome_length)

    print('\t'.join(contig_names))
    print('\t\t'.join(map(str, lengths)))

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
Exemple #8
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Exemple #9
0
def process_bam(bam_path, reference_path, max_threads, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        arguments.append([
            genome_data, reference_path, chromosome_name, start, stop,
            output_dir, bam_path
        ])

    if len(arguments) < max_threads:
        print("Fewer jobs than threads")
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_stats, arguments)

    print("genome_data", genome_data)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Exemple #10
0
def process_bam(bam_path,
                reference_path,
                output_dir=None,
                centromere_table_path=None,
                gap_table_path=None,
                segdup_table_path=None,
                max_threads=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    if output_dir is None:
        output_dir = "plots/"

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        arguments.append([
            bam_path, reference_path, chromosome_name, output_dir,
            centromere_table_path, gap_table_path, segdup_table_path,
            genome_data
        ])

    if len(arguments) < max_threads:
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_data, arguments)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Exemple #11
0
def main(sequences_path, cutoff):
    fasta = FastaHandler(sequences_path)
    names = fasta.get_contig_names()

    n_reads = 0

    with open("assemble_long_segments.sh", "w") as file:
        for i, name in enumerate(names):
            length = fasta.get_chr_sequence_length(name)

            n_reads += 1

            if length > cutoff:
                print(name, length)
                file.write("../build/shasta-install/bin/AssembleSegment.py " +
                           name + "\n")
Exemple #12
0
def generate_window_fasta(bam_file_path,
                          reference_file_path,
                          chromosome_name,
                          window,
                          output_dir,
                          exclude_loose_ends=True):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    reads_found = True

    ref_sequence, read_ids, sequences, reversal_statuses = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end,
        include_ref=True,
        exclude_loose_ends=exclude_loose_ends)

    if sequences is not None:
        for sequence in sequences:
            print(len(sequence))

        print(len(ref_sequence))

        FileManager.ensure_directory_exists(output_dir)
        sequences_output_filename = '_'.join(
            [chromosome_name, str(window[0]),
             str(window[1])]) + ".fasta"
        sequences_output_path = os.path.join(output_dir,
                                             sequences_output_filename)
        fasta_writer = FastaWriter(sequences_output_path)
        fasta_writer.write_sequences(sequences)

        ref_output_filename = '_'.join(
            [chromosome_name,
             str(window[0]),
             str(window[1]), "ref"]) + ".fasta"
        ref_output_path = os.path.join(output_dir, ref_output_filename)
        fasta_writer = FastaWriter(ref_output_path)
        fasta_writer.write_sequences([ref_sequence])

        # print("saving sequences as fasta: ", sequences_output_path, ref_output_path)

    else:
        reads_found = False

    return reads_found
def get_contig_lengths(assembly_path, assembly_contigs):
    handler = FastaHandler(assembly_path)

    contig_names = handler.get_contig_names()
    contigs = list()

    for name in sorted(contig_names):
        length = handler.get_chr_sequence_length(name)

        contigs.append([name, length])

    contigs = sorted(contigs, key=lambda x: x[LENGTH], reverse=True)

    print("Assembly parsed: %s" % assembly_path)

    assembly_contigs[assembly_path] = contigs
Exemple #14
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Exemple #15
0
def iteratively_align_as_RLE(ref_fasta_path, read_fasta_path, output_dir):
    """
    Given 2 fasta files for reads and reference, iterate them, runlength encode their sequences, and write the RLE
    sequences to a new file, then align them with minimap2
    :param ref_fasta_path:
    :param read_fasta_path:
    :param output_dir:
    :return:
    """
    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    print("SAVING run length fasta file:", runlength_ref_fasta_path)
    print("SAVING run length fasta file:", runlength_read_fasta_path)

    with open(runlength_ref_fasta_path, "w") as file:
        fasta_handler = FastaHandler(ref_fasta_path)
        names = fasta_handler.get_contig_names()

        for name in names:
            sequence = fasta_handler.get_sequence(chromosome_name=name,
                                                  start=None,
                                                  stop=None)
            sequence, lengths = runlength_encode(sequence)

            file.write(">" + name + " RLE\n")
            file.write(sequence + "\n")

    with open(runlength_read_fasta_path, "w") as file:
        fasta_handler = FastaHandler(read_fasta_path)
        names = fasta_handler.get_contig_names()

        for name in names:
            sequence = fasta_handler.get_sequence(chromosome_name=name,
                                                  start=None,
                                                  stop=None)
            sequence, lengths = runlength_encode(sequence)

            file.write(">" + name + " RLE\n")
            file.write(sequence + "\n")

    output_sam_file_path, output_bam_file_path = align_minimap(
        output_dir=output_dir,
        ref_sequence_path=runlength_ref_fasta_path,
        reads_sequence_path=runlength_read_fasta_path)

    return output_bam_file_path
Exemple #16
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        assembly_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_assembly_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param chromosome_name:
    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequence:
    :param runlength_ref_lengths:
    :param observations:
    :return:
    """

    for chromosome_name in runlength_ref_sequences:
        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(assembly_vs_ref_bam_path)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_assembly_sequences=runlength_assembly_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        yield matrix
Exemple #17
0
def parse_bam(bam_path, reference_path):
    """
    Iterate a BAM file and count summary stats from that file
    :param bam_path:
    :param reference_path:
    :return:
    """
    fasta_handler = FastaHandler(reference_path)
    chromosome_names = fasta_handler.get_contig_names()

    chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int))

    n_alignments = 0
    n_primary = 0
    n_supplementary = 0
    n_secondary = 0

    map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6)

    for chromosome_name in chromosome_names:
        bam_handler = BamHandler(bam_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        chromosomal_cigar_counts, \
        n_alignments, \
        n_primary, \
        n_supplementary, \
        n_secondary, \
        map_qualities = count_cigar_operations(reads=reads,
                                               chromosome_name=chromosome_name,
                                               chromosomal_cigar_counts=chromosomal_cigar_counts,
                                               n_alignments=n_alignments,
                                               n_primary=n_primary,
                                               n_supplementary=n_supplementary,
                                               n_secondary=n_secondary,
                                               map_qualities=map_qualities)

    return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
Exemple #18
0
def generate_collapsed_data(bam_file_path,
                            reference_file_path,
                            vcf_path,
                            bed_path,
                            chromosome_name,
                            start_position,
                            end_position,
                            generate_from_vcf=False):
    """
    Generate pileups from BAM data, and collapse sequences to have no explicitly repeated characters. Additionally
    encode a repeat channel that describes the number of repeats observed per base.
    :param bam_file_path:
    :param reference_file_path:
    :param vcf_path:
    :param chromosome_name:
    :param start_position:
    :param end_position:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    if generate_from_vcf:
        chromosomal_windows = get_variant_windows(
            vcf_path=vcf_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    else:
        chromosomal_windows = get_non_variant_windows(
            vcf_path=vcf_path,
            bed_path=bed_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    for chromosome_name in chromosomal_windows:
        for w, window in enumerate(chromosomal_windows[chromosome_name]):
            pileup_start = window[0]
            pileup_end = window[1]  # add random variation here

            print(pileup_start, pileup_end)

            ref_sequence, read_ids, sequences = get_aligned_segments(
                fasta_handler=fasta_handler,
                bam_handler=bam_handler,
                chromosome_name=chromosome_name,
                pileup_start=pileup_start,
                pileup_end=pileup_end)

            character_sequences, character_counts = collapse_repeats(sequences)
            print_collapsed_segments(character_sequences, character_counts)

            if w == 0:
                exit()
Exemple #19
0
def runlength_encode_fasta(fasta_sequence_path):
    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    runlength_sequences = dict()

    for contig_name in contig_names:
        sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=None, stop=None)

        bases, lengths = runlength_encode(sequence)

        runlength_sequences[contig_name] = (bases, lengths)

        sys.stderr.write("\rRun length encoded %s            " % contig_name)

    sys.stderr.write("\n")

    return runlength_sequences
Exemple #20
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        read_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_read_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param runlength_ref_sequence_path:
    :param read_vs_ref_bam_path:
    :return:
    """

    for chromosome_name in runlength_ref_sequences.keys():
        # allowed_chromosomes = {"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10"}
        # allowed_chromosomes = {"chrX"}
        # if chromosome_name not in allowed_chromosomes:
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT IN %s" % (chromosome_name, str(allowed_chromosomes)))
        #     continue

        # if not chromosome_name.startswith("chrX"):
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT chrX" % chromosome_name)
        #     continue

        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(read_vs_ref_bam_path)

        bam_handler = BamHandler(read_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        # chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=None,
                                      stop=None)

        n_reads = parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_read_sequences=runlength_read_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" %
                             chromosome_name)
Exemple #21
0
def main():
    output_dir = "output/ref_run_lengths/"
    filename_prefix = "ref_runlength_distribution"

    reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta"

    # ---- GIAB E. Coli - (dev machine) -------------------------
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta"
    # -------------------------------------------------------------------------

    threshold = 5

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    all_counts = defaultdict(lambda: Counter())

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    c = 0
    for chromosome_name in contig_names:
        if len(contig_names) > 1:
            if not chromosome_name != "chr1":
                continue
        c += 1

        # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        # sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reference_sequence = fasta_handler.get_sequence(
            chromosome_name=chromosome_name, stop=chromosome_length, start=0)

        character_counts = count_runlength_per_character(
            sequence=reference_sequence,
            threshold=threshold,
            chromosome_name=chromosome_name)
Exemple #22
0
def runlength_encode_fasta(fasta_sequence_path):
    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    runlength_sequences = dict()

    for contig_name in contig_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(contig_name)

        sequence = fasta_handler.get_sequence(chromosome_name=contig_name,
                                              start=0,
                                              stop=chromosome_length)

        bases, lengths = runlength_encode(sequence)

        runlength_sequences[contig_name] = (bases, lengths)

        print(contig_name, len(bases), len(lengths))

    return runlength_sequences
Exemple #23
0
def generate_data(bam_file_path,
                  reference_file_path,
                  vcf_path,
                  bed_path,
                  chromosome_name,
                  start_position,
                  end_position,
                  generate_from_vcf=False):
    """
    Generate pileup for read segments aligned between two genomic coordinates
    :param bam_file_path:
    :param reference_file_path:
    :param vcf_path:
    :param chromosome_name:
    :param start_position:
    :param end_position:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    if generate_from_vcf:
        chromosomal_windows = get_variant_windows(
            vcf_path=vcf_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    else:
        chromosomal_windows = get_non_variant_windows(
            vcf_path=vcf_path,
            bed_path=bed_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    for chromosome_name in chromosomal_windows:
        for w, window in enumerate(chromosomal_windows[chromosome_name]):
            pileup_start = window[0]
            pileup_end = window[1]  # add random variation here

            print(pileup_start, pileup_end)

            ref_sequence, read_ids, sequences = get_aligned_segments(
                fasta_handler=fasta_handler,
                bam_handler=bam_handler,
                chromosome_name=chromosome_name,
                pileup_start=pileup_start,
                pileup_end=pileup_end)

            if w == 10:
                exit()
Exemple #24
0
def main():
    if READS_PATH.endswith(".fastq"):
        reads = FastqReader().iterate_file(path=READS_PATH)
    elif READS_PATH.endswith(".fasta"):
        reads = FastaHandler(READS_PATH).iterate_file()
    else:
        exit("Improper file format: %s" % READS_PATH)

    n_reads = 0
    lengths = list()
    length_sum = 0

    for i, item in enumerate(reads):
        n_reads += 1

        if READS_PATH.endswith(".fastq"):
            header, sequence, quality = item
        elif READS_PATH.endswith(".fasta"):
            header, sequence = item

        # print()
        # print(header)
        # print(sequence[:30])
        # print(quality[:30])

        lengths.append(len(sequence))
        length_sum += len(sequence)

        sys.stdout.write("\r%d" % i)

    print()

    # ---- Plotting ----

    step = 500  # bin size
    max_length = 50000  # end of histogram

    bins = numpy.arange(0, max_length + step, step=step)
    frequencies, _ = numpy.histogram(lengths, bins=bins)

    print(bins, frequencies)

    plot_length_distribution(step=step, bins=bins, frequencies=frequencies)

    # ---- Printing ----

    print_stats(step=step, frequencies=frequencies, n_reads=n_reads)
    print("total bp:\t%d" % length_sum)
    print("coverage (E. Coli):\t%f" % (length_sum / (5.4 * 1000000)))
Exemple #25
0
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start,
                         stop, output_dir, bam_path):
    fasta_handler = FastaHandler(reference_file_path=reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length,
        fasta_handler=fasta_handler)

    genome_data.append(chromosome_data)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)
Exemple #26
0
def genomic_run():
    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    # chromosomal_window_paths = ["output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1",
    # "output/window_selection/NC_003280.10_0_15279421_2018_10_1_21_25",
    # "output/window_selection/NC_003281.10_0_13783801_2018_10_1_22_40",
    # "output/window_selection/NC_003282.8_0_17493829_2018_10_1_23_51",
    # "output/window_selection/NC_003283.11_0_20924180_2018_10_2_1_22",
    # "output/window_selection/NC_003284.9_0_17718942_2018_10_2_3_10",
    # "output/window_selection/NC_001328.1_0_13794_2018_10_2_4_46"]

    chromosomal_window_paths = [
        "output/window_selection/NC_003283.11_0_20924180_2018_10_2_1_22"
    ]

    for path in chromosomal_window_paths:
        chromosome_name = "_".join(path.split("/")[-1].split("_")[0:2])
        print("STARTING", chromosome_name)

        region = [-1, -1]
        runlength = True

        encode_region_parallel(bam_file_path=bam_file_path,
                               reference_file_path=reference_file_path,
                               chromosome_name=chromosome_name,
                               region=region,
                               window_size=20,
                               output_dir=output_dir,
                               runlength=runlength,
                               max_threads=30,
                               windows_path=path,
                               sort_sequences_by_length=False,
                               reverse_sort=False,
                               two_pass=True)
Exemple #27
0
def test_window(bam_file_path,
                reference_file_path,
                chromosome_name,
                window,
                output_dir,
                save_data=True,
                print_results=False):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    ref_sequence, read_ids, sequences = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end)

    if print_results:
        print_segments(ref_sequence, sequences)

    if save_data:
        filename = "test_" + str(pileup_start) + ".fasta"
        output_path = os.path.join(output_dir, filename)

        if not os.path.exists(output_dir):
            FileManager.ensure_directory_exists(output_dir)

        fasta_writer = FastaWriter(output_path)
        fasta_writer.write_sequences(sequences)
Exemple #28
0
def main():
    # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(ref_fasta_path)
    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(ref_fasta_path)

    pileup_start = 0
    pileup_end = pileup_start + 1000  # add random variation here ?

    aligned_segments = get_aligned_segments(fasta_handler=fasta_handler,
                                            bam_handler=bam_handler,
                                            chromosome_name=chromosome_name,
                                            pileup_start=pileup_start,
                                            pileup_end=pileup_end,
                                            include_ref=True)

    encoding = list()
    for alignment in aligned_segments.values():
        encoding.append(list(map(get_encoding, alignment)))

    encoding = -numpy.array(encoding, dtype=numpy.float)

    pyplot.imshow(encoding)
    pyplot.show()
    pyplot.close()
Exemple #29
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    pileup_start = 6000
    pileup_end = 6050

    output_parent_dir = "output/"
    output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True)

    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path,
                                            runlength_ref_sequences=runlength_ref_sequences,
                                            runlength_read_path=runlength_assembly_fasta_path,
                                            runlength_read_sequences=read_data,
                                            output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
        get_aligned_segments(fasta_handler=fasta_handler,
                             bam_handler=bam_handler,
                             chromosome_name=chromosome_name,
                             pileup_start=pileup_start,
                             pileup_end=pileup_end,
                             runlength_ref_sequences=runlength_ref_sequences,
                             read_data=read_data)

    sequence_encoding = list()
    scale_encoding = list()
    shape_encoding = list()
    modes_encoding = list()

    print(len(aligned_sequences.keys()))

    print("REF\t", "".join(aligned_ref_sequence))
    for read_id in aligned_sequences.keys():
        print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id])))
        sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id])))
        scale_encoding.append(aligned_scales[read_id])
        shape_encoding.append(aligned_shapes[read_id])
        modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id]))))

    sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float)
    scale_encoding = numpy.array(scale_encoding, dtype=numpy.float)
    shape_encoding = numpy.array(shape_encoding, dtype=numpy.float)
    modes_encoding = numpy.array(modes_encoding, dtype=numpy.float)

    plot_runlength_pileup(sequences=sequence_encoding,
                          scales=scale_encoding,
                          shapes=shape_encoding,
                          modes=modes_encoding)
Exemple #30
0
def main(reference_file_path):
    input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0]
    output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name)
    filename_prefix = "ref_runlength_distribution"

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    print(contig_names)
    print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1]))

    all_counts = defaultdict(lambda: Counter())
    raw_counts_AT = list()
    raw_counts_GC = list()

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    max_count = 100
    step = 1
    c = 0
    for chromosome_name in contig_names:
        # if len(contig_names) > 1:
        #     if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name:
        #         print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name)
        #         continue

        # if c == 1:
        #     break
        c += 1

        sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length)
        character_counts = count_runlength_per_character(reference_sequence)

        figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True)
        figure.set_size_inches(6,12)

        for k,key in enumerate(character_counts.keys()):
            counts = character_counts[key]
            counter = Counter(counts)
            all_counts[key] += counter

            if key in {"C","G"}:
                raw_counts_GC += counts

            if key in {"A","T"}:
                raw_counts_AT += counts

            plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step)

            axes[k].set_ylabel(str(key))
            axes[k].set_ylim([-0.5,10])

        axes[0].set_title(chromosome_name)

        filename = filename_prefix + "_" + chromosome_name + ".png"
        file_path = os.path.join(output_dir, filename)
        figure.savefig(file_path)
        # pyplot.show()
        pyplot.close()

    figure, axes = pyplot.subplots(nrows=2)

    filename = filename_prefix + "_genomic.png"
    file_path = os.path.join(output_dir, filename)

    plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step)
    plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step)
    axes[0].set_ylabel("AT Log10 Frequency")
    axes[1].set_ylabel("GC Log10 Frequency")

    figure.savefig(file_path)
    # pyplot.show()
    pyplot.close()

    print_all_counts_as_shasta_matrix(all_counts, max_count=50)
    print_all_counts(all_counts, output_dir)