Ejemplo n.º 1
0
def main():
    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Illumina (laptop) --------------------------------------------------
    # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam"
    # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa"
    # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz"
    # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed"

    # ---- GIAB (dev machine) -------------------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56"

    # ---- Nanopore GUPPY - E. Coli - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta.reads.sorted.bam"
    bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam"
    reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    # reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    chromosome_name = "gi"  # E coli
    # chromosome_name = "NC_003279.8"     # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5
    # chromosome_name = "1"
    # chromosome_name = "chr" + chromosome_name

    lengths = list()
    for name in contig_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(name)
        lengths.append(chromosome_length)

    print('\t'.join(contig_names))
    print('\t\t'.join(map(str, lengths)))

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
Ejemplo n.º 2
0
def run_parameter_comparison():
    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382"
    chromosome_name = "NC_003279.8"

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
    region = [0, chromosome_length]

    runlength = True

    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_anchored_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    encode_region_parallel(bam_file_path=bam_file_path,
                           reference_file_path=reference_file_path,
                           chromosome_name=chromosome_name,
                           region=region,
                           window_size=20,
                           output_dir=output_dir,
                           runlength=runlength,
                           max_threads=30,
                           windows_path=chromosomal_window_path,
                           sort_sequences_by_length=False,
                           reverse_sort=False,
                           two_pass=True)
Ejemplo n.º 3
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path,
                                        runlength_ref_sequences, runlength_read_data):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.

    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequences:
    :param runlength_read_data:
    :return:
    """
    for chromosome_name in runlength_ref_sequences:
        shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length)

        n_reads = parse_reads(chromosome_name=chromosome_name,
                              fasta_handler=fasta_handler,
                              reads=reads,
                              complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS],
                              runlength_read_data=runlength_read_data,
                              matrix=matrix)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
Ejemplo n.º 4
0
def main():
    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    # chromosomal_window_path = "output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1"   # kernel method
    chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382"  # transition method
    chromosome_name = "NC_003279.8"

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reference_sequence = fasta_handler.get_sequence(
        chromosome_name=chromosome_name, start=0, stop=chromosome_length)

    windows = load_windows(chromosomal_window_path)

    long_repeat_positions = find_repeats(sequence=reference_sequence,
                                         repeat_threshold=1)

    split_counts_per_length, split_repeat_windows, unsplit_repeat_windows = \
        locate_repeats_in_anchored_windows(windows=windows, repeat_positions=long_repeat_positions)

    plot_split_ratios_per_length(split_counts_per_length)
    plot_pileups_for_split_repeats(split_repeat_windows=split_repeat_windows,
                                   bam_file_path=bam_file_path,
                                   reference_file_path=reference_file_path,
                                   chromosome_name=chromosome_name)
Ejemplo n.º 5
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Ejemplo n.º 6
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Ejemplo n.º 7
0
def process_bam(bam_path, reference_path, max_threads, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        arguments.append([
            genome_data, reference_path, chromosome_name, start, stop,
            output_dir, bam_path
        ])

    if len(arguments) < max_threads:
        print("Fewer jobs than threads")
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_stats, arguments)

    print("genome_data", genome_data)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Ejemplo n.º 8
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Ejemplo n.º 9
0
def main(sequences_path, cutoff):
    fasta = FastaHandler(sequences_path)
    names = fasta.get_contig_names()

    n_reads = 0

    with open("assemble_long_segments.sh", "w") as file:
        for i, name in enumerate(names):
            length = fasta.get_chr_sequence_length(name)

            n_reads += 1

            if length > cutoff:
                print(name, length)
                file.write("../build/shasta-install/bin/AssembleSegment.py " +
                           name + "\n")
def get_contig_lengths(assembly_path, assembly_contigs):
    handler = FastaHandler(assembly_path)

    contig_names = handler.get_contig_names()
    contigs = list()

    for name in sorted(contig_names):
        length = handler.get_chr_sequence_length(name)

        contigs.append([name, length])

    contigs = sorted(contigs, key=lambda x: x[LENGTH], reverse=True)

    print("Assembly parsed: %s" % assembly_path)

    assembly_contigs[assembly_path] = contigs
Ejemplo n.º 11
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        assembly_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_assembly_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param chromosome_name:
    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequence:
    :param runlength_ref_lengths:
    :param observations:
    :return:
    """

    for chromosome_name in runlength_ref_sequences:
        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(assembly_vs_ref_bam_path)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_assembly_sequences=runlength_assembly_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        yield matrix
Ejemplo n.º 12
0
def parse_bam(bam_path, reference_path):
    """
    Iterate a BAM file and count summary stats from that file
    :param bam_path:
    :param reference_path:
    :return:
    """
    fasta_handler = FastaHandler(reference_path)
    chromosome_names = fasta_handler.get_contig_names()

    chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int))

    n_alignments = 0
    n_primary = 0
    n_supplementary = 0
    n_secondary = 0

    map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6)

    for chromosome_name in chromosome_names:
        bam_handler = BamHandler(bam_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        chromosomal_cigar_counts, \
        n_alignments, \
        n_primary, \
        n_supplementary, \
        n_secondary, \
        map_qualities = count_cigar_operations(reads=reads,
                                               chromosome_name=chromosome_name,
                                               chromosomal_cigar_counts=chromosomal_cigar_counts,
                                               n_alignments=n_alignments,
                                               n_primary=n_primary,
                                               n_supplementary=n_supplementary,
                                               n_secondary=n_secondary,
                                               map_qualities=map_qualities)

    return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
Ejemplo n.º 13
0
def main():
    output_dir = "output/ref_run_lengths/"
    filename_prefix = "ref_runlength_distribution"

    reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta"

    # ---- GIAB E. Coli - (dev machine) -------------------------
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta"
    # -------------------------------------------------------------------------

    threshold = 5

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    all_counts = defaultdict(lambda: Counter())

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    c = 0
    for chromosome_name in contig_names:
        if len(contig_names) > 1:
            if not chromosome_name != "chr1":
                continue
        c += 1

        # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        # sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reference_sequence = fasta_handler.get_sequence(
            chromosome_name=chromosome_name, stop=chromosome_length, start=0)

        character_counts = count_runlength_per_character(
            sequence=reference_sequence,
            threshold=threshold,
            chromosome_name=chromosome_name)
Ejemplo n.º 14
0
def runlength_encode_fasta(fasta_sequence_path):
    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    runlength_sequences = dict()

    for contig_name in contig_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(contig_name)

        sequence = fasta_handler.get_sequence(chromosome_name=contig_name,
                                              start=0,
                                              stop=chromosome_length)

        bases, lengths = runlength_encode(sequence)

        runlength_sequences[contig_name] = (bases, lengths)

        print(contig_name, len(bases), len(lengths))

    return runlength_sequences
Ejemplo n.º 15
0
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start,
                         stop, output_dir, bam_path):
    fasta_handler = FastaHandler(reference_file_path=reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length,
        fasta_handler=fasta_handler)

    genome_data.append(chromosome_data)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)
Ejemplo n.º 16
0
def main():
    # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(ref_fasta_path)
    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(ref_fasta_path)

    pileup_start = 0
    pileup_end = pileup_start + 1000  # add random variation here ?

    aligned_segments = get_aligned_segments(fasta_handler=fasta_handler,
                                            bam_handler=bam_handler,
                                            chromosome_name=chromosome_name,
                                            pileup_start=pileup_start,
                                            pileup_end=pileup_end,
                                            include_ref=True)

    encoding = list()
    for alignment in aligned_segments.values():
        encoding.append(list(map(get_encoding, alignment)))

    encoding = -numpy.array(encoding, dtype=numpy.float)

    pyplot.imshow(encoding)
    pyplot.show()
    pyplot.close()
Ejemplo n.º 17
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta"
    # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    # WG ecoli 60x
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv"
    raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(
        output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_assembly_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    total_confusion = get_runlength_confusion([], [], 10)
    total_confusion_weibull = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)
    # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path)
    length_classifier_weibull = WeibullRunlengthClassifier(
        raw_matrix_path, normalize_matrix=True, pseudocount=0.05)

    print("reading BAM")
    for pileup_start, pileup_end in windows[10:20]:
        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=read_data)

        sequence_encoding = list()
        scale_encoding = list()
        shape_encoding = list()
        modes_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        try:
            # print("REF\t", "".join(aligned_ref_sequence))
            for read_id in aligned_sequences.keys():
                # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id])))
                sequence_encoding.append(
                    list(map(get_encoding, aligned_sequences[read_id])))
                scale_encoding.append(aligned_scales[read_id])
                shape_encoding.append(aligned_shapes[read_id])
                modes_encoding.append(
                    list(
                        map(
                            map_parameters_to_mode,
                            zip(aligned_scales[read_id],
                                aligned_shapes[read_id]))))
                reversal_encoding.append(reversal_statuses[read_id])

            ref_sequence_encoding = [
                list(map(get_encoding, aligned_ref_sequence))
            ]
            ref_lengths_encoding = [aligned_ref_lengths]

            ref_sequence_encoding = numpy.atleast_2d(
                numpy.array(ref_sequence_encoding, dtype=numpy.int))
            ref_length_encoding = numpy.atleast_2d(
                numpy.array(ref_lengths_encoding, dtype=numpy.int))
            sequence_encoding = numpy.atleast_2d(
                numpy.array(sequence_encoding, dtype=numpy.int))
            scale_encoding = numpy.atleast_2d(
                numpy.array(scale_encoding, dtype=numpy.float))
            shape_encoding = numpy.atleast_2d(
                numpy.array(shape_encoding, dtype=numpy.float))
            modes_encoding = numpy.atleast_2d(
                numpy.array(modes_encoding, dtype=numpy.int))
            reversal_encoding = numpy.array(reversal_encoding,
                                            dtype=numpy.bool)

            consensus_sequence, consensus_lengths = \
                get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=modes_encoding,
                                                         reversal_encoding=reversal_encoding)

            weibull_consensus_sequence, weibull_consensus_lengths = \
                get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull,
                                                           sequence_encoding=sequence_encoding,
                                                           scale_encoding=scale_encoding,
                                                           shape_encoding=shape_encoding,
                                                           reversal_encoding=reversal_encoding)

            plot_runlength_pileup(
                sequences=-sequence_encoding,
                scales=scale_encoding,
                shapes=shape_encoding,
                modes=modes_encoding,
                ref_sequence=-ref_sequence_encoding,
                ref_lengths=ref_length_encoding,
                predicted_sequence=-numpy.atleast_2d(
                    numpy.array(weibull_consensus_sequence, dtype=numpy.int)),
                predicted_lengths=numpy.atleast_2d(
                    numpy.array(weibull_consensus_lengths, dtype=numpy.int)))

            print()
            print("PREDICTED\t", weibull_consensus_lengths[:10])
            print("TRUE\t\t", aligned_ref_lengths[:10])

            confusion = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=consensus_lengths,
                max_length=10)

            confusion_weibull = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=weibull_consensus_lengths,
                max_length=10)

            total_confusion += confusion
            total_confusion_weibull += confusion_weibull

        except Exception as e:
            print(e)
            continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Modal: ", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull)

    print("Full: ", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "confusion_weibull.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion_weibull))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Ejemplo n.º 18
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    pileup_start = 6000
    pileup_end = 6050

    output_parent_dir = "output/"
    output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True)

    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path,
                                            runlength_ref_sequences=runlength_ref_sequences,
                                            runlength_read_path=runlength_assembly_fasta_path,
                                            runlength_read_sequences=read_data,
                                            output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
        get_aligned_segments(fasta_handler=fasta_handler,
                             bam_handler=bam_handler,
                             chromosome_name=chromosome_name,
                             pileup_start=pileup_start,
                             pileup_end=pileup_end,
                             runlength_ref_sequences=runlength_ref_sequences,
                             read_data=read_data)

    sequence_encoding = list()
    scale_encoding = list()
    shape_encoding = list()
    modes_encoding = list()

    print(len(aligned_sequences.keys()))

    print("REF\t", "".join(aligned_ref_sequence))
    for read_id in aligned_sequences.keys():
        print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id])))
        sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id])))
        scale_encoding.append(aligned_scales[read_id])
        shape_encoding.append(aligned_shapes[read_id])
        modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id]))))

    sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float)
    scale_encoding = numpy.array(scale_encoding, dtype=numpy.float)
    shape_encoding = numpy.array(shape_encoding, dtype=numpy.float)
    modes_encoding = numpy.array(modes_encoding, dtype=numpy.float)

    plot_runlength_pileup(sequences=sequence_encoding,
                          scales=scale_encoding,
                          shapes=shape_encoding,
                          modes=modes_encoding)
Ejemplo n.º 19
0
def main(reference_file_path):
    input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0]
    output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name)
    filename_prefix = "ref_runlength_distribution"

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    print(contig_names)
    print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1]))

    all_counts = defaultdict(lambda: Counter())
    raw_counts_AT = list()
    raw_counts_GC = list()

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    max_count = 100
    step = 1
    c = 0
    for chromosome_name in contig_names:
        # if len(contig_names) > 1:
        #     if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name:
        #         print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name)
        #         continue

        # if c == 1:
        #     break
        c += 1

        sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length)
        character_counts = count_runlength_per_character(reference_sequence)

        figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True)
        figure.set_size_inches(6,12)

        for k,key in enumerate(character_counts.keys()):
            counts = character_counts[key]
            counter = Counter(counts)
            all_counts[key] += counter

            if key in {"C","G"}:
                raw_counts_GC += counts

            if key in {"A","T"}:
                raw_counts_AT += counts

            plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step)

            axes[k].set_ylabel(str(key))
            axes[k].set_ylim([-0.5,10])

        axes[0].set_title(chromosome_name)

        filename = filename_prefix + "_" + chromosome_name + ".png"
        file_path = os.path.join(output_dir, filename)
        figure.savefig(file_path)
        # pyplot.show()
        pyplot.close()

    figure, axes = pyplot.subplots(nrows=2)

    filename = filename_prefix + "_genomic.png"
    file_path = os.path.join(output_dir, filename)

    plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step)
    plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step)
    axes[0].set_ylabel("AT Log10 Frequency")
    axes[1].set_ylabel("GC Log10 Frequency")

    figure.savefig(file_path)
    # pyplot.show()
    pyplot.close()

    print_all_counts_as_shasta_matrix(all_counts, max_count=50)
    print_all_counts(all_counts, output_dir)
Ejemplo n.º 20
0
def get_chromosome_data(bam_path, reference_path, chromosome_name, output_dir,
                        centromere_table_path, gap_table_path,
                        segdup_table_path, genome_data):
    fasta_handler = FastaHandler(reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    start = 0
    stop = chromosome_length

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        fasta_handler=fasta_handler,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length)

    genome_data.append(chromosome_data)

    # Calculate total identity, and approximate 0 if denominator is zero for F and R
    total_weighted_identity = sum(
        [x[ALIGNMENT_LENGTH] * x[SEQUENCE_IDENTITY] for x in read_data])
    total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
    total_identity = total_weighted_identity / max(1e-9, total_alignment_bases)
    total_identity = round(total_identity, 6)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)

    if centromere_table_path is not None:
        centromere_coordinates = read_centromere_table(
            centromere_table_path=centromere_table_path,
            target_chromosome_name=chromosome_name)
    else:
        centromere_coordinates = None

    if gap_table_path is not None:
        gap_coordinates = read_gap_table(
            table_path=gap_table_path, target_chromosome_name=chromosome_name)
    else:
        gap_coordinates = None

    if segdup_table_path is not None:
        segdup_coordinates = read_gap_table(
            table_path=segdup_table_path,
            target_chromosome_name=chromosome_name,
            size_cutoff=10000)
    else:
        segdup_coordinates = None

    figure, axes = plot_contigs(output_dir=output_dir,
                                read_data=read_data,
                                chromosome_name=chromosome_name,
                                chromosome_length=chromosome_length,
                                total_identity=total_identity,
                                bam_path=bam_path,
                                centromere_coordinates=centromere_coordinates,
                                gap_coordinates=gap_coordinates,
                                segdup_coordinates=segdup_coordinates,
                                show=False)

    pyplot.close(figure)
Ejemplo n.º 21
0
def main():
    output_root_dir = "output/"
    instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Illumina (laptop) --------------------------------------------------
    # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam"
    # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa"
    # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz"
    # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed"

    # ---- GIAB (dev machine) -------------------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    chromosome_name = "NC_003279.8"  # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5
    # chromosome_name = "1"
    # chromosome_name = "chr" + chromosome_name

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    # ---- TEST window --------------------------------------------------------

    # window = [762580, 762600]       # nanopore broken alignment region...  POAPY ONLY
    # window = [748460, 748480]       # nanopore broken alignment region...  POAPY ONLY
    # window = [767240, 767260]       # nanopore broken alignment region...  SPOA NOOOOooOOoooo
    # window = [727360, 767280]       # nanopore broken alignment region...  very high loss in CNNRNN
    # window = [727200, 727220]       # nanopore broken alignment region...  very high loss in CNNRNN
    # window = [748220, 748240]       # nanopore broken alignment region...  very high loss in CNNRNN
    # window = [1105084, 1105104]   # very messy alignment even with spoa... why?
    # window = [246567, 246587]     # previously failing test case for collapsed reads
    # window = [800000, 800020]

    # test sites for misalignment
    # window = [10029532, 10029532+83]
    # window = [10031827, 10031827+34]
    # window = [10039004, 10039004+25]
    # window = [10040234, 10040234+61]
    # window = [1004298, 1004298+109]
    window = [10044514, 10044514 + 54]
    # window = [10037167, 10037167+82]

    # test_window(bam_file_path=bam_file_path,
    #             reference_file_path=reference_file_path,
    #             chromosome_name=chromosome_name,
    #             window=window,
    #             output_dir=output_dir,
    #             print_results=True,
    #             save_data=True)

    generate_window_run_length_encoding(
        bam_file_path=bam_file_path,
        reference_file_path=reference_file_path,
        chromosome_name=chromosome_name,
        window=window,
        output_dir=output_dir,
        sort_sequences_by_length=True,
        reverse_sort=False,
        two_pass=True,
        plot_results=True,
        print_results=True,
        save_data=False)
Ejemplo n.º 22
0
def main():
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta"
    # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq"
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    # Initialize empty confusion matrices
    total_confusion = get_runlength_confusion([], [], 10)
    total_modal_confusion = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)

    print("reading BAM")
    for pileup_start, pileup_end in windows[:10]:
        print("window", pileup_start, pileup_end)

        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=runlength_read_sequences)

        sequence_encoding = list()
        length_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        # print("REF\t", "".join(aligned_ref_sequence))
        for read_id in aligned_sequences.keys():
            # print("READ\t","".join(aligned_sequences[read_id]))
            sequence_encoding.append(
                list(map(get_encoding, aligned_sequences[read_id])))
            length_encoding.append(aligned_lengths[read_id])
            reversal_encoding.append(reversal_statuses[read_id])

        ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))]
        ref_lengths_encoding = [aligned_ref_lengths]

        ref_sequence_encoding = numpy.array(ref_sequence_encoding,
                                            dtype=numpy.int)
        ref_length_encoding = numpy.array(ref_lengths_encoding,
                                          dtype=numpy.int)
        sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int)
        length_encoding = numpy.array(length_encoding, dtype=numpy.float)
        reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool)

        ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding)
        ref_length_encoding = numpy.atleast_2d(ref_length_encoding)
        sequence_encoding = numpy.atleast_2d(sequence_encoding)
        length_encoding = numpy.atleast_2d(length_encoding)

        # plot_runlength_pileup(sequences=-sequence_encoding,
        #                       lengths=length_encoding,
        #                       ref_sequence=-ref_sequence_encoding,
        #                       ref_lengths=ref_length_encoding)

        consensus_sequence, consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding)

        modal_consensus_sequence, modal_consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding,
                                                         bayesian=False)

        print()
        print("PREDICTED\t", consensus_lengths[:10])
        print("TRUE\t\t", aligned_ref_lengths[:10])

        confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=consensus_lengths,
            max_length=10)

        total_confusion += confusion

        modal_confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=modal_consensus_lengths,
            max_length=10)

        total_modal_confusion += modal_confusion

        # except Exception as e:
        #     print(e)
        #     continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Bayes:", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion)

    print("No Bayes", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "modal_confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_modal_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Ejemplo n.º 23
0
def main():
    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta"

    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa"

    # ---- TEST DATA ----
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta"
    # -------------------

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    sys.stderr.write("RL encoding fasta...\n")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    sys.stderr.write("Aligning RLE fasta...\n")

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    print(chromosome_length)

    sequences, lengths = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=runlength_read_sequences)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(lengths[key][:10])
Ejemplo n.º 24
0
def main():
    # output_root_dir = "output/"
    # instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    # output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam"
    # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"

    # ---- Nanopore GUPPY - E. Coli - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam"
    reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()
    fasta_handler.close()

    # chromosome_name = "NC_003279.8"     # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5

    for chromosome_name in contig_names:
        if chromosome_name == "NC_001328.1":    # mitochondrial
            continue

        print("STARTING:", chromosome_name)
        fasta_handler = FastaHandler(reference_file_path)
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name,
                                                        start=0,
                                                        stop=chromosome_length)

        fasta_handler.close()

        region = [0+1000000, chromosome_length-1000000]

        max_threads = 30

        window_size = 10000
        min_size = 20
        max_size = 80

        manager = multiprocessing.Manager()
        counter = manager.Value('i', 0)

        region_windows = chunk_region(region=region, size=window_size)

        n_chunks = len(region_windows)

        print("subregions: ", n_chunks)

        output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string()
        print(output_dir)

        # args = list()
        # for subregion in region_windows:
        #     args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks])

        pooled_args = generate_argument_pools(pool_size=max_threads,
                                              bam_file_path=bam_file_path,
                                              chromosome_name=chromosome_name,
                                              region_windows=region_windows,
                                              reference_sequence=reference_sequence,
                                              min_size=min_size,
                                              max_size=max_size,
                                              output_dir=output_dir,
                                              counter=counter,
                                              n_chunks=n_chunks)

        # print(len(pooled_args))
        # s = 0
        # for pool in pooled_args:
        #     s += len(pool)
        #     print(len(pool))
        # print(len(region_windows))
        # print(s)
        # exit()

        for arg_pool in pooled_args:
            # initiate threading
            gc.collect()
            with Pool(processes=max_threads) as pool:
                pool.starmap(select_windows, arg_pool)

    print()
Ejemplo n.º 25
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    sequences, scales, shapes = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=read_data)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(scales[key][:10])
        print(shapes[key][:10])
Ejemplo n.º 26
0
def main():
    # output_root_dir = "output/"
    # instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    # output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Illumina (laptop) --------------------------------------------------
    # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam"
    # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa"
    # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz"
    # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed"

    # ---- GIAB (dev machine) -------------------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    chromosome_name = "NC_003279.8"  # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5
    # chromosome_name = "1"
    # chromosome_name = "chr" + chromosome_name

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reference_sequence = fasta_handler.get_sequence(
        chromosome_name=chromosome_name, start=0, stop=chromosome_length)

    character_counts = count_runlength_per_character(reference_sequence)

    figure, axes = pyplot.subplots(nrows=len(character_counts.keys()),
                                   sharex=True,
                                   sharey=True)

    for k, key in enumerate(character_counts.keys()):
        counts = character_counts[key]
        max_count = max(counts)

        step = 1
        bins = numpy.arange(0, max_count + step, step=step)
        frequencies, bins = numpy.histogram(counts, bins=bins, normed=False)

        print(bins)
        print(frequencies)

        print(bins.shape)
        center = (bins[:-1] + bins[1:]) / 2 - step / 2

        axes[k].bar(center, frequencies, width=step, align="center")
        axes[k].set_ylabel(str(key))
        axes[k].set_xticks(numpy.arange(0, max_count + 1))

    pyplot.show()