Esempio n. 1
0
def save_directional_frequency_matrices_as_delimited_text(
        output_dir,
        frequency_matrices,
        delimiter=",",
        log_normalize=False,
        plot=False):
    if log_normalize:
        filename = "probability_matrices_directional_" + FileManager.get_datetime_string(
        ) + ".csv"
    else:
        filename = "frequency_matrices_directional_" + FileManager.get_datetime_string(
        ) + ".csv"

    reversal_suffixes = ["F", "R"]
    output_path = os.path.join(output_dir, filename)
    file = open(output_path, "w")

    for reversal in [0, 1]:
        for base_index in range(4):
            base = INDEX_TO_BASE[base_index]
            suffix = reversal_suffixes[reversal]

            matrix = numpy.squeeze(frequency_matrices[reversal,
                                                      base_index, :, :])

            type = int
            if log_normalize:
                matrix = normalize(matrix, pseudocount=15)
                type = float

            if plot:
                pyplot.imshow(matrix)
                pyplot.show()
                pyplot.close()

            matrix_name = "_".join([base, suffix])
            header = ">" + matrix_name + "\n"

            file.write(header)
            for r in range(matrix.shape[0]):
                row = [str(type(x)) for x in matrix[r]]

                # print(r, len(row))

                row = delimiter.join(row) + "\n"

                file.write(row)

            file.write("\n")

    file.close()
Esempio n. 2
0
def main(max_threads=None):
    # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out"

    output_parent_dir = "output/version_comparison/mode/"
    output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    handler = RunlengthHandler(runlength_path)

    if max_threads is None:
        max_threads = max(1, multiprocessing.cpu_count()-2)

    with multiprocessing.Pool(processes=max_threads) as pool:
        for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))):
            sys.stdout.write("\r%d" % r)
    print()

    print("Concatenating files...")
    output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta")

    concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta"
    concatenated_file_path = os.path.join(output_dir, concatenated_filename)

    print("Saving to file: %s" % concatenated_file_path)

    FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path)
    FileManager.delete_files(output_file_paths)
Esempio n. 3
0
def run_batch_training_from_tuples():
    # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"]

    chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"]

    trainer = JointClassifierTrainer()

    all_file_paths = list()
    for path in chr_paths:
        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl")
        all_file_paths.extend(file_paths)

    counts = trainer.get_counts_from_tuples(paths=all_file_paths)

    distribution = trainer.train_model(counts)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename))

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 4
0
def main():
    """
    Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta
    :return:
    """
    output_dir = "data/"
    FileManager.ensure_directory_exists(output_dir)

    n_coverage = 2

    ref_max_runlength = 50
    read_max_runlength = 50

    ref_sequence, observations = generate_sequences(
        ref_max_runlength=ref_max_runlength,
        read_max_runlength=read_max_runlength,
        n_coverage=n_coverage,
        scale_coverage=True)

    datetime_string = FileManager.get_datetime_string()
    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv"
    output_path = os.path.join(output_dir, filename)

    file = open(output_path, "w")
    writer = csv.writer(file, delimiter="\t")
    for line in observations:
        writer.writerow(line)
    file.close()

    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta"
    output_path = os.path.join(output_dir, filename)

    with open(output_path, "w") as file:
        file.write(">ref_0\n")
        file.write(ref_sequence)
Esempio n. 5
0
def train_joint_model_from_tuples(tuples_path):
    training_tuples = load_training_tuples(tuples_path, cutoff=16)

    print("training tuples loaded: ", len(training_tuples))

    distribution = train_model(data=training_tuples)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 6
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Esempio n. 7
0
def main():
    output_dir = "output/" + "read_names_" + FileManager.get_datetime_string()
    output_filename = "read_names.txt"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    # STEP 1
    # Find union of read names within runnie and fastq files
    fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq"
    runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out"

    # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
    #                                                                fastq_path=fastq_path,
    #                                                                runnie_path=runnie_path)

    # STEP 2
    # Split sequence names into train/test partition
    name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt"
    names = read_names_from_file(name_intersection_path)
    names_train, names_test = partition_names(names)

    # STEP 3
    # Extract names and write to files
    runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                            output_dir=output_dir,
                                                            output_filename_suffix="train",
                                                            names=names_train)

    fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                          output_dir=output_dir,
                                                          output_filename_suffix="train",
                                                          names=names_train)

    runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                           output_dir=output_dir,
                                                           output_filename_suffix="test",
                                                           names=names_test)

    fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                         output_dir=output_dir,
                                                         output_filename_suffix="test",
                                                         names=names_test)

    # STEP 4
    # Verify
    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_train_subset_path,
                                                                   runnie_path=runnie_train_subset_path)

    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_test_subset_path,
                                                                   runnie_path=runnie_test_subset_path)
Esempio n. 8
0
def run_generate_tuples_from_pileups():
    max_threads = 6

    # NC_003279.8         Caenorhabditis elegans chromosome I
    # NC_003280.10     Caenorhabditis elegans chromosome II
    # NC_003281.10     Caenorhabditis elegans chromosome III
    # NC_003282.8         Caenorhabditis elegans chromosome IV
    # NC_003283.11    Caenorhabditis elegans chromosome V
    # NC_003284.9        Caenorhabditis elegans chromosome X
    # NC_001328.1        Caenorhabditis elegans mitochondrion, complete genome

    # data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003280.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003281.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003282.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003284.9"]

    data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"]

    args = list()
    for path in data_path:
        gap_filterer = GapFilterer()

        batch_size = 1

        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".npz")

        data_loader = DataLoader(file_paths, batch_size=batch_size, parse_batches=False)

        consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float)

        output_dir = "output/joint_runlength_base_model/" + FileManager.get_datetime_string()

        filename_suffix = path.split("/")[-1]
        print(filename_suffix)

        args.append([data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer])

        gap_filterer = None

        gc.collect()

    n_threads = min(len(args), max_threads)

    for arg in args:
        print(arg)
    print(n_threads)

    with Pool(processes=n_threads) as pool:
        pool.starmap(generate_training_data, args)
Esempio n. 9
0
def plot_joint_distribution(distribution, save=False):
    base_distributions = defaultdict(lambda: numpy.zeros([max_runlength + 1, max_runlength + 1]))

    print(len(distribution))

    max_runlength = 50
    for true_base in ["A", "G", "T", "C", "-"]:
        # base_self_distribution = numpy.zeros([max_runlength + 1, max_runlength + 1])

        for observed_base in ["A", "G", "T", "C", "-"]:
            for r_x, observed_repeat in enumerate(range(0, max_runlength+1)):
                for r_y, true_repeat in enumerate(range(0, max_runlength+1)):

                    key = ((observed_base, observed_repeat),(true_base, true_repeat))

                    if key in distribution:
                        probability = distribution[key]

                        if true_base == "-" and observed_base != "-":
                            base_distributions[observed_base][r_y, r_x] += probability

                        elif true_base == "-" and observed_base == "-":
                            for split_base in ["A", "G", "T", "C"]:
                                base_distributions[split_base][r_y, r_x] += probability

                        else:
                            base_distributions[true_base][r_y,r_x] += probability

    # base_distributions["A"][25, 0] += 999999

    for base in base_distributions:
        axes = pyplot.axes()
        base_distribution = normalize_frequency_matrix(base_distributions[base], log_scale=True)
        pyplot.title(base + ":" + base + " Log probabilities")
        pyplot.imshow(numpy.log10(base_distributions[base]))

        axes.set_xlabel("Observed length")
        axes.set_ylabel("True length")
        pyplot.show()
        pyplot.close()

    if save:
        output_dir = "/home/ryan/code/nanopore_assembly/models/parameters/"
        filename = "runlength_frequency_matrices_per_base_" + FileManager.get_datetime_string()

        print("SAVING: ", output_dir + filename)

        save_numpy_matrices(output_dir=output_dir, filename=filename, matrices=base_distributions)
Esempio n. 10
0
def write_chromosomal_summary_data_to_csv(summary_headers,
                                          summary_data,
                                          output_dir,
                                          sample_name=None):
    if sample_name is None:
        sample_name = FileManager.get_datetime_string()

    filename = "aggregate_summary_" + sample_name + ".csv"
    file_path = os.path.join(output_dir, filename)

    print("Saving aggregate data to: %s" % os.path.abspath(file_path))

    with open(file_path, "w") as file:
        writer = csv.writer(file)

        writer.writerow(summary_headers)

        for data in summary_data:
            writer.writerow(data)
Esempio n. 11
0
def write_joint_distribution_to_file(distribution, output_dir):
    FileManager.ensure_directory_exists(output_dir)

    datetime_string = FileManager.get_datetime_string()

    filename_prefix = "joint_distribution"
    filename = filename_prefix + "_" + datetime_string + ".tsv"
    path = os.path.join(output_dir, filename)

    with open(path, 'w') as file:
        writer = csv.writer(file, delimiter="\t")

        for pair in sorted(distribution.keys()):
            line = [
                pair[0][0], pair[0][1], pair[1][0], pair[1][1],
                distribution[pair]
            ]

            writer.writerow(line)

    return path
Esempio n. 12
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    pileup_start = 6000
    pileup_end = 6050

    output_parent_dir = "output/"
    output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True)

    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path,
                                            runlength_ref_sequences=runlength_ref_sequences,
                                            runlength_read_path=runlength_assembly_fasta_path,
                                            runlength_read_sequences=read_data,
                                            output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
        get_aligned_segments(fasta_handler=fasta_handler,
                             bam_handler=bam_handler,
                             chromosome_name=chromosome_name,
                             pileup_start=pileup_start,
                             pileup_end=pileup_end,
                             runlength_ref_sequences=runlength_ref_sequences,
                             read_data=read_data)

    sequence_encoding = list()
    scale_encoding = list()
    shape_encoding = list()
    modes_encoding = list()

    print(len(aligned_sequences.keys()))

    print("REF\t", "".join(aligned_ref_sequence))
    for read_id in aligned_sequences.keys():
        print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id])))
        sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id])))
        scale_encoding.append(aligned_scales[read_id])
        shape_encoding.append(aligned_shapes[read_id])
        modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id]))))

    sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float)
    scale_encoding = numpy.array(scale_encoding, dtype=numpy.float)
    shape_encoding = numpy.array(shape_encoding, dtype=numpy.float)
    modes_encoding = numpy.array(modes_encoding, dtype=numpy.float)

    plot_runlength_pileup(sequences=sequence_encoding,
                          scales=scale_encoding,
                          shapes=shape_encoding,
                          modes=modes_encoding)
Esempio n. 13
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta"
    # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    # WG ecoli 60x
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv"
    raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(
        output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_assembly_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    total_confusion = get_runlength_confusion([], [], 10)
    total_confusion_weibull = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)
    # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path)
    length_classifier_weibull = WeibullRunlengthClassifier(
        raw_matrix_path, normalize_matrix=True, pseudocount=0.05)

    print("reading BAM")
    for pileup_start, pileup_end in windows[10:20]:
        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=read_data)

        sequence_encoding = list()
        scale_encoding = list()
        shape_encoding = list()
        modes_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        try:
            # print("REF\t", "".join(aligned_ref_sequence))
            for read_id in aligned_sequences.keys():
                # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id])))
                sequence_encoding.append(
                    list(map(get_encoding, aligned_sequences[read_id])))
                scale_encoding.append(aligned_scales[read_id])
                shape_encoding.append(aligned_shapes[read_id])
                modes_encoding.append(
                    list(
                        map(
                            map_parameters_to_mode,
                            zip(aligned_scales[read_id],
                                aligned_shapes[read_id]))))
                reversal_encoding.append(reversal_statuses[read_id])

            ref_sequence_encoding = [
                list(map(get_encoding, aligned_ref_sequence))
            ]
            ref_lengths_encoding = [aligned_ref_lengths]

            ref_sequence_encoding = numpy.atleast_2d(
                numpy.array(ref_sequence_encoding, dtype=numpy.int))
            ref_length_encoding = numpy.atleast_2d(
                numpy.array(ref_lengths_encoding, dtype=numpy.int))
            sequence_encoding = numpy.atleast_2d(
                numpy.array(sequence_encoding, dtype=numpy.int))
            scale_encoding = numpy.atleast_2d(
                numpy.array(scale_encoding, dtype=numpy.float))
            shape_encoding = numpy.atleast_2d(
                numpy.array(shape_encoding, dtype=numpy.float))
            modes_encoding = numpy.atleast_2d(
                numpy.array(modes_encoding, dtype=numpy.int))
            reversal_encoding = numpy.array(reversal_encoding,
                                            dtype=numpy.bool)

            consensus_sequence, consensus_lengths = \
                get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=modes_encoding,
                                                         reversal_encoding=reversal_encoding)

            weibull_consensus_sequence, weibull_consensus_lengths = \
                get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull,
                                                           sequence_encoding=sequence_encoding,
                                                           scale_encoding=scale_encoding,
                                                           shape_encoding=shape_encoding,
                                                           reversal_encoding=reversal_encoding)

            plot_runlength_pileup(
                sequences=-sequence_encoding,
                scales=scale_encoding,
                shapes=shape_encoding,
                modes=modes_encoding,
                ref_sequence=-ref_sequence_encoding,
                ref_lengths=ref_length_encoding,
                predicted_sequence=-numpy.atleast_2d(
                    numpy.array(weibull_consensus_sequence, dtype=numpy.int)),
                predicted_lengths=numpy.atleast_2d(
                    numpy.array(weibull_consensus_lengths, dtype=numpy.int)))

            print()
            print("PREDICTED\t", weibull_consensus_lengths[:10])
            print("TRUE\t\t", aligned_ref_lengths[:10])

            confusion = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=consensus_lengths,
                max_length=10)

            confusion_weibull = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=weibull_consensus_lengths,
                max_length=10)

            total_confusion += confusion
            total_confusion_weibull += confusion_weibull

        except Exception as e:
            print(e)
            continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Modal: ", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull)

    print("Full: ", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "confusion_weibull.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion_weibull))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Esempio n. 14
0
def main():
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta"
    # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq"
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    # Initialize empty confusion matrices
    total_confusion = get_runlength_confusion([], [], 10)
    total_modal_confusion = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)

    print("reading BAM")
    for pileup_start, pileup_end in windows[:10]:
        print("window", pileup_start, pileup_end)

        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=runlength_read_sequences)

        sequence_encoding = list()
        length_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        # print("REF\t", "".join(aligned_ref_sequence))
        for read_id in aligned_sequences.keys():
            # print("READ\t","".join(aligned_sequences[read_id]))
            sequence_encoding.append(
                list(map(get_encoding, aligned_sequences[read_id])))
            length_encoding.append(aligned_lengths[read_id])
            reversal_encoding.append(reversal_statuses[read_id])

        ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))]
        ref_lengths_encoding = [aligned_ref_lengths]

        ref_sequence_encoding = numpy.array(ref_sequence_encoding,
                                            dtype=numpy.int)
        ref_length_encoding = numpy.array(ref_lengths_encoding,
                                          dtype=numpy.int)
        sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int)
        length_encoding = numpy.array(length_encoding, dtype=numpy.float)
        reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool)

        ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding)
        ref_length_encoding = numpy.atleast_2d(ref_length_encoding)
        sequence_encoding = numpy.atleast_2d(sequence_encoding)
        length_encoding = numpy.atleast_2d(length_encoding)

        # plot_runlength_pileup(sequences=-sequence_encoding,
        #                       lengths=length_encoding,
        #                       ref_sequence=-ref_sequence_encoding,
        #                       ref_lengths=ref_length_encoding)

        consensus_sequence, consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding)

        modal_consensus_sequence, modal_consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding,
                                                         bayesian=False)

        print()
        print("PREDICTED\t", consensus_lengths[:10])
        print("TRUE\t\t", aligned_ref_lengths[:10])

        confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=consensus_lengths,
            max_length=10)

        total_confusion += confusion

        modal_confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=modal_consensus_lengths,
            max_length=10)

        total_modal_confusion += modal_confusion

        # except Exception as e:
        #     print(e)
        #     continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Bayes:", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion)

    print("No Bayes", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "modal_confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_modal_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
def generate_ngx_plot(assembly_contigs,
                      input_dir,
                      genome_size=None,
                      y_max=180,
                      title="NGx",
                      figure=None,
                      axes=None):
    samples = [
        "03492", "03098", "02723", "02080", "02055", "01243", "01109", "00733",
        "24385", "24149", "24143", "CHM13", "hg38_no_alts"
    ]

    colors = [
        (175 / 256.0, 48 / 256.0, 51 / 256.0),  # red
        (224 / 256.0, 99 / 256.0, 58 / 256.0),  # orange
        (215 / 256.0, 219 / 256.0, 84 / 256.0),  # yellow
        (110 / 256.0, 170 / 256.0, 100 / 256.0),  # light green
        (80 / 256.0, 180 / 256.0, 150 / 256.0),  # green
        (100 / 256.0, 189 / 256.0, 197 / 256.0),  # green-blue
        (0 / 256.0, 170 / 256.0, 231 / 256.0),  # turquoise
        (51 / 256.0, 87 / 256.0, 182 / 256.0),  # blue
        (37 / 256.0, 36 / 256.0, 93 / 256.0),  # indigo
        (95 / 256.0, 51 / 256.0, 139 / 256.0),  # purple
        (200 / 256.0, 53 / 256.0, 93 / 256.0),  # pink
        (224 / 256.0, 99 / 256.0, 58 / 256.0),
        (110 / 256.0, 170 / 256.0, 100 / 256.0)
    ]

    alphas = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 1.0, 0.3, 0.3, 0.3, 1.0, 1.0]
    zorders = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]

    labels = {}

    # ---------------------------------------------------------------------------

    # samples = ["shasta", "wtdbg2", "canu", "flye"]
    #
    # colors = [(0.890,0.120,0.031),
    #           (0.999,0.696,0.031),  # (112/256, 37/256, 163/256)
    #           (0.039,0.463,0.58),
    #           (0.024,0.69,0.224)]
    #
    # zorders = [1,0,0,0]
    # alphas = [1,0.9,1,1]
    #
    # labels = {}

    # ---------------------------------------------------------------------------
    #
    # samples = ["shasta", "hifi"]
    #
    # colors = [(0.933,0.153,0.031),
    #           (112/256, 37/256, 163/256),
    #           (0.039,0.463,0.58),
    #           (0.024,0.69,0.224)]
    #
    # zorders = [1,1]
    # alphas = [1,1]
    #
    # labels = {}

    # ---------------------------------------------------------------------------

    # samples = ["assembly_GM24385",
    #            "assembly_HG00733",
    #            "scaffold_GM24385",
    #            "scaffold_HG00733"]
    #
    # labels = {}
    #
    # colors = [(51/256.0,    87/256.0,   182/256.0),     # blue
    #           (51/256.0,    87/256.0,   182/256.0),     # green-blue
    #           # (200/256.0,   200/256.0,  200/256.0),     # grey
    #           (100/256.0,   189/256.0,  197/256.0),      # orange
    #           (100/256.0,   189/256.0,  197/256.0)]  # light green
    #
    # zorders = [1,1,1,1]
    # alphas = [0.5,1,0.5,1]

    # ---------------------------------------------------------------------------

    if genome_size is None:
        print("WARNING: genome_size unspecified, using human as default")
        genome_size = 3.23 * 1000**3

    if y_max is None:
        print("WARNING: y_max unspecified, using 180Mbp as default")
        y_max = 180

    if figure is None and axes is None:
        figure = pyplot.figure()
        axes = pyplot.axes()

    legend_names = list()
    for path, contigs in sorted(assembly_contigs.items(), key=lambda x: x[0]):
        print("Plotting assembly: %s" % path)

        sample_matched = False
        for name in samples:
            if name.lower() in path.lower():
                sample_index = samples.index(name)
                color = colors[sample_index]
                alpha = alphas[sample_index]
                zorder = zorders[sample_index]
                sample_name = name
                sample_matched = True

        if not sample_matched:
            print("ERROR: color not found for %s" % path)
            sample_index = 0
            color = colors[sample_index]
            alpha = alphas[sample_index]
            zorder = zorders[sample_index]
            sample_name = os.path.basename(path).split(".")[0]

        if sample_name in labels:
            label = labels[sample_name]
        else:
            label = sample_name

        x1 = 0
        y_prev = None

        x_coords = list()
        y_coords = list()

        for contig in contigs:
            y = contig[LENGTH]
            width = contig[LENGTH] / genome_size
            x2 = x1 + width

            if y_prev is not None:
                x_coords.extend([x1, x1])
                y_coords.extend([y_prev, y])

            x_coords.extend([x1, x2])
            y_coords.extend([y, y])

            x1 = x2
            y_prev = y

        if y_coords[-1] != 0:
            y_coords.append(0)
            x_coords.append(x_coords[-1])

        dashes = [1, 0, 1, 0]

        if "hifi" in path.lower():
            label = "Canu CCS"

        if "shasta" in path:
            label = "Shasta Nanopore"

        if label not in legend_names:
            legend_names.append(label)

        axes.plot(x_coords,
                  y_coords,
                  color=color,
                  alpha=alpha,
                  zorder=zorder,
                  dashes=dashes,
                  linewidth=0.6)

    axes.legend(legend_names)

    axes.axvline(0.5, linestyle="--", alpha=0.3, linewidth=0.7, zorder=-1)

    # max_size = y_max
    #
    # step_size = 20
    # if step_size >= y_max:
    #     step_size = 1
    #
    # scale = 1_000_000
    #
    # axes.set_xlim([0,1])
    # axes.set_ylim([0,max_size*scale])
    # axes.set_yticks(numpy.arange(0,max_size+step_size,step_size)*scale)
    # axes.set_yticklabels(numpy.arange(0,max_size+step_size,step_size))

    axes.set_title(title)
    axes.set_ylabel("Contig/scaffold size (Mbp)")
    axes.set_xlabel("Cumulative coverage")

    FileManager.ensure_directory_exists("output")

    output_dir = "output/"
    filename = input_dir.rstrip("/").split(
        "/")[-1] + "_" + FileManager.get_datetime_string()
    file_path = os.path.abspath(os.path.join(output_dir, filename))

    print("SAVING FIGURE: %s" % file_path)
    figure.savefig(file_path + ".png", dpi=300)
    figure.savefig(file_path + ".pdf", dpi=300)

    pyplot.close()
Esempio n. 16
0
def save_directional_frequency_matrices_as_delimited_text(
        output_dir,
        frequency_matrices,
        chromosome_name=None,
        delimiter=",",
        log_normalize=False,
        plot=False,
        pseudocount=1e-12,
        diagonal_bias=0,
        default_type=int):
    if chromosome_name is not None:
        name_suffix = chromosome_name + "_"
    else:
        name_suffix = ""

    if log_normalize:
        filename = "probability_matrices_directional_" + name_suffix + FileManager.get_datetime_string(
        ) + ".csv"
    else:
        filename = "frequency_matrices_directional_" + name_suffix + FileManager.get_datetime_string(
        ) + ".csv"

    reversal_suffixes = ["F", "R"]
    output_path = os.path.join(output_dir, filename)
    file = open(output_path, "w")

    print("SAVING: %s" % output_path)

    for reversal in [0, 1]:
        for base_index in range(4):
            base = INDEX_TO_BASE[base_index]
            suffix = reversal_suffixes[reversal]

            matrix = numpy.squeeze(frequency_matrices[reversal,
                                                      base_index, :, :])

            type = default_type
            if log_normalize:
                matrix = normalize(matrix,
                                   pseudocount=pseudocount,
                                   diagonal_bias=diagonal_bias)
                type = float

            if plot:
                pyplot.imshow(matrix)
                pyplot.show()
                pyplot.close()

            matrix_name = "_".join([base, suffix])
            header = ">" + matrix_name + " likelihood\n"

            # print(type)
            file.write(header)
            for r in range(matrix.shape[0]):
                row = [str(type(x)) for x in matrix[r]]

                # if r < 4 and not log_normalize:
                # print(row)

                row = delimiter.join(row) + "\n"

                file.write(row)

            file.write("\n")

    file.close()
Esempio n. 17
0
def save_nondirectional_frequency_matrices_as_delimited_text(
        output_dir,
        frequency_matrices,
        chromosome_name=None,
        delimiter=",",
        log_normalize=False,
        pseudocount=1e-12,
        diagonal_bias=0,
        plot=False,
        default_type=int,
        filename=None):
    if filename is None:
        if chromosome_name is not None:
            name_suffix = chromosome_name + "_"
        else:
            name_suffix = ""

        if log_normalize:
            filename = "probability_matrices_" + name_suffix + FileManager.get_datetime_string(
            ) + ".csv"
        else:
            filename = "frequency_matrices_" + name_suffix + FileManager.get_datetime_string(
            ) + ".csv"

    output_path = os.path.join(output_dir, filename)
    file = open(output_path, "w")

    for base_index in range(4):
        base = INDEX_TO_BASE[base_index]

        matrix = numpy.squeeze(frequency_matrices[base_index, :, :])

        type = default_type
        if log_normalize:
            matrix = normalize(matrix,
                               pseudocount=pseudocount,
                               diagonal_bias=diagonal_bias)
            type = float

        if plot:
            pyplot.imshow(matrix)
            pyplot.show()
            pyplot.close()

        matrix_name = base

        if log_normalize:
            matrix_name += " likelihood"

        header = ">" + matrix_name + "\n"

        file.write(header)

        for r in range(matrix.shape[0]):
            row = [str(type(x)) for x in matrix[r]]
            row = delimiter.join(row) + "\n"

            file.write(row)

        file.write("\n")

    file.close()
Esempio n. 18
0
def main():
    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta"

    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa"

    # ---- TEST DATA ----
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta"
    # -------------------

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    sys.stderr.write("RL encoding fasta...\n")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    sys.stderr.write("Aligning RLE fasta...\n")

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    print(chromosome_length)

    sequences, lengths = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=runlength_read_sequences)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(lengths[key][:10])
Esempio n. 19
0
def run_base_frequency_matrix_generation_from_tuples(filter_mismatch=False):
    max_runlength = 50

    # directories = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_11_560358",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_12_855103",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_9_946240",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_7_713553",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_6_593646",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_8_668369"]

    # directories = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"]

    directories = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"]

    all_paths = list()
    for dir in directories:
        paths = FileManager.get_all_file_paths_by_type(parent_directory_path=dir, file_extension=".pkl")
        all_paths.extend(paths)
        print(len(all_paths))

    frequency_matrices = {"A":numpy.zeros([max_runlength+1, max_runlength+1]),
                          "G":numpy.zeros([max_runlength+1, max_runlength+1]),
                          "T":numpy.zeros([max_runlength+1, max_runlength+1]),
                          "C":numpy.zeros([max_runlength+1, max_runlength+1])} # include 0 as a possible runlength

    print("loaded paths: ", len(all_paths))

    cutoff = sys.maxsize
    for p,path in enumerate(all_paths):
        with open(path, 'rb') as pickle_file:
            print(p)

            tuples = pickle.load(pickle_file)

            for tuple in tuples:
                observed_tuple = tuple[0]
                true_tuple = tuple[1]

                observed_base, observed_length = observed_tuple
                true_base, true_length = true_tuple

                observed_length = min(observed_length, max_runlength)
                true_length = min(true_length, max_runlength)

                if true_base == "-" and observed_base != "-":
                    true_base = observed_base
                    frequency_matrices[true_base][true_length, observed_length] += 1    # prefer [y,x] convention, and it plots correctly

                elif true_base == "-" and observed_base == "-":
                    for split_base in ["A", "G", "T", "C"]:
                        # add 0:0 counts to all bases
                        frequency_matrices[split_base][true_length, observed_length] += 1

                elif true_base != "-" and observed_base == "-":

                    frequency_matrices[true_base][true_length, observed_length] += 1

                else:

                    frequency_matrices[true_base][true_length, observed_length] += 1

        if p == cutoff:
            break

    for base in ["A", "G", "T", "C"]:
        print(base)
        print(frequency_matrices[base])

    # plot_frequency_matrices(frequency_matrices)

    output_dir = "/home/ryan/code/nanopore_assembly/models/parameters/"
    filename = "runlength_frequency_matrices_per_base_" + FileManager.get_datetime_string()

    print("SAVING: ", output_dir+filename)

    save_numpy_matrices(output_dir=output_dir, filename=filename, matrices=frequency_matrices)

    # frequency_matrices = load_base_frequency_matrices(os.path.join(output_dir,filename+".npz"))

    plot_frequency_matrices(frequency_matrices)
Esempio n. 20
0
def main(output_dir="data/"):
    filename_prefix = "synthetic_runnie_test_" + FileManager.get_datetime_string(
    )
    runlength_reference_path = os.path.join(output_dir,
                                            filename_prefix + "_ref.fasta")
    runnie_output_path = os.path.join(output_dir,
                                      filename_prefix + "_runnie.out")

    modal_parameters = read_weibull_params()

    n_repeats = 30
    coverage = 12

    ref_max_runlength = 8

    base_pool = ["A", "T", "G", "C"]

    ref_sequence = list()

    ref_lengths = list()
    ref_bases = list()

    read_output_lines = list()

    ref_sequence_name = "synthetic_ref_0"
    for i in range(n_repeats):
        ref_runlengths = {
            b: list(range(1, ref_max_runlength + 1))
            for b in base_pool
        }

        for i in range(ref_max_runlength):
            bases = copy(base_pool)
            random.shuffle(bases)

            if len(ref_bases) > 0:
                while bases[0] == ref_bases[-1]:
                    random.shuffle(bases)

            for base in bases:
                lengths = ref_runlengths[base]
                length = lengths.pop()

                ref_runlengths[base] = lengths

                ref_sequence.extend([base] * length)
                ref_lengths.append(length)
                ref_bases.append(base)

    ref_sequence = "".join(ref_sequence)

    for c in range(coverage):
        read_output_lines.append("# synthetic_read_%d" % c)
        sequence = list()
        scales = list()
        shapes = list()

        for i in range(len(ref_lengths)):
            runlength = ref_lengths[i]
            base = ref_bases[i]

            scale, shape = random.choice(modal_parameters[runlength])

            sequence.append(base)
            scales.append(scale)
            shapes.append(shape)

            hex_scale = scale.hex()
            hex_shape = shape.hex()

            line = [base, hex_shape, hex_scale]
            line = list(map(str, line))
            line = "\t".join(line)
            read_output_lines.append(line)

            print(line)

    print(ref_sequence)

    print("saving file:", runlength_reference_path)
    with open(runlength_reference_path, "w") as file:
        file.write(">" + ref_sequence_name + "\n")
        file.write(ref_sequence + "\n")

    print("saving file:", runnie_output_path)
    with open(runnie_output_path, "w") as file:
        for line in read_output_lines:
            file.write(line + "\n")
Esempio n. 21
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/output/guppy_vs_runnie_ecoli_rad2_train_test_sequences/runnie_subset_train_60x_10kb.out"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(
        output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=12)

    read_runlength_sequences = dict()

    for r, read in enumerate(reads):
        data = read.data
        read_id = read.id

        sequence, lengths = RunlengthHandler.convert_runnie_data_to_rle_sequence(
            data)

        # print(sequence[:10])
        # print(lengths[:10])

        read_runlength_sequences[read_id] = [sequence, lengths]

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_assembly_path=runlength_assembly_fasta_path,
        runlength_assembly_sequences=read_runlength_sequences,
        output_dir=output_dir)

    chromosomal_matrices = generate_runlength_frequency_matrix(
        runlength_ref_sequence_path=runlength_ref_fasta_path,
        assembly_vs_ref_bam_path=assembly_vs_ref_bam_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_assembly_sequences=read_runlength_sequences)

    for matrix in chromosomal_matrices:
        save_directional_frequency_matrices_as_delimited_text(
            output_dir=output_dir,
            frequency_matrices=matrix,
            log_normalize=False,
            plot=False)

        save_directional_frequency_matrices_as_delimited_text(
            output_dir=output_dir,
            frequency_matrices=matrix,
            log_normalize=True,
            plot=False)

        nondirectional_matrix = sum_complementary_matrices(matrix)

        save_nondirectional_frequency_matrices_as_delimited_text(
            output_dir=output_dir,
            frequency_matrices=nondirectional_matrix,
            log_normalize=False,
            plot=False)

        save_nondirectional_frequency_matrices_as_delimited_text(
            output_dir=output_dir,
            frequency_matrices=nondirectional_matrix,
            log_normalize=True,
            plot=False)

        # zero_mask = (matrix == 0)
        # nonzero_mask = numpy.invert(zero_mask)
        # matrix[zero_mask] += numpy.min(matrix[nonzero_mask])

        plot_directional_residuals(matrix)
        plot_base_matrices(matrix, test_spot=False, normalize_matrices=False)
        plot_base_matrices(matrix, test_spot=False, normalize_matrices=True)
Esempio n. 22
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    sequences, scales, shapes = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=read_data)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(scales[key][:10])
        print(shapes[key][:10])
Esempio n. 23
0
def main(output_dir="data/"):
    filename_prefix = "synthetic_runlength_test_" + FileManager.get_datetime_string()
    runlength_reference_path = os.path.join(output_dir, filename_prefix + "_ref.fasta")
    runlength_reads_path = os.path.join(output_dir, filename_prefix + "_reads.fasta")

    reverse_complement = True

    n_repeats = 12
    coverage = 12

    ref_max_runlength = 8

    base_pool = ["A", "T", "G", "C"]

    base_length_offsets = {"A":0, "T":1, "G":2, "C":3}

    ref_sequence = list()

    ref_lengths = list()
    ref_bases = list()

    read_output_lines = list()

    ref_sequence_name = "synthetic_ref_0"
    for i in range(n_repeats):
        ref_runlengths = {b: list(range(1, ref_max_runlength + 1)) for b in base_pool}

        for i in range(ref_max_runlength):
            bases = copy(base_pool)
            random.shuffle(bases)

            if len(ref_bases) > 0:
                while bases[0] == ref_bases[-1]:
                    random.shuffle(bases)

            for base in bases:
                lengths = ref_runlengths[base]
                length = lengths.pop()

                ref_runlengths[base] = lengths

                ref_sequence.extend([base]*length)
                ref_lengths.append(length)
                ref_bases.append(base)

    ref_sequence = "".join(ref_sequence)

    for c in range(coverage):
        read_output_lines.append(">synthetic_read_%d"%c)
        sequence = list()
        for i in range(len(ref_lengths)):
            base = ref_bases[i]
            runlength = ref_lengths[i] + base_length_offsets[base]

            sequence.extend([base]*runlength)

        sequence = "".join(sequence)
        read_output_lines.append(sequence)

        if reverse_complement:
            read_output_lines.append(">synthetic_read_reverse_%d" % c)
            sequence = complement_sequence(sequence=sequence, reverse=True)
            sequence = "".join(sequence)
            read_output_lines.append(sequence)

    print("saving file:", runlength_reference_path)
    with open(runlength_reference_path, "w") as file:
        file.write(">"+ref_sequence_name+"\n")
        file.write(ref_sequence + "\n")

    print("saving file:", runlength_reads_path)
    with open(runlength_reads_path, "w") as file:
        for line in read_output_lines:
            file.write(line + "\n")
Esempio n. 24
0
def main():
    # output_root_dir = "output/"
    # instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    # output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam"
    # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"

    # ---- Nanopore GUPPY - E. Coli - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam"
    reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()
    fasta_handler.close()

    # chromosome_name = "NC_003279.8"     # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5

    for chromosome_name in contig_names:
        if chromosome_name == "NC_001328.1":    # mitochondrial
            continue

        print("STARTING:", chromosome_name)
        fasta_handler = FastaHandler(reference_file_path)
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name,
                                                        start=0,
                                                        stop=chromosome_length)

        fasta_handler.close()

        region = [0+1000000, chromosome_length-1000000]

        max_threads = 30

        window_size = 10000
        min_size = 20
        max_size = 80

        manager = multiprocessing.Manager()
        counter = manager.Value('i', 0)

        region_windows = chunk_region(region=region, size=window_size)

        n_chunks = len(region_windows)

        print("subregions: ", n_chunks)

        output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string()
        print(output_dir)

        # args = list()
        # for subregion in region_windows:
        #     args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks])

        pooled_args = generate_argument_pools(pool_size=max_threads,
                                              bam_file_path=bam_file_path,
                                              chromosome_name=chromosome_name,
                                              region_windows=region_windows,
                                              reference_sequence=reference_sequence,
                                              min_size=min_size,
                                              max_size=max_size,
                                              output_dir=output_dir,
                                              counter=counter,
                                              n_chunks=n_chunks)

        # print(len(pooled_args))
        # s = 0
        # for pool in pooled_args:
        #     s += len(pool)
        #     print(len(pool))
        # print(len(region_windows))
        # print(s)
        # exit()

        for arg_pool in pooled_args:
            # initiate threading
            gc.collect()
            with Pool(processes=max_threads) as pool:
                pool.starmap(select_windows, arg_pool)

    print()