Esempio n. 1
0
def main():
    """
    Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta
    :return:
    """
    output_dir = "data/"
    FileManager.ensure_directory_exists(output_dir)

    n_coverage = 2

    ref_max_runlength = 50
    read_max_runlength = 50

    ref_sequence, observations = generate_sequences(
        ref_max_runlength=ref_max_runlength,
        read_max_runlength=read_max_runlength,
        n_coverage=n_coverage,
        scale_coverage=True)

    datetime_string = FileManager.get_datetime_string()
    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv"
    output_path = os.path.join(output_dir, filename)

    file = open(output_path, "w")
    writer = csv.writer(file, delimiter="\t")
    for line in observations:
        writer.writerow(line)
    file.close()

    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta"
    output_path = os.path.join(output_dir, filename)

    with open(output_path, "w") as file:
        file.write(">ref_0\n")
        file.write(ref_sequence)
Esempio n. 2
0
def run_batch_training_from_tuples():
    # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"]

    chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"]

    trainer = JointClassifierTrainer()

    all_file_paths = list()
    for path in chr_paths:
        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl")
        all_file_paths.extend(file_paths)

    counts = trainer.get_counts_from_tuples(paths=all_file_paths)

    distribution = trainer.train_model(counts)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename))

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 3
0
def plot_kernel_distribution(pdf,
                             cdf,
                             bins,
                             save=False,
                             output_dir=None,
                             filename=None):
    n_steps = 100
    step = float(1.0 / n_steps)
    center = (bins[:-1] + bins[1:]) / 2 - step / 2

    fig, axes = pyplot.subplots(nrows=2)
    axes[0].plot(cdf)
    axes[1].bar(center, pdf, width=step, align="center")
    axes[1].set_ylabel("kernel sum")

    if save:
        FileManager.ensure_directory_exists(output_dir)
        filename = filename + "_distributions.png"
        path = os.path.join(output_dir, filename)
        pyplot.savefig(path)

    else:
        pyplot.show()

    pyplot.close()
Esempio n. 4
0
def save_run_length_training_data(output_dir, pileup_matrix, reference_matrix,
                                  pileup_repeat_matrix,
                                  reference_repeat_matrix, reversal_matrix,
                                  chromosome_name, start):
    array_file_extension = ".npz"

    # ensure chromosomal directory exists
    chromosomal_output_dir = os.path.join(output_dir, chromosome_name)
    if not os.path.exists(chromosomal_output_dir):
        FileManager.ensure_directory_exists(chromosomal_output_dir)

    # generate unique filename and path
    filename = chromosome_name + "_" + str(start)

    output_path_prefix = os.path.join(chromosomal_output_dir, filename)

    data_path = output_path_prefix + "_matrix" + array_file_extension

    # write numpy arrays
    numpy.savez_compressed(data_path,
                           x_pileup=pileup_matrix,
                           y_pileup=reference_matrix,
                           x_repeat=pileup_repeat_matrix,
                           y_repeat=reference_repeat_matrix,
                           reversal=reversal_matrix)
Esempio n. 5
0
def main(summary_glob, output_dir, filter_decoys, args):
    FileManager.ensure_directory_exists(output_dir)

    summary_file_paths = glob.glob(summary_glob)
    if len(summary_file_paths) == 0:
        print("No files matched '{}'".format(summary_glob))
        sys.exit(1)

    if filter_decoys:
        print("Filtering decoy chromosomes")
        summary_file_paths = filter_decoys_from_paths(summary_file_paths)

    summary_headers, summary_data, identities, identities_per_file, read_lengths_per_file, read_len_to_identity = \
        aggregate_summary_data(summary_file_paths, args)

    # all_read_lengths = list()
    # for rli in read_len_to_identity:
    #     all_read_lengths.append(rli[0])
    # all_read_lengths.sort()
    # print("top 15 read lengths: {}".format(all_read_lengths[:-15]))

    for file in identities_per_file.keys():
        mmm(identities_per_file[file], file)
    mmm(identities, "All Data")

    sample_name = args.sample
    if sample_name is None:
        sample_name = summary_glob.rstrip('/').replace('/', "_").replace(
            '*', "_")  # replace this with sample name extractor function?

    # plots
    if args.plot:
        pass

        # plot_identity_histogram(identities, title=sample_name, output_location=os.path.join(output_dir, "{}.all_identities.png".format(sample_name)))
        # plot_read_len_to_identity(read_len_to_identity, title=sample_name, output_base=os.path.join(output_dir, "{}.read_len_to_identity".format(sample_name)))
        # plot_per_file_identity_curve(identities_per_file, output_base=os.path.join(output_dir, sample_name))
        if args.comparison_glob is None:
            plot_per_file_identity_violin(identities_per_file,
                                          title=sample_name,
                                          output_base=os.path.join(
                                              output_dir, sample_name))
        else:
            comparison_paths = glob.glob(args.comparison_glob)
            if len(comparison_paths) == 0:
                raise Exception("No comparison files found for '{}'".format(
                    args.comparison_glob))

            #TODO only for rle experiment
            args.min_read_length *= 0.7
            _, _, _, comparison_identities_per_file, comparison_lengths_per_file, _ = aggregate_summary_data(
                comparison_paths, args)
            plot_identity_comparison_violin(identities_per_file,
                                            comparison_identities_per_file,
                                            read_lengths_per_file,
                                            comparison_lengths_per_file,
                                            title=sample_name,
                                            output_base=os.path.join(
                                                output_dir, sample_name))
Esempio n. 6
0
def write_windows_to_file(windows, output_dir, filename):
    FileManager.ensure_directory_exists(output_dir)

    filename = filename + "_windows.pkl"
    path = os.path.join(output_dir, filename)

    with open(path, 'wb') as output:
        pickle.dump(windows, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 7
0
def save_model(output_directory, model):
    FileManager.ensure_directory_exists(output_directory)

    timestamp = get_timestamp_string()
    filename = "model_" + timestamp
    path = os.path.join(output_directory, filename)

    print("SAVING MODEL:", path)
    torch.save(model.state_dict(), path)
Esempio n. 8
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Esempio n. 9
0
def extract_runnie_reads_by_name(runnie_path, output_dir, output_filename_suffix, names):
    output_filename = "runnie_subset_" + output_filename_suffix + ".out"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    runnie_handler = RunlengthHandler(runnie_path)

    runnie_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True)

    return output_path
Esempio n. 10
0
def extract_fastq_reads_by_name(fastq_path, output_dir, output_filename_suffix, names):
    output_filename = "sequence_subset_" + output_filename_suffix + ".fastq"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    fastq_handler = FastqHandler(fastq_path)

    fastq_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True)

    return output_path
Esempio n. 11
0
def main(reads_file_path,
         true_ref_sequence_path=None,
         output_dir=None,
         n_passes=False):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir,
                                             input_file_path=reads_file_path)

    reads_vs_ref_sam_path, reads_vs_ref_bam_path = align_minimap(
        output_dir=output_dir,
        ref_sequence_path=assembly_sequence_path,
        reads_sequence_path=reads_file_path)

    if true_ref_sequence_path is not None:
        assembled_vs_true_ref_sam_path, assembled_vs_true_ref_bam_path = align_minimap(
            output_dir=output_dir,
            ref_sequence_path=true_ref_sequence_path,
            reads_sequence_path=assembly_sequence_path)

    polished_ref_paths = list()

    for i in range(n_passes):
        suffix = str(i + 1) + "x"
        polish_output_dir = join(output_dir, suffix)
        FileManager.ensure_directory_exists(polish_output_dir)

        if i == 0:
            ref_sequence_path = assembly_sequence_path
        else:
            ref_sequence_path = polished_ref_paths[i - 1]

        reads_vs_polished_ref_sam_path, reads_vs_polished_ref_bam_path = align_minimap(
            output_dir=polish_output_dir,
            ref_sequence_path=ref_sequence_path,
            reads_sequence_path=reads_file_path)

        repolished_ref_sequence_path = polish_racon(
            output_dir=polish_output_dir,
            reads_file_path=reads_file_path,
            reads_vs_ref_sam_path=reads_vs_polished_ref_sam_path,
            ref_sequence_path=ref_sequence_path,
            suffix=suffix)

        polished_ref_paths.append(repolished_ref_sequence_path)

        if true_ref_sequence_path is not None:
            repolished_vs_true_ref_sam_path, repolished_vs_true_ref_bam_path = \
                align_minimap(output_dir=polish_output_dir,
                              ref_sequence_path=true_ref_sequence_path,
                              reads_sequence_path=repolished_ref_sequence_path)
Esempio n. 12
0
def train_joint_model_from_tuples(tuples_path):
    training_tuples = load_training_tuples(tuples_path, cutoff=16)

    print("training tuples loaded: ", len(training_tuples))

    distribution = train_model(data=training_tuples)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 13
0
def main(ref_sequence_path, reads_sequence_path, minimap_preset, output_dir=None):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    output_sam_file_path, output_bam_file_path = align_minimap(output_dir=output_dir,
                                                               ref_sequence_path=ref_sequence_path,
                                                               reads_sequence_path=reads_sequence_path,
                                                               preset=minimap_preset)

    process_bam(bam_path=output_bam_file_path, reference_path=ref_sequence_path, output_dir=output_dir)
Esempio n. 14
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Esempio n. 15
0
def main():
    output_dir = "output/" + "read_names_" + FileManager.get_datetime_string()
    output_filename = "read_names.txt"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    # STEP 1
    # Find union of read names within runnie and fastq files
    fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq"
    runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out"

    # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
    #                                                                fastq_path=fastq_path,
    #                                                                runnie_path=runnie_path)

    # STEP 2
    # Split sequence names into train/test partition
    name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt"
    names = read_names_from_file(name_intersection_path)
    names_train, names_test = partition_names(names)

    # STEP 3
    # Extract names and write to files
    runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                            output_dir=output_dir,
                                                            output_filename_suffix="train",
                                                            names=names_train)

    fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                          output_dir=output_dir,
                                                          output_filename_suffix="train",
                                                          names=names_train)

    runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                           output_dir=output_dir,
                                                           output_filename_suffix="test",
                                                           names=names_test)

    fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                         output_dir=output_dir,
                                                         output_filename_suffix="test",
                                                         names=names_test)

    # STEP 4
    # Verify
    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_train_subset_path,
                                                                   runnie_path=runnie_train_subset_path)

    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_test_subset_path,
                                                                   runnie_path=runnie_test_subset_path)
Esempio n. 16
0
def save_numpy_matrix(output_dir, filename, matrix):
    array_file_extension = ".npz"

    # ensure chromosomal directory exists
    if not os.path.exists(output_dir):
        FileManager.ensure_directory_exists(output_dir)

    output_path_prefix = os.path.join(output_dir, filename)

    output_path = output_path_prefix + array_file_extension

    # write numpy arrays
    numpy.savez_compressed(output_path, a=matrix)
Esempio n. 17
0
    def __init__(self):
        self.datetime_string = '-'.join(
            list(map(str,
                     datetime.datetime.now().timetuple()))[:-1])
        self.subdirectory_name = "training_" + self.datetime_string

        self.output_directory_name = "output/"
        self.directory = path.join(self.output_directory_name,
                                   self.subdirectory_name)

        self.n_checkpoints = 0

        FileManager.ensure_directory_exists(self.directory)
Esempio n. 18
0
def run_generate_tuples_from_pileups():
    max_threads = 6

    # NC_003279.8         Caenorhabditis elegans chromosome I
    # NC_003280.10     Caenorhabditis elegans chromosome II
    # NC_003281.10     Caenorhabditis elegans chromosome III
    # NC_003282.8         Caenorhabditis elegans chromosome IV
    # NC_003283.11    Caenorhabditis elegans chromosome V
    # NC_003284.9        Caenorhabditis elegans chromosome X
    # NC_001328.1        Caenorhabditis elegans mitochondrion, complete genome

    # data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003280.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003281.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003282.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003284.9"]

    data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"]

    args = list()
    for path in data_path:
        gap_filterer = GapFilterer()

        batch_size = 1

        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".npz")

        data_loader = DataLoader(file_paths, batch_size=batch_size, parse_batches=False)

        consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float)

        output_dir = "output/joint_runlength_base_model/" + FileManager.get_datetime_string()

        filename_suffix = path.split("/")[-1]
        print(filename_suffix)

        args.append([data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer])

        gap_filterer = None

        gc.collect()

    n_threads = min(len(args), max_threads)

    for arg in args:
        print(arg)
    print(n_threads)

    with Pool(processes=n_threads) as pool:
        pool.starmap(generate_training_data, args)
Esempio n. 19
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Esempio n. 20
0
def save_directional_frequency_matrices_as_delimited_text(
        output_dir,
        frequency_matrices,
        delimiter=",",
        log_normalize=False,
        plot=False):
    if log_normalize:
        filename = "probability_matrices_directional_" + FileManager.get_datetime_string(
        ) + ".csv"
    else:
        filename = "frequency_matrices_directional_" + FileManager.get_datetime_string(
        ) + ".csv"

    reversal_suffixes = ["F", "R"]
    output_path = os.path.join(output_dir, filename)
    file = open(output_path, "w")

    for reversal in [0, 1]:
        for base_index in range(4):
            base = INDEX_TO_BASE[base_index]
            suffix = reversal_suffixes[reversal]

            matrix = numpy.squeeze(frequency_matrices[reversal,
                                                      base_index, :, :])

            type = int
            if log_normalize:
                matrix = normalize(matrix, pseudocount=15)
                type = float

            if plot:
                pyplot.imshow(matrix)
                pyplot.show()
                pyplot.close()

            matrix_name = "_".join([base, suffix])
            header = ">" + matrix_name + "\n"

            file.write(header)
            for r in range(matrix.shape[0]):
                row = [str(type(x)) for x in matrix[r]]

                # print(r, len(row))

                row = delimiter.join(row) + "\n"

                file.write(row)

            file.write("\n")

    file.close()
Esempio n. 21
0
def process_bam(bam_path,
                reference_path,
                output_dir=None,
                centromere_table_path=None,
                gap_table_path=None,
                segdup_table_path=None,
                max_threads=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    if output_dir is None:
        output_dir = "plots/"

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        arguments.append([
            bam_path, reference_path, chromosome_name, output_dir,
            centromere_table_path, gap_table_path, segdup_table_path,
            genome_data
        ])

    if len(arguments) < max_threads:
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_data, arguments)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Esempio n. 22
0
def process_bam(bam_path, reference_path, max_threads, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        arguments.append([
            genome_data, reference_path, chromosome_name, start, stop,
            output_dir, bam_path
        ])

    if len(arguments) < max_threads:
        print("Fewer jobs than threads")
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_stats, arguments)

    print("genome_data", genome_data)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Esempio n. 23
0
def main(max_threads=None):
    # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out"

    output_parent_dir = "output/version_comparison/mode/"
    output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    handler = RunlengthHandler(runlength_path)

    if max_threads is None:
        max_threads = max(1, multiprocessing.cpu_count()-2)

    with multiprocessing.Pool(processes=max_threads) as pool:
        for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))):
            sys.stdout.write("\r%d" % r)
    print()

    print("Concatenating files...")
    output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta")

    concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta"
    concatenated_file_path = os.path.join(output_dir, concatenated_filename)

    print("Saving to file: %s" % concatenated_file_path)

    FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path)
    FileManager.delete_files(output_file_paths)
Esempio n. 24
0
def main(reads_file_path, genome_size=None, output_dir=None):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    if genome_size is None:
        genome_size = "3g"
        print(
            "WARNING: genome size flag not specified, defaulting to human size (3g)"
        )

    assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir,
                                             input_file_path=reads_file_path,
                                             genome_size=genome_size)
Esempio n. 25
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one-hot with anchors and reversal matrix chr1 celegans
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"  # one-hot with anchors and reversal matrix E. Coli

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)
    # file_paths = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_9699291_matrix.npz",
    #               "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4172039_matrix.npz",
    #               "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4552073_matrix.npz",
    #               "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7332035_matrix.npz",
    #               "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_12807084_matrix.npz",
    #               "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7773028_matrix.npz"]

    # Training parameters
    batch_size_train = 1
    n_batches = 1000

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False)

    gap_filterer = GapFilterer(threshold=0.003)

    consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float)

    print(len(data_loader))
    test_consensus(consensus_caller=consensus_caller,
                   data_loader=data_loader,
                   n_batches=n_batches,
                   gap_filterer=gap_filterer,
                   plot_mismatches=False)
def get_all_aligned_lengths(bam_path, recursive=False):
    if os.path.isdir(bam_path):
        bam_paths = FileManager.get_all_file_paths_by_type(
            parent_directory_path=bam_path,
            file_extension=".bam",
            recursive=recursive)

        print(bam_paths)
    else:
        bam_paths = [bam_path]

    manager = Manager()
    assembly_contigs = manager.dict()

    max_threads = max(1, cpu_count() - 2)

    arguments = list()

    for path in bam_paths:
        arguments.append([path, assembly_contigs])

    if len(arguments) < max_threads:
        print("Fewer jobs than threads")
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_aligned_contig_lengths, arguments)

    return assembly_contigs
Esempio n. 27
0
def run():
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one-hot with anchors and reversal matrix Chr1 filtered 2820
    model_state_path = "output/training_2018-10-17-15-1-39-2-290/model_checkpoint_9"

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Training parameters
    batch_size_train = 1
    n_batches = 1000

    threshold = 0.005

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False,
                             convert_to_distributions=False,
                             use_gpu=False)

    gap_filterer = GapFilterer(model_state_path=model_state_path,
                               threshold=threshold)

    test_filter(gap_filterer=gap_filterer,
                data_loader=data_loader,
                n_batches=n_batches)
Esempio n. 28
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8"  # one-hot with anchors and reversal matrix Chr4
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one hot with anchors and reversal matrix chr1

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Training parameters
    batch_size_train = 1
    checkpoint_interval = 300

    n_batches = 1000

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False)

    consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float)

    gap_filterer = GapFilterer()

    print(len(data_loader))
    test_consensus(consensus_caller=consensus_caller,
                   data_loader=data_loader,
                   plot_mismatches=False,
                   gap_filterer=gap_filterer,
                   n_batches=n_batches)
Esempio n. 29
0
def get_contig_lengths_from_phaseblock_csvs(parent_directory):
    print(parent_directory)
    paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=parent_directory, file_extension="csv")
    print(paths)

    assembly_contigs = dict()
    names = list()
    lengths = list()

    for path in paths:
        with open(path, "r") as file:
            for l, line in enumerate(file):
                print(line)

                if len(line) == 0:
                    continue

                items = line.strip().split(",")
                length = int(items[-1]) - int(items[-2])

                print(length)
                lengths.append(length)
                names.append(items[1])

            contigs = list(zip(names, lengths))
            contigs = sorted(contigs, key=lambda x: x[1], reverse=True)

            assembly_contigs[path] = contigs

    return assembly_contigs
Esempio n. 30
0
def plot_kernels_and_column_frequencies(kernel_sums,
                                        passing_indices,
                                        column_frequencies,
                                        slice_range=None,
                                        save=False,
                                        output_dir=None,
                                        filename=None):
    if slice_range is not None:
        kernel_sums = kernel_sums[:, slice_range[0]:slice_range[1]]
        passing_indices = passing_indices[:, slice_range[0]:slice_range[1]]
        column_frequencies = column_frequencies[:,
                                                slice_range[0]:slice_range[1]]

        kernel_sums.reshape(1, kernel_sums.shape[1])
        passing_indices.reshape(1, passing_indices.shape[1])
        column_frequencies.reshape(column_frequencies.shape[0],
                                   column_frequencies.shape[1])

    fig, axes = pyplot.subplots(nrows=3, sharex=True)
    fig.set_size_inches(16, 4)
    axes[0].imshow(passing_indices)
    axes[1].imshow(kernel_sums)
    axes[2].imshow(column_frequencies)

    axes[0].set_ylabel("Thresholded")
    axes[1].set_ylabel("Convolution")
    axes[2].set_ylabel("Frequencies")

    axes[0].set_yticklabels([])
    axes[1].set_yticklabels([])
    axes[2].set_yticklabels([])

    axes[0].set_yticks([])
    axes[1].set_yticks([])
    axes[2].set_yticks([])

    if save:
        FileManager.ensure_directory_exists(output_dir)
        filename = filename + "_kernels.png"
        path = os.path.join(output_dir, filename)
        pyplot.savefig(path)

    else:
        pyplot.show()

    pyplot.close()