def main(): """ Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta :return: """ output_dir = "data/" FileManager.ensure_directory_exists(output_dir) n_coverage = 2 ref_max_runlength = 50 read_max_runlength = 50 ref_sequence, observations = generate_sequences( ref_max_runlength=ref_max_runlength, read_max_runlength=read_max_runlength, n_coverage=n_coverage, scale_coverage=True) datetime_string = FileManager.get_datetime_string() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv" output_path = os.path.join(output_dir, filename) file = open(output_path, "w") writer = csv.writer(file, delimiter="\t") for line in observations: writer.writerow(line) file.close() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta" output_path = os.path.join(output_dir, filename) with open(output_path, "w") as file: file.write(">ref_0\n") file.write(ref_sequence)
def run_batch_training_from_tuples(): # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"] chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"] trainer = JointClassifierTrainer() all_file_paths = list() for path in chr_paths: file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl") all_file_paths.extend(file_paths) counts = trainer.get_counts_from_tuples(paths=all_file_paths) distribution = trainer.train_model(counts) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename)) FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def plot_kernel_distribution(pdf, cdf, bins, save=False, output_dir=None, filename=None): n_steps = 100 step = float(1.0 / n_steps) center = (bins[:-1] + bins[1:]) / 2 - step / 2 fig, axes = pyplot.subplots(nrows=2) axes[0].plot(cdf) axes[1].bar(center, pdf, width=step, align="center") axes[1].set_ylabel("kernel sum") if save: FileManager.ensure_directory_exists(output_dir) filename = filename + "_distributions.png" path = os.path.join(output_dir, filename) pyplot.savefig(path) else: pyplot.show() pyplot.close()
def save_run_length_training_data(output_dir, pileup_matrix, reference_matrix, pileup_repeat_matrix, reference_repeat_matrix, reversal_matrix, chromosome_name, start): array_file_extension = ".npz" # ensure chromosomal directory exists chromosomal_output_dir = os.path.join(output_dir, chromosome_name) if not os.path.exists(chromosomal_output_dir): FileManager.ensure_directory_exists(chromosomal_output_dir) # generate unique filename and path filename = chromosome_name + "_" + str(start) output_path_prefix = os.path.join(chromosomal_output_dir, filename) data_path = output_path_prefix + "_matrix" + array_file_extension # write numpy arrays numpy.savez_compressed(data_path, x_pileup=pileup_matrix, y_pileup=reference_matrix, x_repeat=pileup_repeat_matrix, y_repeat=reference_repeat_matrix, reversal=reversal_matrix)
def main(summary_glob, output_dir, filter_decoys, args): FileManager.ensure_directory_exists(output_dir) summary_file_paths = glob.glob(summary_glob) if len(summary_file_paths) == 0: print("No files matched '{}'".format(summary_glob)) sys.exit(1) if filter_decoys: print("Filtering decoy chromosomes") summary_file_paths = filter_decoys_from_paths(summary_file_paths) summary_headers, summary_data, identities, identities_per_file, read_lengths_per_file, read_len_to_identity = \ aggregate_summary_data(summary_file_paths, args) # all_read_lengths = list() # for rli in read_len_to_identity: # all_read_lengths.append(rli[0]) # all_read_lengths.sort() # print("top 15 read lengths: {}".format(all_read_lengths[:-15])) for file in identities_per_file.keys(): mmm(identities_per_file[file], file) mmm(identities, "All Data") sample_name = args.sample if sample_name is None: sample_name = summary_glob.rstrip('/').replace('/', "_").replace( '*', "_") # replace this with sample name extractor function? # plots if args.plot: pass # plot_identity_histogram(identities, title=sample_name, output_location=os.path.join(output_dir, "{}.all_identities.png".format(sample_name))) # plot_read_len_to_identity(read_len_to_identity, title=sample_name, output_base=os.path.join(output_dir, "{}.read_len_to_identity".format(sample_name))) # plot_per_file_identity_curve(identities_per_file, output_base=os.path.join(output_dir, sample_name)) if args.comparison_glob is None: plot_per_file_identity_violin(identities_per_file, title=sample_name, output_base=os.path.join( output_dir, sample_name)) else: comparison_paths = glob.glob(args.comparison_glob) if len(comparison_paths) == 0: raise Exception("No comparison files found for '{}'".format( args.comparison_glob)) #TODO only for rle experiment args.min_read_length *= 0.7 _, _, _, comparison_identities_per_file, comparison_lengths_per_file, _ = aggregate_summary_data( comparison_paths, args) plot_identity_comparison_violin(identities_per_file, comparison_identities_per_file, read_lengths_per_file, comparison_lengths_per_file, title=sample_name, output_base=os.path.join( output_dir, sample_name))
def write_windows_to_file(windows, output_dir, filename): FileManager.ensure_directory_exists(output_dir) filename = filename + "_windows.pkl" path = os.path.join(output_dir, filename) with open(path, 'wb') as output: pickle.dump(windows, output, pickle.HIGHEST_PROTOCOL)
def save_model(output_directory, model): FileManager.ensure_directory_exists(output_directory) timestamp = get_timestamp_string() filename = "model_" + timestamp path = os.path.join(output_directory, filename) print("SAVING MODEL:", path) torch.save(model.state_dict(), path)
def process_bam(bam_path, reference_path): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :return: """ print("\n" + bam_path + "\n") output_dir = "plots/" FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = ["gi"] for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) print("chromosome_name:\t", chromosome_name) print("chromosome_length:\t", chromosome_length) for data in read_data: read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data print() print(read_id) print("reversed:\t", reversal_status) print("alignment_start:\t", ref_alignment_start) print("alignment_length:\t", alignment_length) print("n_initial_clipped_bases:", n_initial_clipped_bases) print("n_total_mismatches:\t", n_total_mismatches) print("n_total_deletes:\t", n_total_deletes) print("n_total_inserts:\t", n_total_inserts) print("identity:\t", identity) total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity/total_alignment_bases print("\nTOTAL IDENTITY:\t", total_identity) plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, y_min=-1, y_max=4, show=False)
def extract_runnie_reads_by_name(runnie_path, output_dir, output_filename_suffix, names): output_filename = "runnie_subset_" + output_filename_suffix + ".out" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) runnie_handler = RunlengthHandler(runnie_path) runnie_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True) return output_path
def extract_fastq_reads_by_name(fastq_path, output_dir, output_filename_suffix, names): output_filename = "sequence_subset_" + output_filename_suffix + ".fastq" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) fastq_handler = FastqHandler(fastq_path) fastq_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True) return output_path
def main(reads_file_path, true_ref_sequence_path=None, output_dir=None, n_passes=False): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir, input_file_path=reads_file_path) reads_vs_ref_sam_path, reads_vs_ref_bam_path = align_minimap( output_dir=output_dir, ref_sequence_path=assembly_sequence_path, reads_sequence_path=reads_file_path) if true_ref_sequence_path is not None: assembled_vs_true_ref_sam_path, assembled_vs_true_ref_bam_path = align_minimap( output_dir=output_dir, ref_sequence_path=true_ref_sequence_path, reads_sequence_path=assembly_sequence_path) polished_ref_paths = list() for i in range(n_passes): suffix = str(i + 1) + "x" polish_output_dir = join(output_dir, suffix) FileManager.ensure_directory_exists(polish_output_dir) if i == 0: ref_sequence_path = assembly_sequence_path else: ref_sequence_path = polished_ref_paths[i - 1] reads_vs_polished_ref_sam_path, reads_vs_polished_ref_bam_path = align_minimap( output_dir=polish_output_dir, ref_sequence_path=ref_sequence_path, reads_sequence_path=reads_file_path) repolished_ref_sequence_path = polish_racon( output_dir=polish_output_dir, reads_file_path=reads_file_path, reads_vs_ref_sam_path=reads_vs_polished_ref_sam_path, ref_sequence_path=ref_sequence_path, suffix=suffix) polished_ref_paths.append(repolished_ref_sequence_path) if true_ref_sequence_path is not None: repolished_vs_true_ref_sam_path, repolished_vs_true_ref_bam_path = \ align_minimap(output_dir=polish_output_dir, ref_sequence_path=true_ref_sequence_path, reads_sequence_path=repolished_ref_sequence_path)
def train_joint_model_from_tuples(tuples_path): training_tuples = load_training_tuples(tuples_path, cutoff=16) print("training tuples loaded: ", len(training_tuples)) distribution = train_model(data=training_tuples) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def main(ref_sequence_path, reads_sequence_path, minimap_preset, output_dir=None): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) output_sam_file_path, output_bam_file_path = align_minimap(output_dir=output_dir, ref_sequence_path=ref_sequence_path, reads_sequence_path=reads_sequence_path, preset=minimap_preset) process_bam(bam_path=output_bam_file_path, reference_path=ref_sequence_path, output_dir=output_dir)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def main(): output_dir = "output/" + "read_names_" + FileManager.get_datetime_string() output_filename = "read_names.txt" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) # STEP 1 # Find union of read names within runnie and fastq files fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq" runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out" # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, # fastq_path=fastq_path, # runnie_path=runnie_path) # STEP 2 # Split sequence names into train/test partition name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt" names = read_names_from_file(name_intersection_path) names_train, names_test = partition_names(names) # STEP 3 # Extract names and write to files runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) # STEP 4 # Verify name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_train_subset_path, runnie_path=runnie_train_subset_path) name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_test_subset_path, runnie_path=runnie_test_subset_path)
def save_numpy_matrix(output_dir, filename, matrix): array_file_extension = ".npz" # ensure chromosomal directory exists if not os.path.exists(output_dir): FileManager.ensure_directory_exists(output_dir) output_path_prefix = os.path.join(output_dir, filename) output_path = output_path_prefix + array_file_extension # write numpy arrays numpy.savez_compressed(output_path, a=matrix)
def __init__(self): self.datetime_string = '-'.join( list(map(str, datetime.datetime.now().timetuple()))[:-1]) self.subdirectory_name = "training_" + self.datetime_string self.output_directory_name = "output/" self.directory = path.join(self.output_directory_name, self.subdirectory_name) self.n_checkpoints = 0 FileManager.ensure_directory_exists(self.directory)
def run_generate_tuples_from_pileups(): max_threads = 6 # NC_003279.8 Caenorhabditis elegans chromosome I # NC_003280.10 Caenorhabditis elegans chromosome II # NC_003281.10 Caenorhabditis elegans chromosome III # NC_003282.8 Caenorhabditis elegans chromosome IV # NC_003283.11 Caenorhabditis elegans chromosome V # NC_003284.9 Caenorhabditis elegans chromosome X # NC_001328.1 Caenorhabditis elegans mitochondrion, complete genome # data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003280.10", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003281.10", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003282.8", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003284.9"] data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"] args = list() for path in data_path: gap_filterer = GapFilterer() batch_size = 1 file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".npz") data_loader = DataLoader(file_paths, batch_size=batch_size, parse_batches=False) consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float) output_dir = "output/joint_runlength_base_model/" + FileManager.get_datetime_string() filename_suffix = path.split("/")[-1] print(filename_suffix) args.append([data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer]) gap_filterer = None gc.collect() n_threads = min(len(args), max_threads) for arg in args: print(arg) print(n_threads) with Pool(processes=n_threads) as pool: pool.starmap(generate_training_data, args)
def process_bam(bam_path, reference_path, bac_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" FileManager.ensure_directory_exists(output_dir) ref_fasta_handler = FastaHandler(reference_path) bac_fasta_handler = FastaHandler(bac_path) chromosome_names = ref_fasta_handler.get_contig_names() bac_names = bac_fasta_handler.get_contig_names() print(chromosome_names) print(bac_names) data_per_bac = defaultdict(list) for chromosome_name in chromosome_names: chromosome_length = ref_fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length ref_fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=ref_fasta_handler, chromosome_name=chromosome_name) for data in read_data: data_per_bac[data[0]].append([chromosome_name] + data) # filtered_data = filter_supplementaries_by_largest(data_per_bac) filtered_data = aggregate_bac_data(data_per_bac) export_bac_data_to_csv(read_data=filtered_data, output_dir=output_dir, bam_path=bam_path)
def save_directional_frequency_matrices_as_delimited_text( output_dir, frequency_matrices, delimiter=",", log_normalize=False, plot=False): if log_normalize: filename = "probability_matrices_directional_" + FileManager.get_datetime_string( ) + ".csv" else: filename = "frequency_matrices_directional_" + FileManager.get_datetime_string( ) + ".csv" reversal_suffixes = ["F", "R"] output_path = os.path.join(output_dir, filename) file = open(output_path, "w") for reversal in [0, 1]: for base_index in range(4): base = INDEX_TO_BASE[base_index] suffix = reversal_suffixes[reversal] matrix = numpy.squeeze(frequency_matrices[reversal, base_index, :, :]) type = int if log_normalize: matrix = normalize(matrix, pseudocount=15) type = float if plot: pyplot.imshow(matrix) pyplot.show() pyplot.close() matrix_name = "_".join([base, suffix]) header = ">" + matrix_name + "\n" file.write(header) for r in range(matrix.shape[0]): row = [str(type(x)) for x in matrix[r]] # print(r, len(row)) row = delimiter.join(row) + "\n" file.write(row) file.write("\n") file.close()
def process_bam(bam_path, reference_path, output_dir=None, centromere_table_path=None, gap_table_path=None, segdup_table_path=None, max_threads=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if max_threads is None: max_threads = max(1, cpu_count() - 2) if output_dir is None: output_dir = "plots/" process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: arguments.append([ bam_path, reference_path, chromosome_name, output_dir, centromere_table_path, gap_table_path, segdup_table_path, genome_data ]) if len(arguments) < max_threads: max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_data, arguments) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def process_bam(bam_path, reference_path, max_threads, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" if max_threads is None: max_threads = max(1, cpu_count() - 2) process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length arguments.append([ genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path ]) if len(arguments) < max_threads: print("Fewer jobs than threads") max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_stats, arguments) print("genome_data", genome_data) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def main(max_threads=None): # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out" output_parent_dir = "output/version_comparison/mode/" output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) handler = RunlengthHandler(runlength_path) if max_threads is None: max_threads = max(1, multiprocessing.cpu_count()-2) with multiprocessing.Pool(processes=max_threads) as pool: for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))): sys.stdout.write("\r%d" % r) print() print("Concatenating files...") output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta") concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta" concatenated_file_path = os.path.join(output_dir, concatenated_filename) print("Saving to file: %s" % concatenated_file_path) FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path) FileManager.delete_files(output_file_paths)
def main(reads_file_path, genome_size=None, output_dir=None): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) if genome_size is None: genome_size = "3g" print( "WARNING: genome size flag not specified, defaulting to human size (3g)" ) assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir, input_file_path=reads_file_path, genome_size=genome_size)
def run(): # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8" # one-hot with anchors and reversal matrix chr1 celegans directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi" # one-hot with anchors and reversal matrix E. Coli file_paths = FileManager.get_all_file_paths_by_type( parent_directory_path=directory, file_extension=".npz", sort=False) # file_paths = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_9699291_matrix.npz", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4172039_matrix.npz", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4552073_matrix.npz", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7332035_matrix.npz", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_12807084_matrix.npz", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7773028_matrix.npz"] # Training parameters batch_size_train = 1 n_batches = 1000 data_loader = DataLoader(file_paths=file_paths, batch_size=batch_size_train, parse_batches=False) gap_filterer = GapFilterer(threshold=0.003) consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float) print(len(data_loader)) test_consensus(consensus_caller=consensus_caller, data_loader=data_loader, n_batches=n_batches, gap_filterer=gap_filterer, plot_mismatches=False)
def get_all_aligned_lengths(bam_path, recursive=False): if os.path.isdir(bam_path): bam_paths = FileManager.get_all_file_paths_by_type( parent_directory_path=bam_path, file_extension=".bam", recursive=recursive) print(bam_paths) else: bam_paths = [bam_path] manager = Manager() assembly_contigs = manager.dict() max_threads = max(1, cpu_count() - 2) arguments = list() for path in bam_paths: arguments.append([path, assembly_contigs]) if len(arguments) < max_threads: print("Fewer jobs than threads") max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_aligned_contig_lengths, arguments) return assembly_contigs
def run(): directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8" # one-hot with anchors and reversal matrix Chr1 filtered 2820 model_state_path = "output/training_2018-10-17-15-1-39-2-290/model_checkpoint_9" file_paths = FileManager.get_all_file_paths_by_type( parent_directory_path=directory, file_extension=".npz", sort=False) # Training parameters batch_size_train = 1 n_batches = 1000 threshold = 0.005 data_loader = DataLoader(file_paths=file_paths, batch_size=batch_size_train, parse_batches=False, convert_to_distributions=False, use_gpu=False) gap_filterer = GapFilterer(model_state_path=model_state_path, threshold=threshold) test_filter(gap_filterer=gap_filterer, data_loader=data_loader, n_batches=n_batches)
def run(): # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8" # one-hot with anchors and reversal matrix Chr4 directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8" # one hot with anchors and reversal matrix chr1 file_paths = FileManager.get_all_file_paths_by_type( parent_directory_path=directory, file_extension=".npz", sort=False) # Training parameters batch_size_train = 1 checkpoint_interval = 300 n_batches = 1000 data_loader = DataLoader(file_paths=file_paths, batch_size=batch_size_train, parse_batches=False) consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float) gap_filterer = GapFilterer() print(len(data_loader)) test_consensus(consensus_caller=consensus_caller, data_loader=data_loader, plot_mismatches=False, gap_filterer=gap_filterer, n_batches=n_batches)
def get_contig_lengths_from_phaseblock_csvs(parent_directory): print(parent_directory) paths = FileManager.get_all_file_paths_by_type( parent_directory_path=parent_directory, file_extension="csv") print(paths) assembly_contigs = dict() names = list() lengths = list() for path in paths: with open(path, "r") as file: for l, line in enumerate(file): print(line) if len(line) == 0: continue items = line.strip().split(",") length = int(items[-1]) - int(items[-2]) print(length) lengths.append(length) names.append(items[1]) contigs = list(zip(names, lengths)) contigs = sorted(contigs, key=lambda x: x[1], reverse=True) assembly_contigs[path] = contigs return assembly_contigs
def plot_kernels_and_column_frequencies(kernel_sums, passing_indices, column_frequencies, slice_range=None, save=False, output_dir=None, filename=None): if slice_range is not None: kernel_sums = kernel_sums[:, slice_range[0]:slice_range[1]] passing_indices = passing_indices[:, slice_range[0]:slice_range[1]] column_frequencies = column_frequencies[:, slice_range[0]:slice_range[1]] kernel_sums.reshape(1, kernel_sums.shape[1]) passing_indices.reshape(1, passing_indices.shape[1]) column_frequencies.reshape(column_frequencies.shape[0], column_frequencies.shape[1]) fig, axes = pyplot.subplots(nrows=3, sharex=True) fig.set_size_inches(16, 4) axes[0].imshow(passing_indices) axes[1].imshow(kernel_sums) axes[2].imshow(column_frequencies) axes[0].set_ylabel("Thresholded") axes[1].set_ylabel("Convolution") axes[2].set_ylabel("Frequencies") axes[0].set_yticklabels([]) axes[1].set_yticklabels([]) axes[2].set_yticklabels([]) axes[0].set_yticks([]) axes[1].set_yticks([]) axes[2].set_yticks([]) if save: FileManager.ensure_directory_exists(output_dir) filename = filename + "_kernels.png" path = os.path.join(output_dir, filename) pyplot.savefig(path) else: pyplot.show() pyplot.close()