def iteratively_align_as_RLE(ref_fasta_path, read_fasta_path, output_dir): """ Given 2 fasta files for reads and reference, iterate them, runlength encode their sequences, and write the RLE sequences to a new file, then align them with minimap2 :param ref_fasta_path: :param read_fasta_path: :param output_dir: :return: """ ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) print("SAVING run length fasta file:", runlength_ref_fasta_path) print("SAVING run length fasta file:", runlength_read_fasta_path) with open(runlength_ref_fasta_path, "w") as file: fasta_handler = FastaHandler(ref_fasta_path) names = fasta_handler.get_contig_names() for name in names: sequence = fasta_handler.get_sequence(chromosome_name=name, start=None, stop=None) sequence, lengths = runlength_encode(sequence) file.write(">" + name + " RLE\n") file.write(sequence + "\n") with open(runlength_read_fasta_path, "w") as file: fasta_handler = FastaHandler(read_fasta_path) names = fasta_handler.get_contig_names() for name in names: sequence = fasta_handler.get_sequence(chromosome_name=name, start=None, stop=None) sequence, lengths = runlength_encode(sequence) file.write(">" + name + " RLE\n") file.write(sequence + "\n") output_sam_file_path, output_bam_file_path = align_minimap( output_dir=output_dir, ref_sequence_path=runlength_ref_fasta_path, reads_sequence_path=runlength_read_fasta_path) return output_bam_file_path
def process_bam(bam_path, reference_path, bac_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" FileManager.ensure_directory_exists(output_dir) ref_fasta_handler = FastaHandler(reference_path) bac_fasta_handler = FastaHandler(bac_path) chromosome_names = ref_fasta_handler.get_contig_names() bac_names = bac_fasta_handler.get_contig_names() print(chromosome_names) print(bac_names) data_per_bac = defaultdict(list) for chromosome_name in chromosome_names: chromosome_length = ref_fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length ref_fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=ref_fasta_handler, chromosome_name=chromosome_name) for data in read_data: data_per_bac[data[0]].append([chromosome_name] + data) # filtered_data = filter_supplementaries_by_largest(data_per_bac) filtered_data = aggregate_bac_data(data_per_bac) export_bac_data_to_csv(read_data=filtered_data, output_dir=output_dir, bam_path=bam_path)
def runlength_encode_fasta_parallel(fasta_sequence_path, max_threads=None, min_length=0): if min_length > 0: print("WARNING: excluding all sequences less than length %d" % min_length) fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() manager = Manager() runlength_sequences = manager.dict() args = list() for contig_name in contig_names: args.append([ fasta_sequence_path, contig_name, runlength_sequences, min_length ]) if max_threads is None: max_threads = max(1, cpu_count() - 2) if max_threads > len(args): max_threads = len(args) with Pool(processes=max_threads, maxtasksperchild=40) as pool: pool.starmap(runlength_encode_parallel, args, chunksize=1) sys.stderr.write("\n") return runlength_sequences
def main(): output_root_dir = "output/" instance_dir = "spoa_pileup_generation_" + get_current_timestamp() output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta.reads.sorted.bam" bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "gi" # E coli # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name lengths = list() for name in contig_names: chromosome_length = fasta_handler.get_chr_sequence_length(name) lengths.append(chromosome_length) print('\t'.join(contig_names)) print('\t\t'.join(map(str, lengths))) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def process_bam(bam_path, reference_path, max_threads, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" if max_threads is None: max_threads = max(1, cpu_count() - 2) process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length arguments.append([ genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path ]) if len(arguments) < max_threads: print("Fewer jobs than threads") max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_stats, arguments) print("genome_data", genome_data) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def process_bam(bam_path, reference_path, output_dir=None, centromere_table_path=None, gap_table_path=None, segdup_table_path=None, max_threads=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if max_threads is None: max_threads = max(1, cpu_count() - 2) if output_dir is None: output_dir = "plots/" process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: arguments.append([ bam_path, reference_path, chromosome_name, output_dir, centromere_table_path, gap_table_path, segdup_table_path, genome_data ]) if len(arguments) < max_threads: max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_data, arguments) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def get_contig_lengths(assembly_path, assembly_contigs): handler = FastaHandler(assembly_path) contig_names = handler.get_contig_names() contigs = list() for name in sorted(contig_names): length = handler.get_chr_sequence_length(name) contigs.append([name, length]) contigs = sorted(contigs, key=lambda x: x[LENGTH], reverse=True) print("Assembly parsed: %s" % assembly_path) assembly_contigs[assembly_path] = contigs
def main(sequences_path, cutoff): fasta = FastaHandler(sequences_path) names = fasta.get_contig_names() n_reads = 0 with open("assemble_long_segments.sh", "w") as file: for i, name in enumerate(names): length = fasta.get_chr_sequence_length(name) n_reads += 1 if length > cutoff: print(name, length) file.write("../build/shasta-install/bin/AssembleSegment.py " + name + "\n")
def parse_bam(bam_path, reference_path): """ Iterate a BAM file and count summary stats from that file :param bam_path: :param reference_path: :return: """ fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int)) n_alignments = 0 n_primary = 0 n_supplementary = 0 n_secondary = 0 map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6) for chromosome_name in chromosome_names: bam_handler = BamHandler(bam_path) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) chromosomal_cigar_counts, \ n_alignments, \ n_primary, \ n_supplementary, \ n_secondary, \ map_qualities = count_cigar_operations(reads=reads, chromosome_name=chromosome_name, chromosomal_cigar_counts=chromosomal_cigar_counts, n_alignments=n_alignments, n_primary=n_primary, n_supplementary=n_supplementary, n_secondary=n_secondary, map_qualities=map_qualities) return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
def runlength_encode_fasta(fasta_sequence_path): fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() runlength_sequences = dict() for contig_name in contig_names: sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=None, stop=None) bases, lengths = runlength_encode(sequence) runlength_sequences[contig_name] = (bases, lengths) sys.stderr.write("\rRun length encoded %s " % contig_name) sys.stderr.write("\n") return runlength_sequences
def main(): output_dir = "output/ref_run_lengths/" filename_prefix = "ref_runlength_distribution" reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta" # ---- GIAB E. Coli - (dev machine) ------------------------- # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta" # ------------------------------------------------------------------------- threshold = 5 fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() all_counts = defaultdict(lambda: Counter()) sys.stderr.write("reading fasta file...\n") sys.stderr.flush() c = 0 for chromosome_name in contig_names: if len(contig_names) > 1: if not chromosome_name != "chr1": continue c += 1 # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) # sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, stop=chromosome_length, start=0) character_counts = count_runlength_per_character( sequence=reference_sequence, threshold=threshold, chromosome_name=chromosome_name)
def runlength_encode_fasta(fasta_sequence_path): fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() runlength_sequences = dict() for contig_name in contig_names: chromosome_length = fasta_handler.get_chr_sequence_length(contig_name) sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=0, stop=chromosome_length) bases, lengths = runlength_encode(sequence) runlength_sequences[contig_name] = (bases, lengths) print(contig_name, len(bases), len(lengths)) return runlength_sequences
def main(): # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(ref_fasta_path) pileup_start = 0 pileup_end = pileup_start + 1000 # add random variation here ? aligned_segments = get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, include_ref=True) encoding = list() for alignment in aligned_segments.values(): encoding.append(list(map(get_encoding, alignment))) encoding = -numpy.array(encoding, dtype=numpy.float) pyplot.imshow(encoding) pyplot.show() pyplot.close()
def main(reference_file_path): input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0] output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name) filename_prefix = "ref_runlength_distribution" FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() print(contig_names) print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1])) all_counts = defaultdict(lambda: Counter()) raw_counts_AT = list() raw_counts_GC = list() sys.stderr.write("reading fasta file...\n") sys.stderr.flush() max_count = 100 step = 1 c = 0 for chromosome_name in contig_names: # if len(contig_names) > 1: # if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name: # print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name) # continue # if c == 1: # break c += 1 sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True) figure.set_size_inches(6,12) for k,key in enumerate(character_counts.keys()): counts = character_counts[key] counter = Counter(counts) all_counts[key] += counter if key in {"C","G"}: raw_counts_GC += counts if key in {"A","T"}: raw_counts_AT += counts plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step) axes[k].set_ylabel(str(key)) axes[k].set_ylim([-0.5,10]) axes[0].set_title(chromosome_name) filename = filename_prefix + "_" + chromosome_name + ".png" file_path = os.path.join(output_dir, filename) figure.savefig(file_path) # pyplot.show() pyplot.close() figure, axes = pyplot.subplots(nrows=2) filename = filename_prefix + "_genomic.png" file_path = os.path.join(output_dir, filename) plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step) plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step) axes[0].set_ylabel("AT Log10 Frequency") axes[1].set_ylabel("GC Log10 Frequency") figure.savefig(file_path) # pyplot.show() pyplot.close() print_all_counts_as_shasta_matrix(all_counts, max_count=50) print_all_counts(all_counts, output_dir)
def main(): output_root_dir = "output/" instance_dir = "spoa_pileup_generation_" + get_current_timestamp() output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) # ---- TEST window -------------------------------------------------------- # window = [762580, 762600] # nanopore broken alignment region... POAPY ONLY # window = [748460, 748480] # nanopore broken alignment region... POAPY ONLY # window = [767240, 767260] # nanopore broken alignment region... SPOA NOOOOooOOoooo # window = [727360, 767280] # nanopore broken alignment region... very high loss in CNNRNN # window = [727200, 727220] # nanopore broken alignment region... very high loss in CNNRNN # window = [748220, 748240] # nanopore broken alignment region... very high loss in CNNRNN # window = [1105084, 1105104] # very messy alignment even with spoa... why? # window = [246567, 246587] # previously failing test case for collapsed reads # window = [800000, 800020] # test sites for misalignment # window = [10029532, 10029532+83] # window = [10031827, 10031827+34] # window = [10039004, 10039004+25] # window = [10040234, 10040234+61] # window = [1004298, 1004298+109] window = [10044514, 10044514 + 54] # window = [10037167, 10037167+82] # test_window(bam_file_path=bam_file_path, # reference_file_path=reference_file_path, # chromosome_name=chromosome_name, # window=window, # output_dir=output_dir, # print_results=True, # save_data=True) generate_window_run_length_encoding( bam_file_path=bam_file_path, reference_file_path=reference_file_path, chromosome_name=chromosome_name, window=window, output_dir=output_dir, sort_sequences_by_length=True, reverse_sort=False, two_pass=True, plot_results=True, print_results=True, save_data=False)
def main(): # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta" # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq" matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv" output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) # Initialize empty confusion matrices total_confusion = get_runlength_confusion([], [], 10) total_modal_confusion = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) print("reading BAM") for pileup_start, pileup_end in windows[:10]: print("window", pileup_start, pileup_end) sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) sequence_encoding = list() length_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t","".join(aligned_sequences[read_id])) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) length_encoding.append(aligned_lengths[read_id]) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.array(ref_sequence_encoding, dtype=numpy.int) ref_length_encoding = numpy.array(ref_lengths_encoding, dtype=numpy.int) sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int) length_encoding = numpy.array(length_encoding, dtype=numpy.float) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding) ref_length_encoding = numpy.atleast_2d(ref_length_encoding) sequence_encoding = numpy.atleast_2d(sequence_encoding) length_encoding = numpy.atleast_2d(length_encoding) # plot_runlength_pileup(sequences=-sequence_encoding, # lengths=length_encoding, # ref_sequence=-ref_sequence_encoding, # ref_lengths=ref_length_encoding) consensus_sequence, consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding) modal_consensus_sequence, modal_consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding, bayesian=False) print() print("PREDICTED\t", consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) total_confusion += confusion modal_confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=modal_consensus_lengths, max_length=10) total_modal_confusion += modal_confusion # except Exception as e: # print(e) # continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Bayes:", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion) print("No Bayes", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "modal_confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_modal_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa" # ---- TEST DATA ---- # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta" # ------------------- output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) sys.stderr.write("RL encoding fasta...\n") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) sys.stderr.write("Aligning RLE fasta...\n") read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) print(chromosome_length) sequences, lengths = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(lengths[key][:10])
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta" # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" # WG ecoli 60x matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv" raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv" output_parent_dir = "output/" output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join( output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) total_confusion = get_runlength_confusion([], [], 10) total_confusion_weibull = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path) length_classifier_weibull = WeibullRunlengthClassifier( raw_matrix_path, normalize_matrix=True, pseudocount=0.05) print("reading BAM") for pileup_start, pileup_end in windows[10:20]: sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue try: # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id]))) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append( list( map( map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [ list(map(get_encoding, aligned_ref_sequence)) ] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.atleast_2d( numpy.array(ref_sequence_encoding, dtype=numpy.int)) ref_length_encoding = numpy.atleast_2d( numpy.array(ref_lengths_encoding, dtype=numpy.int)) sequence_encoding = numpy.atleast_2d( numpy.array(sequence_encoding, dtype=numpy.int)) scale_encoding = numpy.atleast_2d( numpy.array(scale_encoding, dtype=numpy.float)) shape_encoding = numpy.atleast_2d( numpy.array(shape_encoding, dtype=numpy.float)) modes_encoding = numpy.atleast_2d( numpy.array(modes_encoding, dtype=numpy.int)) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) consensus_sequence, consensus_lengths = \ get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=modes_encoding, reversal_encoding=reversal_encoding) weibull_consensus_sequence, weibull_consensus_lengths = \ get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull, sequence_encoding=sequence_encoding, scale_encoding=scale_encoding, shape_encoding=shape_encoding, reversal_encoding=reversal_encoding) plot_runlength_pileup( sequences=-sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding, ref_sequence=-ref_sequence_encoding, ref_lengths=ref_length_encoding, predicted_sequence=-numpy.atleast_2d( numpy.array(weibull_consensus_sequence, dtype=numpy.int)), predicted_lengths=numpy.atleast_2d( numpy.array(weibull_consensus_lengths, dtype=numpy.int))) print() print("PREDICTED\t", weibull_consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) confusion_weibull = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=weibull_consensus_lengths, max_length=10) total_confusion += confusion total_confusion_weibull += confusion_weibull except Exception as e: print(e) continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Modal: ", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull) print("Full: ", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "confusion_weibull.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion_weibull)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out" output_parent_dir = "output/" output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) sequences, scales, shapes = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(scales[key][:10]) print(shapes[key][:10])
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" pileup_start = 6000 pileup_end = 6050 output_parent_dir = "output/" output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() print(len(aligned_sequences.keys())) print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id]))) sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float) scale_encoding = numpy.array(scale_encoding, dtype=numpy.float) shape_encoding = numpy.array(shape_encoding, dtype=numpy.float) modes_encoding = numpy.array(modes_encoding, dtype=numpy.float) plot_runlength_pileup(sequences=sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding)
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True, sharey=True) for k, key in enumerate(character_counts.keys()): counts = character_counts[key] max_count = max(counts) step = 1 bins = numpy.arange(0, max_count + step, step=step) frequencies, bins = numpy.histogram(counts, bins=bins, normed=False) print(bins) print(frequencies) print(bins.shape) center = (bins[:-1] + bins[1:]) / 2 - step / 2 axes[k].bar(center, frequencies, width=step, align="center") axes[k].set_ylabel(str(key)) axes[k].set_xticks(numpy.arange(0, max_count + 1)) pyplot.show()
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() fasta_handler.close() # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 for chromosome_name in contig_names: if chromosome_name == "NC_001328.1": # mitochondrial continue print("STARTING:", chromosome_name) fasta_handler = FastaHandler(reference_file_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) fasta_handler.close() region = [0+1000000, chromosome_length-1000000] max_threads = 30 window_size = 10000 min_size = 20 max_size = 80 manager = multiprocessing.Manager() counter = manager.Value('i', 0) region_windows = chunk_region(region=region, size=window_size) n_chunks = len(region_windows) print("subregions: ", n_chunks) output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string() print(output_dir) # args = list() # for subregion in region_windows: # args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks]) pooled_args = generate_argument_pools(pool_size=max_threads, bam_file_path=bam_file_path, chromosome_name=chromosome_name, region_windows=region_windows, reference_sequence=reference_sequence, min_size=min_size, max_size=max_size, output_dir=output_dir, counter=counter, n_chunks=n_chunks) # print(len(pooled_args)) # s = 0 # for pool in pooled_args: # s += len(pool) # print(len(pool)) # print(len(region_windows)) # print(s) # exit() for arg_pool in pooled_args: # initiate threading gc.collect() with Pool(processes=max_threads) as pool: pool.starmap(select_windows, arg_pool) print()