def main(): output_root_dir = "output/" instance_dir = "spoa_pileup_generation_" + get_current_timestamp() output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta.reads.sorted.bam" bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli.contigs.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "gi" # E coli # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name lengths = list() for name in contig_names: chromosome_length = fasta_handler.get_chr_sequence_length(name) lengths.append(chromosome_length) print('\t'.join(contig_names)) print('\t\t'.join(map(str, lengths))) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
def run_parameter_comparison(): # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382" chromosome_name = "NC_003279.8" chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) region = [0, chromosome_length] runlength = True output_root_dir = "output/" instance_dir = "spoa_pileup_generation_anchored_" + get_current_timestamp() output_dir = os.path.join(output_root_dir, instance_dir) encode_region_parallel(bam_file_path=bam_file_path, reference_file_path=reference_file_path, chromosome_name=chromosome_name, region=region, window_size=20, output_dir=output_dir, runlength=runlength, max_threads=30, windows_path=chromosomal_window_path, sort_sequences_by_length=False, reverse_sort=False, two_pass=True)
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path, runlength_ref_sequences, runlength_read_data): """ Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix of true vs observed lengths. :param runlength_ref_sequence_path: :param assembly_vs_ref_bam_path: :param runlength_ref_sequences: :param runlength_read_data: :return: """ for chromosome_name in runlength_ref_sequences: shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1] matrix = numpy.zeros(shape, dtype=numpy.float64) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_sequence_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) n_reads = parse_reads(chromosome_name=chromosome_name, fasta_handler=fasta_handler, reads=reads, complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS], runlength_read_data=runlength_read_data, matrix=matrix) if n_reads > 0: yield (chromosome_name, matrix) else: sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
def main(): # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) # chromosomal_window_path = "output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1" # kernel method chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382" # transition method chromosome_name = "NC_003279.8" chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, start=0, stop=chromosome_length) windows = load_windows(chromosomal_window_path) long_repeat_positions = find_repeats(sequence=reference_sequence, repeat_threshold=1) split_counts_per_length, split_repeat_windows, unsplit_repeat_windows = \ locate_repeats_in_anchored_windows(windows=windows, repeat_positions=long_repeat_positions) plot_split_ratios_per_length(split_counts_per_length) plot_pileups_for_split_repeats(split_repeat_windows=split_repeat_windows, bam_file_path=bam_file_path, reference_file_path=reference_file_path, chromosome_name=chromosome_name)
def process_bam(bam_path, reference_path): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :return: """ print("\n" + bam_path + "\n") output_dir = "plots/" FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = ["gi"] for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) print("chromosome_name:\t", chromosome_name) print("chromosome_length:\t", chromosome_length) for data in read_data: read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data print() print(read_id) print("reversed:\t", reversal_status) print("alignment_start:\t", ref_alignment_start) print("alignment_length:\t", alignment_length) print("n_initial_clipped_bases:", n_initial_clipped_bases) print("n_total_mismatches:\t", n_total_mismatches) print("n_total_deletes:\t", n_total_deletes) print("n_total_inserts:\t", n_total_inserts) print("identity:\t", identity) total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity/total_alignment_bases print("\nTOTAL IDENTITY:\t", total_identity) plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, y_min=-1, y_max=4, show=False)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def process_bam(bam_path, reference_path, max_threads, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" if max_threads is None: max_threads = max(1, cpu_count() - 2) process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length arguments.append([ genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path ]) if len(arguments) < max_threads: print("Fewer jobs than threads") max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_stats, arguments) print("genome_data", genome_data) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def process_bam(bam_path, reference_path, bac_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" FileManager.ensure_directory_exists(output_dir) ref_fasta_handler = FastaHandler(reference_path) bac_fasta_handler = FastaHandler(bac_path) chromosome_names = ref_fasta_handler.get_contig_names() bac_names = bac_fasta_handler.get_contig_names() print(chromosome_names) print(bac_names) data_per_bac = defaultdict(list) for chromosome_name in chromosome_names: chromosome_length = ref_fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length ref_fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=ref_fasta_handler, chromosome_name=chromosome_name) for data in read_data: data_per_bac[data[0]].append([chromosome_name] + data) # filtered_data = filter_supplementaries_by_largest(data_per_bac) filtered_data = aggregate_bac_data(data_per_bac) export_bac_data_to_csv(read_data=filtered_data, output_dir=output_dir, bam_path=bam_path)
def main(sequences_path, cutoff): fasta = FastaHandler(sequences_path) names = fasta.get_contig_names() n_reads = 0 with open("assemble_long_segments.sh", "w") as file: for i, name in enumerate(names): length = fasta.get_chr_sequence_length(name) n_reads += 1 if length > cutoff: print(name, length) file.write("../build/shasta-install/bin/AssembleSegment.py " + name + "\n")
def get_contig_lengths(assembly_path, assembly_contigs): handler = FastaHandler(assembly_path) contig_names = handler.get_contig_names() contigs = list() for name in sorted(contig_names): length = handler.get_chr_sequence_length(name) contigs.append([name, length]) contigs = sorted(contigs, key=lambda x: x[LENGTH], reverse=True) print("Assembly parsed: %s" % assembly_path) assembly_contigs[assembly_path] = contigs
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path, runlength_ref_sequences, runlength_assembly_sequences): """ Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix of true vs observed lengths. :param chromosome_name: :param runlength_ref_sequence_path: :param assembly_vs_ref_bam_path: :param runlength_ref_sequence: :param runlength_ref_lengths: :param observations: :return: """ for chromosome_name in runlength_ref_sequences: shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1] matrix = numpy.zeros(shape, dtype=numpy.float64) # print(assembly_vs_ref_bam_path) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_sequence_path) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) parse_reads( chromosome_name=chromosome_name, fasta_handler=fasta_handler, reads=reads, complete_ref_runlengths=runlength_ref_sequences[chromosome_name] [LENGTHS], runlength_assembly_sequences=runlength_assembly_sequences, matrix=matrix) # plot_base_matrices(matrix=matrix, cutoff=40) yield matrix
def parse_bam(bam_path, reference_path): """ Iterate a BAM file and count summary stats from that file :param bam_path: :param reference_path: :return: """ fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int)) n_alignments = 0 n_primary = 0 n_supplementary = 0 n_secondary = 0 map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6) for chromosome_name in chromosome_names: bam_handler = BamHandler(bam_path) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) chromosomal_cigar_counts, \ n_alignments, \ n_primary, \ n_supplementary, \ n_secondary, \ map_qualities = count_cigar_operations(reads=reads, chromosome_name=chromosome_name, chromosomal_cigar_counts=chromosomal_cigar_counts, n_alignments=n_alignments, n_primary=n_primary, n_supplementary=n_supplementary, n_secondary=n_secondary, map_qualities=map_qualities) return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
def main(): output_dir = "output/ref_run_lengths/" filename_prefix = "ref_runlength_distribution" reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta" # ---- GIAB E. Coli - (dev machine) ------------------------- # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta" # ------------------------------------------------------------------------- threshold = 5 fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() all_counts = defaultdict(lambda: Counter()) sys.stderr.write("reading fasta file...\n") sys.stderr.flush() c = 0 for chromosome_name in contig_names: if len(contig_names) > 1: if not chromosome_name != "chr1": continue c += 1 # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) # sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, stop=chromosome_length, start=0) character_counts = count_runlength_per_character( sequence=reference_sequence, threshold=threshold, chromosome_name=chromosome_name)
def runlength_encode_fasta(fasta_sequence_path): fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() runlength_sequences = dict() for contig_name in contig_names: chromosome_length = fasta_handler.get_chr_sequence_length(contig_name) sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=0, stop=chromosome_length) bases, lengths = runlength_encode(sequence) runlength_sequences[contig_name] = (bases, lengths) print(contig_name, len(bases), len(lengths)) return runlength_sequences
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path): fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data, chromosome_data = parse_reads( reads=reads, chromosome_name=chromosome_name, chromosome_length=chromosome_length, fasta_handler=fasta_handler) genome_data.append(chromosome_data) export_chromosome_summary_to_csv(read_data=read_data, chromosome_data=chromosome_data, output_dir=output_dir, bam_path=bam_path, chromosome_name=chromosome_name)
def main(): # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(ref_fasta_path) pileup_start = 0 pileup_end = pileup_start + 1000 # add random variation here ? aligned_segments = get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, include_ref=True) encoding = list() for alignment in aligned_segments.values(): encoding.append(list(map(get_encoding, alignment))) encoding = -numpy.array(encoding, dtype=numpy.float) pyplot.imshow(encoding) pyplot.show() pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta" # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" # WG ecoli 60x matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv" raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv" output_parent_dir = "output/" output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join( output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) total_confusion = get_runlength_confusion([], [], 10) total_confusion_weibull = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path) length_classifier_weibull = WeibullRunlengthClassifier( raw_matrix_path, normalize_matrix=True, pseudocount=0.05) print("reading BAM") for pileup_start, pileup_end in windows[10:20]: sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue try: # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id]))) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append( list( map( map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [ list(map(get_encoding, aligned_ref_sequence)) ] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.atleast_2d( numpy.array(ref_sequence_encoding, dtype=numpy.int)) ref_length_encoding = numpy.atleast_2d( numpy.array(ref_lengths_encoding, dtype=numpy.int)) sequence_encoding = numpy.atleast_2d( numpy.array(sequence_encoding, dtype=numpy.int)) scale_encoding = numpy.atleast_2d( numpy.array(scale_encoding, dtype=numpy.float)) shape_encoding = numpy.atleast_2d( numpy.array(shape_encoding, dtype=numpy.float)) modes_encoding = numpy.atleast_2d( numpy.array(modes_encoding, dtype=numpy.int)) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) consensus_sequence, consensus_lengths = \ get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=modes_encoding, reversal_encoding=reversal_encoding) weibull_consensus_sequence, weibull_consensus_lengths = \ get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull, sequence_encoding=sequence_encoding, scale_encoding=scale_encoding, shape_encoding=shape_encoding, reversal_encoding=reversal_encoding) plot_runlength_pileup( sequences=-sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding, ref_sequence=-ref_sequence_encoding, ref_lengths=ref_length_encoding, predicted_sequence=-numpy.atleast_2d( numpy.array(weibull_consensus_sequence, dtype=numpy.int)), predicted_lengths=numpy.atleast_2d( numpy.array(weibull_consensus_lengths, dtype=numpy.int))) print() print("PREDICTED\t", weibull_consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) confusion_weibull = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=weibull_consensus_lengths, max_length=10) total_confusion += confusion total_confusion_weibull += confusion_weibull except Exception as e: print(e) continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Modal: ", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull) print("Full: ", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "confusion_weibull.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion_weibull)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" pileup_start = 6000 pileup_end = 6050 output_parent_dir = "output/" output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() print(len(aligned_sequences.keys())) print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id]))) sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float) scale_encoding = numpy.array(scale_encoding, dtype=numpy.float) shape_encoding = numpy.array(shape_encoding, dtype=numpy.float) modes_encoding = numpy.array(modes_encoding, dtype=numpy.float) plot_runlength_pileup(sequences=sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding)
def main(reference_file_path): input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0] output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name) filename_prefix = "ref_runlength_distribution" FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() print(contig_names) print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1])) all_counts = defaultdict(lambda: Counter()) raw_counts_AT = list() raw_counts_GC = list() sys.stderr.write("reading fasta file...\n") sys.stderr.flush() max_count = 100 step = 1 c = 0 for chromosome_name in contig_names: # if len(contig_names) > 1: # if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name: # print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name) # continue # if c == 1: # break c += 1 sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True) figure.set_size_inches(6,12) for k,key in enumerate(character_counts.keys()): counts = character_counts[key] counter = Counter(counts) all_counts[key] += counter if key in {"C","G"}: raw_counts_GC += counts if key in {"A","T"}: raw_counts_AT += counts plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step) axes[k].set_ylabel(str(key)) axes[k].set_ylim([-0.5,10]) axes[0].set_title(chromosome_name) filename = filename_prefix + "_" + chromosome_name + ".png" file_path = os.path.join(output_dir, filename) figure.savefig(file_path) # pyplot.show() pyplot.close() figure, axes = pyplot.subplots(nrows=2) filename = filename_prefix + "_genomic.png" file_path = os.path.join(output_dir, filename) plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step) plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step) axes[0].set_ylabel("AT Log10 Frequency") axes[1].set_ylabel("GC Log10 Frequency") figure.savefig(file_path) # pyplot.show() pyplot.close() print_all_counts_as_shasta_matrix(all_counts, max_count=50) print_all_counts(all_counts, output_dir)
def get_chromosome_data(bam_path, reference_path, chromosome_name, output_dir, centromere_table_path, gap_table_path, segdup_table_path, genome_data): fasta_handler = FastaHandler(reference_path) bam_handler = BamHandler(bam_file_path=bam_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data, chromosome_data = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name, chromosome_length=chromosome_length) genome_data.append(chromosome_data) # Calculate total identity, and approximate 0 if denominator is zero for F and R total_weighted_identity = sum( [x[ALIGNMENT_LENGTH] * x[SEQUENCE_IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity / max(1e-9, total_alignment_bases) total_identity = round(total_identity, 6) export_chromosome_summary_to_csv(read_data=read_data, chromosome_data=chromosome_data, output_dir=output_dir, bam_path=bam_path, chromosome_name=chromosome_name) if centromere_table_path is not None: centromere_coordinates = read_centromere_table( centromere_table_path=centromere_table_path, target_chromosome_name=chromosome_name) else: centromere_coordinates = None if gap_table_path is not None: gap_coordinates = read_gap_table( table_path=gap_table_path, target_chromosome_name=chromosome_name) else: gap_coordinates = None if segdup_table_path is not None: segdup_coordinates = read_gap_table( table_path=segdup_table_path, target_chromosome_name=chromosome_name, size_cutoff=10000) else: segdup_coordinates = None figure, axes = plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, centromere_coordinates=centromere_coordinates, gap_coordinates=gap_coordinates, segdup_coordinates=segdup_coordinates, show=False) pyplot.close(figure)
def main(): output_root_dir = "output/" instance_dir = "spoa_pileup_generation_" + get_current_timestamp() output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" windows_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003283.11_0_20924180_2018_9_28_10_56" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) # ---- TEST window -------------------------------------------------------- # window = [762580, 762600] # nanopore broken alignment region... POAPY ONLY # window = [748460, 748480] # nanopore broken alignment region... POAPY ONLY # window = [767240, 767260] # nanopore broken alignment region... SPOA NOOOOooOOoooo # window = [727360, 767280] # nanopore broken alignment region... very high loss in CNNRNN # window = [727200, 727220] # nanopore broken alignment region... very high loss in CNNRNN # window = [748220, 748240] # nanopore broken alignment region... very high loss in CNNRNN # window = [1105084, 1105104] # very messy alignment even with spoa... why? # window = [246567, 246587] # previously failing test case for collapsed reads # window = [800000, 800020] # test sites for misalignment # window = [10029532, 10029532+83] # window = [10031827, 10031827+34] # window = [10039004, 10039004+25] # window = [10040234, 10040234+61] # window = [1004298, 1004298+109] window = [10044514, 10044514 + 54] # window = [10037167, 10037167+82] # test_window(bam_file_path=bam_file_path, # reference_file_path=reference_file_path, # chromosome_name=chromosome_name, # window=window, # output_dir=output_dir, # print_results=True, # save_data=True) generate_window_run_length_encoding( bam_file_path=bam_file_path, reference_file_path=reference_file_path, chromosome_name=chromosome_name, window=window, output_dir=output_dir, sort_sequences_by_length=True, reverse_sort=False, two_pass=True, plot_results=True, print_results=True, save_data=False)
def main(): # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta" # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq" matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv" output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) # Initialize empty confusion matrices total_confusion = get_runlength_confusion([], [], 10) total_modal_confusion = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) print("reading BAM") for pileup_start, pileup_end in windows[:10]: print("window", pileup_start, pileup_end) sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) sequence_encoding = list() length_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t","".join(aligned_sequences[read_id])) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) length_encoding.append(aligned_lengths[read_id]) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.array(ref_sequence_encoding, dtype=numpy.int) ref_length_encoding = numpy.array(ref_lengths_encoding, dtype=numpy.int) sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int) length_encoding = numpy.array(length_encoding, dtype=numpy.float) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding) ref_length_encoding = numpy.atleast_2d(ref_length_encoding) sequence_encoding = numpy.atleast_2d(sequence_encoding) length_encoding = numpy.atleast_2d(length_encoding) # plot_runlength_pileup(sequences=-sequence_encoding, # lengths=length_encoding, # ref_sequence=-ref_sequence_encoding, # ref_lengths=ref_length_encoding) consensus_sequence, consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding) modal_consensus_sequence, modal_consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding, bayesian=False) print() print("PREDICTED\t", consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) total_confusion += confusion modal_confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=modal_consensus_lengths, max_length=10) total_modal_confusion += modal_confusion # except Exception as e: # print(e) # continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Bayes:", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion) print("No Bayes", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "modal_confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_modal_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa" # ---- TEST DATA ---- # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta" # ------------------- output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) sys.stderr.write("RL encoding fasta...\n") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) sys.stderr.write("Aligning RLE fasta...\n") read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) print(chromosome_length) sequences, lengths = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(lengths[key][:10])
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() fasta_handler.close() # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 for chromosome_name in contig_names: if chromosome_name == "NC_001328.1": # mitochondrial continue print("STARTING:", chromosome_name) fasta_handler = FastaHandler(reference_file_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) fasta_handler.close() region = [0+1000000, chromosome_length-1000000] max_threads = 30 window_size = 10000 min_size = 20 max_size = 80 manager = multiprocessing.Manager() counter = manager.Value('i', 0) region_windows = chunk_region(region=region, size=window_size) n_chunks = len(region_windows) print("subregions: ", n_chunks) output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string() print(output_dir) # args = list() # for subregion in region_windows: # args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks]) pooled_args = generate_argument_pools(pool_size=max_threads, bam_file_path=bam_file_path, chromosome_name=chromosome_name, region_windows=region_windows, reference_sequence=reference_sequence, min_size=min_size, max_size=max_size, output_dir=output_dir, counter=counter, n_chunks=n_chunks) # print(len(pooled_args)) # s = 0 # for pool in pooled_args: # s += len(pool) # print(len(pool)) # print(len(region_windows)) # print(s) # exit() for arg_pool in pooled_args: # initiate threading gc.collect() with Pool(processes=max_threads) as pool: pool.starmap(select_windows, arg_pool) print()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out" output_parent_dir = "output/" output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) sequences, scales, shapes = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(scales[key][:10]) print(shapes[key][:10])
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True, sharey=True) for k, key in enumerate(character_counts.keys()): counts = character_counts[key] max_count = max(counts) step = 1 bins = numpy.arange(0, max_count + step, step=step) frequencies, bins = numpy.histogram(counts, bins=bins, normed=False) print(bins) print(frequencies) print(bins.shape) center = (bins[:-1] + bins[1:]) / 2 - step / 2 axes[k].bar(center, frequencies, width=step, align="center") axes[k].set_ylabel(str(key)) axes[k].set_xticks(numpy.arange(0, max_count + 1)) pyplot.show()