def __init__(self, chromosome_name, bam_file_path, reference_file_path, vcf_path, train_mode, confident_tree): """ Initialize a manager object :param chromosome_name: Name of the chromosome :param bam_file_path: Path to the BAM file :param reference_file_path: Path to the reference FASTA file :param vcf_path: Path to the VCF file :param confident_tree: Dictionary containing all confident trees. NULL if parameter not passed. """ # --- initialize handlers --- # create objects to handle different files and query self.bam_path = bam_file_path self.fasta_path = reference_file_path self.vcf_path = vcf_path self.bam_handler = FRIDAY.BAM_handler(bam_file_path) self.fasta_handler = FRIDAY.FASTA_handler(reference_file_path) self.train_mode = train_mode self.confident_tree = confident_tree[ chromosome_name] if confident_tree else None self.interval_tree = IntervalTree( self.confident_tree) if confident_tree else None # --- initialize names --- # name of the chromosome self.chromosome_name = chromosome_name
def find_haplotypes(self, reads): # get the reference from the fasta file reference_sequence = self.fasta_handler.get_reference_sequence( self.contig, self.region_start, self.region_end) min_k, max_k = FRIDAY.DeBruijnGraph.find_min_k_from_ref( reference_sequence, DeBruijnGraphOptions.MIN_K, DeBruijnGraphOptions.MAX_K, DeBruijnGraphOptions.STEP_K) # couldn't build ref without cycle if min_k == -1: return None, None # print(reference_sequence) # min_k = 44 for kmer_size in range(min_k, max_k + 1, DeBruijnGraphOptions.STEP_K): dbg_graph = FRIDAY.DeBruijnGraph(self.region_start, self.region_end) haplotypes = dbg_graph.generate_haplotypes(reference_sequence, reads, kmer_size) # self.visualize(dbg_graph, 'unpruned', False) # break if haplotypes: # print(kmer_size) # self.visualize(dbg_graph, 'pruned', True) return reference_sequence, haplotypes return reference_sequence, []
def perform_local_alignment(self, region_with_reads): if not region_with_reads.reads: return [] ref_start = min(region_with_reads.min_read_start, region_with_reads.region_start) - AlingerOptions.ALIGNMENT_SAFE_BASES ref_end = max(region_with_reads.max_read_end, region_with_reads.region_end) + AlingerOptions.ALIGNMENT_SAFE_BASES if ref_end <= region_with_reads.region_end: return region_with_reads.reads else: ref_suffix = self.fasta_handler.get_reference_sequence(self.chromosome_name, region_with_reads.region_end, ref_end) ref_prefix = self.fasta_handler.get_reference_sequence(self.chromosome_name, ref_start, region_with_reads.region_start) ref = self.fasta_handler.get_reference_sequence(self.chromosome_name, region_with_reads.region_start, region_with_reads.region_end) ref_seq = ref_prefix + ref + ref_suffix haplotypes = [ref_prefix + hap + ref_suffix for hap in region_with_reads.haplotypes] aligner = FRIDAY.ReadAligner(ref_start, ref_end, ref_seq) haplotypes = sorted(set(haplotypes)) if not haplotypes or haplotypes == [ref_seq]: return region_with_reads.reads realigned_reads = aligner.align_reads(haplotypes, region_with_reads.reads) return realigned_reads
def find_active_region(self, reads): # find the active region active_region_finder = FRIDAY.ActiveRegionFinder( self.reference_sequence, self.contig, self.region_start, self.region_end) # find active regions active_regions = active_region_finder.find_active_region(reads) return active_regions
def generate_pileup(self, reads, windows, positional_candidates, vcf_path, train_mode): ref_start = max(0, windows[0][0] - CandidateFinderOptions.SAFE_BASES) ref_end = windows[-1][1] + CandidateFinderOptions.SAFE_BASES reference_sequence = self.fasta_handler.get_reference_sequence( self.chromosome_name, ref_start, ref_end) # image generator object image_generator = FRIDAY.ImageGenerator(reference_sequence, self.chromosome_name, ref_start, ref_end, positional_candidates) if train_mode: vcf_handler = FRIDAY.VCF_handler(vcf_path) positional_vcf = vcf_handler.get_positional_vcf_records( self.chromosome_name, ref_start - 20, ref_end + 20) image_generator.set_positional_vcf(positional_vcf) pileup_images = image_generator.create_window_pileups( windows, reads, train_mode) # labels = [] # for window in windows: # labels.append(self.get_window_label(window, positional_candidates, positional_vcf)) # print("GOT READS") # # for i, pileup_image in enumerate(pileup_images): # print(pileup_image.chromosome_name, pileup_image.start_pos, pileup_image.end_pos) # for label in pileup_image.label: # print(label, end='') # print() # for image_row in pileup_image.image: # self.decode_image_row(image_row) # # exit() return pileup_images
def find_candidates(self, reads): ref_start = max( 0, self.region_start - (CandidateFinderOptions.SAFE_BASES * 2)) ref_end = self.region_end + (CandidateFinderOptions.SAFE_BASES * 2) reference_sequence = self.fasta_handler.get_reference_sequence( self.contig, ref_start, ref_end) # candidate finder objects candidate_finder = FRIDAY.CandidateFinder( reference_sequence, self.contig, max(0, self.region_start - CandidateFinderOptions.SAFE_BASES), self.region_end + CandidateFinderOptions.SAFE_BASES, ref_start, ref_end) # find candidates candidate_positions, candidate_map = candidate_finder.find_candidates( reads) filtered_positions = [] for candidate_position in candidate_positions: if self.region_start <= candidate_position <= self.region_end: filtered_positions.append(candidate_position) return filtered_positions, candidate_map
def chromosome_level_parallelization(chr_list, bam_file, ref_file, vcf_file, confident_intervals, output_path, image_path, total_threads, thread_id, train_mode, max_size=1000): """ This method takes one chromosome name as parameter and chunks that chromosome in max_threads. :param chr_list: List of chromosomes to be processed :param bam_file: path to BAM file :param ref_file: path to reference FASTA file :param vcf_file: path to VCF file :param max_size: Maximum size of a segment :param output_path: path to output directory :return: """ # if there's no confident bed provided, then chop the chromosome fasta_handler = FRIDAY.FASTA_handler(ref_file) for chr_name in chr_list: interval_start, interval_end = ( 0, fasta_handler.get_chromosome_sequence_length(chr_name) + 1) # interval_start, interval_end = (2005510, 2005525) # interval_start, interval_end = (269856, 269996) # interval_start, interval_end = (1413980, 1413995) # interval_start, interval_end = (260000, 260999) all_intervals = [] for pos in range(interval_start, interval_end, max_size): all_intervals.append((pos, min(interval_end, pos + max_size - 1))) intervals = [ r for i, r in enumerate(all_intervals) if i % total_threads == thread_id ] view = View(chromosome_name=chr_name, bam_file_path=bam_file, reference_file_path=ref_file, vcf_path=vcf_file, train_mode=train_mode, confident_tree=confident_intervals) smry = None image_file_name = image_path + chr_name + "_" + str( thread_id) + ".h5py" if intervals: smry = open( output_path + chr_name + "_" + str(thread_id) + "_summary.csv", 'w') start_time = time.time() total_reads_processed = 0 total_windows = 0 all_images = [] all_labels = [] global_index = 0 for interval in intervals: _start, _end = interval n_reads, n_windows, images, candidate_map = view.parse_region( start_position=_start, end_position=_end) total_reads_processed += n_reads total_windows += n_windows if not images or not candidate_map: continue # save the dictionary dictionary_file_path = image_path + chr_name + "_" + str(_start) + "_" + str(_end) + "_" + str(thread_id) \ + ".pkl" with open(dictionary_file_path, 'wb') as f: pickle.dump(candidate_map, f, pickle.HIGHEST_PROTOCOL) # save the images for i, image in enumerate(images): record = (image.chromosome_name, image.start_pos, image.end_pos) all_images.append(image.image_alt1) if train_mode: all_labels.append(image.label_alt1) # write in summary file summary_string = image_file_name + "," + str(global_index) + "," + dictionary_file_path + "," + \ ' '.join(map(str, record)) + " 1\n" smry.write(summary_string) global_index += 1 all_images.append(image.image_alt2) if train_mode: all_labels.append(image.label_alt2) summary_string = image_file_name + "," + str(global_index) + "," + dictionary_file_path + "," + \ ' '.join(map(str, record)) + " 2\n" smry.write(summary_string) global_index += 1 hdf5_file = h5py.File(image_file_name, mode='w') # the image dataset we save. The index name in h5py is "images". img_dset = hdf5_file.create_dataset( "images", (len(all_images), ) + (ImageSizeOptions.IMAGE_HEIGHT, ImageSizeOptions.SEQ_LENGTH, ImageSizeOptions.IMAGE_CHANNELS), np.uint8, compression='gzip') label_dataset = hdf5_file.create_dataset( "labels", (len(all_labels), ) + (ImageSizeOptions.SEQ_LENGTH, ), np.uint8) # save the images and labels to the h5py file img_dset[...] = all_images label_dataset[...] = all_labels hdf5_file.close() print("CHROMOSOME: ", chr_name, "THREAD ID: ", thread_id, "READS: ", total_reads_processed, "WINDOWS: ", total_windows, "TOTAL TIME ELAPSED: ", int(math.floor(time.time() - start_time) / 60), "MINS", math.ceil(time.time() - start_time) % 60, "SEC")