def __init__(self, chromosome_name, bam_file_path, draft_file_path, truth_bam, hp_tag, train_mode): """ Initialize a manager object :param chromosome_name: Name of the chromosome :param bam_file_path: Path to the BAM file :param draft_file_path: Path to the reference FASTA file :param truth_bam: Path to the truth sequence to reference mapping file """ # --- initialize handlers --- # create objects to handle different files and query self.bam_path = bam_file_path self.fasta_path = draft_file_path self.bam_handler = PEPPER_HP.BAM_handler(bam_file_path) self.fasta_handler = PEPPER_HP.FASTA_handler(draft_file_path) self.train_mode = train_mode self.hp_tag = hp_tag self.downsample_rate = 1.0 self.truth_bam = None if self.train_mode: self.truth_bam = truth_bam # --- initialize names --- # name of the chromosome self.chromosome_name = chromosome_name
def __init__(self, reference_file_path, contigs, sample_name, output_dir, filename): self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path) self.contigs = contigs vcf_header = self.get_vcf_header(sample_name, contigs) self.vcf_file = VariantFile(output_dir + filename + '.vcf', 'w', header=vcf_header)
def reads_to_reference_realignment(self, region_start, region_end, reads): # PERFORMS LOCAL REALIGNMENT OF READS TO THE REFERENCE if not reads: return [] ref_start = region_start ref_end = region_end + AlingerOptions.ALIGNMENT_SAFE_BASES ref_sequence = self.fasta_handler.get_reference_sequence( self.chromosome_name, ref_start, ref_end) aligner = PEPPER_HP.ReadAligner(ref_start, ref_end, ref_sequence) realigned_reads = aligner.align_reads_to_reference(reads) # generate_pileup_from_reads.pileup_from_reads(ref_sequence, ref_start, ref_end, realigned_reads) return realigned_reads
def __init__(self, vcf1, vcf2, ref_fasta, only_overlapping=True, discard_phase=False, detailed_info=False): """Initialize variant merging. Merge variants from two haploid VCFs into a diploid vcf. Variants in one file which overlap with variants in the other will have their alts padded. .. warning:: Variants in a single vcf file should not overlap with each other. :param vcf1, vcf2: paths to haploid vcf files. :param ref_fasta: path to reference.fasta file. :param only_overlapping: bool, merge only overlapping variants (not adjacent ones). :param discard_phase: bool, if False, preserve phase, else output unphased variants. """ self.only_overlapping = only_overlapping self.discard_phase = discard_phase self.detailed_info = detailed_info self.vcfs = [VCFReader(vcf) for vcf in (vcf1, vcf2)] for vcf in self.vcfs: vcf.index() # create tree self.fasta = pysam.FastaFile(ref_fasta) all_contigs = list(set(itertools.chain(*[v.chroms for v in self.vcfs]))) all_contigs = sorted(all_contigs, key=natural_key) fasta_handler = PEPPER_HP.FASTA_handler(ref_fasta) sqs = fasta_handler.get_chromosome_names() self.chroms = [] for sq in sqs: if sq not in all_contigs: continue sq_id = sq ln = fasta_handler.get_chromosome_sequence_length(sq) self.chroms.append((sq_id, ln))
def create_summary(self, truth_bam, hp_tag, train_mode, realignment_flag): log_prefix = "[" + self.chromosome_name + ":" + str(self.region_start_position) + "-" \ + str(self.region_end_position) + "]" all_images = [] all_labels = [] all_positions = [] all_image_chunk_ids = [] all_ref_seq = [] if train_mode: # get the reads from the bam file truth_bam_handler = PEPPER_HP.BAM_handler(truth_bam) # truth reads include_truth_supp = True truth_read_mq = 60 truth_read_baseq = 0 # get the reads from the bam file truth_reads = truth_bam_handler.get_reads( self.chromosome_name, self.region_start_position, self.region_end_position, include_truth_supp, truth_read_mq, truth_read_baseq) # do a local realignment of truth reads to reference if realignment_flag: truth_reads = self.reads_to_reference_realignment( self.region_start_position, self.region_end_position, truth_reads) truth_regions = [] for read in truth_reads: # start, end, read, is_kept, is_h1 truth_regions.append([read.pos, read.pos_end - 1, read, True]) # these are all the regions we will use to generate summaries from. # It's important to notice that we need to realign the reads to the reference before we do that. truth_regions = self.remove_conflicting_regions(truth_regions) if not truth_regions: # sys.stderr.write(TextColor.GREEN + "INFO: " + log_prefix + " NO TRAINING REGION FOUND.\n" # + TextColor.END) return [], [], [], [], [] for region in truth_regions: region_start, region_end, truth_read, is_kept = tuple(region) if not is_kept: continue ref_start = region_start ref_end = region_end + 1 # ref_seq should contain region_end_position base ref_seq = self.fasta_handler.get_reference_sequence( self.chromosome_name, ref_start, ref_end) read_start = max(0, region_start) read_end = region_end all_reads = self.bam_handler.get_reads( self.chromosome_name, read_start, read_end, ReadFilterOptions.INCLUDE_SUPPLEMENTARY, ReadFilterOptions.MIN_MAPQ, ReadFilterOptions.MIN_BASEQ) total_reads = len(all_reads) if total_reads == 0: continue if total_reads > AlingerOptions.MAX_READS_IN_REGION: # https://github.com/google/nucleus/blob/master/nucleus/util/utils.py # reservoir_sample method utilized here random = np.random.RandomState(AlingerOptions.RANDOM_SEED) sample = [] for i, read in enumerate(all_reads): if len(sample) < AlingerOptions.MAX_READS_IN_REGION: sample.append(read) else: j = random.randint(0, i + 1) if j < AlingerOptions.MAX_READS_IN_REGION: sample[j] = read all_reads = sample # sys.stderr.write(TextColor.GREEN + "INFO: " + log_prefix + " TOTAL " + str(total_reads) # + " READS FOUND.\n" + TextColor.END) start_time = time.time() if realignment_flag: all_reads = self.reads_to_reference_realignment( read_start, read_end, all_reads) # sys.stderr.write(TextColor.GREEN + "INFO: " + log_prefix + " REALIGNMENT OF TOTAL " # + str(total_reads) + " READS TOOK: " + str(round(time.time()-start_time, 5)) # + " secs\n" + TextColor.END) summary_generator = PEPPER_HP.SummaryGenerator( ref_seq, self.chromosome_name, ref_start, ref_end) summary_generator.generate_train_summary( all_reads, region_start, region_end, truth_read, hp_tag) image_summary = summary_generator.chunk_image_train( ImageSizeOptions.SEQ_LENGTH, ImageSizeOptions.SEQ_OVERLAP, ImageSizeOptions.IMAGE_HEIGHT) all_images.extend(image_summary.images) all_labels.extend(image_summary.labels) all_positions.extend(image_summary.positions) all_image_chunk_ids.extend(image_summary.chunk_ids) all_ref_seq.extend(image_summary.refs) else: # HERE REALIGN THE READS TO THE REFERENCE THEN GENERATE THE SUMMARY TO GET A POLISHED HAPLOTYPE read_start = max(0, self.region_start_position) read_end = self.region_end_position all_reads = self.bam_handler.get_reads( self.chromosome_name, read_start, read_end, ReadFilterOptions.INCLUDE_SUPPLEMENTARY, ReadFilterOptions.MIN_MAPQ, ReadFilterOptions.MIN_BASEQ) total_reads = len(all_reads) if total_reads == 0: return [], [], [], [], [] if total_reads > AlingerOptions.MAX_READS_IN_REGION: # https://github.com/google/nucleus/blob/master/nucleus/util/utils.py # reservoir_sample method utilized here random = np.random.RandomState(AlingerOptions.RANDOM_SEED) sample = [] for i, read in enumerate(all_reads): if len(sample) < AlingerOptions.MAX_READS_IN_REGION: sample.append(read) else: j = random.randint(0, i + 1) if j < AlingerOptions.MAX_READS_IN_REGION: sample[j] = read all_reads = sample # sys.stderr.write(TextColor.PURPLE + "INFO: " + log_prefix + " TOTAL " + str(total_reads) + " READS FOUND\n" # + TextColor.END) if realignment_flag: start_time = time.time() all_reads = self.reads_to_reference_realignment( self.region_start_position, self.region_end_position, all_reads) # sys.stderr.write(TextColor.GREEN + "INFO: " + log_prefix + " REALIGNMENT OF TOTAL " + str(total_reads) # + " READS TOOK: " + str(round(time.time()-start_time, 5)) + " secs\n" + TextColor.END) # ref_seq should contain region_end_position base ref_seq = self.fasta_handler.get_reference_sequence( self.chromosome_name, self.region_start_position, self.region_end_position + 1) summary_generator = PEPPER_HP.SummaryGenerator( ref_seq, self.chromosome_name, self.region_start_position, self.region_end_position) summary_generator.generate_summary(all_reads, self.region_start_position, self.region_end_position, hp_tag) image_summary = summary_generator.chunk_image( ImageSizeOptions.SEQ_LENGTH, ImageSizeOptions.SEQ_OVERLAP, ImageSizeOptions.IMAGE_HEIGHT) all_images.extend(image_summary.images) all_labels.extend(image_summary.labels) all_positions.extend(image_summary.positions) all_image_chunk_ids.extend(image_summary.chunk_ids) all_ref_seq.extend(image_summary.refs) assert (len(all_images) == len(all_labels) == len(all_image_chunk_ids) == len(all_ref_seq)) return all_images, all_labels, all_positions, all_image_chunk_ids, all_ref_seq
def call_variant(bam_filepath, fasta_filepath, output_dir, threads, region, model_path, batch_size, gpu_mode, callers_per_gpu, device_ids, num_workers, sample_name): """ Run all the sub-modules to polish an input assembly. """ start_time = time.time() # check the bam file if not os.path.isfile(bam_filepath) or not PEPPER_HP.BAM_handler( bam_filepath): sys.stderr.write("ERROR: CAN NOT LOCATE BAM FILE.\n") exit(1) # check the fasta file if not os.path.isfile(fasta_filepath): sys.stderr.write("ERROR: CAN NOT LOCATE FASTA FILE.\n") exit(1) # check the model file if not os.path.isfile(model_path): sys.stderr.write("ERROR: CAN NOT LOCATE MODEL FILE.\n") exit(1) # check number of threads if threads <= 0: sys.stderr.write("ERROR: THREAD NEEDS TO BE >=0.\n") exit(1) # check batch_size if batch_size <= 0: sys.stderr.write("ERROR: batch_size NEEDS TO BE >0.\n") exit(1) # check num_workers if num_workers < 0: sys.stderr.write("ERROR: num_workers NEEDS TO BE >=0.\n") exit(1) # check if gpu inference can be done if gpu_mode: if not torch.cuda.is_available(): sys.stderr.write("ERROR: TORCH IS NOT BUILT WITH CUDA.\n") sys.stderr.write( "SEE TORCH CAPABILITY:\n$ python3\n" ">>> import torch \n" ">>> torch.cuda.is_available()\n If true then cuda is avilable" ) exit(1) # check if all devices are available if device_ids is not None: device_ids = [int(i) for i in device_ids.split(',')] for device_id in device_ids: major_capable, minor_capable = torch.cuda.get_device_capability( device=device_id) if major_capable < 0: sys.stderr.write("ERROR: GPU DEVICE: " + str(device_id) + " IS NOT CUDA CAPABLE.\n") sys.stderr.write( "Try running: $ python3\n" ">>> import torch \n" ">>> torch.cuda.get_device_capability(device=" + str(device_id) + ")\n") exit(1) else: sys.stderr.write("INFO: CAPABILITY OF GPU#" + str(device_id) + ":\t" + str(major_capable) + "-" + str(minor_capable) + "\n") timestr = time.strftime("%m%d%Y_%H%M%S") output_dir = UserInterfaceSupport.handle_output_directory(output_dir) image_output_directory_hp1 = output_dir + "images_" + str( timestr) + "/hp1_images/" image_output_directory_hp2 = output_dir + "images_" + str( timestr) + "/hp2_images/" prediction_output_directory_hp1 = output_dir + "predictions_" + str( timestr) + "/hp1/" prediction_output_directory_hp2 = output_dir + "predictions_" + str( timestr) + "/hp2/" candidate_output_directory_hp1 = output_dir + "candidate_variants_" + str( timestr) + "/hp1/" candidate_output_directory_hp2 = output_dir + "candidate_variants_" + str( timestr) + "/hp2/" sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RUN-ID: " + str(timestr) + "\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: IMAGE OUTPUT: " + str(image_output_directory_hp1) + "\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 1.1: GENERATING IMAGES FOR HAPLOTYPE 1\n") make_images(bam_filepath, fasta_filepath, region, image_output_directory_hp1, 1, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RUN-ID: " + str(timestr) + "\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: IMAGE OUTPUT: " + str(image_output_directory_hp2) + "\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 1.2: GENERATING IMAGES FOR BOTH HAPLOTYPE 2\n") make_images(bam_filepath, fasta_filepath, region, image_output_directory_hp2, 2, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 2.1: RUNNING INFERENCE ON HAPLOTYPE 1\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OUTPUT: " + str(prediction_output_directory_hp1) + "\n") run_inference(image_output_directory_hp1, model_path, batch_size, num_workers, prediction_output_directory_hp1, device_ids, callers_per_gpu, gpu_mode, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 2.2: RUNNING INFERENCE ON HAPLOTYPE 2\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OUTPUT: " + str(prediction_output_directory_hp1) + "\n") run_inference(image_output_directory_hp2, model_path, batch_size, num_workers, prediction_output_directory_hp2, device_ids, callers_per_gpu, gpu_mode, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 3.1: CALLING VARIANTS ON HAPLOTYPE 1\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OUTPUT: " + str(candidate_output_directory_hp1) + "\n") process_candidates(prediction_output_directory_hp1, fasta_filepath, sample_name, candidate_output_directory_hp1, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 3.2: CALLING VARIANTS ON HAPLOTYPE 2\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OUTPUT: " + str(candidate_output_directory_hp2) + "\n") process_candidates(prediction_output_directory_hp2, fasta_filepath, sample_name, candidate_output_directory_hp2, threads) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] STEP 4: MERGING VARIANTS.\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OUTPUT: " + str(output_dir) + "\n") haploid2diploid( candidate_output_directory_hp1 + 'candidates_as_variants.vcf', candidate_output_directory_hp2 + 'candidates_as_variants.vcf', fasta_filepath, output_dir) end_time = time.time() mins = int((end_time - start_time) / 60) secs = int((end_time - start_time)) % 60 sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] TOTAL ELAPSED TIME FOR VARIANT CALLING: " + str(mins) + " Min " + str(secs) + " Sec\n")
def get_chromosome_list(chromosome_names, ref_file, bam_file, region_bed): """ PARSES THROUGH THE CHROMOSOME PARAMETER TO FIND OUT WHICH REGIONS TO PROCESS :param chromosome_names: NAME OF CHROMOSOME :param ref_file: PATH TO THE REFERENCE FILE :param bam_file: PATH TO BAM FILE :return: LIST OF CHROMOSOME IN REGION SPECIFIC FORMAT """ if not chromosome_names and not region_bed: fasta_handler = PEPPER_HP.FASTA_handler(ref_file) bam_handler = PEPPER_HP.BAM_handler(bam_file) bam_contigs = bam_handler.get_chromosome_sequence_names() fasta_contigs = fasta_handler.get_chromosome_names() common_contigs = list(set(fasta_contigs) & set(bam_contigs)) common_contigs = list(set(common_contigs) - set(EXCLUDED_HUMAN_CONTIGS)) if len(common_contigs) == 0: sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " + "ERROR: NO COMMON CONTIGS FOUND BETWEEN THE BAM FILE AND THE FASTA FILE.") sys.stderr.flush() exit(1) common_contigs = sorted(common_contigs, key=UserInterfaceSupport.natural_key) sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: COMMON CONTIGS FOUND: " + str(common_contigs) + "\n") sys.stderr.flush() chromosome_name_list = [] for contig_name in common_contigs: chromosome_name_list.append((contig_name, None)) return chromosome_name_list if region_bed: chromosome_name_list = [] with open(region_bed) as fp: line = fp.readline() cnt = 1 while line: line_to_list = line.rstrip().split('\t') chr_name, start_pos, end_pos = line_to_list[0], int(line_to_list[1]), int(line_to_list[2]) region = sorted([start_pos, end_pos]) chromosome_name_list.append((chr_name, region)) line = fp.readline() cnt += 1 return chromosome_name_list split_names = chromosome_names.strip().split(',') split_names = [name.strip() for name in split_names] chromosome_name_list = [] for name in split_names: # split on region region = None if ':' in name: name_region = name.strip().split(':') if len(name_region) != 2: sys.stderr.write("ERROR: --region INVALID value.\n") exit(0) name, region = tuple(name_region) region = region.strip().split('-') region = [int(pos) for pos in region] if len(region) != 2 or not region[0] <= region[1]: sys.stderr.write("ERROR: --region INVALID value.\n") exit(0) range_split = name.split('-') if len(range_split) > 1: chr_prefix = '' for p in name: if p.isdigit(): break else: chr_prefix = chr_prefix + p int_ranges = [] for item in range_split: s = ''.join(i for i in item if i.isdigit()) int_ranges.append(int(s)) int_ranges = sorted(int_ranges) for chr_seq in range(int_ranges[0], int_ranges[-1] + 1): chromosome_name_list.append((chr_prefix + str(chr_seq), region)) else: chromosome_name_list.append((name, region)) return chromosome_name_list
def chromosome_level_parallelization(chr_list, bam_file, draft_file, truth_bam, hp_tag, output_path, total_threads, train_mode, realignment_flag): if train_mode: max_size = 10000 else: max_size = 10000 start_time = time.time() fasta_handler = PEPPER_HP.FASTA_handler(draft_file) all_intervals = [] # first calculate all the intervals that we need to process for chr_name, region in chr_list: # contig update message if not region: interval_start, interval_end = (0, fasta_handler.get_chromosome_sequence_length(chr_name) - 1) else: interval_start, interval_end = tuple(region) interval_start = max(0, interval_start) interval_end = min(interval_end, fasta_handler.get_chromosome_sequence_length(chr_name) - 1) # this is the interval size each of the process is going to get which is 10^6 # I will split this into 10^4 size inside the worker process for pos in range(interval_start, interval_end, max_size): pos_start = max(interval_start, pos - ImageSizeOptions.MIN_IMAGE_OVERLAP) pos_end = min(interval_end, pos + max_size + ImageSizeOptions.MIN_IMAGE_OVERLAP) all_intervals.append((chr_name, pos_start, pos_end)) # all intervals calculated now # contig update message sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " + "INFO: TOTAL CONTIGS: " + str(len(chr_list)) + " TOTAL INTERVALS: " + str(len(all_intervals)) + "\n") sys.stderr.flush() args = (output_path, bam_file, draft_file, truth_bam, hp_tag, train_mode, realignment_flag) with concurrent.futures.ProcessPoolExecutor(max_workers=total_threads) as executor: futures = [executor.submit(UserInterfaceSupport.image_generator, args, all_intervals, total_threads, thread_id) for thread_id in range(0, total_threads)] for fut in concurrent.futures.as_completed(futures): if fut.exception() is None: # get the results thread_id = fut.result() sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: THREAD " + str(thread_id) + " FINISHED SUCCESSFULLY.\n") else: sys.stderr.write("ERROR: " + str(fut.exception()) + "\n") fut._result = None # python issue 27144 end_time = time.time() mins = int((end_time - start_time) / 60) secs = int((end_time - start_time)) % 60 sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED IMAGE GENERATION\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: ELAPSED TIME: " + str(mins) + " Min " + str(secs) + " Sec\n")