def __init__(self, chromosome_name, bam_file_path, draft_file_path, truth_bam, train_mode): """ Initialize a manager object :param chromosome_name: Name of the chromosome :param bam_file_path: Path to the BAM file :param draft_file_path: Path to the reference FASTA file :param truth_bam: Path to the truth sequence to reference mapping file """ # --- initialize handlers --- # create objects to handle different files and query self.bam_path = bam_file_path self.fasta_path = draft_file_path self.bam_handler = PEPPER.BAM_handler(bam_file_path) self.fasta_handler = PEPPER.FASTA_handler(draft_file_path) self.train_mode = train_mode self.downsample_rate = 1.0 self.truth_bam_handler = None if self.train_mode: self.truth_bam_handler = PEPPER.BAM_handler(truth_bam) # --- initialize names --- # name of the chromosome self.chromosome_name = chromosome_name
def get_chromosome_list(chromosome_names, ref_file, bam_file, region_bed): """ PARSES THROUGH THE CHROMOSOME PARAMETER TO FIND OUT WHICH REGIONS TO PROCESS :param chromosome_names: NAME OF CHROMOSOME :param ref_file: PATH TO THE REFERENCE FILE :param bam_file: PATH TO BAM FILE :return: LIST OF CHROMOSOME IN REGION SPECIFIC FORMAT """ if not chromosome_names and not region_bed: fasta_handler = PEPPER.FASTA_handler(ref_file) bam_handler = PEPPER.BAM_handler(bam_file) bam_contigs = bam_handler.get_chromosome_sequence_names() fasta_contigs = fasta_handler.get_chromosome_names() common_contigs = list(set(fasta_contigs) & set(bam_contigs)) if len(common_contigs) == 0: sys.stderr.write( "[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " + "ERROR: NO COMMON CONTIGS FOUND BETWEEN THE BAM FILE AND THE FASTA FILE." ) sys.stderr.flush() exit(1) common_contigs = sorted(common_contigs, key=UserInterfaceSupport.natural_key) sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: COMMON CONTIGS FOUND: " + str(common_contigs) + "\n") sys.stderr.flush() chromosome_name_list = [] for contig_name in common_contigs: chromosome_name_list.append((contig_name, None)) return chromosome_name_list if region_bed: chromosome_name_list = [] with open(region_bed) as fp: line = fp.readline() cnt = 1 while line: line_to_list = line.rstrip().split('\t') chr_name, start_pos, end_pos = line_to_list[0], int( line_to_list[1]), int(line_to_list[2]) region = sorted([start_pos, end_pos]) chromosome_name_list.append((chr_name, region)) line = fp.readline() cnt += 1 return chromosome_name_list split_names = chromosome_names.strip().split(',') split_names = [name.strip() for name in split_names] chromosome_name_list = [] for name in split_names: # split on region region = None if ':' in name: name_region = name.strip().split(':') if len(name_region) != 2: sys.stderr.write("ERROR: --region INVALID value.\n") exit(0) name, region = tuple(name_region) region = region.strip().split('-') region = [int(pos) for pos in region] if len(region) != 2 or not region[0] <= region[1]: sys.stderr.write("ERROR: --region INVALID value.\n") exit(0) range_split = name.split('-') if len(range_split) > 1: chr_prefix = '' for p in name: if p.isdigit(): break else: chr_prefix = chr_prefix + p int_ranges = [] for item in range_split: s = ''.join(i for i in item if i.isdigit()) int_ranges.append(int(s)) int_ranges = sorted(int_ranges) for chr_seq in range(int_ranges[0], int_ranges[-1] + 1): chromosome_name_list.append( (chr_prefix + str(chr_seq), region)) else: chromosome_name_list.append((name, region)) return chromosome_name_list
def chromosome_level_parallelization(chr_list, bam_file, draft_file, truth_bam, output_path, total_threads, train_mode): if train_mode: max_size = 1000 else: max_size = 1000 start_time = time.time() fasta_handler = PEPPER.FASTA_handler(draft_file) contigs = set() all_intervals = [] # first calculate all the intervals that we need to process for chr_name, region in chr_list: # contig update message contigs.add(str(chr_name)) if not region: interval_start, interval_end = ( 0, fasta_handler.get_chromosome_sequence_length( str(chr_name)) - 1) else: interval_start, interval_end = tuple(region) interval_start = max(0, interval_start) interval_end = min( interval_end, fasta_handler.get_chromosome_sequence_length(str(chr_name)) - 1) # this is the interval size each of the process is going to get which is 10^6 # I will split this into 10^4 size inside the worker process for pos in range(interval_start, interval_end, max_size): pos_start = max(interval_start, pos - ImageSizeOptions.MIN_IMAGE_OVERLAP) pos_end = min( interval_end, pos + max_size + ImageSizeOptions.MIN_IMAGE_OVERLAP) all_intervals.append((chr_name, pos_start, pos_end)) # all intervals calculated now # contig update message sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " + "INFO: TOTAL CONTIGS: " + str(len(contigs)) + " TOTAL INTERVALS: " + str(len(all_intervals)) + "\n") sys.stderr.flush() args = (output_path, bam_file, draft_file, truth_bam, train_mode) with concurrent.futures.ProcessPoolExecutor( max_workers=total_threads) as executor: futures = [ executor.submit(UserInterfaceSupport.image_generator, args, all_intervals, total_threads, thread_id) for thread_id in range(0, total_threads) ] for fut in concurrent.futures.as_completed(futures): if fut.exception() is None: # get the results thread_id = fut.result() sys.stderr.write( "[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " + "INFO: THREAD " + str(thread_id) + " FINISHED SUCCESSFULLY.\n") else: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] ERROR: " + str(fut.exception()) + "\n") fut._result = None # python issue 27144 end_time = time.time() mins = int((end_time - start_time) / 60) secs = int((end_time - start_time)) % 60 sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED IMAGE GENERATION\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: ELAPSED TIME: " + str(mins) + " Min " + str(secs) + " Sec\n")