Example #1
0
    def __init__(self, chromosome_name, bam_file_path, draft_file_path,
                 truth_bam, train_mode):
        """
        Initialize a manager object
        :param chromosome_name: Name of the chromosome
        :param bam_file_path: Path to the BAM file
        :param draft_file_path: Path to the reference FASTA file
        :param truth_bam: Path to the truth sequence to reference mapping file
        """
        # --- initialize handlers ---
        # create objects to handle different files and query
        self.bam_path = bam_file_path
        self.fasta_path = draft_file_path
        self.bam_handler = PEPPER.BAM_handler(bam_file_path)
        self.fasta_handler = PEPPER.FASTA_handler(draft_file_path)
        self.train_mode = train_mode
        self.downsample_rate = 1.0
        self.truth_bam_handler = None

        if self.train_mode:
            self.truth_bam_handler = PEPPER.BAM_handler(truth_bam)

        # --- initialize names ---
        # name of the chromosome
        self.chromosome_name = chromosome_name
Example #2
0
    def get_chromosome_list(chromosome_names, ref_file, bam_file, region_bed):
        """
        PARSES THROUGH THE CHROMOSOME PARAMETER TO FIND OUT WHICH REGIONS TO PROCESS
        :param chromosome_names: NAME OF CHROMOSOME
        :param ref_file: PATH TO THE REFERENCE FILE
        :param bam_file: PATH TO BAM FILE
        :return: LIST OF CHROMOSOME IN REGION SPECIFIC FORMAT
        """
        if not chromosome_names and not region_bed:
            fasta_handler = PEPPER.FASTA_handler(ref_file)
            bam_handler = PEPPER.BAM_handler(bam_file)
            bam_contigs = bam_handler.get_chromosome_sequence_names()
            fasta_contigs = fasta_handler.get_chromosome_names()
            common_contigs = list(set(fasta_contigs) & set(bam_contigs))

            if len(common_contigs) == 0:
                sys.stderr.write(
                    "[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] " +
                    "ERROR: NO COMMON CONTIGS FOUND BETWEEN THE BAM FILE AND THE FASTA FILE."
                )
                sys.stderr.flush()
                exit(1)

            common_contigs = sorted(common_contigs,
                                    key=UserInterfaceSupport.natural_key)
            sys.stderr.write("[" +
                             datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                             "] INFO: COMMON CONTIGS FOUND: " +
                             str(common_contigs) + "\n")
            sys.stderr.flush()

            chromosome_name_list = []
            for contig_name in common_contigs:
                chromosome_name_list.append((contig_name, None))

            return chromosome_name_list

        if region_bed:
            chromosome_name_list = []
            with open(region_bed) as fp:
                line = fp.readline()
                cnt = 1
                while line:
                    line_to_list = line.rstrip().split('\t')
                    chr_name, start_pos, end_pos = line_to_list[0], int(
                        line_to_list[1]), int(line_to_list[2])
                    region = sorted([start_pos, end_pos])
                    chromosome_name_list.append((chr_name, region))
                    line = fp.readline()
                cnt += 1
            return chromosome_name_list

        split_names = chromosome_names.strip().split(',')
        split_names = [name.strip() for name in split_names]

        chromosome_name_list = []
        for name in split_names:
            # split on region
            region = None
            if ':' in name:
                name_region = name.strip().split(':')

                if len(name_region) != 2:
                    sys.stderr.write("ERROR: --region INVALID value.\n")
                    exit(0)

                name, region = tuple(name_region)
                region = region.strip().split('-')
                region = [int(pos) for pos in region]

                if len(region) != 2 or not region[0] <= region[1]:
                    sys.stderr.write("ERROR: --region INVALID value.\n")
                    exit(0)

            range_split = name.split('-')
            if len(range_split) > 1:
                chr_prefix = ''
                for p in name:
                    if p.isdigit():
                        break
                    else:
                        chr_prefix = chr_prefix + p

                int_ranges = []
                for item in range_split:
                    s = ''.join(i for i in item if i.isdigit())
                    int_ranges.append(int(s))
                int_ranges = sorted(int_ranges)

                for chr_seq in range(int_ranges[0], int_ranges[-1] + 1):
                    chromosome_name_list.append(
                        (chr_prefix + str(chr_seq), region))
            else:
                chromosome_name_list.append((name, region))

        return chromosome_name_list
Example #3
0
    def chromosome_level_parallelization(chr_list, bam_file, draft_file,
                                         truth_bam, output_path, total_threads,
                                         train_mode):
        if train_mode:
            max_size = 1000
        else:
            max_size = 1000

        start_time = time.time()
        fasta_handler = PEPPER.FASTA_handler(draft_file)
        contigs = set()

        all_intervals = []
        # first calculate all the intervals that we need to process
        for chr_name, region in chr_list:
            # contig update message
            contigs.add(str(chr_name))
            if not region:
                interval_start, interval_end = (
                    0,
                    fasta_handler.get_chromosome_sequence_length(
                        str(chr_name)) - 1)
            else:
                interval_start, interval_end = tuple(region)
                interval_start = max(0, interval_start)
                interval_end = min(
                    interval_end,
                    fasta_handler.get_chromosome_sequence_length(str(chr_name))
                    - 1)

            # this is the interval size each of the process is going to get which is 10^6
            # I will split this into 10^4 size inside the worker process
            for pos in range(interval_start, interval_end, max_size):
                pos_start = max(interval_start,
                                pos - ImageSizeOptions.MIN_IMAGE_OVERLAP)
                pos_end = min(
                    interval_end,
                    pos + max_size + ImageSizeOptions.MIN_IMAGE_OVERLAP)
                all_intervals.append((chr_name, pos_start, pos_end))

        # all intervals calculated now
        # contig update message
        sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                         "] " + "INFO: TOTAL CONTIGS: " + str(len(contigs)) +
                         " TOTAL INTERVALS: " + str(len(all_intervals)) + "\n")
        sys.stderr.flush()

        args = (output_path, bam_file, draft_file, truth_bam, train_mode)
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=total_threads) as executor:
            futures = [
                executor.submit(UserInterfaceSupport.image_generator, args,
                                all_intervals, total_threads, thread_id)
                for thread_id in range(0, total_threads)
            ]

            for fut in concurrent.futures.as_completed(futures):
                if fut.exception() is None:
                    # get the results
                    thread_id = fut.result()
                    sys.stderr.write(
                        "[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                        "] " + "INFO: THREAD " + str(thread_id) +
                        " FINISHED SUCCESSFULLY.\n")
                else:
                    sys.stderr.write(
                        "[" +
                        str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) +
                        "] ERROR: " + str(fut.exception()) + "\n")
                fut._result = None  # python issue 27144

        end_time = time.time()
        mins = int((end_time - start_time) / 60)
        secs = int((end_time - start_time)) % 60
        sys.stderr.write("[" +
                         str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) +
                         "] INFO: FINISHED IMAGE GENERATION\n")
        sys.stderr.write("[" +
                         str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) +
                         "] INFO: ELAPSED TIME: " + str(mins) + " Min " +
                         str(secs) + " Sec\n")