Exemple #1
0
    def __init__(self, chromosome_name, bam_file_path, reference_file_path,
                 vcf_path, train_mode, confident_tree):
        """
        Initialize a manager object
        :param chromosome_name: Name of the chromosome
        :param bam_file_path: Path to the BAM file
        :param reference_file_path: Path to the reference FASTA file
        :param vcf_path: Path to the VCF file
        :param confident_tree: Dictionary containing all confident trees. NULL if parameter not passed.
        """
        # --- initialize handlers ---
        # create objects to handle different files and query
        self.bam_path = bam_file_path
        self.fasta_path = reference_file_path
        self.vcf_path = vcf_path
        self.bam_handler = FRIDAY.BAM_handler(bam_file_path)
        self.fasta_handler = FRIDAY.FASTA_handler(reference_file_path)
        self.train_mode = train_mode
        self.confident_tree = confident_tree[
            chromosome_name] if confident_tree else None
        self.interval_tree = IntervalTree(
            self.confident_tree) if confident_tree else None

        # --- initialize names ---
        # name of the chromosome
        self.chromosome_name = chromosome_name
Exemple #2
0
    def find_haplotypes(self, reads):
        # get the reference from the fasta file
        reference_sequence = self.fasta_handler.get_reference_sequence(
            self.contig, self.region_start, self.region_end)
        min_k, max_k = FRIDAY.DeBruijnGraph.find_min_k_from_ref(
            reference_sequence, DeBruijnGraphOptions.MIN_K,
            DeBruijnGraphOptions.MAX_K, DeBruijnGraphOptions.STEP_K)
        # couldn't build ref without cycle
        if min_k == -1:
            return None, None

        # print(reference_sequence)
        # min_k = 44
        for kmer_size in range(min_k, max_k + 1, DeBruijnGraphOptions.STEP_K):
            dbg_graph = FRIDAY.DeBruijnGraph(self.region_start,
                                             self.region_end)
            haplotypes = dbg_graph.generate_haplotypes(reference_sequence,
                                                       reads, kmer_size)
            # self.visualize(dbg_graph, 'unpruned', False)
            # break
            if haplotypes:
                # print(kmer_size)
                # self.visualize(dbg_graph, 'pruned', True)
                return reference_sequence, haplotypes

        return reference_sequence, []
Exemple #3
0
    def perform_local_alignment(self, region_with_reads):
        if not region_with_reads.reads:
            return []
        ref_start = min(region_with_reads.min_read_start, region_with_reads.region_start) - AlingerOptions.ALIGNMENT_SAFE_BASES
        ref_end = max(region_with_reads.max_read_end, region_with_reads.region_end) + AlingerOptions.ALIGNMENT_SAFE_BASES

        if ref_end <= region_with_reads.region_end:
            return region_with_reads.reads
        else:
            ref_suffix = self.fasta_handler.get_reference_sequence(self.chromosome_name,
                                                                   region_with_reads.region_end,
                                                                   ref_end)

        ref_prefix = self.fasta_handler.get_reference_sequence(self.chromosome_name,
                                                               ref_start,
                                                               region_with_reads.region_start)
        ref = self.fasta_handler.get_reference_sequence(self.chromosome_name,
                                                        region_with_reads.region_start,
                                                        region_with_reads.region_end)
        ref_seq = ref_prefix + ref + ref_suffix
        haplotypes = [ref_prefix + hap + ref_suffix for hap in region_with_reads.haplotypes]

        aligner = FRIDAY.ReadAligner(ref_start, ref_end, ref_seq)

        haplotypes = sorted(set(haplotypes))

        if not haplotypes or haplotypes == [ref_seq]:
            return region_with_reads.reads

        realigned_reads = aligner.align_reads(haplotypes, region_with_reads.reads)

        return realigned_reads
    def find_active_region(self, reads):
        # find the active region
        active_region_finder = FRIDAY.ActiveRegionFinder(
            self.reference_sequence, self.contig, self.region_start,
            self.region_end)

        # find active regions
        active_regions = active_region_finder.find_active_region(reads)

        return active_regions
Exemple #5
0
    def generate_pileup(self, reads, windows, positional_candidates, vcf_path,
                        train_mode):
        ref_start = max(0, windows[0][0] - CandidateFinderOptions.SAFE_BASES)
        ref_end = windows[-1][1] + CandidateFinderOptions.SAFE_BASES
        reference_sequence = self.fasta_handler.get_reference_sequence(
            self.chromosome_name, ref_start, ref_end)

        # image generator object
        image_generator = FRIDAY.ImageGenerator(reference_sequence,
                                                self.chromosome_name,
                                                ref_start, ref_end,
                                                positional_candidates)
        if train_mode:
            vcf_handler = FRIDAY.VCF_handler(vcf_path)
            positional_vcf = vcf_handler.get_positional_vcf_records(
                self.chromosome_name, ref_start - 20, ref_end + 20)
            image_generator.set_positional_vcf(positional_vcf)

        pileup_images = image_generator.create_window_pileups(
            windows, reads, train_mode)

        # labels = []
        # for window in windows:
        #     labels.append(self.get_window_label(window, positional_candidates, positional_vcf))
        # print("GOT READS")
        #
        # for i, pileup_image in enumerate(pileup_images):
        #     print(pileup_image.chromosome_name, pileup_image.start_pos, pileup_image.end_pos)
        #     for label in pileup_image.label:
        #         print(label, end='')
        #     print()
        #     for image_row in pileup_image.image:
        #         self.decode_image_row(image_row)
        #
        # exit()
        return pileup_images
Exemple #6
0
    def find_candidates(self, reads):
        ref_start = max(
            0, self.region_start - (CandidateFinderOptions.SAFE_BASES * 2))
        ref_end = self.region_end + (CandidateFinderOptions.SAFE_BASES * 2)

        reference_sequence = self.fasta_handler.get_reference_sequence(
            self.contig, ref_start, ref_end)
        # candidate finder objects
        candidate_finder = FRIDAY.CandidateFinder(
            reference_sequence, self.contig,
            max(0, self.region_start - CandidateFinderOptions.SAFE_BASES),
            self.region_end + CandidateFinderOptions.SAFE_BASES, ref_start,
            ref_end)

        # find candidates
        candidate_positions, candidate_map = candidate_finder.find_candidates(
            reads)

        filtered_positions = []
        for candidate_position in candidate_positions:
            if self.region_start <= candidate_position <= self.region_end:
                filtered_positions.append(candidate_position)

        return filtered_positions, candidate_map
Exemple #7
0
def chromosome_level_parallelization(chr_list,
                                     bam_file,
                                     ref_file,
                                     vcf_file,
                                     confident_intervals,
                                     output_path,
                                     image_path,
                                     total_threads,
                                     thread_id,
                                     train_mode,
                                     max_size=1000):
    """
    This method takes one chromosome name as parameter and chunks that chromosome in max_threads.
    :param chr_list: List of chromosomes to be processed
    :param bam_file: path to BAM file
    :param ref_file: path to reference FASTA file
    :param vcf_file: path to VCF file
    :param max_size: Maximum size of a segment
    :param output_path: path to output directory
    :return:
    """
    # if there's no confident bed provided, then chop the chromosome
    fasta_handler = FRIDAY.FASTA_handler(ref_file)

    for chr_name in chr_list:
        interval_start, interval_end = (
            0, fasta_handler.get_chromosome_sequence_length(chr_name) + 1)
        # interval_start, interval_end = (2005510, 2005525)
        # interval_start, interval_end = (269856, 269996)
        # interval_start, interval_end = (1413980, 1413995)
        # interval_start, interval_end = (260000, 260999)

        all_intervals = []
        for pos in range(interval_start, interval_end, max_size):
            all_intervals.append((pos, min(interval_end, pos + max_size - 1)))

        intervals = [
            r for i, r in enumerate(all_intervals)
            if i % total_threads == thread_id
        ]

        view = View(chromosome_name=chr_name,
                    bam_file_path=bam_file,
                    reference_file_path=ref_file,
                    vcf_path=vcf_file,
                    train_mode=train_mode,
                    confident_tree=confident_intervals)

        smry = None
        image_file_name = image_path + chr_name + "_" + str(
            thread_id) + ".h5py"
        if intervals:
            smry = open(
                output_path + chr_name + "_" + str(thread_id) + "_summary.csv",
                'w')

        start_time = time.time()
        total_reads_processed = 0
        total_windows = 0
        all_images = []
        all_labels = []
        global_index = 0
        for interval in intervals:
            _start, _end = interval
            n_reads, n_windows, images, candidate_map = view.parse_region(
                start_position=_start, end_position=_end)
            total_reads_processed += n_reads
            total_windows += n_windows

            if not images or not candidate_map:
                continue
            # save the dictionary
            dictionary_file_path = image_path + chr_name + "_" + str(_start) + "_" + str(_end) + "_" + str(thread_id) \
                                   + ".pkl"
            with open(dictionary_file_path, 'wb') as f:
                pickle.dump(candidate_map, f, pickle.HIGHEST_PROTOCOL)

            # save the images
            for i, image in enumerate(images):
                record = (image.chromosome_name, image.start_pos,
                          image.end_pos)

                all_images.append(image.image_alt1)
                if train_mode:
                    all_labels.append(image.label_alt1)

                # write in summary file
                summary_string = image_file_name + "," + str(global_index) + "," + dictionary_file_path + "," + \
                                 ' '.join(map(str, record)) + " 1\n"
                smry.write(summary_string)
                global_index += 1

                all_images.append(image.image_alt2)
                if train_mode:
                    all_labels.append(image.label_alt2)

                summary_string = image_file_name + "," + str(global_index) + "," + dictionary_file_path + "," + \
                                 ' '.join(map(str, record)) + " 2\n"
                smry.write(summary_string)

                global_index += 1

        hdf5_file = h5py.File(image_file_name, mode='w')
        # the image dataset we save. The index name in h5py is "images".
        img_dset = hdf5_file.create_dataset(
            "images", (len(all_images), ) +
            (ImageSizeOptions.IMAGE_HEIGHT, ImageSizeOptions.SEQ_LENGTH,
             ImageSizeOptions.IMAGE_CHANNELS),
            np.uint8,
            compression='gzip')
        label_dataset = hdf5_file.create_dataset(
            "labels", (len(all_labels), ) + (ImageSizeOptions.SEQ_LENGTH, ),
            np.uint8)
        # save the images and labels to the h5py file
        img_dset[...] = all_images
        label_dataset[...] = all_labels
        hdf5_file.close()

        print("CHROMOSOME: ", chr_name, "THREAD ID: ", thread_id, "READS: ",
              total_reads_processed, "WINDOWS: ", total_windows,
              "TOTAL TIME ELAPSED: ",
              int(math.floor(time.time() - start_time) / 60), "MINS",
              math.ceil(time.time() - start_time) % 60, "SEC")