Beispiel #1
0
    def process_db(self, entry_name, profile_db_path, bam_file_path):
        """Function that does everything.

        `entry_name` is the entry name in bams and profiles file.
        """

        self.progress.new(f"Processing '{entry_name}'")

        ################################################################################
        self.progress.update("Recovering the coverage data")
        ################################################################################

        profile_db = dbops.ProfileSuperclass(argparse.Namespace(
            profile_db=profile_db_path, contigs_db=self.contigs_db_path),
                                             r=run_quiet,
                                             p=progress_quiet)
        sample_id = profile_db.p_meta['sample_id']

        # here we open our bam file with an inversions fetch filter.
        # we will access to it later when it is time to get the FWD/FWD and
        # REV/REV reads.
        bam_file = bamops.BAMFileObject(bam_file_path, 'rb')

        if self.process_only_inverted_reads:
            bam_file.fetch_filter = 'inversions'
        else:
            bam_file.fetch_filter = None

        ################################################################################
        self.progress.update("Computing coverage stretches")
        ################################################################################
        # populate coverage stretches in contigs based on coverage data in this
        # particular profile_db. we will then go through each stretch to find
        # those that include palindromic sequences
        contig_coverages = {}
        coverage_stretches_in_contigs = {}
        for contig_name in self.contig_names:
            contig_coverage = np.array([])

            split_names = self.contig_name_to_split_names[contig_name]

            for i in range(len(split_names)):
                split_name = split_names[i]
                split_coverages = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                    profile_db.auxiliary_data_path,
                    profile_db.p_meta['contigs_db_hash']).get(split_name)
                contig_coverage = np.concatenate(
                    (contig_coverage, split_coverages[sample_id]), axis=None)

            # now we know the `contig_coverage`. it is time to break it into stretches
            # of 'high coverage' regions (as in coverage > `self.min_coverage_to_define_stretches`), and store that
            # information into the dictionary `coverage_stretches_in_contigs`
            coverage_stretches_in_contigs[contig_name] = []

            # we also know the contig length here, so let's keep that in mind:
            contig_length = len(contig_coverage)

            # to find regions of high coverage, we first need to 'pad' our array to ensure it always
            # starts and ends with 'low coverage'.
            regions_of_contig_covered_enough = np.hstack(
                [[False],
                 contig_coverage >= self.min_coverage_to_define_stretches,
                 [False]])

            regions_of_contig_covered_enough_diff = np.diff(
                regions_of_contig_covered_enough.astype(int))
            cov_stretch_start_positions = np.where(
                regions_of_contig_covered_enough_diff == 1)[0]
            cov_stretch_end_positions = np.where(
                regions_of_contig_covered_enough_diff == -1)[0]

            # at this stage, `cov_stretch_start_positions` and `cov_stretch_end_positions` contain pairs of
            # positions that match to the begining and end of stretches. we will remove those that are too
            # short to be considered, and store the start/end positions for the remaining stretches of
            # high coverage into the dictionary `coverage_stretches_in_contigs`
            for i in range(0, len(cov_stretch_start_positions)):
                cov_stretch_start, cov_stretch_end = cov_stretch_start_positions[
                    i], cov_stretch_end_positions[i]

                if (cov_stretch_end -
                        cov_stretch_start) >= self.min_stretch_length:
                    coverage_stretches_in_contigs[contig_name].append(
                        (cov_stretch_start, cov_stretch_end), )

            # now it is time to merge those stretches of coverage if they are close to one another to avoid
            # over-splitting areas of coverage due to short regions with low-coverage in the middle like this,
            # where we wish to identify A and B together in a single stretch:
            #
            #                A         B
            #
            #                -         -
            #               ---        --
            #              -----      -----
            #             --------   --------
            #           -----------------------
            # -----------------------------------------------
            coverage_stretches_in_contigs[contig_name] = utils.merge_stretches(
                coverage_stretches_in_contigs[contig_name],
                min_distance_between_independent_stretches=self.
                min_distance_between_independent_stretches)
            # extend start and stop positions of merged stretches to ENSURE we are not
            # missing important information because bioinformatics.
            coverage_stretches_in_contigs[contig_name] = [(0 if (e[0] - self.num_nts_to_pad_a_stretch< 0) else e[0] - self.num_nts_to_pad_a_stretch,
                                                           contig_length if (e[1] + self.num_nts_to_pad_a_stretch) > contig_length else e[1] + self.num_nts_to_pad_a_stretch) \
                                                                for e in coverage_stretches_in_contigs[contig_name]]

            contig_coverages[contig_name] = contig_coverage

        ################################################################################
        self.progress.update("Getting ready to process stretches")
        ################################################################################
        # time to go through each stretch and look for palindromes
        # first, we will set up the Palindromes class
        _args = argparse.Namespace(
            min_palindrome_length=self.min_palindrome_length,
            max_num_mismatches=self.max_num_mismatches,
            min_distance=self.min_distance_palindrome)

        P = Palindromes(_args, run=run_quiet, progress=progress_quiet)
        P.verbose = False

        # now we can go through all the stretches to look for palindromes. this is a LOOOOOONG loop.
        # down below, we will got through each contig name, find stretches of good coverage of FWD/FWD
        # and REV/REV reads (since their coverage values are stored in the profile db of 'inversions'
        # type), find palindromes in those sequences that match to those coverage stretches, build some
        # constructs, and then go through every FWD/FWD and REV/REV read from the BAM file to see if
        # our constructs occur in any of them, which is the only 100% proof of an active inversion.
        for contig_name in coverage_stretches_in_contigs:
            contig_sequence = self.contig_sequences[contig_name]['sequence']
            for start, stop in coverage_stretches_in_contigs[contig_name]:
                stretch_sequence_coverage = contig_coverages[contig_name][
                    start:stop]
                stretch_sequence = contig_sequence[start:stop]
                sequence_name = f"{contig_name}_{start}_{stop}"

                # if the user wants to learn about only a single sequence, we only
                # focus on that one and prematurely go to the next stretch unless
                # there is a match
                if self.only_report_from and sequence_name != self.only_report_from:
                    continue

                # before we go any further, let's print out the sequence in consideration
                # for the user if they used `--verbose`
                if anvio.DEBUG or self.verbose:
                    self.progress.reset()
                    self.run.warning(None,
                                     header=f"Palindromes in {sequence_name}",
                                     lc='yellow',
                                     nl_before=3)
                    self.run.info_single(f"Sequence {stretch_sequence}",
                                         cut_after=0)
                    self.run.info_single("Coverage:", nl_before=1, nl_after=1)
                    self.plot_coverage(f"{sequence_name}",
                                       stretch_sequence_coverage)

                ################################################################################
                self.progress.update(f"{contig_name}: looking for palindromes")
                ################################################################################
                P.find(stretch_sequence,
                       sequence_name=sequence_name,
                       display_palindromes=False)

                if not len(P.palindromes[sequence_name]):
                    # there is no palindrome in this one
                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        self.run.info_single("No palindromes in this one :/",
                                             mc="red")
                    continue
                else:
                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        self.run.info_single(
                            f"The sequence has {PL('palindrome', len(P.palindromes[sequence_name]))}:",
                            mc="green")

                ################################################################################
                self.progress.update(f"{contig_name}: building constructs")
                ################################################################################
                # this is important. here we are getting ready to test each our inversion candidate
                # by reconstructing Florian's imaginary sequences. in the next step we will see if
                # any of these sequences are in any of the FWD/FWD or REV/REV reads
                inversion_candidates = []
                for inversion_candidate in P.palindromes[sequence_name]:
                    region_A_start = inversion_candidate.first_start - 6
                    region_A_end = inversion_candidate.first_start
                    region_A = stretch_sequence[region_A_start:region_A_end]

                    region_B_start = inversion_candidate.first_end
                    region_B_end = inversion_candidate.first_end + 6
                    region_B = stretch_sequence[region_B_start:region_B_end]

                    region_C_start = inversion_candidate.second_start - 6
                    region_C_end = inversion_candidate.second_start
                    region_C = stretch_sequence[region_C_start:region_C_end]

                    region_D_start = inversion_candidate.second_end
                    region_D_end = inversion_candidate.second_end + 6
                    region_D = stretch_sequence[region_D_start:region_D_end]

                    construct_v1_left = region_A + inversion_candidate.first_sequence + utils.rev_comp(
                        region_C)
                    construct_v1_right = utils.rev_comp(
                        region_B) + utils.rev_comp(
                            inversion_candidate.second_sequence) + region_D

                    construct_v2_left = region_A + inversion_candidate.second_sequence + utils.rev_comp(
                        region_C)
                    construct_v2_right = utils.rev_comp(
                        region_B) + utils.rev_comp(
                            inversion_candidate.first_sequence) + region_D

                    # update the palindrome instance with its constructs
                    inversion_candidate.v1_left = construct_v1_left
                    inversion_candidate.v1_right = construct_v1_right
                    inversion_candidate.v2_left = construct_v2_left
                    inversion_candidate.v2_right = construct_v2_right

                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        inversion_candidate.display()
                        self.run.info("Construct v1 left",
                                      construct_v1_left,
                                      mc="cyan")
                        self.run.info("Construct v1 right",
                                      construct_v1_right,
                                      mc="cyan")
                        self.run.info("Construct v2 left",
                                      construct_v2_left,
                                      mc="cyan")
                        self.run.info("Construct v2 right",
                                      construct_v2_right,
                                      mc="cyan")

                    inversion_candidates.append(inversion_candidate)

                # here we have, for a given `contig_name` and `start` and `stop` positions of a stretch in it, we have our inversion candidates,
                ################################################################################
                self.progress.update(
                    f"{contig_name}[{start}:{stop}]: testing constructs")
                ################################################################################
                true_inversion = None

                for read in bam_file.fetch_only(contig_name,
                                                start=start,
                                                end=stop):
                    for inversion_candidate in inversion_candidates:
                        if inversion_candidate.v1_left in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v1_right in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v2_left in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v2_right in read.query_sequence:
                            true_inversion = inversion_candidate
                            break

                if anvio.DEBUG or self.verbose:
                    if true_inversion:
                        self.progress.reset()
                        self.run.info_single(
                            f"Of the {PL('inversion candidate', len(inversion_candidates))} above, "
                            f"the one below had at least one perfect matches to their constructs in REV/REV or "
                            f"FWD/FWD reads from the BAM file:",
                            mc="green",
                            nl_before=1)

                        true_inversion.display()
                    else:
                        self.progress.reset()
                        self.run.info_single(
                            f"No true inversions in this one: none of the REV/REV or FWD/FWD reads "
                            f"had any of the constructs in {PL('inversion candidate', len(inversion_candidates))}.",
                            mc="red",
                            nl_before=1)

        self.progress.end()
Beispiel #2
0
    def populate_misc_data_tables(self):
        self.run.info_single("Additional data and layer orders...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        # initialize views.
        args = argparse.Namespace(profile_db=self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out layer orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' %
                                 (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(
                    profile_db_super.views[essential_field]['dict'],
                    distance=self.distance,
                    linkage=self.linkage,
                    transpose=True)

                layer_orders_data_dict[essential_field] = {
                    'data_value': data_value,
                    'data_type': 'newick'
                }
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/"
            )
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # add the layer orders quietly
        TableForLayerOrders(
            args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict)
        self.run.warning(None, header="Layer orders added", lc='cyan')
        for layer_order in layer_orders_data_dict:
            self.run.info_single(layer_order, mc='cyan')

        # done with layer orders. let's add our layer additional data and call it a day.
        for data_group_name in self.layer_additional_data_dict:
            args.target_data_group = data_group_name
            TableForLayerAdditionalData(
                args, r=terminal.Run(verbose=False)).add(
                    self.layer_additional_data_dict[data_group_name],
                    list(self.layer_additional_data_keys[data_group_name]))

        self.run.warning(None, header="Data groups added", lc='cyan')
        for data_group in self.layer_additional_data_dict:
            self.run.info_single(
                '%s (w/%d items)' %
                (data_group, len(self.layer_additional_data_keys[data_group])),
                mc='cyan')
Beispiel #3
0
    def gen_samples_db_for_the_merged_profile(self):
        """Geenrate a samples db for the merged profile.

           We use the ProfileSuperclass to load all the views we added into the meged profile,
           and generate clusterings of samples for each view to generate a default samples database."""

        self.run.info_single("SAMPLES.db stuff...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        class Args:
            pass

        args = Args()
        args.profile_db = self.merged_profile_db_path

        # initialize views.
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        sample_orders = {}
        failed_attempts = []
        self.progress.new('Working on SAMPLES.db')
        for essential_field in essential_fields:
            self.progress.update('recovering samples order for "%s"' %
                                 (essential_field))
            try:
                sample_orders[essential_field] = \
                        clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                 distance=self.distance,
                                                                 linkage=self.linkage,
                                                                 transpose=True)
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(sample_orders):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate a samples\
                              database for this merged profile, however, all attempts to cluster samples\
                              based on view data available in the merged profile failed. No samples db\
                              for you :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to generate a samples db with clusterings it generated\
                              using the view data that worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # generate the samples order file
        samples_order_file_path = filesnpaths.get_temp_file_path()
        samples_order_file = open(samples_order_file_path, 'w')
        samples_order_file.write('attributes\tbasic\tnewick\n')
        for sample_order in sample_orders:
            samples_order_file.write(
                '%s\t%s\t%s\n' %
                (sample_order, '', sample_orders[sample_order]))
        samples_order_file.close()

        # figure out samples information stuff
        samples_information = {}
        headers = []
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name] = {}

        self.progress.new('Working on SAMPLES.db')
        self.progress.update('...')

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name][
                'num_mapped_reads'] = self.total_reads_mapped_per_sample[
                    sample_name]

        self.progress.end()
        # generate the samples information file
        samples_information_file_path = filesnpaths.get_temp_file_path()
        utils.store_dict_as_TAB_delimited_file(samples_information,
                                               samples_information_file_path,
                                               headers=headers)

        # generate the samples database
        samples_db = dbops.SamplesInformationDatabase(self.samples_db_path,
                                                      quiet=False)
        samples_db.create(
            samples_order_path=samples_order_file_path,
            samples_information_path=samples_information_file_path)

        os.remove(samples_order_file_path)
        os.remove(samples_information_file_path)

        self.run.info('Samples database', self.samples_db_path)