Example #1
0
 def read_loci_meta_counts(self, fivepseq_out):
     file = fivepseq_out.get_file_path(FivePSeqOut.LOCI_PAUSES_FILE)
     loci_meta_counts = None
     if os.path.exists(file):
         self.logger.info("Loci count file found")
         loci_meta_counts = CountManager.read_meta_counts(file)
     return loci_meta_counts
Example #2
0
 def read_fft_signal_term(self, fivepseq_out):
     file = fivepseq_out.get_file_path(FivePSeqOut.FFT_SIGNALS_TERM)
     try:
         fft_signals_term = CountManager.read_meta_counts(file)
     except:
         self.logger.warn(
             "The file %s not found, plots for this will be skipped." %
             str(file))
         fft_signals_term = None
     return fft_signals_term
Example #3
0
 def read_frame_count_start(self, fivepseq_out):
     file = fivepseq_out.get_file_path(FivePSeqOut.FRAME_COUNTS_START_FILE)
     try:
         frame_count_start = CountManager.read_frame_counts(file)
     except:
         self.logger.warn(
             "The file %s not found, plots for this will be skipped." %
             str(file))
         frame_count_start = None
     return frame_count_start
Example #4
0
 def read_count_vector_list_start(self, fivepseq_out):
     file = fivepseq_out.get_file_path(FivePSeqOut.COUNT_START_FILE)
     try:
         count_vector_list_start = CountManager.read_counts_as_list(file)
     except:
         self.logger.warn(
             "The file %s not found, plots for this will be skipped." %
             str(file))
         count_vector_list_start = None
     return count_vector_list_start
Example #5
0
 def read_meta_count_term(self, fivepseq_out):
     file = fivepseq_out.get_file_path(FivePSeqOut.META_COUNT_TERM_FILE)
     try:
         meta_count_term = CountManager.read_meta_counts(file)
     except:
         self.logger.warn(
             "The file %s not found, plots for this will be skipped." %
             str(file))
         meta_count_term = None
     return meta_count_term
Example #6
0
def bokeh_transcript_scatter_plot(title,
                                  transcript_count_list_dict,
                                  transcript_assembly,
                                  color_dict,
                                  align_to,
                                  span_size,
                                  index_filter,
                                  min_count=0,
                                  max_count=1000,
                                  save_plot=True):
    logging.getLogger(config.FIVEPSEQ_PLOT_LOGGER).info(
        "Plotting transcript specific counts. %d filtered transcript indices specified"
        % len(index_filter))
    output_file(title + "_minCount-" + str(min_count) + "_maxCount-" +
                str(max_count) + ".html",
                mode="cdn")

    p = figure(title=title,
               y_range=(0, 200),
               x_axis_label="position from %s" % align_to,
               y_axis_label="5'seq read counts")
    # try setting range limits - p = figure(x_range=[0, 10], y_range=(10, 20))

    for key in transcript_count_list_dict.keys():
        count_vector_list = transcript_count_list_dict.get(key)
        if index_filter is None:
            index_filter = range(0, len(count_vector_list))
        for index in index_filter:
            count_vector = count_vector_list[index]
            if min_count < sum(count_vector) < max_count:
                count_series = CountManager.count_vector_to_series(
                    count_vector, align_to, tail=span_size)
                transcript = transcript_assembly.ID[index].split(
                    "transcript:")[1]
                gene = transcript_assembly.gene[index].split("gene:")[1]

                c = color_dict.get(key)
                if c is None:
                    logging.getLogger(config.FIVEPSEQ_PLOT_LOGGER).warning(
                        "Color not set for sample %s" % key)
                    c = cl.scales['9']['qual']['Set3'][1]
                line = p.line(count_series.index,
                              count_series.values,
                              legend=key,
                              line_color=RGB(c[0], c[1], c[2]))

                p.add_tools(
                    HoverTool(tooltips=[('position', '@x'), ('count', '@y'),
                                        ['transcript', transcript],
                                        ['gene', gene]],
                              renderers=[line]))
    p.legend.click_policy = "hide"
    if save_plot:
        show(p)
    return p
Example #7
0
    def read_amino_acid_df(self, fivepseq_out, full=False):
        file = fivepseq_out.get_file_path(FivePSeqOut.AMINO_ACID_PAUSES_FILE)
        try:
            amino_acid_df = CountManager.read_amino_acid_df(file)

            if not full:
                if amino_acid_df.shape[1] > self.dist_for_amino_acid_heatmaps:
                    colrange = map(
                        str,
                        np.arange(-1 * self.dist_for_amino_acid_heatmaps, 0))
                    amino_acid_df = amino_acid_df.loc[:, colrange]

        except:
            self.logger.warn(
                "The file %s not found, plots for this will be skipped." %
                str(file))
            amino_acid_df = None
        return amino_acid_df
Example #8
0
 def read_codon_df(self, fivepseq_out, basesort=False):
     file = fivepseq_out.get_file_path(FivePSeqOut.CODON_PAUSES_FILE)
     try:
         codon_df = CountManager.read_amino_acid_df(file)
         if basesort:
             sorted_index = [""] * len(codon_df.index)
             for i in range(len(codon_df.index)):
                 ind = codon_df.index[i]
                 aa = ind.split("_")[0]
                 codon = ind.split("_")[1]
                 sorted_index[i] = codon + "_" + aa
             codon_df.index = sorted_index
             codon_df = codon_df.reindex(sorted(sorted_index))
     except:
         self.logger.warn(
             "The file %s not found, plots for this will be skipped." %
             str(file))
         codon_df = None
     return codon_df
Example #9
0
    def run_transcript_and_count_descriptors(self):
        cancel = False
        #   gene sets: always re-write gene sets, because those can change at each run
        # if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.GENESET_FILE)):
        if self.fivepseq_counts.annotation.gs_transcript_dict is not None:
            self.fivepseq_out.write_geneset_transcript_dict_to_file(
                self.fivepseq_counts.annotation.gs_transcript_dict,
                self.fivepseq_out.GENESET_FILE)

        # case 1 transcript descriptor are not there: generate
        # case 2 transcript descriptors are there, but both count_distribution and outliers are not there: generate
        # case 3 transcript descriptors are there, count_distriubtion is there: set count_distribution
        # case 4 transcript descriptors are there, outliers are there: set ouli

        # transcript descriptors
        if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.TRANSCRIPT_DESCRIPTORS_FILE)) or \
                not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.COUNT_DISTRIBUTION_FILE)):

            self.fivepseq_out.write_df_to_file(
                self.fivepseq_counts.get_transcript_descriptors(),
                self.fivepseq_out.TRANSCRIPT_DESCRIPTORS_FILE)
            self.fivepseq_out.write_dict(
                self.fivepseq_counts.get_count_distribution_dict(),
                self.fivepseq_out.COUNT_DISTRIBUTION_FILE)

        else:
            count_distribution = CountManager.read_count_dict(
                self.fivepseq_out.get_file_path(
                    self.fivepseq_out.COUNT_DISTRIBUTION_FILE))
            self.fivepseq_counts.set_count_distribution_dict(
                count_distribution)

        count_distribution = self.fivepseq_counts.get_count_distribution()
        if len(count_distribution) == 0:
            self.logger.warning(
                "No reads found in coding regions. Fivepseq will skip the rest of calculations."
            )
            cancel = True

        return cancel
Example #10
0
    def compute_fpi_per_transcript(self):
        """
        For each transcript-row:
        Frame of preference (0,1,2) | FPI

        :return:
        """

        if config.args.conflicts == config.ADD_FILES and os.path.exists(
                self.fivepseq_out.get_file_path(
                    FivePSeqOut.TRANSCRIPT_FPI_FILE)):
            self.logger.info(
                "Skipping per-transcript frame preference calculation: file %s exists"
                % FivePSeqOut.TRANSCRIPT_FPI_FILE)
        else:
            frame_counts_df = self.fivepseq_counts.get_frame_counts_df(
                FivePSeqCounts.START)
            self.logger.info(
                "Computing per-transcript frame preference statistics")
            transcript_fpi_df = pd.DataFrame(index=frame_counts_df.index,
                                             columns=[
                                                 self.COUNT, "F",
                                                 self.FRAME_COUNT,
                                                 self.FRAME_PERC, self.FPI
                                             ])
            for index, row in frame_counts_df.iterrows():
                fpi, fmax, f_perc = CountManager.fpi_stats_from_frame_counts(
                    row)

                transcript_fpi_df.at[index, self.COUNT] = sum(row)
                transcript_fpi_df.at[index, 'F'] = fmax
                transcript_fpi_df.at[index, self.FRAME_COUNT] = row[fmax]
                transcript_fpi_df.at[index, self.FRAME_PERC] = f_perc
                transcript_fpi_df.at[index, self.FPI] = fpi

            self.transcript_fpi_df = transcript_fpi_df

            self.fivepseq_out.write_df_to_file(transcript_fpi_df,
                                               FivePSeqOut.TRANSCRIPT_FPI_FILE)
Example #11
0
    def run(self):
        cancel = False

        # info
        self.logger.info(
            "\n\nFivepseq started for bam file %s\n\n" %
            os.path.basename(self.fivepseq_counts.alignment.bam_file))

        #   transcript assembly
        if not self.skip(
                self.fivepseq_out.get_file_path(
                    self.fivepseq_out.TRANSCRIPT_ASSEMBLY_FILE)):
            self.fivepseq_out.write_transcript_assembly_to_file(
                self.fivepseq_counts.annotation.
                get_transcript_assembly_default_filter(0),
                self.fivepseq_out.TRANSCRIPT_ASSEMBLY_FILE)

        # case 1 transcript descriptor are not there: generate
        # case 2 transcript descriptors are there, but both count_distribution and outliers are not there: generate
        # case 3 transcript descriptors are there, count_distriubtion is there: set count_distribution
        # case 4 transcript descriptors are there, outliers are there: set ouli

        # transcript descriptors
        if not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.TRANSCRIPT_DESCRIPTORS_FILE)) or \
                not self.skip(self.fivepseq_out.get_file_path(self.fivepseq_out.COUNT_DISTRIBUTION_FILE)):

            self.fivepseq_out.write_df_to_file(
                self.fivepseq_counts.get_transcript_descriptors(),
                self.fivepseq_out.TRANSCRIPT_DESCRIPTORS_FILE)
            self.fivepseq_out.write_vector(
                self.fivepseq_counts.get_count_distribution(),
                self.fivepseq_out.COUNT_DISTRIBUTION_FILE)

        else:
            count_distribution = CountManager.read_count_vector(
                self.fivepseq_out.get_file_path(
                    self.fivepseq_out.COUNT_DISTRIBUTION_FILE))
            self.fivepseq_counts.set_count_distribution(count_distribution)

        count_distribution = self.fivepseq_counts.get_count_distribution()
        if len(count_distribution) == 0:
            self.logger.warning(
                "No reads found in coding regions. Fivepseq will skip the rest of calculations."
            )
            cancel = True

        if not cancel:
            self.fivepseq_out.write_vector(
                self.fivepseq_counts.get_count_distribution(),
                self.fivepseq_out.COUNT_DISTRIBUTION_FILE)
            self.fivepseq_out.write_vector(
                [self.fivepseq_counts.get_outlier_lower()],
                self.fivepseq_out.OUTLIER_LOWER_FILE)

            #   start codon dictionary
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.START_CODON_DICT_FILE)):
                self.fivepseq_out.write_dict(
                    self.fivepseq_counts.get_start_codon_dict(),
                    self.fivepseq_out.START_CODON_DICT_FILE)

            #   stop codon dictionary
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.TERM_CODON_DICT_FILE)):
                self.fivepseq_out.write_dict(
                    self.fivepseq_counts.get_stop_codon_dict(),
                    self.fivepseq_out.TERM_CODON_DICT_FILE)

            #   load or generate full length counts
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.COUNT_FULL_FILE)):
                self.fivepseq_out.write_vector_list(
                    self.fivepseq_counts.get_count_vector_list(
                        FivePSeqCounts.FULL_LENGTH),
                    self.fivepseq_out.COUNT_FULL_FILE)

            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.OUTLIERS_DF)):
                self.fivepseq_out.write_df_to_file(
                    self.fivepseq_counts.get_outliers_df(),
                    self.fivepseq_out.OUTLIERS_DF)

            # count stats
            count_stats = CountStats(self.fivepseq_counts, self.fivepseq_out,
                                     config)
            count_stats.count_stats()

            #   terminal counts
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.COUNT_TERM_FILE)):
                term_counts = self.fivepseq_counts.get_count_vector_list(
                    FivePSeqCounts.TERM)
                self.fivepseq_out.write_vector_list(
                    term_counts, self.fivepseq_out.COUNT_TERM_FILE)

            #   start counts
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.COUNT_START_FILE)):
                start_counts = self.fivepseq_counts.get_count_vector_list(
                    FivePSeqCounts.START)
                self.fivepseq_out.write_vector_list(
                    start_counts, self.fivepseq_out.COUNT_START_FILE)

            #   meta counts term
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.META_COUNT_TERM_FILE)):
                self.fivepseq_out.write_series_to_file(
                    self.fivepseq_counts.get_meta_count_series(
                        FivePSeqCounts.TERM),
                    self.fivepseq_out.META_COUNT_TERM_FILE)

            #   meta counts start
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.META_COUNT_START_FILE)):
                self.fivepseq_out.write_series_to_file(
                    self.fivepseq_counts.get_meta_count_series(
                        FivePSeqCounts.START),
                    self.fivepseq_out.META_COUNT_START_FILE)

            #   frame counts start
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.FRAME_COUNTS_START_FILE)):
                frame_count_df_start = self.fivepseq_counts.get_frame_counts_df(
                    FivePSeqCounts.START)
                self.fivepseq_out.write_df_to_file(
                    frame_count_df_start,
                    self.fivepseq_out.FRAME_COUNTS_START_FILE)

            #   frame counts term
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.FRAME_COUNTS_TERM_FILE)):
                frame_count_df_term = self.fivepseq_counts.get_frame_counts_df(
                    FivePSeqCounts.TERM)
                self.fivepseq_out.write_df_to_file(
                    frame_count_df_term,
                    self.fivepseq_out.FRAME_COUNTS_TERM_FILE)

            #   canonical transcript indices
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.CANONICAL_TRANSCRIPT_INDEX_FILE)):
                self.fivepseq_out.write_vector(
                    self.fivepseq_counts.canonical_transcript_index,
                    self.fivepseq_out.CANONICAL_TRANSCRIPT_INDEX_FILE)

            #   amino acid pauses
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.AMINO_ACID_PAUSES_FILE)):
                self.fivepseq_out.write_df_to_file(
                    self.fivepseq_counts.get_amino_acid_pauses(
                        50),  # generate more than needed for visualization
                    self.fivepseq_out.AMINO_ACID_PAUSES_FILE)

            #   codon pauses

            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.CODON_PAUSES_FILE)):
                self.fivepseq_out.write_df_to_file(
                    self.fivepseq_counts.get_codon_pauses(),
                    self.fivepseq_out.CODON_PAUSES_FILE)

            #   loci pauses
            if not self.skip(
                    self.fivepseq_out.get_file_path(
                        self.fivepseq_out.LOCI_PAUSES_FILE)):
                if self.fivepseq_counts.loci_file is not None:
                    self.fivepseq_out.write_series_to_file(
                        self.fivepseq_counts.get_pauses_from_loci(
                            self.fivepseq_counts.loci_file),
                        self.fivepseq_out.LOCI_PAUSES_FILE)
        # sanity check
        success = self.fivepseq_out.sanity_check_for_counts()
        if success:
            self.logger.info(
                "\n\nFivepseq successfully finished for bam file %s\n\n" %
                os.path.basename(self.fivepseq_counts.alignment.bam_file))
        else:
            self.logger.info(
                "\n\nFivepseq finished for bam file %s.\n Some files failed to be generated. Check those in %s.\n\n"
                % (os.path.basename(self.fivepseq_counts.alignment.bam_file),
                   self.fivepseq_out.get_file_path(
                       FivePSeqOut.FAILED_COUNT_FILES_LIST)))
Example #12
0
import pandas as pd
import colorlover as cl
import numpy as np
from fivepseq.viz.bokeh_plots import bokeh_transcript_scatter_plot

from fivepseq.logic.structures.fivepseq_counts import CountManager, FivePSeqCounts

dir_5pseq_human = "/proj/sllstore2017018/lilit/5pseq_human"

transcript_assembly = pd.read_csv(os.path.join(dir_5pseq_human,
                                               "fivepseq_Hela-rep1",
                                               "transcript_assembly.txt"),
                                  sep="\t")

hela_chx_counts = CountManager.read_counts_as_list(
    os.path.join(dir_5pseq_human, "fivepseq_HelaCHX-rep1",
                 "counts_FULL_LENGTH.txt"))

high_transcripts = pd.read_csv(
    "/proj/sllstore2017018/lilit/5pseq_human/resources/top1000.transcripts.txt"
)
t_ids = transcript_assembly.iloc[:, 0]
t_ids = [w.replace("transcript:", "") for w in t_ids]
chx_high_ind = [i for i, item in enumerate(t_ids) if item in high_transcripts]

bokeh_transcript_scatter_plot(
    "HelaCHX-rep1", {"HelaCHX-rep1": hela_chx_counts},
    transcript_assembly,
    {"HelaCHX-rep1": cl.to_numeric(cl.scales['9']['qual']['Set3'])[2]},
    FivePSeqCounts.TERM,
    500,
Example #13
0
    def update_dicts(self, sample, directory):
        self.logger.info("reading counts for sample: %s" % sample)
        fivepseq_out = FivePSeqOut(directory)

        self.data_summary_dict.update(
            {sample: self.read_data_summary(fivepseq_out)})
        self.meta_count_start_dict.update(
            {sample: self.read_meta_count_start(fivepseq_out)})
        self.meta_count_term_dict.update(
            {sample: self.read_meta_count_term(fivepseq_out)})

        self.frame_count_term_dict.update(
            {sample: self.read_frame_count_term(fivepseq_out)})
        self.frame_count_start_dict.update(
            {sample: self.read_frame_count_start(fivepseq_out)})
        self.frame_stats_df_dict.update(
            {sample: self.read_frame_stats_df(fivepseq_out)})
        self.amino_acid_df_dict.update(
            {sample: self.read_amino_acid_df(fivepseq_out, full=False)})
        self.amino_acid_df_full_dict.update(
            {sample: self.read_amino_acid_df(fivepseq_out, full=True)})
        self.codon_df_dict.update(
            {sample: self.read_codon_df(fivepseq_out, basesort=False)})
        self.codon_basesorted_df_dict.update(
            {sample: self.read_codon_df(fivepseq_out, basesort=True)})

        self.fft_signal_start_dict.update(
            {sample: self.read_fft_signal_start(fivepseq_out)})
        self.fft_signal_term_dict.update(
            {sample: self.read_fft_signal_term(fivepseq_out)})

        self.count_vector_list_start_dict.update(
            {sample: self.read_count_vector_list_start(fivepseq_out)})
        self.count_vector_list_term_dict.update(
            {sample: self.read_count_vector_list_term(fivepseq_out)})

        self.loci_meta_counts_dict.update(
            {sample: self.read_loci_meta_counts(fivepseq_out)})

        if self.args.tf is not None:
            filter = self.args.tf
            if filter == self.FILTER_TOP_POPULATED:
                self.logger.info("Applying filter %s" % filter)
                self.transcript_index = CountManager.top_populated_count_vector_indices(
                    self.count_vector_list_term_dict.get(sample),
                    self.args.span, 10000)
            elif filter == self.FILTER_CANONICAL_TRANSCRIPTS:
                self.logger.info("Applying filter %s" % filter)
                self.transcript_index = CountManager.canonical_transcript_indices(
                    directory)

        if self.transcript_index is not None:
            self.logger.info("Number of filtered transcripts: %d" %
                             len(self.transcript_index))
            self.frame_count_term_dict[sample] = self.frame_count_term_dict[
                sample].iloc[self.transcript_index, ]
            self.frame_count_start_dict[sample] = self.frame_count_start_dict[
                sample].iloc[self.transcript_index, ]
            self.frame_stats_df_dict[sample] = None

            self.count_vector_list_term_dict[sample] = [
                self.count_vector_list_term_dict[sample][i]
                for i in self.transcript_index
            ]
            self.count_vector_list_start_dict[sample] = [
                self.count_vector_list_start_dict[sample][i]
                for i in self.transcript_index
            ]
            self.meta_count_term_dict[
                sample] = CountManager.count_vector_to_df(
                    CountManager.compute_meta_counts(
                        self.count_vector_list_term_dict[sample]),
                    FivePSeqCounts.TERM, self.args.span)
            self.meta_count_start_dict[
                sample] = CountManager.count_vector_to_df(
                    CountManager.compute_meta_counts(
                        self.count_vector_list_start_dict[sample]),
                    FivePSeqCounts.TERM, self.args.span)