Ejemplo n.º 1
0
def bam_filter_counts(bam_name,
                      alignment,
                      annotation,
                      genome,
                      bam_out_dir,
                      count_folders,
                      success_values,
                      downsample_constant=None,
                      filter_name="protein_coding",
                      is_geneset=False,
                      filter=None,
                      loci_file=None):
    logging.getLogger(config.FIVEPSEQ_LOGGER). \
        info("\n##################\nCounting for sample %s and gene set %s\n##################\n"
             % (bam_name, filter_name))

    filter_out_dir = os.path.join(bam_out_dir, filter_name)
    if not os.path.exists(filter_out_dir):
        os.mkdir(filter_out_dir)

    if filter is not None:
        annotation.apply_geneset_filter(filter_name)

    # combine objects into FivePSeqCounts object
    fivepseq_counts = FivePSeqCounts(
        alignment,
        annotation,
        genome,
        # outlier_probability=config.args.op,
        config=config,
        downsample_constant=downsample_constant,
        is_geneset=is_geneset)
    fivepseq_counts.loci_file = loci_file

    # set up fivepseq out object for this bam
    fivepseq_out = FivePSeqOut(filter_out_dir, config.args.conflicts)

    # run
    fivepseq_pipeline = CountPipeline(fivepseq_counts, fivepseq_out)
    fivepseq_pipeline.run()

    annotation.remove_geneset_filter()

    success = fivepseq_out.sanity_check_for_counts()
    if success:
        success_values.update({bam_name + "_GS_" + filter_name: "SUCCESS"})
        count_folders.append(filter_out_dir)
    else:
        success_values.update({filter_out_dir: "FAILURE"})

    return fivepseq_counts
Ejemplo n.º 2
0
    def apply_geneset_filter(self, geneset_filter, attribute):
        geneset_filtered_assembly = []
        for transcript in self.get_transcript_assembly_default_filter():
            attr_value = FivePSeqOut.get_transcript_attr(transcript, attribute)
            if attr_value in geneset_filter:
                geneset_filtered_assembly.append(transcript)
            #TODO this is not a universal solution, but when the transcripts have names with -1 in the end this works
            elif attr_value.split("-")[0] in geneset_filter:
                geneset_filtered_assembly.append(transcript)
            elif attr_value.split(".")[0] in geneset_filter:
                geneset_filtered_assembly.append(transcript)

        if len(geneset_filtered_assembly) == 0:
            raise Exception("None of the genes in the geneset filter were present in the annotation file")

        #TODO check if the following line suits: if transcript filters were applied prior, those will be preserved
        self.transcript_assembly_dict.update({self.transcript_filter: {0: geneset_filtered_assembly}})
Ejemplo n.º 3
0
    def apply_permanent_gene_filter(self, gene_filter, attribute):
        gene_filtered_assembly = []
        attr_values = []
        for transcript in self.get_transcript_assembly(span_size=0):
            attr_value = FivePSeqOut.get_transcript_attr(transcript, attribute)
            if attribute not in transcript.attr:
                attrs = ""
                for a in transcript.attr:
                    attrs += a + ';'
                raise Exception(
                    "The genefilter attribute %s is not present in the annotation file. Try any of the following: %s"
                    % (attribute, attrs))
            attr_values.append(attr_value)
            if attr_value in gene_filter:
                gene_filtered_assembly.append(transcript)
            # TODO this is not a universal solution, but when the transcripts have names with -1 in the end this works
            elif len(attr_value.split(":")) > 1 and attr_value.split(
                    ":"
            )[1] in gene_filter:  # gene_id filtering results in IDs in the form gene:xxx
                gene_filtered_assembly.append(transcript)
            elif attr_value.split("-")[0] in gene_filter:
                gene_filtered_assembly.append(transcript)
            elif attr_value.split(".")[0] in gene_filter:
                gene_filtered_assembly.append(transcript)

        if len(gene_filtered_assembly) == 0:
            raise Exception(
                "None of the genes in the geneset filter were present in the annotation file"
            )

        #NOTE the gene filter is permanently applied in the beginning,
        # thus span sizes other than 0 are not taken into consideration
        self.transcript_assembly_dict.update(
            {self.default_filter: {
                0: gene_filtered_assembly
            }})
Ejemplo n.º 4
0
    def update_dicts(self, sample, directory):
        self.logger.info("reading counts for sample: %s" % sample)
        fivepseq_out = FivePSeqOut(directory)

        self.data_summary_dict.update(
            {sample: self.read_data_summary(fivepseq_out)})
        self.meta_count_start_dict.update(
            {sample: self.read_meta_count_start(fivepseq_out)})
        self.meta_count_term_dict.update(
            {sample: self.read_meta_count_term(fivepseq_out)})

        self.frame_count_term_dict.update(
            {sample: self.read_frame_count_term(fivepseq_out)})
        self.frame_count_start_dict.update(
            {sample: self.read_frame_count_start(fivepseq_out)})
        self.frame_stats_df_dict.update(
            {sample: self.read_frame_stats_df(fivepseq_out)})
        self.amino_acid_df_dict.update(
            {sample: self.read_amino_acid_df(fivepseq_out, full=False)})
        self.amino_acid_df_full_dict.update(
            {sample: self.read_amino_acid_df(fivepseq_out, full=True)})
        self.codon_df_dict.update(
            {sample: self.read_codon_df(fivepseq_out, basesort=False)})
        self.codon_basesorted_df_dict.update(
            {sample: self.read_codon_df(fivepseq_out, basesort=True)})

        self.fft_signal_start_dict.update(
            {sample: self.read_fft_signal_start(fivepseq_out)})
        self.fft_signal_term_dict.update(
            {sample: self.read_fft_signal_term(fivepseq_out)})

        self.count_vector_list_start_dict.update(
            {sample: self.read_count_vector_list_start(fivepseq_out)})
        self.count_vector_list_term_dict.update(
            {sample: self.read_count_vector_list_term(fivepseq_out)})

        self.loci_meta_counts_dict.update(
            {sample: self.read_loci_meta_counts(fivepseq_out)})

        if self.args.tf is not None:
            filter = self.args.tf
            if filter == self.FILTER_TOP_POPULATED:
                self.logger.info("Applying filter %s" % filter)
                self.transcript_index = CountManager.top_populated_count_vector_indices(
                    self.count_vector_list_term_dict.get(sample),
                    self.args.span, 10000)
            elif filter == self.FILTER_CANONICAL_TRANSCRIPTS:
                self.logger.info("Applying filter %s" % filter)
                self.transcript_index = CountManager.canonical_transcript_indices(
                    directory)

        if self.transcript_index is not None:
            self.logger.info("Number of filtered transcripts: %d" %
                             len(self.transcript_index))
            self.frame_count_term_dict[sample] = self.frame_count_term_dict[
                sample].iloc[self.transcript_index, ]
            self.frame_count_start_dict[sample] = self.frame_count_start_dict[
                sample].iloc[self.transcript_index, ]
            self.frame_stats_df_dict[sample] = None

            self.count_vector_list_term_dict[sample] = [
                self.count_vector_list_term_dict[sample][i]
                for i in self.transcript_index
            ]
            self.count_vector_list_start_dict[sample] = [
                self.count_vector_list_start_dict[sample][i]
                for i in self.transcript_index
            ]
            self.meta_count_term_dict[
                sample] = CountManager.count_vector_to_df(
                    CountManager.compute_meta_counts(
                        self.count_vector_list_term_dict[sample]),
                    FivePSeqCounts.TERM, self.args.span)
            self.meta_count_start_dict[
                sample] = CountManager.count_vector_to_df(
                    CountManager.compute_meta_counts(
                        self.count_vector_list_start_dict[sample]),
                    FivePSeqCounts.TERM, self.args.span)
Ejemplo n.º 5
0
def generate_and_store_fivepseq_counts(plot=False):
    logger = logging.getLogger(config.FIVEPSEQ_COUNT_LOGGER)
    logger.info("Fivepseq count started")

    # process bam input

    print "%s" % (pad_spaces("\tInput bam files:"))
    bam_files = []
    for bam in glob.glob(config.args.b):
        if bam.endswith(".bam"):
            bam_files.append(bam)
            print "%s" % pad_spaces("\t%s" % bam)
        else:
            logger.info("non bam file found: %s" % bam)

    if len(bam_files) == 0:
        err_msg = "No bam files found at %s" % config.args.b
        logging.getLogger(config.FIVEPSEQ_COUNT_LOGGER).error(err_msg)
        return None

    # set up annotation

    annotation_reader = AnnotationReader(
        config.annot)  # set the break for human
    annotation = annotation_reader.annotation

    annotation.set_default_span_size(config.span_size)

    if hasattr(config.args, 'tf') and config.args.tf is not None:
        annotation.set_default_transcript_filter(config.args.tf)

    if hasattr(config.args, 's') and config.args.s is not None:
        annotation.set_gene_set_filter(config.args.s)

    # set up genome

    fasta_reader = FastaReader(config.genome)

    success_values = {}
    fivepseq_counts_dict = {}
    count_folders = []

    for bam in bam_files:
        # set up bam input and output
        bam_reader = BamReader(bam)
        bam_name = os.path.basename(bam)
        if bam_name.endswith(".bam"):
            bam_name = bam_name[0:len(bam_name) - 4]

        bam_out_dir = os.path.join(config.out_dir, FIVEPSEQ_COUNTS_DIR,
                                   bam_name)

        if not os.path.exists(bam_out_dir):
            os.mkdir(bam_out_dir)

        # combine objects into FivePSeqCounts object
        fivepseq_counts = FivePSeqCounts(bam_reader.alignment,
                                         annotation,
                                         fasta_reader.genome,
                                         downsample_constant=config.args.ds)
        if hasattr(config.args, "loci_file"):
            fivepseq_counts.loci_file = config.args.loci_file
        fivepseq_counts_dict.update({bam: fivepseq_counts})

        # set up fivepseq out object for this bam
        fivepseq_out = FivePSeqOut(bam_out_dir, config.args.conflicts)

        # run
        fivepseq_pipeline = CountPipeline(fivepseq_counts, fivepseq_out)
        fivepseq_pipeline.run()

        success = fivepseq_out.sanity_check_for_counts()
        if success:
            success_values.update({bam_name: "SUCCESS"})
            count_folders.append(bam_out_dir)
        else:
            success_values.update({bam_name: "FAILURE"})

    # check if all the files in all directories are in place and store a summary of all runs
    fivepseq_out = FivePSeqOut(
        config.out_dir, config.OVERWRITE
    )  # overwrite is for always removing existing summary file

    fivepseq_out.write_dict(success_values, FivePSeqOut.BAM_SUCCESS_SUMMARY)

    logging.getLogger(
        config.FIVEPSEQ_COUNT_LOGGER).info("\n##################")
    logging.getLogger(config.FIVEPSEQ_COUNT_LOGGER).info(
        "\n# Finished counting successfully! Proceeding to plotting")
    logging.getLogger(
        config.FIVEPSEQ_COUNT_LOGGER).info("\n##################")

    if plot:
        if len(count_folders) == 0:
            err_msg = "None of the count directories succeeded. Plots will not be generated."
            logging.getLogger(config.FIVEPSEQ_PLOT_LOGGER).error(err_msg)
        else:
            # set up job title if none is provided
            if not hasattr(config.args, 't') or config.args.t is None:
                config.args.t = os.path.basename(os.path.dirname(
                    config.args.b)) + "_" + os.path.basename(config.args.b)
                config.args.t = config.args.t.replace(".bam", "")
                config.args.t = config.args.t.replace("_*", "")
                config.args.t = config.args.t.replace("*", "")

            #FIXME currently all count folders in the output directory are used for plotting.
            #FIXME this introduces conflicts with pre-existing count files in the folder
            #FIXME the adding of count_folders list shouuld fix for this: need testing
            #config.args.md = str(os.path.join(config.out_dir, FIVEPSEQ_COUNTS_DIR)) + "/*"
            config.args.o = config.out_dir = os.path.join(
                config.out_dir, FIVEPSEQ_PLOTS_DIR)
            generate_plots(count_folders)
Ejemplo n.º 6
0
def generate_and_store_fivepseq_counts(plot=False):
    logger = logging.getLogger(config.FIVEPSEQ_LOGGER)
    logger.info("FIVEPSEQ STARTED")
    logger.info("________________")

    # process bam input

    logger.info("%s" % (pad_spaces("\tInput bam files:")))
    bam_files = []
    for bam in sorted(glob.glob(config.args.b)):
        if bam.endswith(".bam"):
            bam_files.append(bam)
            logger.info("%s" % pad_spaces("\t%s" % bam))
        else:
            logger.info("non bam file found: %s" % bam)

    if len(bam_files) == 0:
        err_msg = "No bam files found at %s" % config.args.b
        logging.getLogger(config.FIVEPSEQ_LOGGER).error(err_msg)
        return None

    # set up annotation

    if hasattr(config.args, "subset") and config.args.subset is not None:
        annotation_reader = AnnotationReader(
            config.annot,
            transcript_type=config.args.transcript_type,
            break_after=config.args.subset)
    else:
        annotation_reader = AnnotationReader(
            config.annot, transcript_type=config.args.transcript_type)

    # set the break for human
    annotation = annotation_reader.annotation

    annotation.set_default_span_size(config.span_size)

    if hasattr(config.args, 'gf') and config.args.gf is not None:
        annotation.set_permanent_gene_filter(config.args.gf)

    if hasattr(config.args, 'gs') and config.args.gs is not None:
        annotation.store_gene_sets(config.args.gs)
        if len(annotation.gs_transcriptInd_dict) > 12:
            err_msg = "Too many genesets (%d) provided: should not exceed 12" % len(
                annotation.gs_transcript_dict)
            logger.error(err_msg)
            raise Exception(err_msg)

    # set up genome

    fasta_reader = FastaReader(config.genome)

    success_values = {}
    # fivepseq_counts_dict = {}
    count_folders = []
    count_folders_gs = []

    for bam in bam_files:
        # set up bam input and output
        bam_reader = BamReader(bam)
        bam_name = os.path.basename(bam)
        if bam_name.endswith(".bam"):
            bam_name = bam_name[0:len(bam_name) - 4]

        bam_out_dir = os.path.join(config.out_dir, FIVEPSEQ_COUNTS_DIR,
                                   bam_name)

        if not os.path.exists(bam_out_dir):
            os.mkdir(bam_out_dir)

        main_type_counts = bam_filter_counts(
            bam_name,
            bam_reader.alignment,
            annotation,
            fasta_reader.genome,
            bam_out_dir,
            count_folders,
            success_values,
            filter_name=config.args.transcript_type,
            loci_file=config.args.loci_file)

        # fivepseq_counts_dict.update({bam: fivepseq_counts})

        # run on genesets
        if annotation.gs_transcriptInd_dict is not None:
            outlier_lower = main_type_counts.get_outlier_lower()
            for gs in annotation.gs_transcriptInd_dict.keys():
                bam_filter_counts(bam_name,
                                  bam_reader.alignment,
                                  annotation,
                                  fasta_reader.genome,
                                  bam_out_dir,
                                  count_folders_gs,
                                  success_values,
                                  downsample_constant=outlier_lower,
                                  filter_name=gs,
                                  filter=annotation.gs_transcriptInd_dict[gs],
                                  is_geneset=True)

    # check if all the files in all directories are in place and store a summary of all runs
    fivepseq_out = FivePSeqOut(
        config.out_dir, config.OVERWRITE
    )  # overwrite is for always removing existing summary file

    fivepseq_out.write_dict(success_values, FivePSeqOut.BAM_SUCCESS_SUMMARY)

    logging.getLogger(config.FIVEPSEQ_LOGGER).info("\n##################")
    logging.getLogger(config.FIVEPSEQ_LOGGER).info(
        "\n# Finished counting successfully! Proceeding to plotting")
    logging.getLogger(config.FIVEPSEQ_LOGGER).info("\n##################")

    if plot:
        if len(count_folders) == 0:
            err_msg = "None of the count directories succeeded. Plots will not be generated."
            logging.getLogger(config.FIVEPSEQ_LOGGER).error(err_msg)
        else:
            # set up job title if none is provided
            if not hasattr(config.args, 't') or config.args.t is None:
                config.args.t = os.path.basename(os.path.dirname(
                    config.args.b)) + "_" + os.path.basename(config.args.b)
                config.args.t = config.args.t.replace(".bam", "")
                config.args.t = config.args.t.replace("_*", "")
                config.args.t = config.args.t.replace("*", "")

            # FIXME currently all count folders in the output directory are used for plotting.
            # FIXME this introduces conflicts with pre-existing count files in the folder
            # FIXME the adding of count_folders list shouuld fix for this: need testing
            # config.args.md = str(os.path.join(config.out_dir, FIVEPSEQ_COUNTS_DIR)) + "/*"
            config.args.o = config.out_dir = os.path.join(
                config.out_dir, FIVEPSEQ_PLOTS_DIR)
            generate_plots(count_folders, annotation.gs_transcriptInd_dict)
Ejemplo n.º 7
0
    def store_gene_sets(self, geneset_file):
        """
        Provide the file where the names of genes and their mappings to one or several gene sets.
        The file should be a tab delimited file, with the first column containing gene IDs and the second one - gene sets.
        The header of the first column should be equal to the attribute in the gff file, which has been used for gene ID specification.
        The header of the second column should be named "geneset"

        GFF_ATTRIBUTE_NAME(e.g.: gene_id)  geneset
        gene1   GS1
        gene2   GS1
        gene3   GS2
        gene4   GS2


        :param geneset_file: a tab-delimited file containing gene-geneset mapping
        :return: stores and returns a {geneset,[transcript]} dictionary
        """

        # check file
        if not os.path.exists(geneset_file):
            raise Exception("The gene set file %s does not exist" %
                            geneset_file)

        # read genesets, store in GS:[geneIDs] dictionary
        gs_dict = {}
        geneIDs = []
        with open(geneset_file) as file:
            header = file.readline()
            tokens = header.split("\t")
            if len(tokens) != 2:
                raise Exception(
                    "The geneset file should have exactly two columns. Found %d instead."
                    % len(tokens))
            attribute = tokens[0]
            self.gs_geneID_attribute = attribute
            self.logger.info(
                "The attribute %s will be used to read gff for setting the geneset dictionary"
                % attribute)
            line = file.readline()
            count = 1
            while line:
                # if " " in line:
                #    raise Exception("The geneset file %s should not contain spaces. Found one in line %d"
                #                    % (geneset_file, count))
                tokens = line.split('\t')
                if len(tokens) != 2:
                    raise Exception(
                        "The geneset file should have exactly two columns. Found %d in line %d instead."
                        % (len(tokens), count))
                geneID = tokens[0]
                geneIDs.append(geneID)
                geneset = tokens[1].rstrip("\n\r")

                if geneset in gs_dict:
                    gs_dict[geneset].append(geneID)
                else:
                    gs_dict.update({geneset: [geneID]})

                line = file.readline()
                count += 1

        # check if the gene IDs are in the default transcript assembly
        # for those that are, create geneID:transcript dictionary

        geneID_transcript_dict = {}
        geneID_transcriptInd_dict = {}
        transcript_ind = 0
        attr_values = []
        for transcript in self.get_transcript_assembly(span_size=0):
            attr_value = FivePSeqOut.get_transcript_attr(transcript, attribute)
            if attribute not in transcript.attr:
                attrs = ""
                for a in transcript.attr:
                    attrs += a + ';'
                raise Exception(
                    "The geneset attribute %s is not present in the annotation file. Try any of the following: %s"
                    % (attribute, attrs))

            attr_values.append(attr_value)
            # TODO this is not a universal solution, but when the transcripts have names with -1 in the end this works
            geneID = None
            if attr_value in geneIDs:
                geneID = geneIDs[geneIDs.index(attr_value)]
            elif len(attr_value.split(":")) > 1 and attr_value.split(
                    ":")[1] in geneIDs:
                geneID = geneIDs[geneIDs.index(attr_value.split(":")[1])]
            # attr_value.split("-")[0] in geneIDs or \
            # attr_value.split(".")[0] in geneIDs:
            if geneID is not None:
                geneID_transcript_dict.update({geneID: transcript})
                geneID_transcriptInd_dict.update({geneID: transcript_ind})

            transcript_ind = transcript_ind + 1

        if len(geneID_transcript_dict) == 0:
            raise Exception(
                "None of the genes in the geneset file were present in the annotation file. The attribure "
                "%s should contain values like this: %s, %s, %s, etc." %
                (attribute, attr_values[0], attr_values[1], attr_values[2]))

        # with those geneIDs that mapped to actual transcripts,
        # store a {GS: [transcripts]} dictionary
        #TODO remove if fine
        #gs_transcript_dict = {}
        gs_transcriptInd_dict = {}
        for gs in gs_dict.keys():
            #gs_transcript_dict.update({gs: []})
            self.transcript_assembly_dict.update({gs: {0: []}})
            gs_transcriptInd_dict.update({gs: []})
            for geneID in gs_dict[gs]:
                if geneID in geneID_transcript_dict:
                    #TODO remove if fine
                    #gs_transcript_dict[gs].append(geneID_transcript_dict[geneID])
                    self.transcript_assembly_dict[gs][0].append(
                        geneID_transcript_dict[geneID])
                    gs_transcriptInd_dict[gs].append(
                        geneID_transcriptInd_dict[geneID])

        #TODO remove if fine
        #self.gs_transcript_dict = gs_transcript_dict
        self.gs_transcriptInd_dict = gs_transcriptInd_dict

        geneset_summary = ""
        for gs in gs_transcriptInd_dict.keys():
            geneset_summary += gs + "\n"

        self.logger.info(
            "Genesets processed. %d out of %d unique geneIDs matched corresponding transcripts."
            % (len(set(geneID_transcript_dict.keys())), len(set(geneIDs))))
        self.logger.info("Found genesets:\n%s\n" % geneset_summary)