コード例 #1
0
    def __init__(self, psl_alignment, sorted_exons_attr, strand_specific,
                 sqlite3_db_genes, type_isoforms):
        # getting aligned transcript:
        self.alignment = psl_alignment

        # size of aligned transcript in genome reference:
        self.reference_len = self.alignment.target_fragment.end - self.alignment.target_fragment.start + 1

        self.alignment_len = sum(psl_alignment.blocks_sizes)
        self.fraction = self.alignment_len * 1.0 / self.alignment.query_fragment.size

        self.internal_isoforms = None
        self.ids_internal_isoforms = set()

        self.children_exons_dict = {}
        self.ids_children_exons_dict = {}

        if sqlite3_db_genes is not None:
            # internal_exons = self.get_internal_exons(db_genes, strand_specific)

            if strand_specific:
                strand = self.alignment.strand
            else:
                strand = str(None)
            internal_exons = \
                UtilsCoverage.get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr, self.alignment.target_fragment.starts,
                                                        self.alignment.target_fragment.ends, strand, self.alignment.target_fragment.name)
            self.internal_isoforms = list(
                UtilsCoverage.get_internal_isoforms(sqlite3_db_genes,
                                                    type_isoforms,
                                                    internal_exons))
            for internal_isoform in self.internal_isoforms:
                self.ids_internal_isoforms.add(internal_isoform.id)

                self.children_exons_dict[internal_isoform.id] = list(
                    sqlite3_db_genes.children(
                        internal_isoform.id,
                        featuretype=UtilsAnnotations.default_type_exons,
                        order_by='start'))
                # for prokaryotes:
                if len(self.children_exons_dict[internal_isoform.id]) == 0:
                    self.children_exons_dict[internal_isoform.id] = [
                        internal_isoform
                    ]

                self.ids_children_exons_dict[internal_isoform.id] = set([
                    exon.id
                    for exon in self.children_exons_dict[internal_isoform.id]
                ])
コード例 #2
0
    def get_transcript_coverage(self, aligned_transcript, internal_isoforms_coverage, WELL_FULLY_COVERAGE_THRESHOLDS):
        self.transcript_len = aligned_transcript.alignment.query_fragment.size
        self.alignment_len = aligned_transcript.alignment_len

        for i_isoform in range(len(aligned_transcript.internal_isoforms)):
            id_isoform = aligned_transcript.internal_isoforms[i_isoform].id

            # get metrics of coverage excluded covered bases of blocks:
            for i_block in range(self.blocks_num):
                self.covered_bases[id_isoform] += self.covered_bases_blocks[id_isoform][i_block]

                self.covered_fraction_blocks[id_isoform][i_block] = self.covered_bases_blocks[id_isoform][i_block] * 1.0 / aligned_transcript.alignment.blocks_sizes[i_block]

                self.avg_covered_fraction_block[id_isoform] += self.covered_fraction_blocks[id_isoform][i_block]

                if self.covered_fraction_blocks[id_isoform][i_block] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_block_threshold:
                    self.num_well_covered_blocks[id_isoform] += 1
                if self.covered_fraction_blocks[id_isoform][i_block] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_block_threshold:
                    self.num_fully_covered_blocks[id_isoform] += 1

            self.covered_fraction_whole_transcript[id_isoform] = self.covered_bases[id_isoform] * 1.0 / self.transcript_len
            self.covered_fraction_aligned_part[id_isoform] = self.covered_bases[id_isoform] * 1.0 / self.alignment_len

            self.avg_covered_fraction_block[id_isoform] /= self.blocks_num

            self.percentage_well_covered_blocks[id_isoform] = self.num_well_covered_blocks[id_isoform] * 1.0 / self.blocks_num
            self.percentage_fully_covered_blocks[id_isoform] = self.num_fully_covered_blocks[id_isoform] * 1.0 / self.blocks_num

        # get mapped to transcript isoform (TEMPORARY: can have better solution):
        if len(aligned_transcript.internal_isoforms) != 0:
            self.id_mapped_isoform = UtilsCoverage.get_ids_best_mapped(self.covered_bases, internal_isoforms_coverage.assembled_fraction)[0]
コード例 #3
0
    def get_database_coverage_by_reads(
            self, sam_path, args_tophat, reference_path, single_reads,
            left_reads, right_reads, reference_dict, sqlite3_db_genes,
            type_isoforms, sorted_exons_attr, strand_specific,
            tot_isoforms_len, genome_len, output_dir, threads,
            WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir):
        if sam_path is None:
            if args_tophat:
                sam_path = \
                    UtilsTools.get_sam_by_tophat(None, reference_path, single_reads, left_reads, right_reads,
                                                 output_dir, threads, logger, log_dir)
            else:
                sam_path = \
                    UtilsTools.get_sam_by_STAR(threads, reference_path, None, single_reads, left_reads, right_reads,
                                               output_dir, None, None, genome_len, logger, log_dir)

        if sam_path is None:
            return

        logger.print_timestamp()
        logger.info('Getting database coverage by reads...')

        with open(sam_path, 'r') as in_handle:
            for line in in_handle:
                if line[0] == '@':
                    continue

                curr_sam_alignment = Alignment.SAMFileAlignment.get_alignment_from_sam_line(
                    line, logger, reference_dict)

                if strand_specific:
                    strand = curr_sam_alignment.strand
                else:
                    strand = None

                internal_exons = \
                    UtilsCoverage.get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr, curr_sam_alignment.target_fragment.starts,
                                                                curr_sam_alignment.target_fragment.ends, str(strand), curr_sam_alignment.target_fragment.name)

                internal_isoforms = list(
                    UtilsCoverage.get_internal_isoforms(
                        sqlite3_db_genes, type_isoforms, internal_exons))

                for isoform in internal_isoforms:
                    children_exons = list(
                        sqlite3_db_genes.children(
                            isoform.id,
                            featuretype=UtilsAnnotations.default_type_exons,
                            order_by='start'))
                    # for prokaryotes:
                    if len(children_exons) == 0:
                        children_exons = [isoform]

                    exon_starts = [exon.start for exon in children_exons]
                    exon_ends = [exon.end for exon in children_exons]
                    exon_ids = [exon.id for exon in children_exons]

                    target_cov_pos, query_cov_pos = \
                        UtilsCoverage.get_coverage_positions(exon_ids, exon_starts, exon_ends, range(len(curr_sam_alignment.target_fragment.starts)),
                                                             curr_sam_alignment.target_fragment.starts, curr_sam_alignment.target_fragment.ends)

                    if isoform.id not in self.num_reads_covered_pos:
                        self.num_reads_covered_pos[isoform.id] = {}
                    for id_exon in target_cov_pos:
                        if id_exon not in self.num_reads_covered_pos[
                                isoform.id]:
                            self.num_reads_covered_pos[isoform.id][id_exon] = [
                                0
                            ] * len(sqlite3_db_genes[id_exon])
                        for i_cov_pos in range(len(target_cov_pos[id_exon])):
                            for i_pos in range(
                                    target_cov_pos[id_exon][i_cov_pos][0],
                                    target_cov_pos[id_exon][i_cov_pos][1] + 1):
                                self.num_reads_covered_pos[
                                    isoform.id][id_exon][i_pos] += 1

        for id_isoform in self.num_reads_covered_pos:
            parent_genes = list(
                sqlite3_db_genes.parents(
                    id_isoform,
                    featuretype=UtilsAnnotations.default_type_genes))
            if parent_genes == []:
                parent_gene_id = id_isoform
            else:
                parent_gene_id = parent_genes[0].id

            self.expressed_fraction_isoform[id_isoform] = 0.0
            len_isoform = 0
            for id_exon in self.num_reads_covered_pos[id_isoform]:
                len_exon = len(self.num_reads_covered_pos[id_isoform][id_exon])
                num_uncovered_pos = self.num_reads_covered_pos[id_isoform][
                    id_exon].count(0)
                num_expressed_bases = len_exon - num_uncovered_pos

                if id_exon not in self.expressed_fraction_exon:
                    self.expressed_fraction_exon[
                        id_exon] = num_expressed_bases * 1.0 / len_exon

                    if self.expressed_fraction_exon[
                            id_exon] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_exon_threshold:
                        self.ids_well_expressed_exons.add(id_exon)

                    if self.expressed_fraction_exon[
                            id_exon] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_exon_threshold:
                        self.ids_fully_expressed_exons.add(id_exon)

                len_isoform += len_exon

                self.num_expressed_pos_at_least_one_by_reads += num_expressed_bases

                self.expressed_fraction_isoform[
                    id_isoform] += num_expressed_bases

            self.expressed_fraction_isoform[id_isoform] /= len_isoform

            if self.expressed_fraction_isoform[
                    id_isoform] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_isoform_threshold:
                self.ids_well_expressed_genes.add(parent_gene_id)

                self.ids_well_expressed_isoforms.add(id_isoform)

            if self.expressed_fraction_isoform[
                    id_isoform] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_isoform_threshold:
                self.ids_fully_expressed_genes.add(parent_gene_id)

                self.ids_fully_expressed_isoforms.add(id_isoform)

        self.num_well_expressed_genes = len(self.ids_well_expressed_genes)
        self.num_well_expressed_isoforms = len(
            self.ids_well_expressed_isoforms)
        self.num_well_expressed_exons = len(self.ids_well_expressed_exons)

        self.num_fully_expressed_genes = len(self.ids_fully_expressed_genes)
        self.num_fully_expressed_isoforms = len(
            self.ids_fully_expressed_isoforms)
        self.num_fully_expressed_exons = len(self.ids_fully_expressed_exons)

        if tot_isoforms_len != 0:
            self.fraction_annotation_mapped_by_reads = self.num_expressed_pos_at_least_one_by_reads * 1.0 / tot_isoforms_len

        logger.info('Done.')
コード例 #4
0
    def get_aligned_transcript_and_coverages(self, psl_alignment,
                                             sorted_exons_attr,
                                             strand_specific, sqlite3_db_genes,
                                             type_isoforms,
                                             WELL_FULLY_COVERAGE_THRESHOLDS):
        # CREATE ALIGNED TRANSCRIPT:
        start_time = datetime.now()

        # print 'aligned transcript: ', datetime.now()
        aligned_transcript = AlignedTranscript.AlignedTranscript(
            psl_alignment, sorted_exons_attr, strand_specific,
            sqlite3_db_genes, type_isoforms)
        # print 'done: ', datetime.now()

        elapsed_transcript_time = datetime.now() - start_time

        # GET COVERAGES:
        aligned_transcript_coverage = None
        internal_isoforms_coverage = None
        if self.assembly_correctness_metrics.transcripts_coverage is not None and \
                        self.assembly_completeness_metrics.isoforms_coverage is not None:
            # get coverages:
            aligned_transcript_coverage = \
                OneTranscriptCoverage.OneTranscriptCoverage(aligned_transcript.ids_internal_isoforms,
                                                            aligned_transcript.alignment.blocks_num)
            # print 'internal isoforms coverage: : ', datetime.now()
            internal_isoforms_coverage = \
                InternalIsoformsCoverage.InternalIsoformsCoverage(aligned_transcript.internal_isoforms)
            # print 'done: : ', datetime.now()

            # print 'bases exons blocks covered: ', datetime.now()
            # set exons and blocks overlap (covered bases):
            for internal_isoform in aligned_transcript.internal_isoforms:
                # exon.start, exon.end: 1-based coordinates; start must be <= end
                exons_starts = [
                    exon.start - 1 for exon in
                    aligned_transcript.children_exons_dict[internal_isoform.id]
                ]
                exons_ends = [
                    exon.end - 1 for exon in
                    aligned_transcript.children_exons_dict[internal_isoform.id]
                ]
                exons_ids = [
                    exon.id for exon in aligned_transcript.children_exons_dict[
                        internal_isoform.id]
                ]

                target_cov_pos, query_cov_pos = \
                    UtilsCoverage.get_coverage_positions(exons_ids, exons_starts, exons_ends, range(aligned_transcript.alignment.blocks_num),
                                                         aligned_transcript.alignment.target_fragment.starts, aligned_transcript.alignment.target_fragment.ends)

                internal_isoforms_coverage.update_internal_isoforms_coverage(
                    sqlite3_db_genes, internal_isoform.id, target_cov_pos)

                aligned_transcript_coverage.update_transcript_coverage(
                    internal_isoform.id, query_cov_pos)

            # print 'done: ', datetime.now()

            internal_isoforms_coverage.get_internal_isoforms_coverage(
                aligned_transcript.internal_isoforms,
                aligned_transcript.children_exons_dict)

            aligned_transcript_coverage.get_transcript_coverage(
                aligned_transcript, internal_isoforms_coverage,
                WELL_FULLY_COVERAGE_THRESHOLDS)

        return aligned_transcript, aligned_transcript_coverage, internal_isoforms_coverage, elapsed_transcript_time
コード例 #5
0
    def get_best_mapped_from_best_aligned(self, best_lines, best_alignments,
                                          sorted_exons_attr, strand_specific,
                                          sqlite3_db_genes, type_isoforms,
                                          WELL_FULLY_COVERAGE_THRESHOLDS):
        start_time = datetime.now()

        best_aligned_transcripts = []
        best_aligned_transcripts_coverages = []
        best_aligned_internal_isoforms_coverages = []
        for i_line_alignment in range(len(best_lines)):
            curr_aligned_transcript, curr_aligned_transcript_coverage, curr_internal_isoforms_coverage, \
            elapsed_transcript_time = \
                self.get_aligned_transcript_and_coverages(best_alignments[i_line_alignment], sorted_exons_attr,
                                                          strand_specific, sqlite3_db_genes, type_isoforms,
                                                          WELL_FULLY_COVERAGE_THRESHOLDS)

            best_aligned_transcripts.append(curr_aligned_transcript)

            best_aligned_transcripts_coverages.append(
                curr_aligned_transcript_coverage)

            best_aligned_internal_isoforms_coverages.append(
                curr_internal_isoforms_coverage)

        # IN CASE WHEN WE HAVN'T ANNOTATION:
        if sqlite3_db_genes is None:
            elapsed_time = datetime.now() - start_time

            return best_lines, best_alignments, best_aligned_transcripts, \
                   best_aligned_transcripts_coverages, best_aligned_internal_isoforms_coverages, \
                   elapsed_time, elapsed_transcript_time

        # IN CASE WHEN WE HAVE ANNOTATION:
        else:
            # choose best annotated transcript alignments over all best alignments:
            transcripts_covered_bases = {}
            isoforms_covered_fraction = {}
            for i_alignment in range(len(best_aligned_transcripts_coverages)):
                curr_id_isoform = best_aligned_transcripts_coverages[
                    i_alignment].id_mapped_isoform
                curr_isoforms_coverage = best_aligned_internal_isoforms_coverages[
                    i_alignment]
                if curr_id_isoform is not None:
                    transcripts_covered_bases[
                        i_alignment] = best_aligned_transcripts_coverages[
                            i_alignment].covered_bases[curr_id_isoform]
                    isoforms_covered_fraction[
                        i_alignment] = curr_isoforms_coverage.assembled_fraction[
                            curr_id_isoform]

            curr_max_keys = UtilsCoverage.get_ids_best_mapped(
                transcripts_covered_bases, isoforms_covered_fraction)

            # for transcripts all aligned to unannotated regions:
            if curr_max_keys == []:
                elapsed_time = datetime.now() - start_time

                return best_lines, best_alignments, best_aligned_transcripts, \
                       best_aligned_transcripts_coverages, best_aligned_internal_isoforms_coverages,\
                       elapsed_time, elapsed_transcript_time

            # form lines, alignments, transcripts and coverages corresponded best mapping:
            best_mapped_lines = []
            best_mapped_alignments = []
            best_mapped_aligned_transcripts = []
            best_mapped_aligned_transcripts_coverages = []
            best_mapped_internal_isoforms_coverages = []
            for i_line_alignment in curr_max_keys:
                best_mapped_lines.append(best_lines[i_line_alignment])
                best_mapped_alignments.append(
                    best_alignments[i_line_alignment])
                best_mapped_aligned_transcripts.append(
                    best_aligned_transcripts[i_line_alignment])
                best_mapped_aligned_transcripts_coverages.append(
                    best_aligned_transcripts_coverages[i_line_alignment])
                best_mapped_internal_isoforms_coverages.append(
                    best_aligned_internal_isoforms_coverages[i_line_alignment])

            elapsed_time = datetime.now() - start_time

            return best_mapped_lines, best_mapped_alignments, best_mapped_aligned_transcripts, \
                   best_mapped_aligned_transcripts_coverages, best_mapped_internal_isoforms_coverages,\
                   elapsed_time, elapsed_transcript_time