def __init__(self, psl_alignment, sorted_exons_attr, strand_specific, sqlite3_db_genes, type_isoforms): # getting aligned transcript: self.alignment = psl_alignment # size of aligned transcript in genome reference: self.reference_len = self.alignment.target_fragment.end - self.alignment.target_fragment.start + 1 self.alignment_len = sum(psl_alignment.blocks_sizes) self.fraction = self.alignment_len * 1.0 / self.alignment.query_fragment.size self.internal_isoforms = None self.ids_internal_isoforms = set() self.children_exons_dict = {} self.ids_children_exons_dict = {} if sqlite3_db_genes is not None: # internal_exons = self.get_internal_exons(db_genes, strand_specific) if strand_specific: strand = self.alignment.strand else: strand = str(None) internal_exons = \ UtilsCoverage.get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr, self.alignment.target_fragment.starts, self.alignment.target_fragment.ends, strand, self.alignment.target_fragment.name) self.internal_isoforms = list( UtilsCoverage.get_internal_isoforms(sqlite3_db_genes, type_isoforms, internal_exons)) for internal_isoform in self.internal_isoforms: self.ids_internal_isoforms.add(internal_isoform.id) self.children_exons_dict[internal_isoform.id] = list( sqlite3_db_genes.children( internal_isoform.id, featuretype=UtilsAnnotations.default_type_exons, order_by='start')) # for prokaryotes: if len(self.children_exons_dict[internal_isoform.id]) == 0: self.children_exons_dict[internal_isoform.id] = [ internal_isoform ] self.ids_children_exons_dict[internal_isoform.id] = set([ exon.id for exon in self.children_exons_dict[internal_isoform.id] ])
def get_transcript_coverage(self, aligned_transcript, internal_isoforms_coverage, WELL_FULLY_COVERAGE_THRESHOLDS): self.transcript_len = aligned_transcript.alignment.query_fragment.size self.alignment_len = aligned_transcript.alignment_len for i_isoform in range(len(aligned_transcript.internal_isoforms)): id_isoform = aligned_transcript.internal_isoforms[i_isoform].id # get metrics of coverage excluded covered bases of blocks: for i_block in range(self.blocks_num): self.covered_bases[id_isoform] += self.covered_bases_blocks[id_isoform][i_block] self.covered_fraction_blocks[id_isoform][i_block] = self.covered_bases_blocks[id_isoform][i_block] * 1.0 / aligned_transcript.alignment.blocks_sizes[i_block] self.avg_covered_fraction_block[id_isoform] += self.covered_fraction_blocks[id_isoform][i_block] if self.covered_fraction_blocks[id_isoform][i_block] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_block_threshold: self.num_well_covered_blocks[id_isoform] += 1 if self.covered_fraction_blocks[id_isoform][i_block] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_block_threshold: self.num_fully_covered_blocks[id_isoform] += 1 self.covered_fraction_whole_transcript[id_isoform] = self.covered_bases[id_isoform] * 1.0 / self.transcript_len self.covered_fraction_aligned_part[id_isoform] = self.covered_bases[id_isoform] * 1.0 / self.alignment_len self.avg_covered_fraction_block[id_isoform] /= self.blocks_num self.percentage_well_covered_blocks[id_isoform] = self.num_well_covered_blocks[id_isoform] * 1.0 / self.blocks_num self.percentage_fully_covered_blocks[id_isoform] = self.num_fully_covered_blocks[id_isoform] * 1.0 / self.blocks_num # get mapped to transcript isoform (TEMPORARY: can have better solution): if len(aligned_transcript.internal_isoforms) != 0: self.id_mapped_isoform = UtilsCoverage.get_ids_best_mapped(self.covered_bases, internal_isoforms_coverage.assembled_fraction)[0]
def get_database_coverage_by_reads( self, sam_path, args_tophat, reference_path, single_reads, left_reads, right_reads, reference_dict, sqlite3_db_genes, type_isoforms, sorted_exons_attr, strand_specific, tot_isoforms_len, genome_len, output_dir, threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir): if sam_path is None: if args_tophat: sam_path = \ UtilsTools.get_sam_by_tophat(None, reference_path, single_reads, left_reads, right_reads, output_dir, threads, logger, log_dir) else: sam_path = \ UtilsTools.get_sam_by_STAR(threads, reference_path, None, single_reads, left_reads, right_reads, output_dir, None, None, genome_len, logger, log_dir) if sam_path is None: return logger.print_timestamp() logger.info('Getting database coverage by reads...') with open(sam_path, 'r') as in_handle: for line in in_handle: if line[0] == '@': continue curr_sam_alignment = Alignment.SAMFileAlignment.get_alignment_from_sam_line( line, logger, reference_dict) if strand_specific: strand = curr_sam_alignment.strand else: strand = None internal_exons = \ UtilsCoverage.get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr, curr_sam_alignment.target_fragment.starts, curr_sam_alignment.target_fragment.ends, str(strand), curr_sam_alignment.target_fragment.name) internal_isoforms = list( UtilsCoverage.get_internal_isoforms( sqlite3_db_genes, type_isoforms, internal_exons)) for isoform in internal_isoforms: children_exons = list( sqlite3_db_genes.children( isoform.id, featuretype=UtilsAnnotations.default_type_exons, order_by='start')) # for prokaryotes: if len(children_exons) == 0: children_exons = [isoform] exon_starts = [exon.start for exon in children_exons] exon_ends = [exon.end for exon in children_exons] exon_ids = [exon.id for exon in children_exons] target_cov_pos, query_cov_pos = \ UtilsCoverage.get_coverage_positions(exon_ids, exon_starts, exon_ends, range(len(curr_sam_alignment.target_fragment.starts)), curr_sam_alignment.target_fragment.starts, curr_sam_alignment.target_fragment.ends) if isoform.id not in self.num_reads_covered_pos: self.num_reads_covered_pos[isoform.id] = {} for id_exon in target_cov_pos: if id_exon not in self.num_reads_covered_pos[ isoform.id]: self.num_reads_covered_pos[isoform.id][id_exon] = [ 0 ] * len(sqlite3_db_genes[id_exon]) for i_cov_pos in range(len(target_cov_pos[id_exon])): for i_pos in range( target_cov_pos[id_exon][i_cov_pos][0], target_cov_pos[id_exon][i_cov_pos][1] + 1): self.num_reads_covered_pos[ isoform.id][id_exon][i_pos] += 1 for id_isoform in self.num_reads_covered_pos: parent_genes = list( sqlite3_db_genes.parents( id_isoform, featuretype=UtilsAnnotations.default_type_genes)) if parent_genes == []: parent_gene_id = id_isoform else: parent_gene_id = parent_genes[0].id self.expressed_fraction_isoform[id_isoform] = 0.0 len_isoform = 0 for id_exon in self.num_reads_covered_pos[id_isoform]: len_exon = len(self.num_reads_covered_pos[id_isoform][id_exon]) num_uncovered_pos = self.num_reads_covered_pos[id_isoform][ id_exon].count(0) num_expressed_bases = len_exon - num_uncovered_pos if id_exon not in self.expressed_fraction_exon: self.expressed_fraction_exon[ id_exon] = num_expressed_bases * 1.0 / len_exon if self.expressed_fraction_exon[ id_exon] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_exon_threshold: self.ids_well_expressed_exons.add(id_exon) if self.expressed_fraction_exon[ id_exon] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_exon_threshold: self.ids_fully_expressed_exons.add(id_exon) len_isoform += len_exon self.num_expressed_pos_at_least_one_by_reads += num_expressed_bases self.expressed_fraction_isoform[ id_isoform] += num_expressed_bases self.expressed_fraction_isoform[id_isoform] /= len_isoform if self.expressed_fraction_isoform[ id_isoform] >= WELL_FULLY_COVERAGE_THRESHOLDS.well_isoform_threshold: self.ids_well_expressed_genes.add(parent_gene_id) self.ids_well_expressed_isoforms.add(id_isoform) if self.expressed_fraction_isoform[ id_isoform] >= WELL_FULLY_COVERAGE_THRESHOLDS.fully_isoform_threshold: self.ids_fully_expressed_genes.add(parent_gene_id) self.ids_fully_expressed_isoforms.add(id_isoform) self.num_well_expressed_genes = len(self.ids_well_expressed_genes) self.num_well_expressed_isoforms = len( self.ids_well_expressed_isoforms) self.num_well_expressed_exons = len(self.ids_well_expressed_exons) self.num_fully_expressed_genes = len(self.ids_fully_expressed_genes) self.num_fully_expressed_isoforms = len( self.ids_fully_expressed_isoforms) self.num_fully_expressed_exons = len(self.ids_fully_expressed_exons) if tot_isoforms_len != 0: self.fraction_annotation_mapped_by_reads = self.num_expressed_pos_at_least_one_by_reads * 1.0 / tot_isoforms_len logger.info('Done.')
def get_aligned_transcript_and_coverages(self, psl_alignment, sorted_exons_attr, strand_specific, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS): # CREATE ALIGNED TRANSCRIPT: start_time = datetime.now() # print 'aligned transcript: ', datetime.now() aligned_transcript = AlignedTranscript.AlignedTranscript( psl_alignment, sorted_exons_attr, strand_specific, sqlite3_db_genes, type_isoforms) # print 'done: ', datetime.now() elapsed_transcript_time = datetime.now() - start_time # GET COVERAGES: aligned_transcript_coverage = None internal_isoforms_coverage = None if self.assembly_correctness_metrics.transcripts_coverage is not None and \ self.assembly_completeness_metrics.isoforms_coverage is not None: # get coverages: aligned_transcript_coverage = \ OneTranscriptCoverage.OneTranscriptCoverage(aligned_transcript.ids_internal_isoforms, aligned_transcript.alignment.blocks_num) # print 'internal isoforms coverage: : ', datetime.now() internal_isoforms_coverage = \ InternalIsoformsCoverage.InternalIsoformsCoverage(aligned_transcript.internal_isoforms) # print 'done: : ', datetime.now() # print 'bases exons blocks covered: ', datetime.now() # set exons and blocks overlap (covered bases): for internal_isoform in aligned_transcript.internal_isoforms: # exon.start, exon.end: 1-based coordinates; start must be <= end exons_starts = [ exon.start - 1 for exon in aligned_transcript.children_exons_dict[internal_isoform.id] ] exons_ends = [ exon.end - 1 for exon in aligned_transcript.children_exons_dict[internal_isoform.id] ] exons_ids = [ exon.id for exon in aligned_transcript.children_exons_dict[ internal_isoform.id] ] target_cov_pos, query_cov_pos = \ UtilsCoverage.get_coverage_positions(exons_ids, exons_starts, exons_ends, range(aligned_transcript.alignment.blocks_num), aligned_transcript.alignment.target_fragment.starts, aligned_transcript.alignment.target_fragment.ends) internal_isoforms_coverage.update_internal_isoforms_coverage( sqlite3_db_genes, internal_isoform.id, target_cov_pos) aligned_transcript_coverage.update_transcript_coverage( internal_isoform.id, query_cov_pos) # print 'done: ', datetime.now() internal_isoforms_coverage.get_internal_isoforms_coverage( aligned_transcript.internal_isoforms, aligned_transcript.children_exons_dict) aligned_transcript_coverage.get_transcript_coverage( aligned_transcript, internal_isoforms_coverage, WELL_FULLY_COVERAGE_THRESHOLDS) return aligned_transcript, aligned_transcript_coverage, internal_isoforms_coverage, elapsed_transcript_time
def get_best_mapped_from_best_aligned(self, best_lines, best_alignments, sorted_exons_attr, strand_specific, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS): start_time = datetime.now() best_aligned_transcripts = [] best_aligned_transcripts_coverages = [] best_aligned_internal_isoforms_coverages = [] for i_line_alignment in range(len(best_lines)): curr_aligned_transcript, curr_aligned_transcript_coverage, curr_internal_isoforms_coverage, \ elapsed_transcript_time = \ self.get_aligned_transcript_and_coverages(best_alignments[i_line_alignment], sorted_exons_attr, strand_specific, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS) best_aligned_transcripts.append(curr_aligned_transcript) best_aligned_transcripts_coverages.append( curr_aligned_transcript_coverage) best_aligned_internal_isoforms_coverages.append( curr_internal_isoforms_coverage) # IN CASE WHEN WE HAVN'T ANNOTATION: if sqlite3_db_genes is None: elapsed_time = datetime.now() - start_time return best_lines, best_alignments, best_aligned_transcripts, \ best_aligned_transcripts_coverages, best_aligned_internal_isoforms_coverages, \ elapsed_time, elapsed_transcript_time # IN CASE WHEN WE HAVE ANNOTATION: else: # choose best annotated transcript alignments over all best alignments: transcripts_covered_bases = {} isoforms_covered_fraction = {} for i_alignment in range(len(best_aligned_transcripts_coverages)): curr_id_isoform = best_aligned_transcripts_coverages[ i_alignment].id_mapped_isoform curr_isoforms_coverage = best_aligned_internal_isoforms_coverages[ i_alignment] if curr_id_isoform is not None: transcripts_covered_bases[ i_alignment] = best_aligned_transcripts_coverages[ i_alignment].covered_bases[curr_id_isoform] isoforms_covered_fraction[ i_alignment] = curr_isoforms_coverage.assembled_fraction[ curr_id_isoform] curr_max_keys = UtilsCoverage.get_ids_best_mapped( transcripts_covered_bases, isoforms_covered_fraction) # for transcripts all aligned to unannotated regions: if curr_max_keys == []: elapsed_time = datetime.now() - start_time return best_lines, best_alignments, best_aligned_transcripts, \ best_aligned_transcripts_coverages, best_aligned_internal_isoforms_coverages,\ elapsed_time, elapsed_transcript_time # form lines, alignments, transcripts and coverages corresponded best mapping: best_mapped_lines = [] best_mapped_alignments = [] best_mapped_aligned_transcripts = [] best_mapped_aligned_transcripts_coverages = [] best_mapped_internal_isoforms_coverages = [] for i_line_alignment in curr_max_keys: best_mapped_lines.append(best_lines[i_line_alignment]) best_mapped_alignments.append( best_alignments[i_line_alignment]) best_mapped_aligned_transcripts.append( best_aligned_transcripts[i_line_alignment]) best_mapped_aligned_transcripts_coverages.append( best_aligned_transcripts_coverages[i_line_alignment]) best_mapped_internal_isoforms_coverages.append( best_aligned_internal_isoforms_coverages[i_line_alignment]) elapsed_time = datetime.now() - start_time return best_mapped_lines, best_mapped_alignments, best_mapped_aligned_transcripts, \ best_mapped_aligned_transcripts_coverages, best_mapped_internal_isoforms_coverages,\ elapsed_time, elapsed_transcript_time