def ribo_seq_read_counting_raw(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True): supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism) exclude = True orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) if unique: orf_regions = get_unique_regions(orf_regions) # unique_regions = get_unique_regions(genomic_exon_coordinates) # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print( "The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process" ) return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript( orf_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( orf_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( orf_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 print(counts) return counts
def ribo_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True): supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism) exclude = True orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) if unique: orf_regions = get_unique_regions(orf_regions) # unique_regions = get_unique_regions(genomic_exon_coordinates) # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print("The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process") return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript(orf_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 region_coverage = get_coverage_per_region(orf_regions, counts) coverage = average_coverage_per_transcript(region_coverage) # rankings = rank_based_on_dict_values(coverage) # rankings_dict = {} # for i in rankings: # print i # if i[0] not in rankings_dict: # rankings_dict[i[0]] = i[1] # for transcript in counts: # transcript_sum = sum(counts[transcript]) # # if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads): # # transcripts_to_explain_reads.append(str(transcript)) # sum_of_exon_counts[transcript] = transcript_sum # # if transcript_sum > maximum_sum: # maximum_sum = transcript_sum # maximum_transcript = transcript # print "highest sum of counts: {maximum_transcript} with a total sum of: {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) return coverage
def rna_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, exclude=True, count_type="range"): if exclude: supported = filter_unsupported_transcripts(gene, sqlite_path_organism, sqlite_path_reads) else: gene_info = get_gene_info(gene, sqlite_path_organism) supported = [i[0] for i in gene_info] genomic_exon_coordinates = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported) unique_regions = get_unique_regions(genomic_exon_coordinates) all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print("The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process") return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript(unique_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 for transcript in counts: transcript_sum = sum(counts[transcript]) if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads): transcripts_to_explain_reads.append(str(transcript)) sum_of_exon_counts[transcript] = transcript_sum if transcript_sum > maximum_sum: maximum_sum = transcript_sum maximum_transcript = transcript print("The transcript with most uniquely mapped reads is {maximum_transcript} with a score of {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) ) return transcripts_to_explain_reads
def classify_regions_pos_neg(gene, sqlite_path_reads, sqlite_path_organism, regions, supported, exclude=True, count_type="range"): # classify the genomic coordinate ranges into pos or neg. pos if 1 or more read supports it. Returns a nested dictionary # with two nests. 1st on transcripts as keys, 2nd with pos or neg as keys if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript( regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( regions, genomic_read_positions) classified = {} for transcript in counts: if transcript not in classified: classified[transcript] = {'pos': [], 'neg': []} for i in zip(regions[transcript], counts[transcript]): if i[1] > 0: classified[transcript]['pos'].append(i[0]) else: classified[transcript]['neg'].append(i[0]) return classified