def orfQuant_OPM(gene, sqlite_path_organism, sqlite_path_reads, supported, counts, filter=True): exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported) junctions = genomic_junction_positions(gene, sqlite_path_organism, supported) orf_coordinates = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) orf_junctions = get_orf_coordinate_junctions(orf_coordinates) features_per_orf(orf_coordinates, orf_junctions, exons, junctions) junction_scores = genomic_junction_scores(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=True) unique_shared_exons = classify_regions_shared_unique(exons) unique_shared_junctions = classify_regions_shared_unique(junctions) coverage_exons = region_coverage(exons, counts) coverage_junctions = coverage_junction_transcript(junction_scores) shared_coverage(coverage_exons, coverage_junctions) # pos_neg_exons = classify_regions_pos_neg(exons, supported, exclude=True, count_type="range") # pos_neg_junctions = classify_junctions_pos_neg(junction_scores) average_unique = average_unique_coverage(unique_shared_exons, unique_shared_junctions, coverage_junctions, coverage_exons) average_all = all_feature_average(coverage_junctions, coverage_exons) cORF = cORF_ratio(average_unique, average_all) all_shared = True for transcript in supported: if unique_shared_junctions[transcript][ "unique"] == [] and unique_shared_exons[transcript][ "unique"] == []: cORF[transcript] = adjusted_coverage_for_non_unique_orfs( transcript, coverage_exons, coverage_junctions, average_all) else: all_shared = False if all_shared: cORF = shared_coverage(coverage_exons, coverage_junctions) else: cORF = cORF_ratio(average_unique, average_all) adjusted_a_sites = aORF(cORF, counts) orf_lengths = lORF(supported, sqlite_path_organism) orfs_per_million = ORFs_per_million(adjusted_a_sites, orf_lengths) return orfs_per_million
def rna_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, exclude=True, count_type="range"): if exclude: supported = filter_unsupported_transcripts(gene, sqlite_path_organism, sqlite_path_reads) else: gene_info = get_gene_info(gene, sqlite_path_organism) supported = [i[0] for i in gene_info] genomic_exon_coordinates = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported) unique_regions = get_unique_regions(genomic_exon_coordinates) all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print("The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process") return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript(unique_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 for transcript in counts: transcript_sum = sum(counts[transcript]) if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads): transcripts_to_explain_reads.append(str(transcript)) sum_of_exon_counts[transcript] = transcript_sum if transcript_sum > maximum_sum: maximum_sum = transcript_sum maximum_transcript = transcript print("The transcript with most uniquely mapped reads is {maximum_transcript} with a score of {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) ) return transcripts_to_explain_reads
from tripsSplicepy2 import get_protein_coding_transcript_ids from tripsSplicepy2 import get_reads_per_genomic_location_asite from tripsCountpy2 import ribo_seq_read_counting from tripsCountpy2 import ribo_seq_read_counting_raw if __name__ == "__main__": start = time.time() gene = "phpt1" sqlite_path_organism = "homo_sapiens.v2.sqlite" sqlite_path_reads = ["SRR2433794.sqlite"] coding = get_protein_coding_transcript_ids(gene, sqlite_path_organism) genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True) # genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True) exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, coding) counts = count_read_supporting_regions_per_transcript( exons, genomic_read_positions) orfQuant_res = orfQuant_OPM(gene, sqlite_path_organism, sqlite_path_reads, coding, counts, filter=True) end = time.time() print("ORFquant OPM time: " + str(end - start)) start = time.time() gene = "phpt1"
def query(): global user_short_passed tran_dict = {} gene_dict = {} ribo_user_files = {} data = ast.literal_eval(request.data) tran = data['transcript'].upper().strip() readscore = data['readscore'] secondary_readscore = data['secondary_readscore'] minread = int(data['minread']) maxread = int(data['maxread']) minfiles = int(data['minfiles']) organism = data['organism'] seqhili = data['seqhili'].split(",") hili_start = int(data['hili_start']) hili_stop = int(data['hili_stop']) transcriptome = data['transcriptome'] advanced = data["advanced"] # Send file_list (a list of integers intentionally encoded as strings due to javascript), to be converted to a dictionary with riboseq/rnaseq lists of file paths. file_paths_dict = fetch_file_paths(data["file_list"],organism) primetype = data["primetype"] user_hili_starts = data["user_hili_starts"] user_hili_stops = data["user_hili_stops"] user_short = data["user_short"] connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC)) connection.text_factory = str cursor = connection.cursor() cursor.execute("SELECT owner FROM organisms WHERE organism_name = '{}' and transcriptome_list = '{}';".format(organism, transcriptome)) owner = (cursor.fetchone())[0] if owner == 1: if os.path.isfile("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)): sqlite_path_organism = "{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome) transhelve = sqlite3.connect("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)) else: return "Cannot find annotation file {}.{}.sqlite".format(organism,transcriptome) else: sqlite_path_organism = "{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome) transhelve = sqlite3.connect("{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome)) cursor = transhelve.cursor() cursor.execute("SELECT * from transcripts WHERE transcript = '{}'".format(tran)) result = cursor.fetchone() inputtran = True if result != None: newtran = result[0] else: inputtran = False if inputtran == False: cursor.execute("SELECT * from transcripts WHERE gene = '{}'".format(tran)) result = cursor.fetchall() if result != []: if len(result) == 1: tran = str(result[0][0]) else: return_str = "TRANSCRIPTS" f = open("logfile.txt", "w") coding = get_protein_coding_transcript_ids(tran, sqlite_path_organism) genomic_read_positions = get_reads_per_genomic_location_asite(tran, file_paths_dict["riboseq"].values(), sqlite_path_organism, coding, filter=True) exons = genomic_exon_coordinate_ranges(tran, sqlite_path_organism, coding) counts = count_read_supporting_regions_per_transcript(exons, genomic_read_positions) orfQuant_res = orfQuant(tran, sqlite_path_organism, file_paths_dict["riboseq"].values(), coding, counts, filter=True) f.close() for transcript in result: cursor.execute("SELECT length,cds_start,cds_stop,principal,version from transcripts WHERE transcript = '{}'".format(transcript[0])) tran_result = cursor.fetchone() tranlen = tran_result[0] cds_start = tran_result[1] cds_stop = tran_result[2] if str(tran_result[3]) == "1": principal = "principal" else: principal = "" version = tran_result[4] if cds_start == "NULL" or cds_start == None: cdslen = "NULL" threeutrlen = "NULL" else: cdslen = cds_stop-cds_start threeutrlen = tranlen - cds_stop if transcript[0] in orfQuant_res: coverage = orfQuant_res[transcript[0]] else: coverage = "NULL" return_str += (":{},{},{},{},{},{},{}".format(transcript[0],version, tranlen, cds_start, cdslen, threeutrlen,coverage)) return return_str else: return "ERROR! Could not find any transcript corresponding to {}".format(tran) transhelve.close() if 'varlite' in data: lite = "y" else: lite="n" if 'preprocess' in data: preprocess = True else: preprocess = False if 'uga_diff' in data: uga_diff = True else: uga_diff = False if 'color_readlen_dist' in data: color_readlen_dist = True else: color_readlen_dist = False if 'ribocoverage' in data: ribocoverage = True else: ribocoverage = False if "nucseq" in data: nucseq = True else: nucseq = False if "mismatches" in data: mismatches = True else: mismatches = False if "ambiguous" in data: ambiguous = "ambig" else: ambiguous = "unambig" if "pcr" in data: pcr = True else: pcr = False if "noisered" in data: noisered = True else: noisered = False if "mismatch" in data: mismatch = True else: mismatch = False if data["user_short"] == "None" or user_short_passed == True: short_code = generate_short_code(data,organism,data["transcriptome"],"interactive_plot") else: short_code = data["user_short"] user_short_passed = True try: user = current_user.name except: user = None connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC)) connection.text_factory = str cursor = connection.cursor() background_col = config.BACKGROUND_COL uga_col = config.UGA_COL uag_col = config.UAG_COL uaa_col = config.UAA_COL title_size = config.TITLE_SIZE subheading_size = config.SUBHEADING_SIZE axis_label_size = config.AXIS_LABEL_SIZE marker_size = config.MARKER_SIZE cds_marker_size = config.CDS_MARKER_SIZE cds_marker_colour = config.CDS_MARKER_COLOUR legend_size = config.LEGEND_SIZE ribo_linewidth = config.RIBO_LINEWIDTH #Put any publicly available seq types (apart from riboseq and rnaseq) here seq_rules = {"proteomics":{"frame_breakdown":1},"conservation":{"frame_breakdown":1},"tcpseq":{"frame_breakdown":0}} #get user_id if user != None: cursor.execute("SELECT user_id from users WHERE username = '******';".format(user)) result = (cursor.fetchone()) user_id = result[0] #get a list of organism id's this user can access cursor.execute("SELECT background_col,uga_col,uag_col,uaa_col,title_size,subheading_size,axis_label_size,marker_size,cds_marker_width,cds_marker_colour,legend_size,ribo_linewidth from user_settings WHERE user_id = '{}';".format(user_id)) result = (cursor.fetchone()) background_col = result[0] uga_col = result[1] uag_col = result[2] uaa_col = result[3] title_size = result[4] subheading_size = result[5] axis_label_size = result[6] marker_size = result[7] cds_marker_size = result[8] cds_marker_colour = result[9] legend_size = result[10] ribo_linewidth = result[11] #get rules for all custom seq types cursor.execute("SELECT * from seq_rules WHERE user_id = {};".format(user_id)) result = (cursor.fetchall()) for row in result: seq_name = row[1] frame_breakdown = row[2] seq_rules[seq_name] = {"frame_breakdown":frame_breakdown} connection.close() if tran != "": x = riboflask.generate_plot(tran, ambiguous, minread, maxread, lite , ribocoverage, organism, readscore, noisered,primetype, minfiles,nucseq, user_hili_starts, user_hili_stops,uga_diff,file_paths_dict,short_code, color_readlen_dist, background_col,uga_col, uag_col, uaa_col,advanced,config.ANNOTATION_DIR,seqhili,seq_rules,title_size, subheading_size,axis_label_size,marker_size,transcriptome,config.UPLOADS_DIR,cds_marker_size,cds_marker_colour, legend_size,ribo_linewidth,secondary_readscore,pcr,mismatches,hili_start, hili_stop) else: x = "ERROR! Could not find any transcript corresponding to whatever you entered" return x
def orfQuant_signal(gene, sqlite_path_organism, sqlite_path_reads, supported, counts, filter=True): exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported) junctions = genomic_junction_positions(gene, sqlite_path_organism, supported) orf_coordinates = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) orf_junctions = get_orf_coordinate_junctions(orf_coordinates) features_per_orf(orf_coordinates, orf_junctions, exons, junctions) junction_scores = genomic_junction_scores(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=True) unique_shared_exons = classify_regions_shared_unique(exons) unique_shared_junctions = classify_regions_shared_unique(junctions) coverage_exons = region_coverage(exons, counts) coverage_junctions = coverage_junction_transcript(junction_scores) shared_coverage(coverage_exons, coverage_junctions) # pos_neg_exons = classify_regions_pos_neg(exons, supported, exclude=True, count_type="range") # pos_neg_junctions = classify_junctions_pos_neg(junction_scores) average_unique = average_unique_coverage(unique_shared_exons, unique_shared_junctions, coverage_junctions, coverage_exons) average_all = all_feature_average(coverage_junctions, coverage_exons) cORF = cORF_ratio(average_unique, average_all) all_shared = True for transcript in supported: if unique_shared_junctions[transcript][ "unique"] == [] and unique_shared_exons[transcript][ "unique"] == []: cORF[transcript] = adjusted_coverage_for_non_unique_orfs( transcript, coverage_exons, coverage_junctions, average_all) else: all_shared = False if all_shared: cORF = shared_coverage(coverage_exons, coverage_junctions) else: cORF = cORF_ratio(average_unique, average_all) adjusted_a_sites = aORF(cORF, counts) orf_lengths = lORF(supported, sqlite_path_organism) gene_signal_per_orf = pct_gene_signal_per_orf(adjusted_a_sites) return gene_signal_per_orf # if __name__ == "__main__": # start= time.time() # gene = "igf2" # sqlite_path_organism = "homo_sapiens.v2.sqlite" # sqlite_path_reads = ["SRR2433794.sqlite"] # coding = get_protein_coding_transcript_ids(gene, sqlite_path_organism) # genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, # coding, filter=True) # # genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True) # exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, coding) # # counts = count_read_supporting_regions_per_transcript(exons, genomic_read_positions) # # orfQuant_res = orfQuant_OPM(gene, sqlite_path_organism, sqlite_path_reads, coding, counts, filter=True) # end = time.time() # print(end - start) # rankings = rank_based_on_dict_values(orfQuant_res) # print rankings