def ribo_seq_read_counting_raw(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True): supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism) exclude = True orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) if unique: orf_regions = get_unique_regions(orf_regions) # unique_regions = get_unique_regions(genomic_exon_coordinates) # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print( "The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process" ) return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript( orf_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( orf_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( orf_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 print(counts) return counts
def ribo_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True): supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism) exclude = True orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported) if unique: orf_regions = get_unique_regions(orf_regions) # unique_regions = get_unique_regions(genomic_exon_coordinates) # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print("The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process") return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript(orf_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 region_coverage = get_coverage_per_region(orf_regions, counts) coverage = average_coverage_per_transcript(region_coverage) # rankings = rank_based_on_dict_values(coverage) # rankings_dict = {} # for i in rankings: # print i # if i[0] not in rankings_dict: # rankings_dict[i[0]] = i[1] # for transcript in counts: # transcript_sum = sum(counts[transcript]) # # if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads): # # transcripts_to_explain_reads.append(str(transcript)) # sum_of_exon_counts[transcript] = transcript_sum # # if transcript_sum > maximum_sum: # maximum_sum = transcript_sum # maximum_transcript = transcript # print "highest sum of counts: {maximum_transcript} with a total sum of: {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) return coverage
def rna_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, exclude=True, count_type="range"): if exclude: supported = filter_unsupported_transcripts(gene, sqlite_path_organism, sqlite_path_reads) else: gene_info = get_gene_info(gene, sqlite_path_organism) supported = [i[0] for i in gene_info] genomic_exon_coordinates = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported) unique_regions = get_unique_regions(genomic_exon_coordinates) all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates) junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude) transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions) try: ["range", "fiveprime", "asite"].index(count_type) except ValueError: print("The count type must be one of 'range', 'fiveprime' or 'asite'. " "count_type refers to the part of the read that is used in the feature counting process") return "ERROR" if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript(unique_regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions) sum_of_exon_counts = {} maximum_sum = 0 for transcript in counts: transcript_sum = sum(counts[transcript]) if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads): transcripts_to_explain_reads.append(str(transcript)) sum_of_exon_counts[transcript] = transcript_sum if transcript_sum > maximum_sum: maximum_sum = transcript_sum maximum_transcript = transcript print("The transcript with most uniquely mapped reads is {maximum_transcript} with a score of {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) ) return transcripts_to_explain_reads
def classify_regions_pos_neg(gene, sqlite_path_reads, sqlite_path_organism, regions, supported, exclude=True, count_type="range"): # classify the genomic coordinate ranges into pos or neg. pos if 1 or more read supports it. Returns a nested dictionary # with two nests. 1st on transcripts as keys, 2nd with pos or neg as keys if count_type == "range": genomic_read_ranges = get_read_ranges_genomic_location( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_readranges_supporting_exons_per_transcript( regions, genomic_read_ranges) if count_type == "fiveprime": genomic_read_positions = get_reads_per_genomic_location_fiveprime( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( regions, genomic_read_positions) if count_type == "asite": genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, supported, filter=exclude) counts = count_read_supporting_regions_per_transcript( regions, genomic_read_positions) classified = {} for transcript in counts: if transcript not in classified: classified[transcript] = {'pos': [], 'neg': []} for i in zip(regions[transcript], counts[transcript]): if i[1] > 0: classified[transcript]['pos'].append(i[0]) else: classified[transcript]['neg'].append(i[0]) return classified
import time from tripsCountpy2 import count_read_supporting_regions_per_transcript from tripsSplicepy2 import genomic_exon_coordinate_ranges from tripsSplicepy2 import get_protein_coding_transcript_ids from tripsSplicepy2 import get_reads_per_genomic_location_asite from tripsCountpy2 import ribo_seq_read_counting from tripsCountpy2 import ribo_seq_read_counting_raw if __name__ == "__main__": start = time.time() gene = "phpt1" sqlite_path_organism = "homo_sapiens.v2.sqlite" sqlite_path_reads = ["SRR2433794.sqlite"] coding = get_protein_coding_transcript_ids(gene, sqlite_path_organism) genomic_read_positions = get_reads_per_genomic_location_asite( gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True) # genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True) exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, coding) counts = count_read_supporting_regions_per_transcript( exons, genomic_read_positions) orfQuant_res = orfQuant_OPM(gene, sqlite_path_organism, sqlite_path_reads, coding, counts, filter=True) end = time.time() print("ORFquant OPM time: " + str(end - start))
def query(): global user_short_passed tran_dict = {} gene_dict = {} ribo_user_files = {} data = ast.literal_eval(request.data) tran = data['transcript'].upper().strip() readscore = data['readscore'] secondary_readscore = data['secondary_readscore'] minread = int(data['minread']) maxread = int(data['maxread']) minfiles = int(data['minfiles']) organism = data['organism'] seqhili = data['seqhili'].split(",") hili_start = int(data['hili_start']) hili_stop = int(data['hili_stop']) transcriptome = data['transcriptome'] advanced = data["advanced"] # Send file_list (a list of integers intentionally encoded as strings due to javascript), to be converted to a dictionary with riboseq/rnaseq lists of file paths. file_paths_dict = fetch_file_paths(data["file_list"],organism) primetype = data["primetype"] user_hili_starts = data["user_hili_starts"] user_hili_stops = data["user_hili_stops"] user_short = data["user_short"] connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC)) connection.text_factory = str cursor = connection.cursor() cursor.execute("SELECT owner FROM organisms WHERE organism_name = '{}' and transcriptome_list = '{}';".format(organism, transcriptome)) owner = (cursor.fetchone())[0] if owner == 1: if os.path.isfile("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)): sqlite_path_organism = "{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome) transhelve = sqlite3.connect("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)) else: return "Cannot find annotation file {}.{}.sqlite".format(organism,transcriptome) else: sqlite_path_organism = "{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome) transhelve = sqlite3.connect("{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome)) cursor = transhelve.cursor() cursor.execute("SELECT * from transcripts WHERE transcript = '{}'".format(tran)) result = cursor.fetchone() inputtran = True if result != None: newtran = result[0] else: inputtran = False if inputtran == False: cursor.execute("SELECT * from transcripts WHERE gene = '{}'".format(tran)) result = cursor.fetchall() if result != []: if len(result) == 1: tran = str(result[0][0]) else: return_str = "TRANSCRIPTS" f = open("logfile.txt", "w") coding = get_protein_coding_transcript_ids(tran, sqlite_path_organism) genomic_read_positions = get_reads_per_genomic_location_asite(tran, file_paths_dict["riboseq"].values(), sqlite_path_organism, coding, filter=True) exons = genomic_exon_coordinate_ranges(tran, sqlite_path_organism, coding) counts = count_read_supporting_regions_per_transcript(exons, genomic_read_positions) orfQuant_res = orfQuant(tran, sqlite_path_organism, file_paths_dict["riboseq"].values(), coding, counts, filter=True) f.close() for transcript in result: cursor.execute("SELECT length,cds_start,cds_stop,principal,version from transcripts WHERE transcript = '{}'".format(transcript[0])) tran_result = cursor.fetchone() tranlen = tran_result[0] cds_start = tran_result[1] cds_stop = tran_result[2] if str(tran_result[3]) == "1": principal = "principal" else: principal = "" version = tran_result[4] if cds_start == "NULL" or cds_start == None: cdslen = "NULL" threeutrlen = "NULL" else: cdslen = cds_stop-cds_start threeutrlen = tranlen - cds_stop if transcript[0] in orfQuant_res: coverage = orfQuant_res[transcript[0]] else: coverage = "NULL" return_str += (":{},{},{},{},{},{},{}".format(transcript[0],version, tranlen, cds_start, cdslen, threeutrlen,coverage)) return return_str else: return "ERROR! Could not find any transcript corresponding to {}".format(tran) transhelve.close() if 'varlite' in data: lite = "y" else: lite="n" if 'preprocess' in data: preprocess = True else: preprocess = False if 'uga_diff' in data: uga_diff = True else: uga_diff = False if 'color_readlen_dist' in data: color_readlen_dist = True else: color_readlen_dist = False if 'ribocoverage' in data: ribocoverage = True else: ribocoverage = False if "nucseq" in data: nucseq = True else: nucseq = False if "mismatches" in data: mismatches = True else: mismatches = False if "ambiguous" in data: ambiguous = "ambig" else: ambiguous = "unambig" if "pcr" in data: pcr = True else: pcr = False if "noisered" in data: noisered = True else: noisered = False if "mismatch" in data: mismatch = True else: mismatch = False if data["user_short"] == "None" or user_short_passed == True: short_code = generate_short_code(data,organism,data["transcriptome"],"interactive_plot") else: short_code = data["user_short"] user_short_passed = True try: user = current_user.name except: user = None connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC)) connection.text_factory = str cursor = connection.cursor() background_col = config.BACKGROUND_COL uga_col = config.UGA_COL uag_col = config.UAG_COL uaa_col = config.UAA_COL title_size = config.TITLE_SIZE subheading_size = config.SUBHEADING_SIZE axis_label_size = config.AXIS_LABEL_SIZE marker_size = config.MARKER_SIZE cds_marker_size = config.CDS_MARKER_SIZE cds_marker_colour = config.CDS_MARKER_COLOUR legend_size = config.LEGEND_SIZE ribo_linewidth = config.RIBO_LINEWIDTH #Put any publicly available seq types (apart from riboseq and rnaseq) here seq_rules = {"proteomics":{"frame_breakdown":1},"conservation":{"frame_breakdown":1},"tcpseq":{"frame_breakdown":0}} #get user_id if user != None: cursor.execute("SELECT user_id from users WHERE username = '******';".format(user)) result = (cursor.fetchone()) user_id = result[0] #get a list of organism id's this user can access cursor.execute("SELECT background_col,uga_col,uag_col,uaa_col,title_size,subheading_size,axis_label_size,marker_size,cds_marker_width,cds_marker_colour,legend_size,ribo_linewidth from user_settings WHERE user_id = '{}';".format(user_id)) result = (cursor.fetchone()) background_col = result[0] uga_col = result[1] uag_col = result[2] uaa_col = result[3] title_size = result[4] subheading_size = result[5] axis_label_size = result[6] marker_size = result[7] cds_marker_size = result[8] cds_marker_colour = result[9] legend_size = result[10] ribo_linewidth = result[11] #get rules for all custom seq types cursor.execute("SELECT * from seq_rules WHERE user_id = {};".format(user_id)) result = (cursor.fetchall()) for row in result: seq_name = row[1] frame_breakdown = row[2] seq_rules[seq_name] = {"frame_breakdown":frame_breakdown} connection.close() if tran != "": x = riboflask.generate_plot(tran, ambiguous, minread, maxread, lite , ribocoverage, organism, readscore, noisered,primetype, minfiles,nucseq, user_hili_starts, user_hili_stops,uga_diff,file_paths_dict,short_code, color_readlen_dist, background_col,uga_col, uag_col, uaa_col,advanced,config.ANNOTATION_DIR,seqhili,seq_rules,title_size, subheading_size,axis_label_size,marker_size,transcriptome,config.UPLOADS_DIR,cds_marker_size,cds_marker_colour, legend_size,ribo_linewidth,secondary_readscore,pcr,mismatches,hili_start, hili_stop) else: x = "ERROR! Could not find any transcript corresponding to whatever you entered" return x