def control_handler(cnx, exon_type, summary, regulation="down"): my_path = os.path.dirname(os.path.realpath(__file__)) control_folder = my_path + "/control" control_file = "%s/control_%s.py" % (control_folder, summary) control_full = control_folder + "/control_full.pkl" ctrl_list, tmp = exon_control_handler.get_control_information(exon_type, control_file, control_full) exon2remove = union_dataset_function.get_exon_regulated_by_sf(cnx, regulation) if ctrl_list is None: print("Control dictionary was not found !") print("Creating control information") names, exon_tuple = exon_control_handler.get_control_exon_information(cnx, exon_type, exon2remove, regulation) # getting the new columns exon_tuple = exon_control_handler.remove_redundant_gene_information(exon_tuple) tmp = exon_control_handler.create_a_temporary_dictionary(names, exon_tuple) ctrl_list = exon_control_handler.get_summary_dictionaries(names, tmp, summary) exon_control_handler.write_control_file(exon_type, control_file, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) if "rel_exon_intron_up" not in ctrl_list.keys(): print("relative exon_intron size where not found.") print("getting relative exon_intron size") exon_tuple = get_control_exon_size_information(cnx, exon_type, exon2remove) print("Relative size calculation...") tmp_dic = tmp_dic_creator(exon_tuple) tmp = dict(tmp, **tmp_dic) print("summarizing") sum_dic = get_summary_dictionaries(tmp_dic, summary) ctrl_list = dict(ctrl_list, **sum_dic) print("writting...") write_adapted_dic(control_file, exon_type, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) return ctrl_list, tmp
def control_dictionaries_creator(): """ Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons """ exon_class.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/minimum_free_energy", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/minimum_free_energy", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon_type = "CCE" exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") ctrl_exon_list = get_control_exon_information(cnx, exon_type, exon2remove) print("retrieving upstream intron sequence") list_exon = [ exon_class.ExonClass(cnx, exon[0], exon[1], exon[2]) for exon in ctrl_exon_list ] print("calculating mfe") mfe_list_3ss, mfe_list_5ss = function.mfe_calculator(list_exon) cur_file = open(ctrl_dir + exon_type + "_mfe.py", "w") cur_file.write("mfe_3ss=" + str(mfe_list_3ss) + "\n") cur_file.write("mfe_5ss=" + str(mfe_list_5ss) + "\n") cur_file.close()
def irimia_analysis(cnx, exon_type, output, regulation, size_threshold, target_column, dic_bed): """ Create the histogram of the size of exons and make a levene test \ to test if the variance of GC content of a group of big CCE exons \ and a group of small exons (taken from Irimia et al) is different. :param cnx: (pymysql connection object) connection to Sed database. :param exon_type: (str) the type of control exons to analyse :param output: (str) folder where the results will be created :param regulation: (str) the regulation :param size_threshold: (int) the threshold :param target_column: (str) the feature of interest """ exon_2_remove = udf.get_exon_regulated_by_sf(cnx, regulation) exon_list = get_control_exon(cnx, exon_type, exon_2_remove, regulation) dic_size = get_list_of_value(cnx, exon_list, target_column) small_exons, big_exons = get_two_groups_of_exon(dic_size, size_threshold, target_column) del (small_exons) list_size = get_size(dic_bed) sizefig = "hist_of_Irimia_exon_size" make_histogram(list_size, output, sizefig, target_column, log=True) print(" nb exons having a size below/equal to %s nt (Irimia) : %s" % (size_threshold, len(dic_bed.keys()))) print(" nb exons having a size greater to %s nt : %s" % (size_threshold, len(big_exons))) small_gc = get_gc_content(dic_bed) big_gc = get_list_of_value_iupac_dnt(cnx, big_exons, "iupac_exon", "S") make_histogram(small_gc, output, "gc_content_small_Irimia_exon", "GC content Irimia micro-exons (<= %s nt)" % size_threshold) make_histogram(big_gc, output, "gc_content_big_%s_exon" % exon_type, "GC content exons > %s nt" % size_threshold) write_test_result(big_gc, small_gc, size_threshold, output, exon_type)
def main(): base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) fasterdb = base_dir + "/data/fasterDB_lite.db" seddb = base_dir + "/data/sed.db" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon2remove = udf.get_exon_regulated_by_sf(cnx_sed, "down") my_exons = get_control_exon_information(cnx, "CCE", exon2remove) with open("data/input/CCE_exons.txt", "w") as outfile: outfile.write("\n".join(my_exons) + "\n")
def main(): """ Create a bed file containing info about GC frequency of every GC-AT exons. """ base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) seddb = base + "/data/sed.db" fasterdb = base + "/data/fasterDB_lite.db" output = base + "/result/correlation_GC-AT-exons_TAD" if not os.path.isdir(output): os.mkdir(output) cnx = sqlite3.connect(seddb) u1_exons = [ list(map(int, exon)) for exon in get_exons_list( cnx, ["SNRPC", "SNRNP70", "DDX5_DDX17"], "down") ] u2_exons = [ list(map(int, exon)) for exon in get_exons_list( cnx, ["U2AF2", "SF1", "SF3A3", "SF3B4"], "down") ] print("U1-exons : %s exons" % len(u1_exons)) print("U2-exons : %s exons" % len(u2_exons)) exon_list = udf.get_exon_regulated_by_sf(cnx, "down") print("Getting exon data ...") exon_data = get_exon_info(cnx, seddb, fasterdb, exon_list, u1_exons, u2_exons) print("Writing bed") write_bed(output, exon_data, "data_for_regulated_exons") cnx.close() cnx = sqlite3.connect(seddb) exon2remove = udf.get_exon_regulated_by_sf(cnx, "down") cnx_fasterdb = sqlite3.connect(fasterdb) exon_list = get_control_exon_information(cnx_fasterdb, "CCE", exon2remove) + exon2remove cnx_fasterdb.close() print("CCE exons + regulated exons : %s" % len(exon_list)) print("Getting CCE + regulated exon data ...") exon_data = get_exon_info(cnx, seddb, fasterdb, exon_list, u1_exons, u2_exons) print("Writing bed") write_bed(output, exon_data, "data_for_regulated_CCE_exons") cnx.close()
def my_level_analysis(cnx, exon_type, output, regulation, size_threshold, target_column, level="control"): """ Create the histogram of the size of exons and make a levene test \ to test if the variance of GC content of a group of big exons \ and a group of small exons is different. :param cnx: (pymysql connection object) connection to Sed database. :param exon_type: (str) the type of control exons to analyse :param output: (str) folder where the results will be created :param regulation: (str) the regulation :param size_threshold: (int) the threshold :param target_column: (str) the feature of interest :param level: (str) the level """ sizefig = "hist_of_%s_exon_size" % exon_type if level == "control": exon_2_remove = udf.get_exon_regulated_by_sf(cnx, regulation) exon_list = get_control_exon(cnx, exon_type, exon_2_remove, regulation) else: exon_list = udf.get_exon_regulated(cnx, regulation) exon_type = "SF-down" dic_size = get_list_of_value(cnx, exon_list, target_column) # dic_size = {"exon": [], target_column: []} # for i in range(len(tmp["exon"])): # if tmp[target_column][i] > 10: # dic_size["exon"].append(tmp["exon"][i]) # dic_size[target_column].append(tmp[target_column][i]) list_size = np.array(dic_size["exon_size"]) print(" min : %s" % min(list_size)) print(" max : %s" % max(list_size)) print(" nb exons having a size below/equal %s : %s" % (size_threshold, len(list_size[list_size <= size_threshold]))) make_histogram(list_size, output, sizefig, target_column, log=True) small_exons, big_exons = get_two_groups_of_exon(dic_size, size_threshold, target_column) print(" nb exons having a size below/equal to %s nt : %s" % (size_threshold, len(small_exons))) print(" nb exons having a size greater to %s nt : %s" % (size_threshold, len(big_exons))) small_gc = get_list_of_value_iupac_dnt(cnx, small_exons, "iupac_exon", "S") big_gc = get_list_of_value_iupac_dnt(cnx, big_exons, "iupac_exon", "S") make_histogram(small_gc, output, "gc_content_small_%s_exon" % exon_type, "GC content exons <= %s nt" % size_threshold) make_histogram(big_gc, output, "gc_content_big_%s_exon" % exon_type, "GC content exons > %s nt" % size_threshold) write_test_result(big_gc, small_gc, size_threshold, output, exon_type)
def main(branch_point_file, name_bp_file, at_file, gc_file, fasterdb, seddb, output, exon_type="CCE"): """ Create a GC/AT barplots with experimental branch points. :param branch_point_file: (str) a bed file containing branch point :param name_bp_file: (str) the name of the experimental bp file :param at_file: (str) a file containing AT exons :param gc_file: (str) a file containing GC exons :param fasterdb: (str) path to fasterdb database :param seddb: (str) path to sed database :param output: (str) folder where the figures will be created :param exon_type: (str) the type of control exons """ result_file = "%s/intron_experimental_%s_table.txt" % (output, name_bp_file) if not os.path.isfile(result_file): at_exon = read_file(at_file) gc_exon = read_file(gc_file) list_bp = read_file(branch_point_file) print(len(list_bp)) cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon2remove = [ list(map(int, exon)) for exon in udf.get_exon_regulated_by_sf(cnx_sed, "down") ] ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove) exon_list = gc_exon + at_exon + ctrl_exons type_exon = ["GC-exons"] * len(gc_exon) + \ ["AT-exons"] * len(at_exon) + \ ["%s-exons" % exon_type] * len(ctrl_exons) intron_data = get_intron_coordinates(cnx, exon_list, type_exon) df = get_intron_bp_data(intron_data, list_bp) print(df.head()) df.to_csv(result_file, sep="\t", index=False) cnx.close() cnx_sed.close() else: print("Recovering %s" % result_file) df = pd.read_csv(result_file, sep="\t") create_barplot(df, output, os.path.basename(result_file).replace(".txt", ""))
def main_1d(list_file, name_file, seddb, exon_type, regulation, output, nt): """ Create the 1.D figure with custom list of exons :param list_file: (list of str) list of exons files in the form \ of GC_rich_exon file. :param name_file: (list of str) the name of each files of exons \ given in ``list_file`` :param seddb: (str) path to sed database :param exon_type: (str) the control exons :param regulation: (str) the resultation wanted up or down :param output: (str) pat were the result will be created :param nt: (str) the nt we want to use for the figure 1.1D """ cnx = sqlite3.connect(seddb) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx, regulation) my_level = "exons" list_gc_content = [] list_intron_size = [] list_file.append(None) name_file.append("%s_exons" % exon_type) for i in range(len(name_file)): if exon_type not in name_file[i]: list_gc_content.append( boxplot_gc_content_maker.extract_exon_gc_content_from_file( cnx, list_file[i], nt)) list_intron_size.append( boxplot_flanking_intron_size. extract_exon_min_flanking_intron_size_from_file( cnx, list_file[i])) else: list_gc_content.append( boxplot_gc_content_maker.get_exon_control_gc_content( cnx, exon_type, exon2remove, nt)) list_intron_size.append( boxplot_flanking_intron_size. get_exon_control_min_flanking_intron_size( cnx, exon_type, exon2remove)) create_figure(list_gc_content, name_file, output, regulation, "1.1D_%s_content" % nt, my_level) dataframe_creator(list_gc_content, name_file, output, regulation, "1.1D_%s_content" % nt, my_level) create_figure(list_intron_size, name_file, output, regulation, "1.2D_min_intron_size", my_level) dataframe_creator(list_intron_size, name_file, output, regulation, "1.2D_min_intron_size", my_level) cnx.close()
def control_handler(cnx, exon_type, regulation): my_path = os.path.dirname(os.path.realpath(__file__)) control_folder = my_path + "/control" control_file = control_folder + "/control.py" control_full = control_folder + "/control_full.pkl" ctrl_list, tmp = get_control_information(exon_type, control_file, control_full) if ctrl_list is None: print("Control dictionary was not found !") print("Creating control information") exon2remove = union_dataset_function.get_exon_regulated_by_sf(cnx, regulation) names, exon_tuple = get_control_exon_information(cnx, exon_type, exon2remove, regulation) # getting the new columns exon_tuple = remove_redundant_gene_information(exon_tuple) tmp = create_a_temporary_dictionary(names, exon_tuple) ctrl_list = get_summary_dictionaries(names, tmp) write_control_file(exon_type, control_file, str(ctrl_list)) write_pickle(control_full, tmp) return ctrl_list, tmp
def control_dictionaries_creator(window_size): """ Create the control dictionary containing the vector of values of control exons. those vector will be use to \ display the frequencies of a given nucleotide in a meta-exon figures for the control exons. \ Create control dictionary files that contain the values for the boxplot, metagene and metagene windowed figure for \ coding CCE exons, CE exon, ACE exons and ASE exons. :param window_size: (int) the size of the window we want to use to create the control metagene windowsed \ dictionaries """ exon_class_metaexon.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/metaexon_figure", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/metaexon_figure", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") exon_type = ["ACE", "CCE"] for cur_exon_type in exon_type: ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type, exon2remove) # ctrl_exon_list = ctrl_exon_list[0:2] list_exon = [ exon_class_metaexon.ExonClass(cnx, exon[0], exon[1], exon[2], window_size) for exon in ctrl_exon_list ] print("creating metagene windowsed") final_res_5p, final_res_3p, p5_analyzed, p3_analyzed = \ exon_class_metaexon.get_metagene_vectors_windowsed(list_exon, window_size) print(final_res_5p) cur_file = open(ctrl_dir + cur_exon_type + "_metagene_windowsed.py", "w") cur_file.write("final_res_5p=" + str(final_res_5p) + "\n") cur_file.write("final_res_3p=" + str(final_res_3p) + "\n") cur_file.write("# " + str(p5_analyzed) + " sequences 5' analysees\n") cur_file.write("# " + str(p3_analyzed) + " sequences 3' analysees\n") cur_file.close() del (final_res_5p, final_res_3p, p5_analyzed, p3_analyzed)
def main(): regulation = "down" exon_class_bp.set_debug(0) base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) output = base + "/result/experimental_branch_point" at_exon_file = base + "/result/AT_rich_exons" gc_exon_file = base + "/result/GC_rich_exons" fasterdb = base + "/data/fasterDB_lite.db" seddb = base + "/data/sed.db" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon_type = "CCE" at_exon = read_file(at_exon_file) gc_exon = read_file(gc_exon_file) exon2remove = [ list(map(int, exon)) for exon in udf.get_exon_regulated_by_sf(cnx_sed, regulation) ] ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove) exon_list = gc_exon + at_exon + ctrl_exons type_exon = ["GC-exons"] * len(gc_exon) + \ ["AT-exons"] * len(at_exon) + \ ["%s-exons" % exon_type] * len(ctrl_exons) tot = len(exon_list) count = 0 count_none = 0 print("Creating bed of predicted branch points") with open("%s/predicted_branch_points.bed" % output, "w") as outf: for exon, name_exon in zip(exon_list, type_exon): exon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) nb_good_bp, list_pos = function_bp.goob_bp_only(exon) if list_pos is not None: for line in list_pos: line[3] += "_" + name_exon line[0] = "chr" + str(line[0]) outf.write("\t".join(list(map(str, line))) + "\n") else: count_none += 1 count += 1 sys.stdout.write("%s/%s (%s) \r" % (count, tot, count_none)) cnx.close() cnx_sed.close()
def control_dictionaries_creator(): """ Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons """ exon_class_bp.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon_type = ["CCE"] sizes = [100, 50, 35, 25] for cur_exon_type in exon_type: ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type, exon2remove) print("retrieving upstream intron sequence") list_exon = [ exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2]) for exon in ctrl_exon_list ] for size in sizes: print("calculating bp and ppt score") bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, sequence_list, ag_count_list, \ hbound_list, uaa_list, una_list = function_bp.bp_ppt_calculator(list_exon, size) cur_file = open( ctrl_dir + cur_exon_type + "_" + str(size) + "_bp_ppt_score.py", "w") cur_file.write("bp_score=" + str(bp_score_list) + "\n") cur_file.write("ppt_score=" + str(ppt_score_list) + "\n") cur_file.write("nb_bp=" + str(nb_bp_list) + "\n") cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n") cur_file.write("bp_seq=" + str(sequence_list) + "\n") cur_file.write("ag_count=" + str(ag_count_list) + "\n") cur_file.write("hbound=" + str(hbound_list) + "\n") cur_file.write("uaa_count=" + str(uaa_list) + "\n") cur_file.write("una_count=" + str(una_list) + "\n") cur_file.close()
def main(): """ Create the graphics wanted """ base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) seddb = base + "/data/sed.db" fasterdb = base + "/data/fasterDB_lite.db" output = base + "/result/variance_analysis/bed_file/" if not os.path.isdir(output): os.mkdir(output) exon_type = "CCE" regulation = "down" threshold = 27 cnx = sqlite3.connect(seddb) exon2remove = udf.get_exon_regulated_by_sf(cnx, regulation) print("Get every FasterDB small exons : (exons with a size greater than 2" " nucleotides and lower than %s nucleotides" % threshold) small_exons = get_small_exons(cnx, seddb, fasterdb, threshold, exon_type, exon2remove) print("\tSmall exons found : %s" % len(small_exons)) print("Getting small exons down-regulated by a splicing factor ...") r = [exon[3] for exon in small_exons] with open("%s/small.txt" % output, "w") as ouf: ouf.write("\n".join(r) + "\n") small_down_sf_exon = get_small_sf_down_exons(cnx, seddb, fasterdb, threshold) print("\tSmall exons downregulated by a splicing factor found : %s" % len(small_down_sf_exon)) print("Getting control exons ...") ctrl_exon = get_control_exons(cnx, seddb, fasterdb, threshold, exon2remove, exon_type) print("\tControl exons found : %s" % len(ctrl_exon)) cnx.close() print("Writing results") write_bed_file(small_exons, output, "small_%s_exons_(3-%snt).bed" % (exon_type, threshold)) write_bed_file(small_down_sf_exon, output, "small_sf-downregulated_exons_(3-%snt).bed" % threshold) write_bed_file(ctrl_exon, output, "%s_exons_%snt+.bed" % (exon_type, threshold))
def main(): exon_type = "CCE" regulation = "down" output = os.path.realpath(os.path.dirname(__file__)).replace( "src/boxplot_GC_content_and_flanking_intron_size", "result/boxplot_gc_content_and_flanking_intron_size/") if not os.path.isdir(output): os.mkdir(output) path = os.path.realpath(os.path.dirname(__file__)).replace( "src/boxplot_GC_content_and_flanking_intron_size", "result/") seddb = os.path.realpath(os.path.dirname(__file__)).replace( "src/boxplot_GC_content_and_flanking_intron_size", "data/sed.db") cnx = sqlite3.connect(seddb) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx, regulation) gene2remove = [exon[0] for exon in exon2remove] at_file_pure = "%sAT_rich_exons" % path gc_file_pure = "%sGC_rich_exons" % path gene2remove_at_gc = get_common_genes(at_file_pure, gc_file_pure, output) # levels = ["exons", "genes"] levels = ["exons"] for my_level in levels: name_file = [ "GC_pure_%s" % my_level, "AT_pure_%s" % my_level, "%s_%s" % (exon_type, my_level) ] list_file = [gc_file_pure, at_file_pure, None] list_gc_content = [] list_intron_size = [] list_gene_size = [] list_exon_size = [] for i in range(len(name_file)): if "exons" in name_file[i]: if exon_type not in name_file[i]: list_gc_content.append( boxplot_gc_content_maker. extract_exon_gc_content_from_file(cnx, list_file[i])) list_intron_size.append( boxplot_flanking_intron_size. extract_exon_min_flanking_intron_size_from_file( cnx, list_file[i])) list_exon_size.append( boxplot_flanking_intron_size. extract_exon_size_from_file(cnx, list_file[i])) else: list_gc_content.append( boxplot_gc_content_maker.get_exon_control_gc_content( cnx, exon_type, exon2remove)) list_intron_size.append( boxplot_flanking_intron_size. get_exon_control_min_flanking_intron_size( cnx, exon_type, exon2remove)) list_exon_size.append( boxplot_flanking_intron_size.get_exon_control_size( cnx, exon_type, exon2remove)) if "genes" in name_file[i]: print(name_file[i]) if exon_type not in name_file[i]: list_gc_content.append( boxplot_gc_content_maker. extract_gene_gc_content_from_file( cnx, list_file[i], gene2remove_at_gc)) list_intron_size.append( boxplot_flanking_intron_size. extract_gene_median_intron_size_from_file( cnx, list_file[i], gene2remove_at_gc)) list_gene_size.append( boxplot_gene_size.extract_gene_size_from_file( cnx, list_file[i], gene2remove_at_gc)) else: list_gc_content.append( boxplot_gc_content_maker.get_gene_control_gc_content( cnx, exon_type, gene2remove)) list_intron_size.append( boxplot_flanking_intron_size. get_gene_control_median_flanking_intron_size( cnx, exon_type, gene2remove)) list_gene_size.append( boxplot_gene_size.get_control_gene_size( cnx, exon_type, gene2remove)) if my_level == "exons": create_figure(list_gc_content, name_file, output, regulation, "GC_content", my_level) dataframe_creator(list_gc_content, name_file, output, regulation, "GC_content", my_level) create_figure(list_intron_size, name_file, output, regulation, "min_intron_size", my_level) dataframe_creator(list_intron_size, name_file, output, regulation, "min_intron_size", my_level) create_figure(list_exon_size, name_file, output, regulation, "exon_size", my_level) dataframe_creator(list_exon_size, name_file, output, regulation, "exon_size", my_level) if my_level == "genes": # create_figure(list_intron_size, name_file, output, regulation, "median_intron_size", my_level) # dataframe_creator(list_intron_size, name_file, output, regulation, "median_intron_size", my_level) # create_figure(list_gene_size, name_file, output, regulation, "gene_size", my_level) # dataframe_creator(list_gene_size, name_file, output, regulation, "gene_size", my_level) # create_figure(list_gc_content, name_file, output, regulation, "GC_content", my_level) dataframe_creator2(list_gc_content, list_gene_size, name_file, output, regulation, "GC_content", "gene_size", my_level) dataframe_creator2(list_intron_size, list_gc_content, name_file, output, regulation, "median_intron_size", "GC_content", my_level)
def control_handler(cnx, exon_type, size=None, regulation="down"): my_path = os.path.dirname(os.path.realpath(__file__)) control_folder = my_path + "/control" control_file = control_folder + "/control.py" control_full = control_folder + "/control_full.pkl" ctrl_list, tmp = exon_control_handler.get_control_information( exon_type, control_file, control_full) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx, regulation) if ctrl_list is None: print("Control dictionary was not found !") print("Creating control information") names, exon_tuple = exon_control_handler.get_control_exon_information( cnx, exon_type, exon2remove, regulation) # getting the new columns exon_tuple = exon_control_handler.remove_redundant_gene_information( exon_tuple) tmp = exon_control_handler.create_a_temporary_dictionary( names, exon_tuple) ctrl_list = exon_control_handler.get_summary_dictionaries(names, tmp) exon_control_handler.write_control_file(exon_type, control_file, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) if "rel_exon_intron_up" not in ctrl_list.keys(): print("relative exon_intron size where not found.") print("getting relative exon_intron size") exon_tuple = get_control_exon_size_information(cnx, exon_type, exon2remove) print("Relative size calculation...") tmp_dic = tmp_dic_creator(exon_tuple) tmp = dict(tmp, **tmp_dic) print("summarizing") sum_dic = get_summary_dictionaries(tmp_dic) ctrl_list = dict(ctrl_list, **sum_dic) print("writting...") write_adapted_dic(control_file, exon_type, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) if size is not None and "nb_good_bp_%s" % size not in ctrl_list.keys(): file_name = "%s_%s_bp_ppt_score.py" % (exon_type, size) ctrl_file = control_folder + "/" + file_name print(ctrl_file) if os.path.isfile(ctrl_file): print("Loading %s file to add nb_ggod_pb in control dic !" % ctrl_file) sys.path.insert(0, control_folder) mod = __import__(file_name.replace(".py", "")) val = np.median(mod.nb_good_bp) ctrl_list["nb_good_bp_%s" % size] = val tmp["nb_good_bp_%s" % size] = mod.nb_good_bp val = np.median(mod.ag_count) ctrl_list["ag_count"] = val tmp["ag_count"] = mod.ag_count val = np.median(mod.hbound) ctrl_list["hbound"] = val tmp["hbound"] = mod.hbound write_adapted_dic(control_file, exon_type, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) else: print("Creating control file, this operation will take some time") control_bp_ppt.control_dictionaries_creator(exon_type, size) if "mfe_3ss" not in ctrl_list.keys(): file_name = "%s_mfe.py" % exon_type ctrl_file = control_folder + "/" + file_name if os.path.isfile(ctrl_file): print( "Loading %s file to add mfe_3ss and mfe_5ss in control dic !" % ctrl_file) sys.path.insert(0, control_folder) module = __import__(file_name.replace(".py", "")) ctrl_list["mfe_3ss"] = np.median(module.mfe_3ss) tmp["mfe_3ss"] = module.mfe_3ss ctrl_list["mfe_5ss"] = np.median(module.mfe_5ss) tmp["mfe_5ss"] = module.mfe_5ss write_adapted_dic(control_file, exon_type, str(ctrl_list)) exon_control_handler.write_pickle(control_full, tmp) else: print("Creating control file, for mfe.") control_mfe.control_dictionaries_creator() return ctrl_list, tmp