def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation): """ :param cnx: (sqlite3 connect object) connection to fasterDB lite :param cnx_sed: (sqlite3 connect object) connection to sed :param list_files: (list of string) list of files containing exon set :param list_names: (list of string) the name of exon set :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \ Those 2 lists must have the same length :param regulation: (string) up or down :return: (list of ExonClass object) list of exon. """ if list_files: exon_list = extract_exon_files(cnx, list_files[pos]) else: dic_name = { "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"], "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"] } exon_list_tmp = [] for sf_name in dic_name[list_names[pos]]: exon_list_tmp += union_dataset_function.get_every_events_4_a_sl( cnx_sed, sf_name, regulation) exon_list_tmp = union_dataset_function.washing_events_all( exon_list_tmp) exon_list = [ exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]), int(exon[1])) for exon in exon_list_tmp ] print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation)) return exon_list
def extract_exon_files(cnx, filename): """ :param cnx: (sqlite3 connect object) connection to fasterDB lite :param filename: (string) the name of a file containing exons :return: (list of Exonclass object) list of exons """ exon_list = [] with open(filename, "r") as outfile: line = outfile.readline() while line: line = line.replace("\n", "") line = line.split("\t") exon = exon_class_bp.ExonClass(cnx, str(line[0]), int(line[0]), int(line[1])) exon_list.append(exon) line = outfile.readline() return exon_list
def svm_bp_finder_launcher(cnx, exon_list, output): """ Compute the number of good branch points of every exons in ``exon_list``. :param cnx: (sqlite3 connect object) connection to fasterDB database :param exon_list: ((list of list of 1 str 2 int) list of exons. :param output: (str) folder were the input will be created :return: (pandas dataframe) list of the branch point of interest """ list_df = [] for exon in exon_list: class_exon = exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2]) input_file = function_bp.fasta_writer(class_exon, output, 100) df = run_svs_bp_finder(input_file, exon[1], exon[2], class_exon.upstream_intron.sequence_proxi) list_df.append(df) return pd.concat(list_df, ignore_index=True)
def main(): regulation = "down" exon_class_bp.set_debug(0) base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) output = base + "/result/experimental_branch_point" at_exon_file = base + "/result/AT_rich_exons" gc_exon_file = base + "/result/GC_rich_exons" fasterdb = base + "/data/fasterDB_lite.db" seddb = base + "/data/sed.db" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon_type = "CCE" at_exon = read_file(at_exon_file) gc_exon = read_file(gc_exon_file) exon2remove = [ list(map(int, exon)) for exon in udf.get_exon_regulated_by_sf(cnx_sed, regulation) ] ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove) exon_list = gc_exon + at_exon + ctrl_exons type_exon = ["GC-exons"] * len(gc_exon) + \ ["AT-exons"] * len(at_exon) + \ ["%s-exons" % exon_type] * len(ctrl_exons) tot = len(exon_list) count = 0 count_none = 0 print("Creating bed of predicted branch points") with open("%s/predicted_branch_points.bed" % output, "w") as outf: for exon, name_exon in zip(exon_list, type_exon): exon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) nb_good_bp, list_pos = function_bp.goob_bp_only(exon) if list_pos is not None: for line in list_pos: line[3] += "_" + name_exon line[0] = "chr" + str(line[0]) outf.write("\t".join(list(map(str, line))) + "\n") else: count_none += 1 count += 1 sys.stdout.write("%s/%s (%s) \r" % (count, tot, count_none)) cnx.close() cnx_sed.close()
def control_dictionaries_creator(): """ Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons """ exon_class_bp.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon_type = ["CCE"] sizes = [100, 50, 35, 25] for cur_exon_type in exon_type: ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type, exon2remove) print("retrieving upstream intron sequence") list_exon = [ exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2]) for exon in ctrl_exon_list ] for size in sizes: print("calculating bp and ppt score") bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, sequence_list, ag_count_list, \ hbound_list, uaa_list, una_list = function_bp.bp_ppt_calculator(list_exon, size) cur_file = open( ctrl_dir + cur_exon_type + "_" + str(size) + "_bp_ppt_score.py", "w") cur_file.write("bp_score=" + str(bp_score_list) + "\n") cur_file.write("ppt_score=" + str(ppt_score_list) + "\n") cur_file.write("nb_bp=" + str(nb_bp_list) + "\n") cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n") cur_file.write("bp_seq=" + str(sequence_list) + "\n") cur_file.write("ag_count=" + str(ag_count_list) + "\n") cur_file.write("hbound=" + str(hbound_list) + "\n") cur_file.write("uaa_count=" + str(uaa_list) + "\n") cur_file.write("una_count=" + str(una_list) + "\n") cur_file.close()
def get_exon_info(cnx, sedb, fasterdb_file, exon_list, u1_exons, u2_exons): """ :param cnx: (sqlite3 connect object) connexion to fasterdb :param fasterdb_file: (str) an sqlite3 database file :param sedb: (str) path to sed database :param exon_list: (list of 2 int) list of exons :param u1_exons: (list of list of 2 int) list of exons regulated by U1 :param u2_exons: (list of list of 2 int) list of exons regulated by U2 :return: (list of list of value) list of data """ dic = {-1: "-", 1: "+"} cursor = cnx.cursor() cursor.execute("ATTACH DATABASE ? as sed", (sedb, )) cursor.execute("ATTACH DATABASE ? as fasterdb", (fasterdb_file, )) if exon_list is None: query = """ SELECT t1.id_gene, t1.pos_on_gene, t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.strand, t3.iupac_exon, t3.upstream_intron_size, t3.downstream_intron_size FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3 WHERE t3.gene_id = t1.id_gene AND t3.exon_pos = t1.pos_on_gene AND t1.id_gene = t2.id AND t3.exon_type LIKE '%CCE%' """ cursor.execute(query) res = cursor.fetchall() new_res = [] for exon in res: exon = list(exon) exon[3] = int(exon[3]) - 1 cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) exon_data = bp_ppt_calculator([cexon]) mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) mfe_5ss, mfe_3ss = mfe_calculator([mexon]) stretch = catch_index_error(stretch_counter([cexon])["T"], 0) dic_info = { "GC_content": exon[6].split(";")[4], "upstream_intron_size": exon[7], "downstream_intron_size": exon[8], "UNA_count": catch_index_error(exon_data[8], 0), "Hbound_count": catch_index_error(exon_data[6], 0), "good_bp": catch_index_error(exon_data[3], 0), "MFE_5SS": catch_index_error(mfe_5ss, 0), "MFE_3SS": catch_index_error(mfe_3ss, 0), "T_stretch": stretch, "U1-regulated": is_in(exon[0:2], u1_exons), "U2-regulated": is_in(exon[0:2], u2_exons), } new_res.append(exon[2:5] + ["%s_%s" % (exon[0], exon[1])] + \ ["0", dic[exon[5]]] + [str(dic_info)]) return new_res count = 0 tot = len(exon_list) result = [] for exon in exon_list: count += 1 query = """ SELECT t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.strand, t3.iupac_exon, t3.upstream_intron_size, t3.downstream_intron_size FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3 WHERE t3.gene_id = t1.id_gene AND t3.exon_pos = t1.pos_on_gene AND t1.id_gene = t2.id AND t3.gene_id = %s AND t3.exon_pos = %s """ % (exon[0], exon[1]) cursor.execute(query) res = cursor.fetchall() if len(res) > 1: raise IndexError("Error only one row shoud be return for %s" % exon) tmp = list(res[0]) tmp[1] = int(tmp[1]) - 1 cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) exon_data = bp_ppt_calculator([cexon]) mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) mfe_5ss, mfe_3ss = mfe_calculator([mexon]) stretch = catch_index_error(stretch_counter([cexon])["T"], 0) dic_info = { "GC_content": tmp[4].split(";")[4], "upstream_intron_size": tmp[5], "downstream_intron_size": tmp[6], "UNA_count": catch_index_error(exon_data[8], 0), "Hbound_count": catch_index_error(exon_data[6], 0), "good_bp": catch_index_error(exon_data[3], 0), "MFE_5SS": catch_index_error(mfe_5ss, 0), "MFE_3SS": catch_index_error(mfe_3ss, 0), "T_stretch": stretch, "U1-regulated": is_in(exon[0:2], u1_exons), "U2-regulated": is_in(exon[0:2], u2_exons), } exon_data = tmp[0:3] + ["%s_%s" % (exon[0], exon[1])] + \ ["0", dic[tmp[3]]] + [str(dic_info)] result.append(exon_data) sys.stdout.write("Processing %s/%s\t\t\t\r" % (count, tot)) sys.stdout.flush() return result