def redundant_ag_at_and_u1_u2(cnx, regulation): """ Create the list of redundant exons between the AT and GC rich list of exons and \ between the U1 and U2 list of exons :param cnx: (sqlite3 connect object) allow connection to sed database :param regulation: (string) the regulation we want for the common exons :return: (list of list of 2 int) list of exons identified by their gene id and their exons position """ exon_at = [] for sf_name in group_factor.at_rich_down: exon_at += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_at = union_dataset_function.washing_events_all(exon_at) exon_gc = [] for sf_name in group_factor.gc_rich_down: exon_gc += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_gc = union_dataset_function.washing_events_all(exon_gc) global redundant_gc_at redundant_gc_at = [exon for exon in exon_at if exon in exon_gc] print("redundant exon GC and AT rich : %s" % len(redundant_gc_at)) exon_u1 = [] for sf_name in group_factor.u1_factors: exon_u1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_u1 = union_dataset_function.washing_events_all(exon_u1) exon_u2 = [] for sf_name in group_factor.u2_factors: exon_u2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_u2 = union_dataset_function.washing_events_all(exon_u2) global redundant_u1_u2 redundant_u1_u2 = [exon for exon in exon_u1 if exon in exon_u2] print("redundant exon U1 and U2 rich : %s" % len(redundant_u1_u2))
def get_exon_list(cnx, annotation_name, regulation): """ Get the exon_list wanted. :param cnx: (sqlite3 connect object) connection to sed database :param annotation_name: (string) GC-AT or a sf_name :param regulation: (string) the regulation of an exon list by a factor(s) :return: (list of 2 int) gene id and exon_pos """ if "GC" in annotation_name or "AT" in annotation_name: annotation_name = annotation_name.split("_")[0] folder = os.path.realpath(os.path.dirname(__file__)).replace( "src", "data/") my_file = "%s%s_rich_exons" % (folder, annotation_name) exon_list = extract_exon_list(my_file) elif "U1-FACTORS" in annotation_name or "U2-FACTORS" in annotation_name: annotation_name = annotation_name.split("_")[0] dic_name = { "U1-FACTORS": ["SNRPC", "SNRNP70", "DDX5_DDX17"], "U2-FACTORS": ["U2AF2", "SF1", "SF3A3", "SF3B4"] } exon_list = [] for sf_name in dic_name[annotation_name]: exon_list += union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: annotation_name = annotation_name.split("_")[0] sf_name = annotation_name.upper() sf_name = sf_name.replace("SFRS", "SRSF").replace("TRA2A", "TRA2A_B").replace( "DDX5-17", "DDX5_DDX17") exon_list = union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) return exon_list
def bed_creator(cnx, cnx_sed, dest_folder, sf_name, regulation, chrom_size_file): """ Create a bed file containing all the exons regulated by ``sf_name`` with ``regulation``. :param cnx: (sqlite3 connector object) connection to fasterDB lite database :param cnx: (sqlite3 connector object) connection to sed database :param dest_folder: (string) path where the bed will be created :param sf_name: (string) the name of a splicing factor :param regulation: (string) up or down :param: (string) a file containing chromosome size :return: (string) the name of the bed file created """ sf_name = sf_name.upper() sf_name = sf_name.replace("SFRS", "SRSF") dic_fact = {"TRA2A": "TRA2A_B"} strand_dic = {-1: "-", 1: "+"} if sf_name in dic_fact: exon_list = union_dataset_function.get_every_events_4_a_sl( cnx_sed, dic_fact[sf_name], regulation) else: exon_list = union_dataset_function.get_every_events_4_a_sl( cnx_sed, sf_name, regulation) if len(exon_list) == 0: print("%s exon %s : %s %s" % ("\033[0;31m", sf_name, len(exon_list), "\033[0m")) else: print("%s exon %s : %s %s" % ("\033[0;32m", sf_name, len(exon_list), "\033[0m")) cursor = cnx.cursor() exon_info = [] for exon in exon_list: query = """SELECT t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.official_symbol, t1.pos_on_gene, t2.strand, t1.end_on_chromosome - t1.start_on_chromosome + 1 FROM exons t1, genes t2 WHERE t1.id_gene = t2.id AND t1.id_gene = %s AND t1.pos_on_gene = %s ORDER BY t1.chromosome ASC, t1.start_on_chromosome ASC """ % (exon[0], exon[1]) cursor.execute(query) res = cursor.fetchall() if len(res) > 1: print("Error, only one exon should be found for %s_%s exon" % (exon[0], exon[1])) exit(1) my_exon = list(res[0][0:3]) + ["%s_%s" % (res[0][3], res[0][4])] + [ "." ] + [strand_dic[res[0][5]], res[0][6]] exon_info.append("\t".join(list(map(str, my_exon)))) bed_content = "\n".join(exon_info) filename = "%sSF_%s_exons-union.bed" % (dest_folder, regulation) final_name = filename.replace(".bed", "_add200nt.bed") with open(filename, "w") as bedfile: bedfile.write(bed_content + "\n") fasterdb_bed_add_exon_type.add_intron_sequence(filename, final_name, chrom_size_file) return final_name
def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation): """ :param cnx: (sqlite3 connect object) connection to fasterDB lite :param cnx_sed: (sqlite3 connect object) connection to sed :param list_files: (list of string) list of files containing exon set :param list_names: (list of string) the name of exon set :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \ Those 2 lists must have the same length :param regulation: (string) up or down :return: (list of ExonClass object) list of exon. """ if list_files: exon_list = extract_exon_files(cnx, list_files[pos]) else: dic_name = { "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"], "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"] } exon_list_tmp = [] for sf_name in dic_name[list_names[pos]]: exon_list_tmp += union_dataset_function.get_every_events_4_a_sl( cnx_sed, sf_name, regulation) exon_list_tmp = union_dataset_function.washing_events_all( exon_list_tmp) exon_list = [ exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]), int(exon[1])) for exon in exon_list_tmp ] print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation)) return exon_list
def get_values_for_many_projects_iupac_dnt(cnx, id_projects_sf_names, target_column, regulation, nt_dnt, union): """ Return the frequency of the nucleotide ``nt`` of ``target_column`` for each ``regulation`` \ exons for projects in ``id_projects``. :param cnx: (sqlite3 connection object) connexion to sed database :param id_projects_sf_names: (list of str or int) list project id if union is none. List of sf_name \ else :param target_column: (string) the column for which we want to get information on exons. :param regulation: (string) up or down :param nt_dnt: (string) a nucleotide or a di-nucleotide :param union: (None or string) None if we want to work project by project, anything else to work \ with exons regulation by a particular splicing factor. :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \ for every regulated exon in a given project. """ results = [] if not union: for id_project in id_projects_sf_names: exon_list = get_ase_events(cnx, id_project, regulation) results.append(get_list_of_value_iupac_dnt(cnx, exon_list, target_column, nt_dnt)) else: for sf_name in id_projects_sf_names: exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) results.append(get_list_of_value_iupac_dnt(cnx, exon_list, target_column, nt_dnt)) return results
def get_values_for_many_projects(cnx, cnx_fasterdb, id_projects_sf_names, target_column, regulation, output_bp_file, union): """ Return the value of ``target_column`` for each ``regulation`` exons for projects in ``id_projects``. :param cnx: (sqlite3 connection object) connexion to sed database :param cnx_fasterdb: (sqlite3 connection object) connexion to fasterdb database :param id_projects_sf_names: (list of str or int) list project id if union is none. List of sf_name \ else :param target_column: (string) the column for which we want to get information on exons. :param regulation: (string)) up or down :param output_bp_file: (string) path where the bp files will be created :param union: (None or string) None if we want to work project by project, anything else to work \ with exons regulation by a particular splicing factor. :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \ for every regulated exon in a given project. """ results = [] if not union: for id_project in id_projects_sf_names: exon_list = get_ase_events(cnx, id_project, regulation) if target_column == "median_flanking_intron_size": values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))]) results.append(values) elif target_column in ["nb_good_bp", "hbound", "ag_count"]: results.append(handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp_file, str(id_project), regulation, target_column)) elif "mfe" in target_column: results.append(handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp_file, str(id_project), regulation, target_column)) else: results.append(get_list_of_value(cnx, exon_list, target_column)) else: for sf_name in id_projects_sf_names: exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) if target_column == "median_flanking_intron_size": values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))]) results.append(values) elif target_column == "min_flanking_intron_size": values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([np.nanmin([values1[i], values2[i]]) for i in range(len(values1))]) results.append(values) elif target_column in ["nb_good_bp", "hbound", "ag_count"]: results.append(handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp_file, sf_name, regulation, target_column)) elif "mfe" in target_column: results.append(handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp_file, sf_name, regulation, target_column)) else: results.append(get_list_of_value(cnx, exon_list, target_column)) return results
def difference(cnx, list1, list2, regulation): """ Return the exons regulated by the factors in list1 if they are not regulated by the factors in list2 :param cnx: (sqlite3 connect object) connection to sed database :param list1: (list of string) list of splicing factors :param list2: (list of strings) list of splicing factros :param regulation: (string) the exons with the regulation ``regulation`` regulated by the splicing factors in \ ``list1`` or ``list2`` :return:(list of list of 2 int """ exon_list1 = [] exon_list2 = [] for sf_name in list1: exon_list1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list1 = union_dataset_function.washing_events_all(exon_list1) for sf_name in list2: exon_list2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list2 = union_dataset_function.washing_events_all(exon_list2) return [exon for exon in exon_list1 if exon not in exon_list2]
def get_gene_values(cnx, sf_list, target_column1, target_column2, regulation): """ Return the values of target_column in every`\ `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines. :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` \ exon. :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` \ exon. :param regulation: (list of string) up or down or up + down :return: 3 lists : * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \ every exons regulated by a splicing factor * exon_name : (list of list of string) each sublist corresponds to the name of \ every exons regulated by a splicing factor - the value in the sublist **i** position **j** \ in the ``value`` and ``exon_name`` corresponds to the same exons * all_sf (list of string) list of each sf studied """ if "$" in target_column1: target_column1, nt1 = target_column1.split("$") else: nt1 = None if "$" in target_column2: target_column2, nt2 = target_column2.split("$") else: nt2 = None exon_list = [] if isinstance(sf_list[0], str): for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: exon_list = sf_list gene_id = [] for val in exon_list: if val[0] not in gene_id: gene_id.append(val[0]) gene_name = [union_dataset_function.get_gene_name(cnx, my_id) for my_id in gene_id] if nt1: values1 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column1, nt1) else: values1 = functions.get_list_of_value(cnx, exon_list, target_column1) if nt2: values2 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column2, nt2) else: values2 = functions.get_list_of_value(cnx, exon_list, target_column2) return values1, values2, gene_name
def mydiff(cnx, exon_list, sf_type, regulation): """ Remove every exon of ``exon_list`` also persent in ``sf_type``. :param exon_list: (list of list of 2 int) a list of exons :param sf_type: (str) the type of splicing factor for which we don't \ want to display any exons. :return: (list of list of 2 int) the ``exon_list` without the \ exon regulated by a kind of splicing factor. """ exon_sf = [] for sf_name in eval("group_factor.%s" % sf_type): exon_sf += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) return [exon for exon in exon_list if exon not in exon_sf]
def main(exon_file, name_table, list_sf, sed, fasterdb, output, ss="5'ss"): """ Create a table showing for the exon commons in exon_files files \ their surrounding introns length and their MFE at their 5'ss. :param exon_file: (str) a file containing gc/at exons :param name_table: (str) the name of the resulting table :param list_sf: (List(vtype=str)) list of sf name :param sed: (str) path to sed database :param fasterdb: (str) path to fasterdb database :param output: (str) file were the output will be created :param ss: (str) the splicing site of interest """ sf_names = "_".join([name_table] + list_sf) exon_class.set_debug(1) exon_class_bp.set_debug(debug=1) cnx_sed = sqlite3.connect(sed) cnx_fasterdb = sqlite3.connect(fasterdb) exon_list = [] print("Getting exon from file") exon_list.append(get_exon(exon_file)) print("Getting regulated exons") for sf in list_sf: tmp = udf.get_every_events_4_a_sl(cnx_sed, sf, "down") tmp = [[int(v[0]), int(v[1])] for v in tmp] exon_list.append(tmp) print("\t%s : %s down-regulated exons" % (sf, len(tmp))) new_exon_list = reduce(get_union_exon, exon_list) print("Commons exons : %s" % len(new_exon_list)) print("Getting commons exons data !") df = get_exon_data(cnx_sed, new_exon_list, ss) if ss == "5'ss": noutput = output + "/rnafold_" + sf_names + "_commons_down_exons/" print("Computing MFE") df = computing_mfe(cnx_fasterdb, df, noutput) else: # Code to compute number of good branch point print("Computing Good branch point") nexon_list = df[["gene_name", "gene_id", "pos"]].values df2 = svm_bp_finder_launcher(cnx_fasterdb, nexon_list, output) print(df2.head()) print(df.head()) df = pd.merge(df, df2, how="right", on=["gene_id", "pos"]) print("Writing results !") df.to_csv("%s/%s_commons_down_exons.csv" % (output, sf_names), sep="\t", index=False)
def get_exons_list(cnx, sf_list, regulation): """ Return every non-redundant exons regulated by at least one factor in ``sf_list`` (with the regulation \ ``regulation``) :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param regulation: (list of string) up or down or up + down :return: (list of list of int) list of exons shownig the regulation ``regulation`` at least for a factor \ in ``sf_list`` """ exon_list = [] for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) return exon_list
def get_exons_values(cnx, sf_list, target_column1, target_column2, regulation): """ Return the values of target_column in every` \ `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines. :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` exon. :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` exon. :param regulation: (list of string) up or down or up + down :return: 3 lists : * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \ every exons regulated by a splicing factor * exon_name : (list of list of string) each sublist corresponds to the name of \ every exons regulated by a splicing factor - the value in the sublist **i** position **j** \ in the ``value`` and ``exon_name`` corresponds to the same exons * all_sf (list of string) list of each sf studied """ if "$" in target_column1: target_column1, nt1 = target_column1.split("$") else: nt1 = None if "$" in target_column2: target_column2, nt2 = target_column2.split("$") else: nt2 = None exon_list = [] if isinstance(sf_list[0], str): for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: exon_list = sf_list print(len(exon_list)) exon_name = ["%s_%s" % (union_dataset_function.get_gene_name(cnx, a[0]), a[1]) for a in exon_list] values1 = get_interest_values(cnx, exon_list, target_column1, nt1) values2 = get_interest_values(cnx, exon_list, target_column2, nt2) if len(exon_name) * 2 == len(values1): exon_name = ["%s_upstream" % a for a in exon_name] + \ ["%s_downstream" % a for a in exon_name] return values1, values2, exon_name
def extract_data(cnx, cnx_sed, list_files, list_names, pos): """ :param cnx: (sqlite3 connect object) connection to fasterDB lite :param cnx_sed: (sqlite3 connect object) connection to sed :param list_files: (list of string) list of files containing exon set :param list_names: (list of string) the name of exon set :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \ Those 2 lists must hace the same lenght :return: (list of ExonClass object) list of exon. """ if list_files: exon_list = extract_exon_files(cnx, list_files[pos]) else: exon_list_tmp = union_dataset_function.get_every_events_4_a_sl( cnx_sed, list_names[pos], "down") exon_list = [ exon_class.ExonClass(cnx, str(exon[0]), int(exon[0]), int(exon[1])) for exon in exon_list_tmp ] print("%s : %s exons" % (list_names[pos], len(exon_list))) return exon_list
def get_median_value(cnx, id_projects_sf_name, target_column, control_dic, regulation, operation, representation, nt=None): """ Return the median value of target_column in ``regulation`` exons of every ``id_projects_sf_name``. :param cnx: (sqlite3 connexion object) allow connexion to sed database :param id_projects_sf_name: (list of int) the splicing lore id projects of every projects of interest :param target_column: (string) the value for which we want to get the median value for the ``regulation`` \ exon. :param control_dic: (dictionnary of list of float) median value of each possible control exons of \ each feature in sed database. :param regulation: (list of string) up or down or up + down :param operation: (string) mean or median :param representation: (string) relative or absolute :param nt: (string) the nt of interest :return: (float) the relative median value (compared to control exons) of ``target_column`` for every \ ``regulation`` exons in every projects ``id_projects`` """ values_list = [] try: int(id_projects_sf_name[0]) sf_type = "project" except ValueError: sf_type = "sf" if sf_type == "project": for i in range(len(id_projects_sf_name)): exon_list = functions.get_ase_events(cnx, id_projects_sf_name[i], regulation) final_value = get_relative_value_of_a_project_or_sf(cnx, exon_list, target_column, control_dic, nt, operation, representation) values_list.append(final_value) else: for sf_name in id_projects_sf_name: exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) final_value = get_relative_value_of_a_project_or_sf(cnx, exon_list, target_column, control_dic, nt, operation, representation) values_list.append(final_value) return values_list
def create_matrix(cnx, id_projects, names, target_columns, control_dic, ctrl_full, regulations, operation, union=None, sf_type=None): """ Create a matrix of relative medians (toward control) for iupac characteristics of an exon set. :param cnx: (sqlite3 connect object) connexion to sed database :param id_projects: (list of ints) the list of splicing lore project id. :param names: (list of strings) the list of projects name (corresponding - in the same order - of the projects \ in ``id_projects``) or if union is not none: list of sf_name. :param target_columns: (list of strings) list of interest characteristics for a set of exons. :param control_dic: (dictionary of float) dictionary storing medians values for every characteristics of \ teh control set of exons. :param ctrl_full: (dictionary of list of float) dictionary storing every values for every characteristics of \ the control set of exons. :param regulations: (list of strings) the strings can be "up" or "down" only for up or down-regulated exons. :param operation: (string) the type of heatmp we want to produce : i.e heatmap of the mean or median :param union: (None or string) None if we want to work project by project, anything else to work \ with exons regulation by a particular splicing factor. :param sf_type: (string) the type of splicing factor we want to display in the final figures :return: (lif of list of float) the medians value for a project (line) for every characteristic of \ interests (number of value in one line corresponding to a project). """ print(regulations) new_targets = create_columns_names(target_columns) project_names = [] projects_tab = [] project_abs = [] project_vect_names = [] project_col = [] for i in range(len(names)): for regulation in regulations: project_res = [] if len(regulations) == 1: project_names.append("%s" % (names[i])) else: project_names.append("%s_%s" % (names[i], regulation)) if not union: exon_list = functions.get_ase_events(cnx, id_projects[i], regulation) print("Splicing factor : %s, project %s - exons %s - reg %s" % (names[i], id_projects[i], len(exon_list), regulation)) exon_list = difference(exon_list, names[i], sf_type) else: exon_list = union_dataset_function.get_every_events_4_a_sl( cnx, names[i], regulation) print("Splicing factor : %s - exons %s - reg %s" % (names[i], len(exon_list), regulation)) exon_list = difference(exon_list, names[i], sf_type) for j in range(len(new_targets)): if "_nt_" in new_targets[j]: nt = new_targets[j].split("_")[0] name_col = new_targets[j].replace("%s_nt" % nt, "iupac") if "mean_intron" in new_targets[j]: values1 = np.array( functions.get_list_of_value_iupac_dnt( cnx, exon_list, "iupac_upstream_intron", nt)) values2 = np.array( functions.get_list_of_value_iupac_dnt( cnx, exon_list, "iupac_downstream_intron", nt)) values = np.array([ np.nanmean([values1[i], values2[i]]) for i in range(len(values1)) ]) else: values = np.array( functions.get_list_of_value_iupac_dnt( cnx, exon_list, name_col, nt)) if names[i] == "QKI" and nt == "G": print(exon_list) print(values) list_val = values[~np.isnan(values)] val_obs = eval("np.%s(list_val)" % operation) final_value = float(val_obs - control_dic[name_col][nt]) / \ control_dic[name_col][nt] * 100 # ctrl_val = np.array(ctrl_full[name_col][nt], dtype=float) else: if new_targets[j] == "median_flanking_intron_size": values1 = np.array( functions.get_redundant_list_of_value( cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array( functions.get_redundant_list_of_value( cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([ np.nanmedian([values1[i], values2[i]]) for i in range(len(values1)) ]) elif new_targets[j] == "min_flanking_intron_size": values1 = np.array( functions.get_redundant_list_of_value( cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array( functions.get_redundant_list_of_value( cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([ np.nanmin([values1[i], values2[i]]) for i in range(len(values1)) ]) else: values = np.array( functions.get_list_of_value( cnx, exon_list, new_targets[j])) list_val = values[~np.isnan(values)] val_obs = eval("np.%s(values[~np.isnan(values)])" % operation) final_value = float(val_obs - control_dic[new_targets[j]]) / \ control_dic[new_targets[j]] * 100 # ctrl_val = np.array(ctrl_full[new_targets[j]], dtype=float) project_abs.append(list_val) project_vect_names.append([project_names[-1]] * len(list_val)) project_col.append([new_targets[j]] * len(list_val)) project_res.append(final_value) projects_tab.append(project_res) for j in range(len(new_targets)): if "_nt_" in new_targets[j]: nt = new_targets[j].split("_")[0] name_col = new_targets[j].replace("%s_nt" % nt, "iupac") mctrl = np.array(ctrl_full[name_col][nt], dtype=float) mctrl = list(mctrl[~np.isnan(mctrl)]) else: mctrl = np.array(ctrl_full[new_targets[j]], dtype=float) mctrl = list(mctrl[~np.isnan(mctrl)]) project_abs.append(mctrl) project_vect_names.append(["CCE"] * len(mctrl)) project_col.append([new_targets[j]] * len(mctrl)) df = pd.DataFrame({ "values": list(np.hstack(project_abs)), "project": list(np.hstack(project_vect_names)), "features": list(np.hstack(project_col)) }) return projects_tab, df, project_names, new_targets
def get_values_for_many_projects_iupac_dnt(cnx, id_projects_sf_names, target_columns, regulation, ctrl_full, exon_type): """ Return the frequency of the nucleotide ``nt`` of ``target_column`` for each ``regulation`` \ exons for projects in ``id_projects``. :param cnx: (sqlite3 connection object) connexion to sed database :param id_projects_sf_names: (list of str or int) list project id if union is none. List of sf_name \ else :param target_columns: (list of string) the list of target columns of interest :param regulation: (string) up or down :param ctrl_full: (dictionary of list of float) the list of float for every exons for each features of interest. :param exon_type: (string) the control exon type used :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \ for every regulated exon in a given project. """ my_len = None results = {target_column: [] for target_column in target_columns} results["project"] = [] for sf_name in id_projects_sf_names: exon_list = union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) for target_column in target_columns: if "_nt_" in target_column: my_nt, target_column_name = target_column.split("_nt_") target_column_name = "iupac_" + target_column_name results[target_column] += get_list_of_value_iupac_dnt( cnx, exon_list, target_column_name, my_nt) else: results[target_column] += get_list_of_value( cnx, exon_list, target_column) results["project"] += [sf_name] * len(exon_list) # print("SF : %s" % sf_name) # for target_column in target_columns: # print("%s len : %s" % (target_column, len(results[target_column]))) # print("project len : %s" % len(results["project"])) for target_column in target_columns: print(target_column) if "_nt_" in target_column: my_nt, target_column_name = target_column.split("_nt_") target_column_name = "iupac_" + target_column_name print("test len %s : %s" % (target_column_name, len( ctrl_full[target_column_name][my_nt]))) results[target_column] += ctrl_full[target_column_name][my_nt] my_len = len(ctrl_full[target_column_name][my_nt]) else: results[target_column] += ctrl_full[target_column] my_len = len(ctrl_full[target_column]) results["project"] += [exon_type] * my_len # print("CCE") # print(ctrl_full.keys()) # print(len(ctrl_full["iupac_upstream_intron"]["S"])) # print(type(ctrl_full["iupac_upstream_intron"]["S"])) for target_column in target_columns: print("%s len : %s" % (target_column, len(results[target_column]))) print("project len : %s" % len(results["project"])) df = pd.DataFrame(results) return df
def create_matrix(cnx, id_projects, names, target_columns, control_dic, regulations, union=None, sf_type=None): """ Create a matrix of relative medians (toward control) for iupac characteristics of an exon set. :param cnx: (sqlite3 connect object) connexion to sed database :param id_projects: (list of ints) the list of splicing lore project id. :param names: (list of strings) the list of projects name (corresponding - in the same order - of the projects \ in ``id_projects``) or if union is not none: list of sf_name. :param target_columns: (list of strings) list of interest characteristics for a set of exons. :param control_dic: (dictionary of float) dictionary storing medians values for every characteristics of \ teh control set of exons. :param regulations: (list of strings) the strings can be "up" or "down" only for up or down-regulated exons. :param union: (None or string) None if we want to work project by project, anything else to work \ with exons regulation by a particular splicing factor. :param sf_type: (string) the type of splicing factor we want to display in the final figures :return: (lif of list of float) the medians value for a project (line) for every characteristic of \ interests (number of value in one line corresponding to a project). """ new_targets = create_columns_names(target_columns) project_names = [] projects_tab = [] for i in range(len(names)): for regulation in regulations: project_res = [] if len(regulations) > 1: project_names.append("%s_%s" % (names[i], regulation)) else: project_names.append("%s" % (names[i])) if not union: exon_list = figure_producer.get_ase_events(cnx, id_projects[i], regulation) print("Splicing factor : %s, project %s - exons %s" % (names[i], id_projects[i], len(exon_list))) exon_list = difference(cnx, exon_list, names[i], regulation, sf_type) else: exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, names[i], regulation) print("Splicing factor : %s - exons %s" % (names[i], len(exon_list))) exon_list = difference(cnx, exon_list, names[i], regulation, sf_type) for j in range(len(new_targets)): if "_nt_" in new_targets[j]: nt = new_targets[j].split("_")[0] name_col = new_targets[j].replace("%s_nt" % nt, "iupac") if "mean_intron" in new_targets[j]: values1 = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, "iupac_upstream_intron", nt)) values2 = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, "iupac_downstream_intron", nt)) values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))]) else: values = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, name_col, nt)) median_obs = np.median(values[~np.isnan(values)]) final_value = float(median_obs - control_dic[name_col][nt]) / \ control_dic[name_col][nt] * 100 else: if new_targets[j] == "median_flanking_intron_size": values1 = np.array( figure_producer.get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float) values2 = np.array( figure_producer.get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float) values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))]) elif new_targets[j] in ["nb_good_bp", "hbound", "ag_count"]: if union: values = np.array(figure_producer.handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp, names[i], regulation, new_targets[j])) else: values = np.array(figure_producer.handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp, str(id_projects[i]), regulation, new_targets[j])) elif "mfe" in new_targets[j]: if union: values = np.array(figure_producer.handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp, names[i], regulation, new_targets[j])) else: values = np.array(figure_producer.handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp, str(id_projects[i]), regulation, new_targets[j])) else: values = np.array(figure_producer.get_list_of_value(cnx, exon_list, new_targets[j])) if new_targets[j] in figure_producer.log_columns: median_obs = np.median(values[~np.isnan(values)]) final_value = float(math.log10(median_obs) - math.log10(control_dic[new_targets[j]])) / math.log10(control_dic[ new_targets[j]]) * 100 else: median_obs = np.median(values[~np.isnan(values)]) final_value = float(median_obs - control_dic[new_targets[j]]) / control_dic[new_targets[j]] * 100 project_res.append(final_value) projects_tab.append(project_res) return projects_tab, project_names, new_targets