Esempio n. 1
0
def redundant_ag_at_and_u1_u2(cnx, regulation):
    """
    Create the list of redundant exons between the AT and GC rich list of exons and \
    between the U1 and U2 list of exons

    :param cnx: (sqlite3 connect object) allow connection to sed database
    :param regulation: (string) the regulation we want for the common exons
    :return: (list of list of 2 int) list of exons identified by their gene id and their exons position
    """
    exon_at = []
    for sf_name in group_factor.at_rich_down:
        exon_at += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_at = union_dataset_function.washing_events_all(exon_at)
    exon_gc = []
    for sf_name in group_factor.gc_rich_down:
        exon_gc += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_gc = union_dataset_function.washing_events_all(exon_gc)
    global redundant_gc_at
    redundant_gc_at = [exon for exon in exon_at if exon in exon_gc]
    print("redundant exon GC and AT rich : %s" % len(redundant_gc_at))

    exon_u1 = []
    for sf_name in group_factor.u1_factors:
        exon_u1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_u1 = union_dataset_function.washing_events_all(exon_u1)
    exon_u2 = []
    for sf_name in group_factor.u2_factors:
        exon_u2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_u2 = union_dataset_function.washing_events_all(exon_u2)
    global redundant_u1_u2
    redundant_u1_u2 = [exon for exon in exon_u1 if exon in exon_u2]
    print("redundant exon U1 and U2 rich : %s" % len(redundant_u1_u2))
def get_exon_list(cnx, annotation_name, regulation):
    """
    Get the exon_list wanted.

    :param cnx: (sqlite3 connect object) connection to sed database
    :param annotation_name: (string) GC-AT or a sf_name
    :param regulation: (string) the regulation of an exon list by a factor(s)
    :return: (list of 2 int) gene id and exon_pos
    """
    if "GC" in annotation_name or "AT" in annotation_name:
        annotation_name = annotation_name.split("_")[0]
        folder = os.path.realpath(os.path.dirname(__file__)).replace(
            "src", "data/")
        my_file = "%s%s_rich_exons" % (folder, annotation_name)
        exon_list = extract_exon_list(my_file)
    elif "U1-FACTORS" in annotation_name or "U2-FACTORS" in annotation_name:
        annotation_name = annotation_name.split("_")[0]
        dic_name = {
            "U1-FACTORS": ["SNRPC", "SNRNP70", "DDX5_DDX17"],
            "U2-FACTORS": ["U2AF2", "SF1", "SF3A3", "SF3B4"]
        }
        exon_list = []
        for sf_name in dic_name[annotation_name]:
            exon_list += union_dataset_function.get_every_events_4_a_sl(
                cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        annotation_name = annotation_name.split("_")[0]
        sf_name = annotation_name.upper()
        sf_name = sf_name.replace("SFRS",
                                  "SRSF").replace("TRA2A", "TRA2A_B").replace(
                                      "DDX5-17", "DDX5_DDX17")
        exon_list = union_dataset_function.get_every_events_4_a_sl(
            cnx, sf_name, regulation)
    return exon_list
def bed_creator(cnx, cnx_sed, dest_folder, sf_name, regulation,
                chrom_size_file):
    """
    Create a bed file containing all the exons regulated by ``sf_name`` with ``regulation``.

    :param cnx: (sqlite3 connector object) connection to fasterDB lite database
    :param cnx: (sqlite3 connector object) connection to sed database
    :param dest_folder: (string) path where the bed will be created
    :param sf_name: (string) the name of a splicing factor
    :param regulation: (string) up or down
    :param: (string) a file containing chromosome size
    :return: (string) the name of the bed file created
    """
    sf_name = sf_name.upper()
    sf_name = sf_name.replace("SFRS", "SRSF")
    dic_fact = {"TRA2A": "TRA2A_B"}
    strand_dic = {-1: "-", 1: "+"}
    if sf_name in dic_fact:
        exon_list = union_dataset_function.get_every_events_4_a_sl(
            cnx_sed, dic_fact[sf_name], regulation)
    else:
        exon_list = union_dataset_function.get_every_events_4_a_sl(
            cnx_sed, sf_name, regulation)
    if len(exon_list) == 0:
        print("%s exon %s : %s %s" %
              ("\033[0;31m", sf_name, len(exon_list), "\033[0m"))
    else:
        print("%s exon %s : %s %s" %
              ("\033[0;32m", sf_name, len(exon_list), "\033[0m"))
    cursor = cnx.cursor()
    exon_info = []
    for exon in exon_list:
        query = """SELECT t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.official_symbol, t1.pos_on_gene,
                   t2.strand, t1.end_on_chromosome - t1.start_on_chromosome + 1
                   FROM exons t1, genes t2
                   WHERE t1.id_gene = t2.id
                   AND t1.id_gene = %s
                   AND t1.pos_on_gene = %s
                   ORDER BY t1.chromosome ASC, t1.start_on_chromosome ASC
                """ % (exon[0], exon[1])
        cursor.execute(query)
        res = cursor.fetchall()
        if len(res) > 1:
            print("Error, only one exon should be found for %s_%s exon" %
                  (exon[0], exon[1]))
            exit(1)
        my_exon = list(res[0][0:3]) + ["%s_%s" % (res[0][3], res[0][4])] + [
            "."
        ] + [strand_dic[res[0][5]], res[0][6]]
        exon_info.append("\t".join(list(map(str, my_exon))))
    bed_content = "\n".join(exon_info)
    filename = "%sSF_%s_exons-union.bed" % (dest_folder, regulation)
    final_name = filename.replace(".bed", "_add200nt.bed")
    with open(filename, "w") as bedfile:
        bedfile.write(bed_content + "\n")
    fasterdb_bed_add_exon_type.add_intron_sequence(filename, final_name,
                                                   chrom_size_file)
    return final_name
def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation):
    """

    :param cnx: (sqlite3 connect object) connection to fasterDB lite
    :param cnx_sed: (sqlite3 connect object) connection to sed
    :param list_files: (list of string) list of files containing exon set
    :param list_names: (list of string) the name of exon set
    :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \
    Those 2 lists must have the same length
    :param regulation: (string) up or down
    :return: (list of ExonClass object) list of exon.
    """
    if list_files:
        exon_list = extract_exon_files(cnx, list_files[pos])
    else:
        dic_name = {
            "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"],
            "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"]
        }
        exon_list_tmp = []
        for sf_name in dic_name[list_names[pos]]:
            exon_list_tmp += union_dataset_function.get_every_events_4_a_sl(
                cnx_sed, sf_name, regulation)
        exon_list_tmp = union_dataset_function.washing_events_all(
            exon_list_tmp)
        exon_list = [
            exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]),
                                    int(exon[1])) for exon in exon_list_tmp
        ]
    print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation))
    return exon_list
Esempio n. 5
0
def get_values_for_many_projects_iupac_dnt(cnx, id_projects_sf_names, target_column, regulation, nt_dnt, union):
    """
    Return the frequency of the nucleotide ``nt`` of ``target_column`` for each ``regulation`` \
    exons for projects in ``id_projects``.

    :param cnx: (sqlite3 connection object) connexion to sed database
    :param id_projects_sf_names: (list of str or  int) list project id if union is none. List of sf_name \
    else
    :param target_column: (string) the column for which we want to get information on exons.
    :param regulation: (string) up or down
    :param nt_dnt: (string) a nucleotide or a di-nucleotide
    :param union: (None or string) None if we want to work project by project, anything else to work \
    with exons regulation by a particular splicing factor.
    :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \
    for every regulated exon in a given project.
    """

    results = []
    if not union:
        for id_project in id_projects_sf_names:
            exon_list = get_ase_events(cnx, id_project, regulation)
            results.append(get_list_of_value_iupac_dnt(cnx, exon_list, target_column, nt_dnt))

    else:
        for sf_name in id_projects_sf_names:
            exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
            results.append(get_list_of_value_iupac_dnt(cnx, exon_list, target_column, nt_dnt))
    return results
Esempio n. 6
0
def get_values_for_many_projects(cnx, cnx_fasterdb, id_projects_sf_names, target_column,
                                 regulation, output_bp_file, union):
    """
    Return the value of ``target_column`` for each ``regulation`` exons for projects in ``id_projects``.

    :param cnx: (sqlite3 connection object) connexion to sed database
    :param cnx_fasterdb: (sqlite3 connection object) connexion to fasterdb database
    :param id_projects_sf_names: (list of str or  int) list project id if union is none. List of sf_name \
    else
    :param target_column: (string) the column for which we want to get information on exons.
    :param regulation: (string)) up or down
    :param output_bp_file: (string) path where the bp files will be created
    :param union: (None or string) None if we want to work project by project, anything else to work \
    with exons regulation by a particular splicing factor.
    :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \
    for every regulated exon in a given project.
    """
    results = []
    if not union:
        for id_project in id_projects_sf_names:
            exon_list = get_ase_events(cnx, id_project, regulation)
            if target_column == "median_flanking_intron_size":
                values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float)
                values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float)
                values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))])
                results.append(values)
            elif target_column in ["nb_good_bp", "hbound", "ag_count"]:
                results.append(handle_nb_bp_recovering(cnx_fasterdb, exon_list,
                                                       output_bp_file, str(id_project), regulation,
                                                       target_column))
            elif "mfe" in target_column:
                results.append(handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp_file,
                                                     str(id_project), regulation, target_column))
            else:
                results.append(get_list_of_value(cnx, exon_list, target_column))

    else:
        for sf_name in id_projects_sf_names:
            exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
            if target_column == "median_flanking_intron_size":
                values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float)
                values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"), dtype=float)
                values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))])
                results.append(values)
            elif target_column == "min_flanking_intron_size":
                values1 = np.array(get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"), dtype=float)
                values2 = np.array(get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"),
                                   dtype=float)
                values = np.array([np.nanmin([values1[i], values2[i]]) for i in range(len(values1))])
                results.append(values)
            elif target_column in ["nb_good_bp", "hbound", "ag_count"]:
                results.append(handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp_file, sf_name,
                                                       regulation, target_column))
            elif "mfe" in target_column:
                results.append(handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp_file, sf_name,
                                                     regulation, target_column))
            else:
                results.append(get_list_of_value(cnx, exon_list, target_column))
    return results
def difference(cnx, list1, list2, regulation):
    """
    Return the exons regulated by the factors in list1 if they are not regulated by the factors in list2
    :param cnx: (sqlite3 connect object) connection to sed database
    :param list1: (list of string) list of splicing factors
    :param list2:  (list of strings) list of splicing factros
    :param regulation: (string) the exons with the regulation ``regulation`` regulated by the splicing factors in \
    ``list1`` or ``list2``
    :return:(list of list of 2 int
    """
    exon_list1 = []
    exon_list2 = []
    for sf_name in list1:
        exon_list1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_list1 = union_dataset_function.washing_events_all(exon_list1)
    for sf_name in list2:
        exon_list2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_list2 = union_dataset_function.washing_events_all(exon_list2)
    return [exon for exon in exon_list1 if exon not in exon_list2]
def get_gene_values(cnx, sf_list, target_column1, target_column2, regulation):
    """
    Return the values of target_column in every`\
    `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines.

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` \
    exon.
    :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` \
    exon.
    :param regulation: (list of string) up or down or up + down
    :return: 3 lists :

        * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \
        every exons regulated by a splicing factor
        * exon_name : (list of list of string) each sublist corresponds to the name of \
        every exons regulated by a splicing factor - the value in the sublist **i** position **j** \
         in the ``value`` and ``exon_name`` corresponds to the same exons
        * all_sf (list of string) list of each sf studied
    """
    if "$" in target_column1:
        target_column1, nt1 = target_column1.split("$")
    else:
        nt1 = None
    if "$" in target_column2:
        target_column2, nt2 = target_column2.split("$")
    else:
        nt2 = None
    exon_list = []
    if isinstance(sf_list[0], str):
        for sf_name in sf_list:
            exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        exon_list = sf_list
    gene_id = []
    for val in exon_list:
        if val[0] not in gene_id:
            gene_id.append(val[0])
    gene_name = [union_dataset_function.get_gene_name(cnx, my_id) for my_id in gene_id]
    if nt1:
        values1 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column1, nt1)
    else:
        values1 = functions.get_list_of_value(cnx, exon_list, target_column1)

    if nt2:
        values2 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column2, nt2)
    else:
        values2 = functions.get_list_of_value(cnx, exon_list, target_column2)

    return values1, values2, gene_name
Esempio n. 9
0
def mydiff(cnx, exon_list, sf_type, regulation):
    """
    Remove every exon of ``exon_list`` also persent in ``sf_type``.

    :param exon_list: (list of list of 2 int) a list of exons
    :param sf_type: (str) the type of splicing factor for which we don't \
    want to display any exons.
    :return:  (list of list of 2 int) the ``exon_list` without the \
    exon regulated by a kind of splicing factor.
    """
    exon_sf = []
    for sf_name in eval("group_factor.%s" % sf_type):
        exon_sf += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name,
                                                                  regulation)
    return [exon for exon in exon_list if exon not in exon_sf]
Esempio n. 10
0
def main(exon_file, name_table, list_sf, sed, fasterdb, output, ss="5'ss"):
    """
    Create a table showing for the exon commons in exon_files files \
    their surrounding introns length and their MFE at their 5'ss.

    :param exon_file: (str) a file containing gc/at exons
    :param name_table: (str) the name of the resulting table
    :param list_sf: (List(vtype=str)) list of sf name
    :param sed: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    :param output: (str) file were the output will be created
    :param ss: (str) the splicing site of interest
    """
    sf_names = "_".join([name_table] + list_sf)
    exon_class.set_debug(1)
    exon_class_bp.set_debug(debug=1)
    cnx_sed = sqlite3.connect(sed)
    cnx_fasterdb = sqlite3.connect(fasterdb)
    exon_list = []
    print("Getting exon from file")
    exon_list.append(get_exon(exon_file))
    print("Getting regulated exons")
    for sf in list_sf:
        tmp = udf.get_every_events_4_a_sl(cnx_sed, sf, "down")
        tmp = [[int(v[0]), int(v[1])] for v in tmp]
        exon_list.append(tmp)
        print("\t%s : %s down-regulated exons" % (sf, len(tmp)))
    new_exon_list = reduce(get_union_exon, exon_list)
    print("Commons exons : %s" % len(new_exon_list))
    print("Getting commons exons data !")
    df = get_exon_data(cnx_sed, new_exon_list, ss)
    if ss == "5'ss":
        noutput = output + "/rnafold_" + sf_names + "_commons_down_exons/"
        print("Computing MFE")
        df = computing_mfe(cnx_fasterdb, df, noutput)
    else:
        # Code to compute number of good branch point
        print("Computing Good branch point")
        nexon_list = df[["gene_name", "gene_id", "pos"]].values
        df2 = svm_bp_finder_launcher(cnx_fasterdb, nexon_list, output)
        print(df2.head())
        print(df.head())
        df = pd.merge(df, df2, how="right", on=["gene_id", "pos"])
    print("Writing results !")
    df.to_csv("%s/%s_commons_down_exons.csv" % (output, sf_names),
              sep="\t",
              index=False)
Esempio n. 11
0
def get_exons_list(cnx, sf_list, regulation):
    """
    Return every non-redundant exons regulated by  at least one factor in ``sf_list`` (with the regulation \
    ``regulation``)

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param regulation: (list of string) up or down or up + down
    :return: (list of list of int) list of exons shownig the regulation ``regulation`` at least for a factor \
    in ``sf_list``
    """
    exon_list = []
    for sf_name in sf_list:
        exon_list += union_dataset_function.get_every_events_4_a_sl(
            cnx, sf_name, regulation)
    exon_list = union_dataset_function.washing_events_all(exon_list)
    return exon_list
def get_exons_values(cnx, sf_list, target_column1, target_column2, regulation):
    """
    Return the values of target_column in every` \
    `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines.

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` exon.
    :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` exon.
    :param regulation: (list of string) up or down or up + down
    :return: 3 lists :

        * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \
        every exons regulated by a splicing factor
        * exon_name : (list of list of string) each sublist corresponds to the name of \
        every exons regulated by a splicing factor - the value in the sublist **i** position **j** \
         in the ``value`` and ``exon_name`` corresponds to the same exons
        * all_sf (list of string) list of each sf studied

    """
    if "$" in target_column1:
        target_column1, nt1 = target_column1.split("$")
    else:
        nt1 = None
    if "$" in target_column2:
        target_column2, nt2 = target_column2.split("$")
    else:
        nt2 = None
    exon_list = []
    if isinstance(sf_list[0], str):
        for sf_name in sf_list:
            exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        exon_list = sf_list
    print(len(exon_list))
    exon_name = ["%s_%s" % (union_dataset_function.get_gene_name(cnx, a[0]), a[1]) for a in exon_list]
    values1 = get_interest_values(cnx, exon_list, target_column1, nt1)
    values2 = get_interest_values(cnx, exon_list, target_column2, nt2)
    if len(exon_name) * 2 == len(values1):
        exon_name = ["%s_upstream" % a for a in exon_name] + \
                    ["%s_downstream" % a for a in exon_name]
    return values1, values2, exon_name
def extract_data(cnx, cnx_sed, list_files, list_names, pos):
    """

    :param cnx: (sqlite3 connect object) connection to fasterDB lite
    :param cnx_sed: (sqlite3 connect object) connection to sed
    :param list_files: (list of string) list of files containing exon set
    :param list_names: (list of string) the name of exon set
    :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \
    Those 2 lists must hace the same lenght
    :return: (list of ExonClass object) list of exon.
    """
    if list_files:
        exon_list = extract_exon_files(cnx, list_files[pos])
    else:
        exon_list_tmp = union_dataset_function.get_every_events_4_a_sl(
            cnx_sed, list_names[pos], "down")
        exon_list = [
            exon_class.ExonClass(cnx, str(exon[0]), int(exon[0]), int(exon[1]))
            for exon in exon_list_tmp
        ]
    print("%s : %s exons" % (list_names[pos], len(exon_list)))
    return exon_list
def get_median_value(cnx, id_projects_sf_name, target_column, control_dic, regulation, operation,
                     representation, nt=None):
    """
    Return the median value of target_column in ``regulation`` exons of every  ``id_projects_sf_name``.

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param id_projects_sf_name:  (list of int) the splicing lore id projects of every projects of interest
    :param target_column: (string) the value for which we want to get the median value for the ``regulation`` \
    exon.
    :param control_dic: (dictionnary of list of float) median value of each possible control exons of \
    each feature in sed database.
    :param regulation: (list of string) up or down or up + down
    :param operation: (string) mean or median
    :param representation: (string) relative or absolute
    :param nt: (string) the nt of interest
    :return: (float) the relative median value (compared to control exons)  of ``target_column`` for every \
    ``regulation`` exons in every projects ``id_projects``
    """
    values_list = []
    try:
        int(id_projects_sf_name[0])
        sf_type = "project"
    except ValueError:
        sf_type = "sf"
    if sf_type == "project":
        for i in range(len(id_projects_sf_name)):
            exon_list = functions.get_ase_events(cnx, id_projects_sf_name[i], regulation)
            final_value = get_relative_value_of_a_project_or_sf(cnx, exon_list, target_column, control_dic, nt,
                                                                operation, representation)
            values_list.append(final_value)
    else:
        for sf_name in id_projects_sf_name:
            exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
            final_value = get_relative_value_of_a_project_or_sf(cnx, exon_list, target_column, control_dic, nt,
                                                                operation, representation)
            values_list.append(final_value)

    return values_list
def create_matrix(cnx,
                  id_projects,
                  names,
                  target_columns,
                  control_dic,
                  ctrl_full,
                  regulations,
                  operation,
                  union=None,
                  sf_type=None):
    """
    Create a matrix of relative medians (toward control) for iupac characteristics of an exon set.

    :param cnx: (sqlite3 connect object) connexion to sed database
    :param id_projects: (list of ints) the list of splicing lore project id.
    :param names: (list of strings) the list of projects name (corresponding - in the same order - of the projects \
    in ``id_projects``) or if union is not none: list of sf_name.
    :param target_columns: (list of strings) list of interest characteristics for a set of exons.
    :param control_dic: (dictionary of float) dictionary storing medians values for every characteristics of \
    teh control set of exons.
    :param ctrl_full: (dictionary of list of float) dictionary storing every values for every characteristics of \
    the control set of exons.
    :param regulations: (list of strings) the strings can be "up" or "down" only for up or down-regulated exons.
    :param operation: (string) the type of heatmp we want to produce : i.e heatmap of the mean or median
    :param union: (None or string) None if we want to work project by project, anything else to work \
    with exons regulation by a particular splicing factor.
    :param sf_type: (string) the type of splicing factor we want to display in the final figures
    :return: (lif of list of float) the medians value for a project (line) for every characteristic of \
    interests (number of value in one line corresponding to a project).
    """
    print(regulations)
    new_targets = create_columns_names(target_columns)
    project_names = []
    projects_tab = []
    project_abs = []
    project_vect_names = []
    project_col = []
    for i in range(len(names)):
        for regulation in regulations:
            project_res = []
            if len(regulations) == 1:
                project_names.append("%s" % (names[i]))
            else:
                project_names.append("%s_%s" % (names[i], regulation))
            if not union:
                exon_list = functions.get_ase_events(cnx, id_projects[i],
                                                     regulation)
                print("Splicing factor : %s, project %s  - exons %s - reg %s" %
                      (names[i], id_projects[i], len(exon_list), regulation))
                exon_list = difference(exon_list, names[i], sf_type)
            else:
                exon_list = union_dataset_function.get_every_events_4_a_sl(
                    cnx, names[i], regulation)
                print("Splicing factor : %s - exons %s - reg %s" %
                      (names[i], len(exon_list), regulation))
                exon_list = difference(exon_list, names[i], sf_type)
            for j in range(len(new_targets)):
                if "_nt_" in new_targets[j]:
                    nt = new_targets[j].split("_")[0]
                    name_col = new_targets[j].replace("%s_nt" % nt, "iupac")
                    if "mean_intron" in new_targets[j]:
                        values1 = np.array(
                            functions.get_list_of_value_iupac_dnt(
                                cnx, exon_list, "iupac_upstream_intron", nt))
                        values2 = np.array(
                            functions.get_list_of_value_iupac_dnt(
                                cnx, exon_list, "iupac_downstream_intron", nt))
                        values = np.array([
                            np.nanmean([values1[i], values2[i]])
                            for i in range(len(values1))
                        ])
                    else:
                        values = np.array(
                            functions.get_list_of_value_iupac_dnt(
                                cnx, exon_list, name_col, nt))
                        if names[i] == "QKI" and nt == "G":
                            print(exon_list)
                            print(values)
                    list_val = values[~np.isnan(values)]
                    val_obs = eval("np.%s(list_val)" % operation)
                    final_value = float(val_obs - control_dic[name_col][nt]) / \
                        control_dic[name_col][nt] * 100
                    # ctrl_val = np.array(ctrl_full[name_col][nt], dtype=float)
                else:
                    if new_targets[j] == "median_flanking_intron_size":
                        values1 = np.array(
                            functions.get_redundant_list_of_value(
                                cnx, exon_list, "upstream_intron_size"),
                            dtype=float)
                        values2 = np.array(
                            functions.get_redundant_list_of_value(
                                cnx, exon_list, "downstream_intron_size"),
                            dtype=float)
                        values = np.array([
                            np.nanmedian([values1[i], values2[i]])
                            for i in range(len(values1))
                        ])
                    elif new_targets[j] == "min_flanking_intron_size":
                        values1 = np.array(
                            functions.get_redundant_list_of_value(
                                cnx, exon_list, "upstream_intron_size"),
                            dtype=float)
                        values2 = np.array(
                            functions.get_redundant_list_of_value(
                                cnx, exon_list, "downstream_intron_size"),
                            dtype=float)
                        values = np.array([
                            np.nanmin([values1[i], values2[i]])
                            for i in range(len(values1))
                        ])
                    else:

                        values = np.array(
                            functions.get_list_of_value(
                                cnx, exon_list, new_targets[j]))
                    list_val = values[~np.isnan(values)]
                    val_obs = eval("np.%s(values[~np.isnan(values)])" %
                                   operation)
                    final_value = float(val_obs - control_dic[new_targets[j]]) / \
                        control_dic[new_targets[j]] * 100
                    # ctrl_val = np.array(ctrl_full[new_targets[j]], dtype=float)
                project_abs.append(list_val)
                project_vect_names.append([project_names[-1]] * len(list_val))
                project_col.append([new_targets[j]] * len(list_val))
                project_res.append(final_value)
            projects_tab.append(project_res)
    for j in range(len(new_targets)):
        if "_nt_" in new_targets[j]:
            nt = new_targets[j].split("_")[0]
            name_col = new_targets[j].replace("%s_nt" % nt, "iupac")
            mctrl = np.array(ctrl_full[name_col][nt], dtype=float)
            mctrl = list(mctrl[~np.isnan(mctrl)])
        else:
            mctrl = np.array(ctrl_full[new_targets[j]], dtype=float)
            mctrl = list(mctrl[~np.isnan(mctrl)])
        project_abs.append(mctrl)
        project_vect_names.append(["CCE"] * len(mctrl))
        project_col.append([new_targets[j]] * len(mctrl))
    df = pd.DataFrame({
        "values": list(np.hstack(project_abs)),
        "project": list(np.hstack(project_vect_names)),
        "features": list(np.hstack(project_col))
    })
    return projects_tab, df, project_names, new_targets
Esempio n. 16
0
def get_values_for_many_projects_iupac_dnt(cnx, id_projects_sf_names,
                                           target_columns, regulation,
                                           ctrl_full, exon_type):
    """
    Return the frequency of the nucleotide ``nt`` of ``target_column`` for each ``regulation`` \
    exons for projects in ``id_projects``.

    :param cnx: (sqlite3 connection object) connexion to sed database
    :param id_projects_sf_names: (list of str or  int) list project id if union is none. List of sf_name \
    else
    :param target_columns: (list of string) the list of target columns of interest
    :param regulation: (string) up or down
    :param ctrl_full: (dictionary of list of float) the list of float for every exons for each features of interest.
    :param exon_type: (string)  the control exon type used
    :return: (list of list of float) each sublist of float corresponds to the values of ``target_column`` \
    for every regulated exon in a given project.
    """
    my_len = None
    results = {target_column: [] for target_column in target_columns}
    results["project"] = []
    for sf_name in id_projects_sf_names:
        exon_list = union_dataset_function.get_every_events_4_a_sl(
            cnx, sf_name, regulation)
        for target_column in target_columns:
            if "_nt_" in target_column:
                my_nt, target_column_name = target_column.split("_nt_")
                target_column_name = "iupac_" + target_column_name
                results[target_column] += get_list_of_value_iupac_dnt(
                    cnx, exon_list, target_column_name, my_nt)
            else:
                results[target_column] += get_list_of_value(
                    cnx, exon_list, target_column)
        results["project"] += [sf_name] * len(exon_list)
        # print("SF : %s" % sf_name)
        # for target_column in target_columns:
        #     print("%s len : %s"  % (target_column, len(results[target_column])))
        # print("project len : %s" % len(results["project"]))
    for target_column in target_columns:
        print(target_column)
        if "_nt_" in target_column:
            my_nt, target_column_name = target_column.split("_nt_")
            target_column_name = "iupac_" + target_column_name
            print("test len %s : %s" %
                  (target_column_name, len(
                      ctrl_full[target_column_name][my_nt])))
            results[target_column] += ctrl_full[target_column_name][my_nt]
            my_len = len(ctrl_full[target_column_name][my_nt])

        else:
            results[target_column] += ctrl_full[target_column]
            my_len = len(ctrl_full[target_column])
    results["project"] += [exon_type] * my_len
    # print("CCE")
    # print(ctrl_full.keys())
    # print(len(ctrl_full["iupac_upstream_intron"]["S"]))
    # print(type(ctrl_full["iupac_upstream_intron"]["S"]))
    for target_column in target_columns:
        print("%s len : %s" % (target_column, len(results[target_column])))
    print("project len : %s" % len(results["project"]))
    df = pd.DataFrame(results)
    return df
def create_matrix(cnx, id_projects, names, target_columns, control_dic, regulations, union=None, sf_type=None):
    """
    Create a matrix of relative medians (toward control) for iupac characteristics of an exon set.

    :param cnx: (sqlite3 connect object) connexion to sed database
    :param id_projects: (list of ints) the list of splicing lore project id.
    :param names: (list of strings) the list of projects name (corresponding - in the same order - of the projects \
    in ``id_projects``) or if union is not none: list of sf_name.
    :param target_columns: (list of strings) list of interest characteristics for a set of exons.
    :param control_dic: (dictionary of float) dictionary storing medians values for every characteristics of \
    teh control set of exons.
    :param regulations: (list of strings) the strings can be "up" or "down" only for up or down-regulated exons.
    :param union: (None or string) None if we want to work project by project, anything else to work \
    with exons regulation by a particular splicing factor.
    :param sf_type: (string) the type of splicing factor we want to display in the final figures
    :return: (lif of list of float) the medians value for a project (line) for every characteristic of \
    interests (number of value in one line corresponding to a project).
    """
    new_targets = create_columns_names(target_columns)
    project_names = []
    projects_tab = []
    for i in range(len(names)):
        for regulation in regulations:
            project_res = []
            if len(regulations) > 1:
                project_names.append("%s_%s" % (names[i], regulation))
            else:
                project_names.append("%s" % (names[i]))
            if not union:
                exon_list = figure_producer.get_ase_events(cnx, id_projects[i], regulation)
                print("Splicing factor : %s, project %s  - exons %s" % (names[i], id_projects[i], len(exon_list)))
                exon_list = difference(cnx, exon_list, names[i], regulation, sf_type)
            else:
                exon_list = union_dataset_function.get_every_events_4_a_sl(cnx, names[i], regulation)
                print("Splicing factor : %s - exons %s" % (names[i], len(exon_list)))
                exon_list = difference(cnx, exon_list, names[i], regulation, sf_type)
            for j in range(len(new_targets)):
                if "_nt_" in new_targets[j]:
                    nt = new_targets[j].split("_")[0]
                    name_col = new_targets[j].replace("%s_nt" % nt, "iupac")
                    if "mean_intron" in new_targets[j]:
                        values1 = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, "iupac_upstream_intron", nt))
                        values2 = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, "iupac_downstream_intron", nt))
                        values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))])
                    else:
                        values = np.array(figure_producer.get_list_of_value_iupac_dnt(cnx, exon_list, name_col, nt))

                    median_obs = np.median(values[~np.isnan(values)])
                    final_value = float(median_obs - control_dic[name_col][nt]) / \
                        control_dic[name_col][nt] * 100
                else:
                    if new_targets[j] == "median_flanking_intron_size":
                        values1 = np.array(
                            figure_producer.get_redundant_list_of_value(cnx, exon_list, "upstream_intron_size"),
                            dtype=float)
                        values2 = np.array(
                            figure_producer.get_redundant_list_of_value(cnx, exon_list, "downstream_intron_size"),
                            dtype=float)
                        values = np.array([np.nanmedian([values1[i], values2[i]]) for i in range(len(values1))])
                    elif new_targets[j] in ["nb_good_bp", "hbound", "ag_count"]:
                        if union:
                            values = np.array(figure_producer.handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp, names[i], regulation, new_targets[j]))
                        else:
                            values = np.array(figure_producer.handle_nb_bp_recovering(cnx_fasterdb, exon_list, output_bp, str(id_projects[i]), regulation, new_targets[j]))
                    elif "mfe" in new_targets[j]:
                        if union:
                            values = np.array(figure_producer.handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp, names[i], regulation, new_targets[j]))
                        else:
                            values = np.array(figure_producer.handle_mfe_recovering(cnx_fasterdb, exon_list, output_bp, str(id_projects[i]), regulation, new_targets[j]))
                    else:
                        values = np.array(figure_producer.get_list_of_value(cnx, exon_list, new_targets[j]))
                    if new_targets[j] in figure_producer.log_columns:
                        median_obs = np.median(values[~np.isnan(values)])
                        final_value = float(math.log10(median_obs) - math.log10(control_dic[new_targets[j]])) / math.log10(control_dic[
                            new_targets[j]]) * 100
                    else:
                        median_obs = np.median(values[~np.isnan(values)])
                        final_value = float(median_obs - control_dic[new_targets[j]]) / control_dic[new_targets[j]] * 100
                project_res.append(final_value)
            projects_tab.append(project_res)
    return projects_tab, project_names, new_targets