Esempio n. 1
0
def redundant_ag_at_and_u1_u2(cnx, regulation):
    """
    Create the list of redundant exons between the AT and GC rich list of exons and \
    between the U1 and U2 list of exons

    :param cnx: (sqlite3 connect object) allow connection to sed database
    :param regulation: (string) the regulation we want for the common exons
    :return: (list of list of 2 int) list of exons identified by their gene id and their exons position
    """
    exon_at = []
    for sf_name in group_factor.at_rich_down:
        exon_at += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_at = union_dataset_function.washing_events_all(exon_at)
    exon_gc = []
    for sf_name in group_factor.gc_rich_down:
        exon_gc += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_gc = union_dataset_function.washing_events_all(exon_gc)
    global redundant_gc_at
    redundant_gc_at = [exon for exon in exon_at if exon in exon_gc]
    print("redundant exon GC and AT rich : %s" % len(redundant_gc_at))

    exon_u1 = []
    for sf_name in group_factor.u1_factors:
        exon_u1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_u1 = union_dataset_function.washing_events_all(exon_u1)
    exon_u2 = []
    for sf_name in group_factor.u2_factors:
        exon_u2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_u2 = union_dataset_function.washing_events_all(exon_u2)
    global redundant_u1_u2
    redundant_u1_u2 = [exon for exon in exon_u1 if exon in exon_u2]
    print("redundant exon U1 and U2 rich : %s" % len(redundant_u1_u2))
def get_exon_list(cnx, annotation_name, regulation):
    """
    Get the exon_list wanted.

    :param cnx: (sqlite3 connect object) connection to sed database
    :param annotation_name: (string) GC-AT or a sf_name
    :param regulation: (string) the regulation of an exon list by a factor(s)
    :return: (list of 2 int) gene id and exon_pos
    """
    if "GC" in annotation_name or "AT" in annotation_name:
        annotation_name = annotation_name.split("_")[0]
        folder = os.path.realpath(os.path.dirname(__file__)).replace(
            "src", "data/")
        my_file = "%s%s_rich_exons" % (folder, annotation_name)
        exon_list = extract_exon_list(my_file)
    elif "U1-FACTORS" in annotation_name or "U2-FACTORS" in annotation_name:
        annotation_name = annotation_name.split("_")[0]
        dic_name = {
            "U1-FACTORS": ["SNRPC", "SNRNP70", "DDX5_DDX17"],
            "U2-FACTORS": ["U2AF2", "SF1", "SF3A3", "SF3B4"]
        }
        exon_list = []
        for sf_name in dic_name[annotation_name]:
            exon_list += union_dataset_function.get_every_events_4_a_sl(
                cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        annotation_name = annotation_name.split("_")[0]
        sf_name = annotation_name.upper()
        sf_name = sf_name.replace("SFRS",
                                  "SRSF").replace("TRA2A", "TRA2A_B").replace(
                                      "DDX5-17", "DDX5_DDX17")
        exon_list = union_dataset_function.get_every_events_4_a_sl(
            cnx, sf_name, regulation)
    return exon_list
def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation):
    """

    :param cnx: (sqlite3 connect object) connection to fasterDB lite
    :param cnx_sed: (sqlite3 connect object) connection to sed
    :param list_files: (list of string) list of files containing exon set
    :param list_names: (list of string) the name of exon set
    :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \
    Those 2 lists must have the same length
    :param regulation: (string) up or down
    :return: (list of ExonClass object) list of exon.
    """
    if list_files:
        exon_list = extract_exon_files(cnx, list_files[pos])
    else:
        dic_name = {
            "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"],
            "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"]
        }
        exon_list_tmp = []
        for sf_name in dic_name[list_names[pos]]:
            exon_list_tmp += union_dataset_function.get_every_events_4_a_sl(
                cnx_sed, sf_name, regulation)
        exon_list_tmp = union_dataset_function.washing_events_all(
            exon_list_tmp)
        exon_list = [
            exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]),
                                    int(exon[1])) for exon in exon_list_tmp
        ]
    print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation))
    return exon_list
def difference(cnx, list1, list2, regulation):
    """
    Return the exons regulated by the factors in list1 if they are not regulated by the factors in list2
    :param cnx: (sqlite3 connect object) connection to sed database
    :param list1: (list of string) list of splicing factors
    :param list2:  (list of strings) list of splicing factros
    :param regulation: (string) the exons with the regulation ``regulation`` regulated by the splicing factors in \
    ``list1`` or ``list2``
    :return:(list of list of 2 int
    """
    exon_list1 = []
    exon_list2 = []
    for sf_name in list1:
        exon_list1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_list1 = union_dataset_function.washing_events_all(exon_list1)
    for sf_name in list2:
        exon_list2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
    exon_list2 = union_dataset_function.washing_events_all(exon_list2)
    return [exon for exon in exon_list1 if exon not in exon_list2]
def get_gene_values(cnx, sf_list, target_column1, target_column2, regulation):
    """
    Return the values of target_column in every`\
    `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines.

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` \
    exon.
    :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` \
    exon.
    :param regulation: (list of string) up or down or up + down
    :return: 3 lists :

        * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \
        every exons regulated by a splicing factor
        * exon_name : (list of list of string) each sublist corresponds to the name of \
        every exons regulated by a splicing factor - the value in the sublist **i** position **j** \
         in the ``value`` and ``exon_name`` corresponds to the same exons
        * all_sf (list of string) list of each sf studied
    """
    if "$" in target_column1:
        target_column1, nt1 = target_column1.split("$")
    else:
        nt1 = None
    if "$" in target_column2:
        target_column2, nt2 = target_column2.split("$")
    else:
        nt2 = None
    exon_list = []
    if isinstance(sf_list[0], str):
        for sf_name in sf_list:
            exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        exon_list = sf_list
    gene_id = []
    for val in exon_list:
        if val[0] not in gene_id:
            gene_id.append(val[0])
    gene_name = [union_dataset_function.get_gene_name(cnx, my_id) for my_id in gene_id]
    if nt1:
        values1 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column1, nt1)
    else:
        values1 = functions.get_list_of_value(cnx, exon_list, target_column1)

    if nt2:
        values2 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column2, nt2)
    else:
        values2 = functions.get_list_of_value(cnx, exon_list, target_column2)

    return values1, values2, gene_name
Esempio n. 6
0
def get_exons_list(cnx, sf_list, regulation):
    """
    Return every non-redundant exons regulated by  at least one factor in ``sf_list`` (with the regulation \
    ``regulation``)

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param regulation: (list of string) up or down or up + down
    :return: (list of list of int) list of exons shownig the regulation ``regulation`` at least for a factor \
    in ``sf_list``
    """
    exon_list = []
    for sf_name in sf_list:
        exon_list += union_dataset_function.get_every_events_4_a_sl(
            cnx, sf_name, regulation)
    exon_list = union_dataset_function.washing_events_all(exon_list)
    return exon_list
def get_exons_values(cnx, sf_list, target_column1, target_column2, regulation):
    """
    Return the values of target_column in every` \
    `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines.

    :param cnx: (sqlite3 connexion object) allow connexion to sed database
    :param sf_list:  (list of string) the list of splicing factor studied
    :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` exon.
    :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` exon.
    :param regulation: (list of string) up or down or up + down
    :return: 3 lists :

        * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \
        every exons regulated by a splicing factor
        * exon_name : (list of list of string) each sublist corresponds to the name of \
        every exons regulated by a splicing factor - the value in the sublist **i** position **j** \
         in the ``value`` and ``exon_name`` corresponds to the same exons
        * all_sf (list of string) list of each sf studied

    """
    if "$" in target_column1:
        target_column1, nt1 = target_column1.split("$")
    else:
        nt1 = None
    if "$" in target_column2:
        target_column2, nt2 = target_column2.split("$")
    else:
        nt2 = None
    exon_list = []
    if isinstance(sf_list[0], str):
        for sf_name in sf_list:
            exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation)
        exon_list = union_dataset_function.washing_events_all(exon_list)
    else:
        exon_list = sf_list
    print(len(exon_list))
    exon_name = ["%s_%s" % (union_dataset_function.get_gene_name(cnx, a[0]), a[1]) for a in exon_list]
    values1 = get_interest_values(cnx, exon_list, target_column1, nt1)
    values2 = get_interest_values(cnx, exon_list, target_column2, nt2)
    if len(exon_name) * 2 == len(values1):
        exon_name = ["%s_upstream" % a for a in exon_name] + \
                    ["%s_downstream" % a for a in exon_name]
    return values1, values2, exon_name