def main():
    exon_class_bp.set_debug(0)
    exon_type = "CCE"
    output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/stretch_calculator", "result/stretch_calculator/")
    file_dir = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/stretch_calculator", "result/")
    if not os.path.isdir(output):
        os.mkdir(output)
    ctrl_dir = os.path.realpath(
        os.path.dirname(__file__)) + "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    fasterdb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/stretch_calculator", "data/fasterDB_lite.db")
    seddb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/stretch_calculator", "data/sed.db")
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    type_factors = ["exon", "spliceosome", "spliceosome"]
    regulations = ["down", "down", "up"]
    for i in range(len(type_factors)):
        type_analysis = type_factors[i]
        regulation = regulations[i]
        name_file, list_file = initiate_list_of_factor(file_dir, exon_type,
                                                       type_analysis)
        dict_stretch_3ss = {
            "X".join(map(str, stretch_data)):
            {nt: []
             for nt in config.nt_list}
            for stretch_data in config.stretches
        }
        for j in range(len(name_file)):
            if name_file[j] != exon_type:
                exon_list = extract_data(cnx, cnx_sed, list_file, name_file, j,
                                         regulation)
                for stretch_data in config.stretches:
                    stretch_dic = get_stretch_score_list(
                        exon_list, stretch_data)
                    for nt in config.nt_list:
                        dict_stretch_3ss["X".join(map(
                            str, stretch_data))][nt].append(stretch_dic[nt])
            else:
                for stretch_data in config.stretches:
                    mod = __import__("%s_stretches" % exon_type)
                    st_name = "X".join(map(str, stretch_data))
                    ctrl_dic = eval("mod.stretch_%s" % st_name)
                    for nt in config.nt_list:
                        dict_stretch_3ss[st_name][nt].append(ctrl_dic[nt])

        for stretch_data in config.stretches:
            st_name = "X".join(map(str, stretch_data))
            for nt in config.nt_list:
                create_figure(
                    dict_stretch_3ss[st_name][nt], name_file, output,
                    regulation, "nb_stretch_%s-%s_%s_nt" %
                    (stretch_data[1], stretch_data[0], nt), type_analysis)
                dataframe_creator(
                    dict_stretch_3ss[st_name][nt], name_file, output,
                    regulation, "nb_stretch_%s-%s_%s_nt" %
                    (stretch_data[1], stretch_data[0], nt), type_analysis)
def main_2g(list_file, name_file, exon_type, fasterdb, seddb, output):
    """

    :param list_file: (list of str) list of exons files in the form \
    of GC_rich_exon file.
    :param name_file: (list of str) the name of each files of exons \
    given in ``list_file``
    :param exon_type: (str) the control exons
    :param output: (str) folder where the result will be created
    :param seddb: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    """
    list_file.append(None)
    name_file.append(exon_type)
    exon_class_bp.set_debug(0)
    ctrl_dir = os.path.realpath(os.path.dirname(__file__)) + \
               "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)

    type_analysis = "exon"
    regulation = "down"
    dict_stretch_3ss = {
        "X".join(map(str, stretch_data)): {nt: []
                                           for nt in config.nt_list}
        for stretch_data in config.stretches
    }
    for j in range(len(name_file)):
        if name_file[j] != exon_type:
            exon_list = extract_data(cnx, cnx_sed, list_file, name_file, j,
                                     regulation)
            for stretch_data in config.stretches:
                stretch_dic = get_stretch_score_list(exon_list, stretch_data)
                for nt in config.nt_list:
                    dict_stretch_3ss["X".join(map(
                        str, stretch_data))][nt].append(stretch_dic[nt])
        else:
            for stretch_data in config.stretches:
                mod = __import__("%s_stretches" % exon_type)
                st_name = "X".join(map(str, stretch_data))
                ctrl_dic = eval("mod.stretch_%s" % st_name)
                for nt in config.nt_list:
                    dict_stretch_3ss[st_name][nt].append(ctrl_dic[nt])

    for stretch_data in config.stretches:
        st_name = "X".join(map(str, stretch_data))
        for nt in config.nt_list:
            create_figure(
                dict_stretch_3ss[st_name][nt], name_file, output, regulation,
                "2.2G_nb_stretch_%s-%s_%s_nt" %
                (stretch_data[1], stretch_data[0], nt), type_analysis)
            dataframe_creator(
                dict_stretch_3ss[st_name][nt], name_file, output, regulation,
                "2.2G_nb_stretch_%s-%s_%s_nt" %
                (stretch_data[1], stretch_data[0], nt), type_analysis)
Esempio n. 3
0
def main(exon_file, name_table, list_sf, sed, fasterdb, output, ss="5'ss"):
    """
    Create a table showing for the exon commons in exon_files files \
    their surrounding introns length and their MFE at their 5'ss.

    :param exon_file: (str) a file containing gc/at exons
    :param name_table: (str) the name of the resulting table
    :param list_sf: (List(vtype=str)) list of sf name
    :param sed: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    :param output: (str) file were the output will be created
    :param ss: (str) the splicing site of interest
    """
    sf_names = "_".join([name_table] + list_sf)
    exon_class.set_debug(1)
    exon_class_bp.set_debug(debug=1)
    cnx_sed = sqlite3.connect(sed)
    cnx_fasterdb = sqlite3.connect(fasterdb)
    exon_list = []
    print("Getting exon from file")
    exon_list.append(get_exon(exon_file))
    print("Getting regulated exons")
    for sf in list_sf:
        tmp = udf.get_every_events_4_a_sl(cnx_sed, sf, "down")
        tmp = [[int(v[0]), int(v[1])] for v in tmp]
        exon_list.append(tmp)
        print("\t%s : %s down-regulated exons" % (sf, len(tmp)))
    new_exon_list = reduce(get_union_exon, exon_list)
    print("Commons exons : %s" % len(new_exon_list))
    print("Getting commons exons data !")
    df = get_exon_data(cnx_sed, new_exon_list, ss)
    if ss == "5'ss":
        noutput = output + "/rnafold_" + sf_names + "_commons_down_exons/"
        print("Computing MFE")
        df = computing_mfe(cnx_fasterdb, df, noutput)
    else:
        # Code to compute number of good branch point
        print("Computing Good branch point")
        nexon_list = df[["gene_name", "gene_id", "pos"]].values
        df2 = svm_bp_finder_launcher(cnx_fasterdb, nexon_list, output)
        print(df2.head())
        print(df.head())
        df = pd.merge(df, df2, how="right", on=["gene_id", "pos"])
    print("Writing results !")
    df.to_csv("%s/%s_commons_down_exons.csv" % (output, sf_names),
              sep="\t",
              index=False)
def main():
    regulation = "down"
    exon_class_bp.set_debug(0)
    base = os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    output = base + "/result/experimental_branch_point"
    at_exon_file = base + "/result/AT_rich_exons"
    gc_exon_file = base + "/result/GC_rich_exons"
    fasterdb = base + "/data/fasterDB_lite.db"
    seddb = base + "/data/sed.db"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    exon_type = "CCE"
    at_exon = read_file(at_exon_file)
    gc_exon = read_file(gc_exon_file)
    exon2remove = [
        list(map(int, exon))
        for exon in udf.get_exon_regulated_by_sf(cnx_sed, regulation)
    ]
    ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove)
    exon_list = gc_exon + at_exon + ctrl_exons
    type_exon = ["GC-exons"] * len(gc_exon) + \
                ["AT-exons"] * len(at_exon) + \
                ["%s-exons" % exon_type] * len(ctrl_exons)
    tot = len(exon_list)
    count = 0
    count_none = 0
    print("Creating bed of predicted branch points")
    with open("%s/predicted_branch_points.bed" % output, "w") as outf:
        for exon, name_exon in zip(exon_list, type_exon):
            exon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1])
            nb_good_bp, list_pos = function_bp.goob_bp_only(exon)
            if list_pos is not None:
                for line in list_pos:
                    line[3] += "_" + name_exon
                    line[0] = "chr" + str(line[0])
                    outf.write("\t".join(list(map(str, line))) + "\n")
            else:
                count_none += 1
            count += 1
            sys.stdout.write("%s/%s  (%s)              \r" %
                             (count, tot, count_none))
    cnx.close()
    cnx_sed.close()
def control_dictionaries_creator():
    """
    Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons
    """
    exon_class_bp.set_debug(0)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    fasterdb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/fasterDB_lite.db")
    seddb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/sed.db")
    ctrl_dir = dir_path + "/control_dictionaries/"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    exon2remove = union_dataset_function.get_exon_regulated_by_sf(
        cnx_sed, "down")
    if not os.path.isdir(ctrl_dir):
        os.mkdir(ctrl_dir)
    exon_type = ["CCE"]
    sizes = [100, 50, 35, 25]
    for cur_exon_type in exon_type:
        ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type,
                                                      exon2remove)
        print("retrieving upstream intron sequence")
        list_exon = [
            exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2])
            for exon in ctrl_exon_list
        ]
        for size in sizes:
            print("calculating bp and ppt score")
            bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, sequence_list, ag_count_list, \
                hbound_list, uaa_list, una_list = function_bp.bp_ppt_calculator(list_exon, size)
            cur_file = open(
                ctrl_dir + cur_exon_type + "_" + str(size) +
                "_bp_ppt_score.py", "w")
            cur_file.write("bp_score=" + str(bp_score_list) + "\n")
            cur_file.write("ppt_score=" + str(ppt_score_list) + "\n")
            cur_file.write("nb_bp=" + str(nb_bp_list) + "\n")
            cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n")
            cur_file.write("bp_seq=" + str(sequence_list) + "\n")
            cur_file.write("ag_count=" + str(ag_count_list) + "\n")
            cur_file.write("hbound=" + str(hbound_list) + "\n")
            cur_file.write("uaa_count=" + str(uaa_list) + "\n")
            cur_file.write("una_count=" + str(una_list) + "\n")
            cur_file.close()
"""
Description:
    Create a bed file containing every GC and AT exons and their GC frequency
"""

import sqlite3
import os
import sys
mydir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, mydir)
import union_dataset_function as udf
from figure_creator import get_exons_list
bp_dir = mydir + "/make_control_files_bp_ppt"
sys.path.insert(0, bp_dir)
import exon_class_bp
exon_class_bp.set_debug(0)
from function_bp import bp_ppt_calculator
mfe_dir = mydir + "/minimum_free_energy"
sys.path.insert(0, mfe_dir)
import exon_class
exon_class.set_debug(0)
from function import mfe_calculator
stretch_dir = mydir + "/stretch_calculator"
sys.path.insert(0, stretch_dir)
from stretch_calculator import stretch_counter


def get_exon_from_file(exon_file):
    """
    Get evey exon stored into ``exon_file``.
Esempio n. 7
0
def main_2efg(list_file, name_file, exon_type, seddb, fasterdb, output):
    """

    :param list_file: (list of str) list of exons files in the form \
    of GC_rich_exon file.
    :param name_file: (list of str) the name of each files of exons \
    given in ``list_file``
    :param exon_type: (str) the control exons of interest
    :param seddb: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    :param output: (str) path where the result will be created
    """
    list_file.append(None)
    name_file.append(exon_type)
    exon_class_bp.set_debug(0)
    ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "result/make_control_files_bp_ppt/")
    if not os.path.isdir(ctrl_output):
        os.mkdir(ctrl_output)
    ctrl_dir = os.path.realpath(
        os.path.dirname(__file__)) + "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    type_analysis = "exon"
    regulation = "down"

    dict_score_3ss = {
        k: {
            "bp_score_list": [],
            "ppt_score_list": [],
            "nb_bp_list": [],
            "gc_weblogo": [],
            "nb_good_bp_list": [],
            "ag_count": [],
            "hbound": [],
            "uaa_count": [],
            "una_count": []
        }
        for k in [100, 50, 25]
    }
    for i in range(len(name_file)):
        if name_file[i] != exon_type:
            exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i,
                                     regulation)
            for size in dict_score_3ss.keys():
                bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, bp_seq_list, ag_count_list,\
                    hbound_list, uaa_list, una_list = \
                    get_bp_ppt_score_list(ctrl_output, exon_list, name_file[i], size, regulation)
                dict_score_3ss[size]["nb_good_bp_list"].append(nb_good_bp_list)
                if size == 25:
                    web_logo_creator(
                        bp_seq_list, "2.1F_%s_%s_exons_%s_nt" %
                        (name_file[i], regulation, size), output)
                dict_score_3ss[size]["gc_weblogo"].append(
                    get_weblogo_gc_count(bp_seq_list))
                dict_score_3ss[size]["hbound"].append(hbound_list)
                dict_score_3ss[size]["una_count"].append(una_list)

        else:
            for size in dict_score_3ss.keys():
                mod = __import__("%s_%s_bp_ppt_score" % (exon_type, size))
                dict_score_3ss[size]["nb_good_bp_list"].append(mod.nb_good_bp)
                dict_score_3ss[size]["gc_weblogo"].append(
                    get_weblogo_gc_count(mod.bp_seq))
                dict_score_3ss[size]["hbound"].append(mod.hbound)
                dict_score_3ss[size]["una_count"].append(mod.una_count)

    for size in dict_score_3ss.keys():
        if size == 25:
            create_figure(dict_score_3ss[size]["hbound"], name_file, output,
                          regulation, "2.2E_nb_h_bound_%s_nt" % size,
                          type_analysis, "violin")
            dataframe_creator(dict_score_3ss[size]["hbound"], name_file,
                              output, regulation,
                              "2.2E_nb_h_bound_%s_nt" % size, type_analysis)
            create_figure(dict_score_3ss[size]["gc_weblogo"], name_file,
                          output, regulation, "2.2F_gc_weblogo_%s_nt" % size,
                          type_analysis, "box")
            dataframe_creator(dict_score_3ss[size]["gc_weblogo"], name_file,
                              output, regulation,
                              "2.2F_gc_weblogo_%s_nt" % size, type_analysis)
        if size == 50:
            create_figure(dict_score_3ss[size]["una_count"], name_file, output,
                          regulation, "2.1G_UNA_count(%snt)" % size,
                          type_analysis, "box")
            dataframe_creator(dict_score_3ss[size]["una_count"], name_file,
                              output, regulation,
                              "2.1G_UNA_count(%snt)" % size, type_analysis)
        if size == 100:
            create_barplot(dict_score_3ss[size]["nb_good_bp_list"], name_file,
                           output, regulation,
                           "2.1E_prop_nb_good_branch_point_(%snt)" % size,
                           type_analysis)
            write_proportion_pvalues(
                dict_score_3ss[size]["nb_good_bp_list"], name_file, output,
                regulation, "2.1E_prop_nb_good_branch_point_(%snt)" % size,
                type_analysis)
            # dataframe_creator(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation,
            #                   "2.1E_prop_nb_good_branch_point_(%snt)" % size, type_analysis)
    cnx.close()
    cnx_sed.close()
Esempio n. 8
0
def main():
    exon_class_bp.set_debug(0)
    exon_type = "CCE"
    ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "result/make_control_files_bp_ppt/")
    output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "result/bp_ppt_score/")
    file_dir = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "result/")
    if not os.path.isdir(ctrl_output):
        os.mkdir(ctrl_output)
    if not os.path.isdir(output):
        os.mkdir(output)
    ctrl_dir = os.path.realpath(
        os.path.dirname(__file__)) + "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    fasterdb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/fasterDB_lite.db")
    seddb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/sed.db")
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    type_factors = ["exon", "spliceosome", "spliceosome"]
    regulations = ["down", "down", "up"]

    for j in range(len(type_factors)):
        type_analysis = type_factors[j]
        regulation = regulations[j]
        name_file, list_file = initiate_list_of_factor(file_dir, exon_type,
                                                       type_analysis)
        dict_score_3ss = {
            k: {
                "bp_score_list": [],
                "ppt_score_list": [],
                "nb_bp_list": [],
                "gc_weblogo": [],
                "nb_good_bp_list": [],
                "ag_count": [],
                "hbound": [],
                "uaa_count": [],
                "una_count": []
            }
            for k in [100, 50, 35, 25]
        }
        # list_force_acceptor = []
        # list_force_donor = []
        for i in range(len(name_file)):
            if name_file[i] != exon_type:
                exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i,
                                         regulation)
                # list_force_acceptor.append(get_redundant_list_of_value(cnx_sed, exon_list, "force_acceptor"))
                # list_force_donor.append(get_redundant_list_of_value(cnx_sed, exon_list, "force_donor"))
                for size in dict_score_3ss.keys():
                    bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, bp_seq_list, ag_count_list,\
                        hbound_list, uaa_list, una_list = \
                        get_bp_ppt_score_list(ctrl_output, exon_list, name_file[i], size, regulation)
                    # dict_score_3ss[size]["bp_score_list"].append(bp_score_list)
                    # dict_score_3ss[size]["ppt_score_list"].append(ppt_score_list)
                    # dict_score_3ss[size]["nb_bp_list"].append(nb_bp_list)
                    dict_score_3ss[size]["nb_good_bp_list"].append(
                        nb_good_bp_list)
                    web_logo_creator(
                        bp_seq_list,
                        "%s_%s_exons_%s_nt" % (name_file[i], regulation, size),
                        output)
                    dict_score_3ss[size]["gc_weblogo"].append(
                        get_weblogo_gc_count(bp_seq_list))
                    # dict_score_3ss[size]["ag_count"].append(ag_count_list)
                    dict_score_3ss[size]["hbound"].append(hbound_list)
                    # dict_score_3ss[size]["uaa_count"].append(uaa_list)
                    dict_score_3ss[size]["una_count"].append(una_list)

            else:
                for size in dict_score_3ss.keys():
                    mod = __import__("%s_%s_bp_ppt_score" % (exon_type, size))
                    # dict_score_3ss[size]["bp_score_list"].append(mod.bp_score)
                    # dict_score_3ss[size]["ppt_score_list"].append(mod.ppt_score)
                    # dict_score_3ss[size]["nb_bp_list"].append(mod.nb_bp)
                    dict_score_3ss[size]["nb_good_bp_list"].append(
                        mod.nb_good_bp)
                    web_logo_creator(
                        mod.bp_seq,
                        "%s_%s_exons_%s_nt" % (name_file[i], regulation, size),
                        output)
                    dict_score_3ss[size]["gc_weblogo"].append(
                        get_weblogo_gc_count(mod.bp_seq))
                    # dict_score_3ss[size]["ag_count"].append(mod.ag_count)
                    dict_score_3ss[size]["hbound"].append(mod.hbound)
                    # dict_score_3ss[size]["uaa_count"].append(mod.uaa_count)
                    dict_score_3ss[size]["una_count"].append(mod.una_count)
                # list_force_acceptor.append(get_control_exon_information(cnx_sed, exon_type, "force_acceptor"))
                # list_force_donor.append(get_control_exon_information(cnx_sed, exon_type, "force_donor"))

        for size in dict_score_3ss.keys():
            print("------------> %s nt " % size)
            create_barplot(dict_score_3ss[size]["nb_good_bp_list"], name_file,
                           output, regulation,
                           "prop_nb_good_branch_point_(%snt)" % size,
                           type_analysis)
            # create_distplot_bp(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation,
            #                    "prop_nb_good_branch_point_(%snt)" % size, type_analysis)
            # write_proportion_pvalues(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation,
            #                          "prop_nb_good_branch_point_(%snt)" % size, type_analysis)
            dataframe_creator(dict_score_3ss[size]["nb_good_bp_list"],
                              name_file, output, regulation,
                              "prop_nb_good_branch_point_(%snt)" % size,
                              type_analysis)
            # create_barplot(dict_score_3ss[size]["ag_count"], name_file, output, regulation,
            #                "AG_count_downstream_bp(%snt)" % size, type_analysis)
            # write_proportion_pvalues(dict_score_3ss[size]["ag_count"], name_file, output, regulation,
            #                          "AG_count_downstream_bp(%snt)" % size, type_analysis)

            # create_barplot(dict_score_3ss[size]["uaa_count"], name_file, output, regulation,
            #                "UAA_count(%snt)" % size, type_analysis)
            # write_proportion_pvalues(dict_score_3ss[size]["uaa_count"], name_file, output, regulation,
            #                          "UAA_count(%snt)" % size, type_analysis)
            create_figure(dict_score_3ss[size]["una_count"], name_file, output,
                          regulation, "UNA_count(%snt)" % size, type_analysis,
                          "box")
            dataframe_creator(dict_score_3ss[size]["una_count"], name_file,
                              output, regulation, "UNA_count(%snt)" % size,
                              type_analysis)
            # write_proportion_pvalues(dict_score_3ss[size]["una_count"], name_file, output, regulation,
            #                          "UNA_count(%snt)" % size, type_analysis)
            create_figure(dict_score_3ss[size]["hbound"], name_file, output,
                          regulation, "nb_h_bound_%s_nt" % size, type_analysis,
                          "violin")
            dataframe_creator(dict_score_3ss[size]["hbound"], name_file,
                              output, regulation, "nb_h_bound_%s_nt" % size,
                              type_analysis)

            create_figure(dict_score_3ss[size]["gc_weblogo"], name_file,
                          output, regulation, "gc_weblogo_%s_nt" % size,
                          type_analysis, "box")
            dataframe_creator(dict_score_3ss[size]["gc_weblogo"], name_file,
                              output, regulation, "gc_weblogo_%s_nt" % size,
                              type_analysis)