Ejemplo n.º 1
0
def main_2d(list_file,
            name_file,
            exon_type,
            output,
            seddb,
            fasterdb,
            fig_nums=("2.1D", "2.2D")):
    """
    Create the figure 2.1D and 2.2D of the article with a given list of exons.

    :param list_file: (list of str) list of exons files in the form \
    of GC_rich_exon file.
    :param name_file: (list of str) the name of each files of exons \
    given in ``list_file``
    :param exon_type: (str) the control exons
    :param output: (str) folder where the result will be created
    :param seddb: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    :param fig_nums: (list of str) list of figure names
    :return:
    """
    exon_class.set_debug(0)
    list_file.append(None)
    name_file.append(exon_type)

    ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "result/minimum_free_energy/")
    if not os.path.isdir(ctrl_output):
        os.mkdir(ctrl_output)
    ctrl_dir = os.path.realpath(os.path.dirname(__file__)) + \
               "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    type_analysis = "exon"

    mfe_3ss_score = []
    mfe_5ss_score = []
    for i in range(len(name_file)):
        if name_file[i] != exon_type:
            exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i)
            mfe_3ss, mfe_5ss = get_mfe_score_list(ctrl_output, exon_list,
                                                  name_file[i])
            mfe_3ss_score.append(mfe_3ss)
            mfe_5ss_score.append(mfe_5ss)
        else:
            mod = __import__("%s_mfe" % exon_type)
            mfe_3ss_score.append(mod.mfe_3ss)
            mfe_5ss_score.append(mod.mfe_5ss)

    create_figure(mfe_5ss_score, name_file, output, "down", "5SS",
                  type_analysis, fig_nums[0])
    dataframe_creator(mfe_5ss_score, name_file, output, "down", "5SS",
                      type_analysis, fig_nums[0])
    create_figure(mfe_3ss_score, name_file, output, "down", "3SS",
                  type_analysis, fig_nums[1])
    dataframe_creator(mfe_3ss_score, name_file, output, "down", "3SS",
                      type_analysis, fig_nums[1])
    cnx.close()
    cnx_sed.close()
Ejemplo n.º 2
0
def control_dictionaries_creator():
    """
    Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons
    """
    exon_class.set_debug(0)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    fasterdb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/minimum_free_energy", "data/fasterDB_lite.db")
    seddb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/minimum_free_energy", "data/sed.db")
    ctrl_dir = dir_path + "/control_dictionaries/"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    if not os.path.isdir(ctrl_dir):
        os.mkdir(ctrl_dir)
    exon_type = "CCE"
    exon2remove = union_dataset_function.get_exon_regulated_by_sf(
        cnx_sed, "down")
    ctrl_exon_list = get_control_exon_information(cnx, exon_type, exon2remove)
    print("retrieving upstream intron sequence")
    list_exon = [
        exon_class.ExonClass(cnx, exon[0], exon[1], exon[2])
        for exon in ctrl_exon_list
    ]
    print("calculating mfe")
    mfe_list_3ss, mfe_list_5ss = function.mfe_calculator(list_exon)
    cur_file = open(ctrl_dir + exon_type + "_mfe.py", "w")
    cur_file.write("mfe_3ss=" + str(mfe_list_3ss) + "\n")
    cur_file.write("mfe_5ss=" + str(mfe_list_5ss) + "\n")
    cur_file.close()
def main():
    exon_class.set_debug(0)
    exon_type = "CCE"
    ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "result/minimum_free_energy/")
    output = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "result/minimum_free_energy/")
    file_dir = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "result/")
    if not os.path.isdir(ctrl_output):
        os.mkdir(ctrl_output)
    if not os.path.isdir(output):
        os.mkdir(output)
    ctrl_dir = os.path.realpath(
        os.path.dirname(__file__)) + "/control_dictionaries/"
    sys.path.insert(0, ctrl_dir)
    fasterdb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "data/fasterDB_lite.db")
    seddb = os.path.realpath(os.path.dirname(__file__)).replace(
        "src/minimum_free_energy", "data/sed.db")
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    type_factors = ["exon", "spliceosome"]
    for type_analysis in type_factors:
        name_file, list_file = initiate_list_of_factor(file_dir, exon_type,
                                                       type_analysis)
        mfe_3ss_score = []
        mfe_5ss_score = []
        for i in range(len(name_file)):
            if name_file[i] != exon_type:
                exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i)
                mfe_3ss, mfe_5ss = get_mfe_score_list(ctrl_output, exon_list,
                                                      name_file[i])
                mfe_3ss_score.append(mfe_3ss)
                mfe_5ss_score.append(mfe_5ss)
            else:
                mod = __import__("%s_mfe" % exon_type)
                mfe_3ss_score.append(mod.mfe_3ss)
                mfe_5ss_score.append(mod.mfe_5ss)

        create_figure(mfe_3ss_score, name_file, output, "down", "3SS",
                      type_analysis)
        dataframe_creator(mfe_3ss_score, name_file, output, "down", "3SS",
                          type_analysis)
        # create_figure_error_bar(mfe_3ss_score, name_file, output, "down", "3SS", type_analysis)
        # write_proportion_pvalues(mfe_3ss_score, name_file, output, "3SS", type_analysis)
        create_figure(mfe_5ss_score, name_file, output, "down", "5SS",
                      type_analysis)
        dataframe_creator(mfe_5ss_score, name_file, output, "down", "5SS",
                          type_analysis)
Ejemplo n.º 4
0
def main(exon_file, name_table, list_sf, sed, fasterdb, output, ss="5'ss"):
    """
    Create a table showing for the exon commons in exon_files files \
    their surrounding introns length and their MFE at their 5'ss.

    :param exon_file: (str) a file containing gc/at exons
    :param name_table: (str) the name of the resulting table
    :param list_sf: (List(vtype=str)) list of sf name
    :param sed: (str) path to sed database
    :param fasterdb: (str) path to fasterdb database
    :param output: (str) file were the output will be created
    :param ss: (str) the splicing site of interest
    """
    sf_names = "_".join([name_table] + list_sf)
    exon_class.set_debug(1)
    exon_class_bp.set_debug(debug=1)
    cnx_sed = sqlite3.connect(sed)
    cnx_fasterdb = sqlite3.connect(fasterdb)
    exon_list = []
    print("Getting exon from file")
    exon_list.append(get_exon(exon_file))
    print("Getting regulated exons")
    for sf in list_sf:
        tmp = udf.get_every_events_4_a_sl(cnx_sed, sf, "down")
        tmp = [[int(v[0]), int(v[1])] for v in tmp]
        exon_list.append(tmp)
        print("\t%s : %s down-regulated exons" % (sf, len(tmp)))
    new_exon_list = reduce(get_union_exon, exon_list)
    print("Commons exons : %s" % len(new_exon_list))
    print("Getting commons exons data !")
    df = get_exon_data(cnx_sed, new_exon_list, ss)
    if ss == "5'ss":
        noutput = output + "/rnafold_" + sf_names + "_commons_down_exons/"
        print("Computing MFE")
        df = computing_mfe(cnx_fasterdb, df, noutput)
    else:
        # Code to compute number of good branch point
        print("Computing Good branch point")
        nexon_list = df[["gene_name", "gene_id", "pos"]].values
        df2 = svm_bp_finder_launcher(cnx_fasterdb, nexon_list, output)
        print(df2.head())
        print(df.head())
        df = pd.merge(df, df2, how="right", on=["gene_id", "pos"])
    print("Writing results !")
    df.to_csv("%s/%s_commons_down_exons.csv" % (output, sf_names),
              sep="\t",
              index=False)
Ejemplo n.º 5
0
def control_dictionaries_creator():
    """
    Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons
    """
    exon_class.set_debug(0)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    fasterdb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src", "data/fasterDB_lite.db")
    seddb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src", "data/sed.db")
    ctrl_dir = dir_path + "/control_dictionaries/"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    if not os.path.isdir(ctrl_dir):
        os.mkdir(ctrl_dir)
    exon_type = "CCE"
    sizes = [100, 25, 50, 35]
    exon2remove = union_dataset_function.get_exon_regulated_by_sf(
        cnx_sed, "down")
    ctrl_exon_list = get_control_exon_information(cnx, exon_type, exon2remove,
                                                  "down")
    for size in sizes:
        print("size : %s" % size)
        print("retrieving upstream intron sequence")
        list_exon = [
            exon_class.ExonClass(cnx, exon[0], exon[1], exon[2])
            for exon in ctrl_exon_list
        ]
        print("calculating bp and ppt score")
        bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, \
            sequence_list, ag_count_list, hbound_list = function.bp_ppt_calculator(list_exon, size)
        cur_file = open(
            ctrl_dir + exon_type + "_" + str(size) + "_bp_ppt_score.py", "w")
        cur_file.write("bp_score=" + str(bp_score_list) + "\n")
        cur_file.write("ppt_score=" + str(ppt_score_list) + "\n")
        cur_file.write("nb_bp=" + str(nb_bp_list) + "\n")
        cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n")
        cur_file.write("bp_seq=" + str(sequence_list) + "\n")
        cur_file.write("ag_count=" + str(ag_count_list) + "\n")
        cur_file.write("hbound=" + str(hbound_list) + "\n")
        cur_file.close()
def get_exon_info(cnx, info_list, debug):
    """
    Get every information we need on an exon

    :param debug: (int) 0 no debug, 1 debug mode
    :param cnx: (sqlite3 object) return all the information we need to connect to FasterDB lite
    :param info_list: (list of list of string and int and int) each sublist contains \
    a string : gene_symbol and 2 int : the gene_id and the exobn position on gene respectively
    :return: (a list of ExonClass object) list of exons
    """
    print("Getting exons information !")
    exon_list = []
    exon_class.set_debug(debug)
    count = 0
    ll = str(len(info_list))
    for exon_info in info_list:
        exon_list.append(exon_class.ExonClass(cnx, exon_info[0], exon_info[1], exon_info[2]))
        count += 1
        percent = round(float(count) / len(info_list) * 100, 1)
        sys.stdout.write("Progression : " + str(count) + " / " + ll + " - " + str(percent) + " %\r")
        sys.stdout.flush()
    return exon_list
Ejemplo n.º 7
0
import function
import function_mfe
import exon_class
import exon_class_mfe
import statistical_analysis
import rpy2.robjects as robj
from rpy2.robjects.packages import importr
import rpy2.robjects.vectors as v
import pandas as pd
nt_dic = {"A": 0, "C": 1, "G": 2, "T": 3, "S": 4, "W": 5, "R": 6, "Y": 7}
dnt_dic = {"AA": 0, "AC": 1, "AG": 2, "AT": 3, "CA": 4, "CC": 5,
           "CG": 6, "CT": 7, "GA": 8, "GC": 9, "GG": 10, "GT": 11,
           "TA": 12, "TC": 13, "TG": 14, "TT": 15}
log_columns = ["nb_intron_gene", "downstream_intron_size", "upstream_intron_size",
               "median_flanking_intron_size", "min_flanking_intron_size", "exon_size"]
exon_class.set_debug(0)
exon_class_mfe.set_debug(0)
size_bp_up_seq = 100
output_bp = "/".join(os.path.realpath(__file__).split("/")[:-2]) + "/result/bp_files/"


# Functions
def connexion(seddb):
    """
    Connexion to SED database.

    :param seddb: ((string) path to sed database
    :return:  (sqlite3 connection object) allow connexion to sed database
    """
    return sqlite3.connect(seddb)