コード例 #1
0
def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation):
    """

    :param cnx: (sqlite3 connect object) connection to fasterDB lite
    :param cnx_sed: (sqlite3 connect object) connection to sed
    :param list_files: (list of string) list of files containing exon set
    :param list_names: (list of string) the name of exon set
    :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \
    Those 2 lists must have the same length
    :param regulation: (string) up or down
    :return: (list of ExonClass object) list of exon.
    """
    if list_files:
        exon_list = extract_exon_files(cnx, list_files[pos])
    else:
        dic_name = {
            "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"],
            "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"]
        }
        exon_list_tmp = []
        for sf_name in dic_name[list_names[pos]]:
            exon_list_tmp += union_dataset_function.get_every_events_4_a_sl(
                cnx_sed, sf_name, regulation)
        exon_list_tmp = union_dataset_function.washing_events_all(
            exon_list_tmp)
        exon_list = [
            exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]),
                                    int(exon[1])) for exon in exon_list_tmp
        ]
    print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation))
    return exon_list
コード例 #2
0
def extract_exon_files(cnx, filename):
    """
    :param cnx: (sqlite3 connect object) connection to fasterDB lite
    :param filename: (string) the name of a file containing exons
    :return: (list of Exonclass object) list of exons
    """
    exon_list = []
    with open(filename, "r") as outfile:
        line = outfile.readline()
        while line:
            line = line.replace("\n", "")
            line = line.split("\t")
            exon = exon_class_bp.ExonClass(cnx, str(line[0]), int(line[0]),
                                           int(line[1]))
            exon_list.append(exon)
            line = outfile.readline()
    return exon_list
コード例 #3
0
def svm_bp_finder_launcher(cnx, exon_list, output):
    """
    Compute the number of good branch points of every exons in ``exon_list``.

    :param cnx: (sqlite3 connect object) connection to fasterDB database
    :param exon_list: ((list of list of 1 str 2 int) list of exons.
    :param output: (str) folder were the input will be created
    :return: (pandas dataframe) list of the branch point of interest
    """
    list_df = []
    for exon in exon_list:
        class_exon = exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2])
        input_file = function_bp.fasta_writer(class_exon, output, 100)
        df = run_svs_bp_finder(input_file, exon[1], exon[2],
                               class_exon.upstream_intron.sequence_proxi)
        list_df.append(df)
    return pd.concat(list_df, ignore_index=True)
コード例 #4
0
def main():
    regulation = "down"
    exon_class_bp.set_debug(0)
    base = os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    output = base + "/result/experimental_branch_point"
    at_exon_file = base + "/result/AT_rich_exons"
    gc_exon_file = base + "/result/GC_rich_exons"
    fasterdb = base + "/data/fasterDB_lite.db"
    seddb = base + "/data/sed.db"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    exon_type = "CCE"
    at_exon = read_file(at_exon_file)
    gc_exon = read_file(gc_exon_file)
    exon2remove = [
        list(map(int, exon))
        for exon in udf.get_exon_regulated_by_sf(cnx_sed, regulation)
    ]
    ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove)
    exon_list = gc_exon + at_exon + ctrl_exons
    type_exon = ["GC-exons"] * len(gc_exon) + \
                ["AT-exons"] * len(at_exon) + \
                ["%s-exons" % exon_type] * len(ctrl_exons)
    tot = len(exon_list)
    count = 0
    count_none = 0
    print("Creating bed of predicted branch points")
    with open("%s/predicted_branch_points.bed" % output, "w") as outf:
        for exon, name_exon in zip(exon_list, type_exon):
            exon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1])
            nb_good_bp, list_pos = function_bp.goob_bp_only(exon)
            if list_pos is not None:
                for line in list_pos:
                    line[3] += "_" + name_exon
                    line[0] = "chr" + str(line[0])
                    outf.write("\t".join(list(map(str, line))) + "\n")
            else:
                count_none += 1
            count += 1
            sys.stdout.write("%s/%s  (%s)              \r" %
                             (count, tot, count_none))
    cnx.close()
    cnx_sed.close()
コード例 #5
0
def control_dictionaries_creator():
    """
    Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons
    """
    exon_class_bp.set_debug(0)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    fasterdb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/fasterDB_lite.db")
    seddb = os.path.dirname(os.path.realpath(__file__)).replace(
        "src/make_control_files_bp_ppt", "data/sed.db")
    ctrl_dir = dir_path + "/control_dictionaries/"
    cnx = sqlite3.connect(fasterdb)
    cnx_sed = sqlite3.connect(seddb)
    exon2remove = union_dataset_function.get_exon_regulated_by_sf(
        cnx_sed, "down")
    if not os.path.isdir(ctrl_dir):
        os.mkdir(ctrl_dir)
    exon_type = ["CCE"]
    sizes = [100, 50, 35, 25]
    for cur_exon_type in exon_type:
        ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type,
                                                      exon2remove)
        print("retrieving upstream intron sequence")
        list_exon = [
            exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2])
            for exon in ctrl_exon_list
        ]
        for size in sizes:
            print("calculating bp and ppt score")
            bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, sequence_list, ag_count_list, \
                hbound_list, uaa_list, una_list = function_bp.bp_ppt_calculator(list_exon, size)
            cur_file = open(
                ctrl_dir + cur_exon_type + "_" + str(size) +
                "_bp_ppt_score.py", "w")
            cur_file.write("bp_score=" + str(bp_score_list) + "\n")
            cur_file.write("ppt_score=" + str(ppt_score_list) + "\n")
            cur_file.write("nb_bp=" + str(nb_bp_list) + "\n")
            cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n")
            cur_file.write("bp_seq=" + str(sequence_list) + "\n")
            cur_file.write("ag_count=" + str(ag_count_list) + "\n")
            cur_file.write("hbound=" + str(hbound_list) + "\n")
            cur_file.write("uaa_count=" + str(uaa_list) + "\n")
            cur_file.write("una_count=" + str(una_list) + "\n")
            cur_file.close()
コード例 #6
0
def get_exon_info(cnx, sedb, fasterdb_file, exon_list, u1_exons, u2_exons):
    """

    :param cnx: (sqlite3 connect object) connexion to fasterdb
    :param fasterdb_file: (str) an sqlite3 database file
    :param sedb: (str) path to sed database
    :param exon_list:  (list of 2 int) list of exons
    :param u1_exons: (list of list of 2 int) list of exons regulated by U1
    :param u2_exons: (list of list of 2 int) list of exons regulated by U2
    :return: (list of list of value) list of data
    """
    dic = {-1: "-", 1: "+"}
    cursor = cnx.cursor()
    cursor.execute("ATTACH DATABASE ? as sed", (sedb, ))
    cursor.execute("ATTACH DATABASE ? as fasterdb", (fasterdb_file, ))
    if exon_list is None:
        query = """
                SELECT t1.id_gene, t1.pos_on_gene, t1.chromosome, 
                       t1.start_on_chromosome, t1.end_on_chromosome, 
                       t2.strand, t3.iupac_exon, t3.upstream_intron_size,
                       t3.downstream_intron_size
                FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3
                WHERE t3.gene_id =  t1.id_gene
                AND   t3.exon_pos = t1.pos_on_gene
                AND   t1.id_gene = t2.id
                AND   t3.exon_type LIKE '%CCE%'
                """
        cursor.execute(query)
        res = cursor.fetchall()
        new_res = []
        for exon in res:
            exon = list(exon)
            exon[3] = int(exon[3]) - 1
            cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0],
                                            exon[1])
            exon_data = bp_ppt_calculator([cexon])
            mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1])
            mfe_5ss, mfe_3ss = mfe_calculator([mexon])
            stretch = catch_index_error(stretch_counter([cexon])["T"], 0)
            dic_info = {
                "GC_content": exon[6].split(";")[4],
                "upstream_intron_size": exon[7],
                "downstream_intron_size": exon[8],
                "UNA_count": catch_index_error(exon_data[8], 0),
                "Hbound_count": catch_index_error(exon_data[6], 0),
                "good_bp": catch_index_error(exon_data[3], 0),
                "MFE_5SS": catch_index_error(mfe_5ss, 0),
                "MFE_3SS": catch_index_error(mfe_3ss, 0),
                "T_stretch": stretch,
                "U1-regulated": is_in(exon[0:2], u1_exons),
                "U2-regulated": is_in(exon[0:2], u2_exons),
            }

            new_res.append(exon[2:5] + ["%s_%s" % (exon[0], exon[1])] + \
                    ["0", dic[exon[5]]] + [str(dic_info)])
        return new_res
    count = 0
    tot = len(exon_list)
    result = []
    for exon in exon_list:
        count += 1
        query = """
                SELECT t1.chromosome, t1.start_on_chromosome, 
                       t1.end_on_chromosome, t2.strand, t3.iupac_exon,
                       t3.upstream_intron_size, t3.downstream_intron_size
                FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3
                WHERE t3.gene_id =  t1.id_gene
                AND   t3.exon_pos = t1.pos_on_gene
                AND   t1.id_gene = t2.id
                AND   t3.gene_id = %s
                AND   t3.exon_pos = %s
                """ % (exon[0], exon[1])
        cursor.execute(query)
        res = cursor.fetchall()
        if len(res) > 1:
            raise IndexError("Error only one row shoud be return for %s" %
                             exon)
        tmp = list(res[0])
        tmp[1] = int(tmp[1]) - 1
        cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1])
        exon_data = bp_ppt_calculator([cexon])
        mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1])
        mfe_5ss, mfe_3ss = mfe_calculator([mexon])
        stretch = catch_index_error(stretch_counter([cexon])["T"], 0)
        dic_info = {
            "GC_content": tmp[4].split(";")[4],
            "upstream_intron_size": tmp[5],
            "downstream_intron_size": tmp[6],
            "UNA_count": catch_index_error(exon_data[8], 0),
            "Hbound_count": catch_index_error(exon_data[6], 0),
            "good_bp": catch_index_error(exon_data[3], 0),
            "MFE_5SS": catch_index_error(mfe_5ss, 0),
            "MFE_3SS": catch_index_error(mfe_3ss, 0),
            "T_stretch": stretch,
            "U1-regulated": is_in(exon[0:2], u1_exons),
            "U2-regulated": is_in(exon[0:2], u2_exons),
        }
        exon_data = tmp[0:3] + ["%s_%s" % (exon[0], exon[1])] + \
                    ["0", dic[tmp[3]]] + [str(dic_info)]
        result.append(exon_data)
        sys.stdout.write("Processing %s/%s\t\t\t\r" % (count, tot))
        sys.stdout.flush()
    return result