Ejemplo n.º 1
0
def main():
    logic_pre = LogicPrep.LogicsPrep()
    logic = Logic.Logics()

    for i in range(1, 30):
        FILE_NAME_LIST.append(str(i))
    description_dict = logic_pre.read_dat_file_for_description(
        WORK_DIR + CDS_EACH_FNAME, FILE_NAME_LIST)

    cds_dict, sm_gene_diff_trscrpt = logic_pre.read_gtf_file_by_line_to_dict(
        WORK_DIR + CDS_FNAME, description_dict)

    logic.get_guide_ref(cds_dict, WORK_DIR + SEQ_FNAME, INITIAL_SEQ)
Ejemplo n.º 2
0
def merge_off_target_total():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicsPrep()
    logic = Logic.Logics()

    off_trgt_dict = util.read_txt_to_dict(INITIAL_MRGE_OFF_TRGT)

    for i in range(1, 30):
        FILE_NAME_LIST.append(str(i))

    seq_cnt_group_by_crpt_id = {}
    re_off_trgt_dict = {}
    first_merge_dict = {}
    result_dict = {}
    for f_num in FILE_NAME_LIST:
        print("starting with file [" + str(f_num) + "]")
        df_obj = util.read_excel_2_dataframe(WORK_DIR + CAS_OFF_EXCEL, f_num)

        first_merge_dict = logic_prep.merge_excel_n_off_trgt(
            df_obj, off_trgt_dict)

        for cas_scr_opt in CAS_SCORE_OPT:
            for off_trg_opt_arr in FILTER_OUT_OPT:
                result_dict = logic.filter_out_by_rule(cas_scr_opt,
                                                       off_trg_opt_arr,
                                                       MAX_SEQ_NUM,
                                                       first_merge_dict,
                                                       result_dict)

    seq_cnt_group_by_crpt_id = util.make_excel_w_off_trgt(
        WORK_DIR + CAS_OFF_EXCEL + "off_trgt_", "total", result_dict,
        seq_cnt_group_by_crpt_id)

    re_off_trgt_dict = logic.get_re_off_target_seq(MAX_SEQ_NUM,
                                                   seq_cnt_group_by_crpt_id,
                                                   first_merge_dict,
                                                   re_off_trgt_dict)

    util.make_tab_txt_seq_cnt_group_by_crpt_id(
        WORK_DIR + CAS_OFF_EXCEL + "seq_cnt_group_by_crpt_id_",
        seq_cnt_group_by_crpt_id)

    util.make_tab_txt_re_off_target_seq(
        WORK_DIR + CAS_OFF_EXCEL + "re_off_target_seq_", re_off_trgt_dict)
    def merge_off_target_indi(self, init_off_trgt, f_nm_list, wrk_dir,
                              excel_path, max_num, filt_out_opt):
        util = Util.Utils()
        logic_prep = LogicPrep.LogicsPrep()

        off_trgt_opt = filt_out_opt[0]
        cas_scr_opt = filt_out_opt[1]

        off_trgt_dict = util.read_txt_to_dict(init_off_trgt)

        seq_cnt_group_by_crpt_id = {}
        re_off_trgt_dict = {}
        for f_num in f_nm_list:
            print("starting with file [" + str(f_num) + "]")
            df_obj = util.read_excel_2_dataframe(wrk_dir + excel_path, f_num)

            first_merge_dict = logic_prep.merge_excel_n_off_trgt(
                df_obj, off_trgt_dict)

            result_dict = {}
            for cas_opt in cas_scr_opt:
                for off_trg_opt_arr in off_trgt_opt:
                    result_dict = self.filter_out_by_rule(
                        cas_opt, off_trg_opt_arr, max_num, first_merge_dict,
                        result_dict)

            seq_cnt_group_by_crpt_id = util.make_excel_w_off_trgt(
                wrk_dir + excel_path + "off_trgt_", f_num, result_dict,
                seq_cnt_group_by_crpt_id)

            re_off_trgt_dict = self.get_re_off_target_seq(
                max_num, seq_cnt_group_by_crpt_id, first_merge_dict,
                re_off_trgt_dict)
            print("done with file [" + str(f_num) + "]\n")

        util.make_tab_txt_seq_cnt_group_by_crpt_id(
            wrk_dir + excel_path + "seq_cnt_group_by_crpt_id_",
            seq_cnt_group_by_crpt_id)

        util.make_tab_txt_re_off_target_seq(
            wrk_dir + excel_path + "re_off_target_seq_", re_off_trgt_dict)
    def get_guide_ref(self, cds_dict, path, init):
        util = Util.Utils()
        logic_pre = LogicPrep.LogicsPrep()
        # result_dict = {}
        idx = 1
        for key, vals in cds_dict.items():

            result_dict = {}
            tmp_p_dict, tmp_m_dict = logic_pre.read_seq_dict(path, key, init)
            for val_dict in vals.values():
                if 'CDS' in val_dict:
                    cds_seq_arr = val_dict['CDS']

                    total_cds_len = 0
                    for cds_tmp in cds_seq_arr:
                        cds_seq_tmp = cds_tmp.split(" ")
                        total_cds_len = total_cds_len + (
                            int(cds_seq_tmp[1]) - int(cds_seq_tmp[0]) + 1)

                    prent_cds_len = 0
                    for cds_seq in cds_seq_arr:
                        cds_seq_exon_num = cds_seq.split(" ")
                        prent_cds_len = prent_cds_len + (
                            int(cds_seq_exon_num[1]) -
                            int(cds_seq_exon_num[0]) + 1)
                        for i in range(
                                int(cds_seq_exon_num[0]) + 1,
                                int(cds_seq_exon_num[1]) + 1):
                            if i in tmp_p_dict:
                                result_dict[idx] = {}
                                result_dict[idx].update({
                                    'Target gene name':
                                    val_dict['Target gene name']
                                })
                                result_dict[idx].update({
                                    'Ensembl transcript ID':
                                    val_dict['Ensembl transcript ID']
                                })
                                result_dict[idx].update({
                                    'Ensembl Gene ID':
                                    val_dict['Ensembl Gene ID']
                                })
                                if 'Description' in val_dict:
                                    result_dict[idx].update({
                                        'Description':
                                        val_dict['Description']
                                    })
                                result_dict[idx].update(
                                    {'Position of Base After cut': i})
                                result_dict[idx].update(
                                    {'Target context sequence': tmp_p_dict[i]})
                                result_dict[idx].update({'Strand': '+'})
                                result_dict[idx].update(
                                    {'Exon Number': cds_seq_exon_num[2]})
                                # the pos ratio out of total CDS depends on which strand gene occurs
                                if '+' == val_dict['Strand']:
                                    this_len = prent_cds_len - (
                                        int(cds_seq_exon_num[1]) - i + 1)
                                    result_dict[idx].update(
                                        {'Ratio': this_len / total_cds_len})
                                else:
                                    this_len = prent_cds_len - (
                                        i - int(cds_seq_exon_num[0]) + 1)
                                    result_dict[idx].update(
                                        {'Ratio': this_len / total_cds_len})
                                idx = idx + 1
                            if i in tmp_m_dict:
                                result_dict[idx] = {}
                                result_dict[idx].update({
                                    'Target gene name':
                                    val_dict['Target gene name']
                                })
                                result_dict[idx].update({
                                    'Ensembl transcript ID':
                                    val_dict['Ensembl transcript ID']
                                })
                                result_dict[idx].update({
                                    'Ensembl Gene ID':
                                    val_dict['Ensembl Gene ID']
                                })
                                if 'Description' in val_dict:
                                    result_dict[idx].update({
                                        'Description':
                                        val_dict['Description']
                                    })
                                result_dict[idx].update(
                                    {'Position of Base After cut': i})
                                result_dict[idx].update({
                                    'Target context sequence':
                                    tmp_m_dict[i].split(" ")[0]
                                })
                                result_dict[idx].update({
                                    'Target context anti sequence':
                                    tmp_m_dict[i].split(" ")[1]
                                })
                                result_dict[idx].update({'Strand': '-'})
                                result_dict[idx].update(
                                    {'Exon Number': cds_seq_exon_num[2]})
                                # the pos ratio out of total CDS depends on which strand gene occurs
                                if '+' == val_dict['Strand']:
                                    this_len = prent_cds_len - (
                                        int(cds_seq_exon_num[1]) - i + 1)
                                    result_dict[idx].update(
                                        {'Ratio': this_len / total_cds_len})
                                else:
                                    this_len = prent_cds_len - (
                                        i - int(cds_seq_exon_num[0]) + 1)
                                    result_dict[idx].update(
                                        {'Ratio': this_len / total_cds_len})
                                idx = idx + 1
            print("DONE file_" + key)
            util.make_excel(result_dict, init, key)