Ejemplo n.º 1
0
def multi_process():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    brcd_df = Util.Utils().read_excel_to_df(WORK_DIR + IN + BRCD_FL)
    # key : TTTTTT+barcode, val : Target length
    brcd_dict = LogicPrep.LogicPreps().make_df_to_dict(brcd_df, 0, 1)
    logic = Logic.Logics(INIT, brcd_dict)

    sources = util.get_files_from_dir(WORK_DIR + FASTQ + "*.fastq")

    for path in sources:
        fastq_list = util.make_fastq_file_to_list(path)

        # divide data_list by MULTI_CNT
        splited_fastq_list = np.array_split(fastq_list, MULTI_CNT)
        print("platform.system() : ", SYSTEM_NM)
        print("total cpu_count : ", str(TOTAL_CPU))
        print("will use : ", str(MULTI_CNT))
        pool = mp.Pool(processes=MULTI_CNT)

        pool_list = pool.map(logic.get_brcd_umi_frequency_from_FASTQ, splited_fastq_list)

        result_dict, brcd_result_dict = logic_prep.merge_dict_pool_list(pool_list)

        res_list = logic_prep.make_dict_to_list(result_dict, brcd_result_dict)
        sorted_res_list = logic_prep.sort_list_by_ele(res_list, 0)
        header = ["barcode", "#tot_freq_barcode", "umi", "#freq_umi"]
        util.make_tsv(path.replace("FASTQ", "output").replace(".fastq", "_result.txt"), header, sorted_res_list)
Ejemplo n.º 2
0
def make_filtered_ccds_current_file_by_shortest_cdn():
    print('make_filtered_ccds_current_file_by_shortest_cdn')
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    ccds_list = []
    if SYSTEM_NM == 'Linux':
        ccds_list.extend(
            util.read_tsv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO,
                                        n_line=0))
    else:
        # ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO, n_line=0)[:3000])
        ccds_list.extend(
            util.read_tsv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO,
                                        n_line=0))

    # st plan A : filter out non Public, non Identical
    ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Public', 5)
    ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Identical', -1)

    ccds_hg38_form_list = logic_prep.transform_mouse_ccds_form_to_hg38_refFlat(
        ccds_list)

    filted_ccds_list = logic_prep.get_shortest_cdn_among_same_gen_id(
        ccds_hg38_form_list)  # 20201201
    # en plan A

    header = [
        'GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End',
        'ORFStart', 'End', '#Exon', 'ExonS_list', 'ExonE_list'
    ]
    util.make_tsv(WORK_DIR + IN + 'shortest_cdn_' + FLTD_CDS_INFO, header,
                  filted_ccds_list)
    def sort_n_merge_by_chr_one_file(self, init_merge, init_be):
        ref_path = init_merge[0]
        cdf_file = init_merge[1]
        a_or_c_idx = init_merge[2]
        a_c_rule = init_merge[3]
        work_dir = init_merge[4]
        top_n = init_merge[5]
        f_nm = init_be[5]

        logic_prep = LogicPrep.LogicPreps()
        util = Util.Utils()

        trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site(ref_path + cdf_file, init_be)
        chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome(trgt_seq_dict)

        cs9_score_dict = {}
        cs9_score_dict.update(logic_prep.get_deep_cas9_tupl(work_dir + "deep_cas_9/", "RANK_final_DeepCas9_0.txt",
                                                            "sample_0.txt"))
        cs9_score_dict.update(logic_prep.get_deep_cas9_tupl(work_dir + "deep_cas_9/", "RANK_final_DeepCas9_1.txt",
                                                            "sample_1.txt"))

        top_n_list = []
        for chr_key, trnscrpt_list in chr_dict.items():
            result_list = []
            result_list = logic_prep.merge_cas9_abe_cbe_to_list(chr_key, [trnscrpt_list, {}, {},
                                                                          cs9_score_dict], result_list)

            sort_by_cas9_list = logic_prep.sort_by_idx_element(result_list, -3, [])

            top_n_list.extend(sort_by_cas9_list[:top_n + 1])

        # make tsv file result
        util.make_tsv_after_sorting(work_dir + "output/" + f_nm + "_seq_sorted_by_CAS9_top_" + str(top_n), top_n_list, init_be)
        # make excel result
        util.make_excel_after_sorting(work_dir + "output/" + f_nm + "_seq_sorted_by_CAS9_top_" + str(top_n), top_n_list, init_be)
def main_by_list_w_filenames():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()

    # file_num_list = []
    # for j in range(964):
    #     file_num_list.append(j)

    header = ['chr', 'tot_seq', 'fam_nm', 'index', 'strand', 'trns_flag']
    # for i in file_num_list:
    for i in range(964):
        path = WORK_DIR + IN + TE_info_fl.replace(
            ".txt", "") + "/Genome_TandemRepeat_TRD_" + str(i) + ".txt"
        te_inf_list = util.read_csv_ignore_N_line(path, "\t", 0)

        splited_te_inf_list = np.array_split(te_inf_list, MULTI_CNT)

        print("platform.system() : ", SYSTEM_NM)
        print("total cpu_count : ", str(TOTAL_CPU))
        print("will use : ", str(MULTI_CNT))
        pool = mp.Pool(processes=MULTI_CNT)

        pool_list = pool.map(start_multi_processing, splited_te_inf_list)
        pool.close()
        splited_te_inf_list[:] = []

        result_list = logic_prep.merge_multi_list(pool_list)
        pool_list.clear()

        util.make_csv(WORK_DIR + "output2/TE_trgt_20210330_" + str(i) + ".txt",
                      header, result_list, 0, '\t')
        result_list.clear()
Ejemplo n.º 5
0
def merge_cas9_abe_cbe():
    logic = Logic.Logics()
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site(
        REF_PATH + CDS_FILE, INIT_BE)
    chr_dict, aqia_chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome(
        trgt_seq_dict, ":Macaca_fascicularis_5.0:", IGNORE_CHR_LIST)

    a_c_dict = logic.filter_out_by_ACGTU_rule(chr_dict, A_or_C_IDX, ACTG_RULE)
    aqia_a_c_dict = logic.filter_out_by_ACGTU_rule(aqia_chr_dict, A_or_C_IDX,
                                                   ACTG_RULE)

    abe_score_dict = logic_prep.get_deep_base_ed_score(
        WORK_DIR + "deep_ABE/ABE_Efficiency.txt")
    cbe_score_dict = logic_prep.get_deep_base_ed_score(
        WORK_DIR + "deep_CBE/CBE_Efficiency.txt")
    cs9_score_dict = logic_prep.get_deep_cas9_tupl(
        WORK_DIR + "deep_cas_9/", "RANK_final_DeepCas9_Final.txt",
        "sample.txt")

    util.make_merge_excel_by_chr(
        WORK_DIR + "merge_cas9_abe_cbe/crab_eating_monkey_merge_abe_cbe_cas9",
        [a_c_dict, abe_score_dict, cbe_score_dict, cs9_score_dict], INIT_BE)

    util.make_merge_excel(
        WORK_DIR +
        "merge_cas9_abe_cbe/crab_eating_monkey_merge_abe_cbe_cas9_AQIA",
        [aqia_a_c_dict, abe_score_dict, cbe_score_dict, cs9_score_dict],
        INIT_BE)
Ejemplo n.º 6
0
def multi_processing_1():
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    # CHROM	POS	ID	REF	ALT	mut_length	CLNVC	CLNSIG
    # POS - 1 = index of .fa sequence
    # [['1', '930188', '846933', 'G', 'A', '1', 'substitution', 'Uncertain_significance'],...]
    mut_list = []
    if SYSTEM_NM == 'Linux':
        mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t"))
    else:
        mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")[:300])

    splited_mut_list = np.array_split(mut_list, MULTI_CNT)

    print("platform.system() : ", SYSTEM_NM)
    print("total cpu_count : ", str(TOTAL_CPU))
    print("will use : ", str(MULTI_CNT))
    pool = mp.Pool(processes=MULTI_CNT)

    pool_list = pool.map(get_PAM_within_N_bp_of_POS, splited_mut_list)
    result_list = logic_prep.merge_multi_list(pool_list)

    header = ['CHROM', 'PAM', str(SEQ_WIN_SIZE[0]) + ' + PAM + ' + str(SEQ_WIN_SIZE[1]), 'PAM_POS', 'STRAND']
    try:
        os.remove(WORK_DIR + "input/" + multi_processing_1_FILE)
    except Exception as err:
        print('os.remove(WORK_DIR + "input/" + multi_processing_1_FILE) : ', str(err))
    util.make_csv(WORK_DIR + "input/" + multi_processing_1_FILE, header, result_list, 0, "\t")
    util.make_excel(WORK_DIR + "output/ClinVar_hg38_result", header, result_list)
def main_20201117():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    input_list = util.read_tsv_ignore_N_line(INPUT_LIST)

    result_list = []
    for val_arr in input_list:
        ori_seq = val_arr[0].upper()
        n_of_mismatch = int(val_arr[1])
        n_of_sub_seq = int(val_arr[2])

        idx_set = logic_prep.make_seq_idx_set(0, len(ori_seq))

        rand_idx_list = []
        for i in range(n_of_sub_seq):
            rand_idx_list.append([random.sample(idx_set, n_of_mismatch)])

        for idx_list in rand_idx_list:
            sub_seq = ori_seq
            for idx_arr in idx_list:
                for i in idx_arr:
                    tmp_set = BASE_NT - {ori_seq[i].lower()}
                    sub_seq = logic.swap_char_in_string(
                        sub_seq, i,
                        random.sample(tmp_set, 1)[0])
                result_list.append([ori_seq, sub_seq, len(idx_arr)])

    header = ['ori_seq', 'sub_seq', '#_of_mismatch']
    try:
        util.make_excel(WORK_DIR + '/output/result', header, result_list)
    except Exception as err:
        util.make_tsv(WORK_DIR + '/output/result', header, result_list)
Ejemplo n.º 8
0
def main():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps(WEB_DRV, TARGET_URL)
    logic = Logic.Logics()

    for input_file in INPUT_TXT:
        needle_result_list = []
        input_list = util.read_tb_txt(WORK_DIR + input_file)

        for val_arr in input_list:
            final_idx = val_arr[1]
            asequence = val_arr[3]  # NGS read
            bsequence = val_arr[4]  # Reference
            logic_prep.go_to_url(TARGET_URL)
            logic_prep.input_data_by_id("pn_stype", "dna")
            logic_prep.input_data_by_id("asequence", asequence)
            logic_prep.input_data_by_id("bsequence", bsequence)

            logic_prep.scroll_down()
            logic_prep.get_by_xpath("//div[@id='jd_submitButtonPanel']/input[@type='submit']", False).click()

            logic_prep.go_to_url(WEB_DRV.current_url)
            logic_prep.get_by_xpath("//pre[@id='alignmentContent']", False)
            crwl_txt = logic_prep.get_by_xpath("//pre[@id='alignmentContent']", False).text

            a_seq_name, crwl_txt_arr = logic.extract_data(crwl_txt)

            logic.add_needle_result(final_idx, a_seq_name, crwl_txt_arr, needle_result_list)

        util.make_excel(WORK_DIR + "crawler_output/result_" + input_file.replace(".txt", ""), needle_result_list)
def multi_processing_for_whole_pam_ClinVar():
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
    # POS - 1 = index of .fa sequence
    # [['1', '1338000', '208047', 'CT', 'C', '.', '.', '"ALLELEID=204306;CLNDISDB=MONDO:MONDO:0014591,MedGen:C4225363,OMIM:616331;CLNDN=Robinow_syndrome,_autosomal_dominant_2;CLNHGVS=NC_000001.11:g.1338001del;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Pathogenic;CLNVC=Deletion;CLNVCSO=SO:0000159;GENEINFO=DVL1:1855;MC=SO:0001589|frameshift_variant;ORIGIN=33;RS=797044837"'],...]
    mut_list = []
    if SYSTEM_NM == 'Linux':
        mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t"))
    else:
        mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")[:300])

    splited_mut_list = np.array_split(mut_list, MULTI_CNT)

    print("platform.system() : ", SYSTEM_NM)
    print("total cpu_count : ", str(TOTAL_CPU))
    print("will use : ", str(MULTI_CNT))
    pool = mp.Pool(processes=MULTI_CNT)

    pool_list = pool.map(get_seq_by_pam_after_mut, splited_mut_list)

    result_list = logic_prep.merge_multi_list(pool_list)

    header = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
              'P_REF_SEQ_[' + str(WIN_SIZE[0]) + '], M_REF_SEQ_[' + str(WIN_SIZE[1]) + ']']
    for pam_nm in OTHOLOG:
        for strand in ['+', '-']:
            header.append(pam_nm + strand)
    util.make_excel(WORK_DIR + "output/SY_Dominant_result_by_spacer", header, result_list)
def multi_processing_test():
    util = Util.Utils()
    ref_val = [
        '76967',
        'TTTGACTCATCTCGTCACTACAGACATGCATCGCATACTCTCCCTATGTTCCAGCTTCCTGGGTCTGCAGGTCCAGCCGAGTCGCCAAATAAGTGCCATCTACTCTACC'
    ]
    logic_prep = LogicPrep.LogicPreps([ref_val, 0, 0])

    ngs_read = util.read_tb_txt_wo_header(WORK_DIR + NGS_read_DIR +
                                          ref_val[0] + ".txt")
    splited_ngs_read = np.array_split(ngs_read, MULTI_CNT)

    print("total cpu_count : " + str(TOTAL_CPU))
    print("will use : " + str(MULTI_CNT))
    pool = mp.Pool(processes=MULTI_CNT)

    pool_list = pool.map(logic_prep.get_pairwise2_needle_dict_simple,
                         splited_ngs_read)

    merge_dict, _ = logic_prep.merge_multi_dict_from_simple(pool_list)
    result_dict = logic_prep.get_sub_ins_del_list_dict_from_simple(merge_dict)

    util.make_excel_simple(
        WORK_DIR + "output/multi_p_result_" + ref_val[0] + "_" +
        str(time.perf_counter()), result_dict)
def multi_processing():
    util = Util.Utils()

    ref_seq_list = util.read_tb_txt_wo_header(WORK_DIR + REF_SEQ)

    for ref_val in ref_seq_list:
        logic_prep = LogicPrep.LogicPreps([ref_val, 0, 0])
        try:
            ngs_read = util.read_tb_txt_wo_header(WORK_DIR + NGS_read_DIR +
                                                  ref_val[0] + ".txt")
            splited_ngs_read = np.array_split(ngs_read, MULTI_CNT)

            print("total cpu_count : " + str(TOTAL_CPU))
            print("will use : " + str(MULTI_CNT))
            pool = mp.Pool(processes=MULTI_CNT)

            pool_list = pool.map(logic_prep.get_pairwise2_needle_dict_simple,
                                 splited_ngs_read)

            merge_dict, _ = logic_prep.merge_multi_dict_from_simple(pool_list)
            result_dict = logic_prep.get_sub_ins_del_list_dict_from_simple(
                merge_dict)

            util.make_excel_simple(
                WORK_DIR + "output/multi_p_result_" + ref_val[0], result_dict)
        except FileNotFoundError:
            print(ref_val[0] + ".txt : FileNotFoundError")
            continue
Ejemplo n.º 12
0
def anlyze_indel_by_MAIN_to_SUB():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    brcd_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + BRCD_FILE)
    brcd_arr = logic_prep.make_arr_list_to_list(brcd_list)

    trgt_list = []
    trgt_err_list = []
    for path in [MAIN_DIR, SUB_DIR]:
        csv_list = util.csv_to_list_ignr_header(
            WORK_DIR + INPUT + path + F_TABLE_FILE, "\t")
        result_list, err_list = logic_prep.get_data_by_cell_id(
            csv_list, brcd_arr, CONST_INIT)
        trgt_list.append(result_list)
        trgt_err_list.append(err_list)

    # result_dict = logic.count_len_arr_mut_non_mut_by_main_list(trgt_list[0], trgt_list[1], brcd_arr)
    result_dict = logic.count_cell_mut_non_mut_by_main_list(
        trgt_list[0], trgt_list[1])
    util.make_excel_indel_frequency_by_cell_id(
        WORK_DIR + "output/result_indel_" + MAIN_SUB_NAME[0] + "_" +
        MAIN_SUB_NAME[1], result_dict, MAIN_SUB_NAME)

    for idx in range(len(trgt_err_list)):
        sorted_err_list = logic_prep.sort_list_by_ele(trgt_err_list[idx], -1)
        logic.count_num_by_err(sorted_err_list)
        util.make_excel_err_list(
            WORK_DIR + "output/" + MAIN_SUB_NAME[idx] + "_error_list",
            sorted_err_list)
Ejemplo n.º 13
0
def main1():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    seq_record = util.get_seq_record_from_genbank(WORK_DIR + NCBI +
                                                  genbank_file_name + ".gb")
    cds_idx_list = logic_prep.get_cds_idx_arr_to_list(seq_record)

    init_rule = INIT
    pam_seq = init_rule[2]
    plus_strand_list, minus_strand_list = logic.get_idx_of_matching_seq(
        seq_record.seq, pam_seq)

    plus_idx_list = logic.get_idx_in_list(plus_strand_list, cds_idx_list)
    minus_idx_list = logic.get_idx_in_list(minus_strand_list, cds_idx_list,
                                           False)

    filtered_plus_idx_list = logic_prep.filter_out_dupl(plus_idx_list)
    filtered_minus_idx_list = logic_prep.filter_out_dupl(minus_idx_list)

    plus_seq_list = logic.get_trgt_seq_in_idx_list(seq_record.seq,
                                                   filtered_plus_idx_list,
                                                   init_rule)
    minus_seq_list = logic.get_trgt_seq_in_idx_list(seq_record.seq,
                                                    filtered_minus_idx_list,
                                                    init_rule, False)

    merge_list = logic_prep.merge_list([plus_seq_list, minus_seq_list])
    tot_list = logic_prep.sort_list_by_ele(merge_list, 0)

    header = ["sequence", "strand"]
Ejemplo n.º 14
0
def multi_step_1():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()

    fl_num = 0
    dfam_info = util.read_csv_ignore_N_line(DFAM_ANNO + str(fl_num), '\t', 0)

    if SYSTEM_NM != 'Linux':
        dfam_info = dfam_info[:100]

    dfam_dict = logic_prep.make_list_to_dict_by_ele_as_key(dfam_info, 0)

    header = ['chr', 'tot_seq', 'fam_nm', 'index', 'strand', 'trns_flag']
    for key, val_list in dfam_dict.items():
        splited_dfam_list = np.array_split(val_list, MULTI_CNT)

        print("platform.system() : ", SYSTEM_NM)
        print("total cpu_count : ", str(TOTAL_CPU))
        print("will use : ", str(MULTI_CNT))
        pool = mp.Pool(processes=MULTI_CNT)

        pool_list = pool.map(get_trgt, splited_dfam_list)
        result_list = logic_prep.merge_multi_list(pool_list)
        print(type(result_list))

        util.make_excel(WORK_DIR + "output/TE_trgt_" + str(fl_num) + "_" + key,
                        header, result_list)
Ejemplo n.º 15
0
def make_filtered_hg38_refFlat():
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    # GeneSym   NMID    Chrom   Strand  Transcript_Start   End ORFStart    End #Exon   ExonS_list  ExonE_list
    # ['MIR6859-1', 'NR_106918', 'chr1', '-', '17368', '17436', '17436', '17436', '1', '17368,', '17436,']
    # ['WASH7P', 'NR_024540', 'chr1', '-', '14361', '29370', '29370', '29370', '11', '14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320,', '14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370,']
    cds_list = []
    if SYSTEM_NM == 'Linux':
        cds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + CDS_INFO, "\t", 0))
    else:
        cds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + CDS_INFO, "\t", 0)[:3000])

    NM_cds_list = logic_prep.filter_out_NON_NM_id_in_cds_list(cds_list)
    # filter_out_cds_wout_strt_cdn(NM_cds_list)

    splited_cds_list = np.array_split(NM_cds_list, MULTI_CNT)

    print("platform.system() : ", SYSTEM_NM)
    print("total cpu_count : ", str(TOTAL_CPU))
    print("will use : ", str(MULTI_CNT))
    pool = mp.Pool(processes=MULTI_CNT)

    pool_cds_idx_list = pool.map(filter_out_cds_wout_strt_cdn, splited_cds_list)
    result_list = logic_prep.merge_multi_list(pool_cds_idx_list)

    header = ['GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End', 'ORFStart', 'End', '#Exon', 'ExonS_list',
              'ExonE_list']
    try:
        os.remove(WORK_DIR + "input/" + FILTERED_CDS_INFO)
    except Exception as err:
        print('os.remove(WORK_DIR + "input/" + FILTERED_CDS_INFO) : ', str(err))
    util.make_csv(WORK_DIR + "input/" + FILTERED_CDS_INFO, header, result_list, 0, "\t")
    util.make_excel(WORK_DIR + "output/filtered_hg38_refFlat", header, result_list)
Ejemplo n.º 16
0
def recount_motif_error():
    util = Util.Utils()
    logic = Logic.Logics()
    logic_prep = LogicPrep.LogicPreps()

    for err_fl_path in MOTIF_ERROR_FL:
        motif_err_fl = util.read_tsv_ignore_N_line(WORK_DIR + IN + err_fl_path)

        # filter out missing values
        flted_1_motif_err_fl = logic_prep.filterout_ele_w_trgt_str(
            motif_err_fl, 2, '-')
        motif_err_fl.clear()
        # #NAME? is removed
        flted_2_motif_err_fl = logic_prep.filterout_ele_w_trgt_str(
            flted_1_motif_err_fl, 2, 'N')
        flted_1_motif_err_fl.clear()
        flted_3_motif_err_fl = logic_prep.filterout_ele_w_trgt_str(
            flted_2_motif_err_fl, 2, 'n')
        flted_2_motif_err_fl.clear()

        motif_err_dict = logic_prep.make_list_to_dict_by_elekey(
            flted_3_motif_err_fl, 0)

        result_list = logic.recount_total_proportion_by_dictkey(
            motif_err_dict, 3)

        # head = ['Filename', 'INDEX', 'seq', 'Motif', 'Count', 'Total_cnt', 'Proportion', 'Substitution']
        head = [
            'Filename', 'seq', 'Motif', 'Count', 'Total_cnt', 'Proportion',
            'Substitution'
        ]
        util.make_excel(
            WORK_DIR + OU + 'new_' + err_fl_path.replace('.txt', ''), head,
            result_list)
def test():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()


    excel_arr = []
    csv_list = [[x.upper() for x in tmp_arr] for tmp_arr in
                util.read_csv_ignore_N_line(WORK_DIR + INPUT + GUIDE_BARCODE_CSV)]
    # csv_list
    excel_arr.append(csv_list)
    # index_list
    excel_arr.append(logic_prep.make_1_arr_list_to_list(0, csv_list))
    # guide_list
    excel_arr.append(logic_prep.make_1_arr_list_to_list(2, csv_list))
    # barcd_randBP_list
    excel_arr.append(logic_prep.make_2_arr_list_to_list_after_slice(6, 7, POS_SLICE_RAND_BP, csv_list))

    for tmp_idx in range(len(logic_prep.make_2_arr_list_to_list(6, 7, csv_list))):
        print(logic_prep.make_2_arr_list_to_list(6, 7, csv_list)[tmp_idx])
        print(logic_prep.make_2_arr_list_to_list_after_slice(6, 7, POS_SLICE_RAND_BP, csv_list)[tmp_idx])
    # trgt_list
    excel_arr.append(logic_prep.make_1_arr_list_to_list(8, csv_list))
    # d0_seq_wo_scaf_list
    excel_arr.append(logic_prep.make_3_arr_list_to_list(3, 4, 5, csv_list))
    # barcd_list
    excel_arr.append(logic_prep.make_1_arr_list_to_list(6, csv_list))
    # randBP_list
    excel_arr.append(logic_prep.make_1_arr_list_to_list(7, csv_list))

    for d0_d4_idx in range(len(D0_D4_FLAG_ARR)):
        logic = Logic.Logics(INIT, excel_arr, D0_D4_FLAG_ARR[d0_d4_idx])
def make_filtered_ccds_current_file_by_shortest_cdn():
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    ccds_list = []
    if SYSTEM_NM == 'Linux':
        ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/201130_CCDS_" + TYPE + "_current.txt", "\t", 0))
    else:
        # ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/CCDS.current.txt", "\t", 0)[:3000])
        ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/201130_CCDS_" + TYPE + "_current.txt", "\t", 0))

    # st plan A : filter out non Public, non Identical
    ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Public', 5)
    ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Identical', -1)

    ccds_hg38_form_list = logic_prep.transform_mouse_ccds_form_to_hg38_refFlat(ccds_list)

    filted_ccds_list = logic_prep.get_shortest_cdn_among_same_gen_id(ccds_hg38_form_list)  # 20201201
    # en plan A

    header = ['GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End', 'ORFStart', 'End', '#Exon', 'ExonS_list',
              'ExonE_list']

    try:
        os.remove(WORK_DIR + "input/filtered_shortest_cdn_CCDS_" + TYPE + ".txt")
    except Exception as err:
        print('os.remove(WORK_DIR + "input/filtered_CCDS.current.txt") : ', str(err))
    util.make_csv(WORK_DIR + "input/filtered_shortest_cdn_CCDS_" + TYPE + ".txt", header, filted_ccds_list, 0, "\t")
def make_filtered_out_ClinVar_pos_in_cds_or_not():
    logic = Logic.Logics()
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    cds_info = util.read_csv_ignore_N_line(WORK_DIR + "input/" + ALL_CDS_INFO, "\t")
    cds_dict_by_chr = {}
    for cds_arr in cds_info:
        chrom = cds_arr[2]
        start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr)
        idx_list = logic_prep.get_idx_num_frm_strt_to_end_list(start_idx_arr, end_idx_arr)
        if chrom in cds_dict_by_chr:
            cds_dict_by_chr[chrom].append(idx_list)
        else:
            cds_dict_by_chr.update({chrom: [idx_list]})

    mut_dict = logic_prep.get_dict_from_list_by_ele_key(
        util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t"), 0)

    not_in_cds_list = []
    in_cds_list = []
    for chr_num, mut_list in mut_dict.items():
        cds_idx_list = cds_dict_by_chr['chr' + chr_num]
        for mut_arr in mut_list:
            pos = int(mut_arr[1]) + ADJ_REF_IDX
            tmp_id = int(mut_arr[2])
            if not logic.check_seq_in_cds(cds_idx_list, pos):
                not_in_cds_list.append(mut_arr)
            else:
                in_cds_list.append(mut_arr)
    print(len(not_in_cds_list))
    header = ['#CHROM',	'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
    util.make_csv(WORK_DIR + '/input/ClinVar_dominant_mutation_on_CDS.txt', header, in_cds_list, deli='\t')
    util.make_csv(WORK_DIR + '/input/ClinVar_dominant_mutation_not_on_CDS.txt', header, not_in_cds_list, deli='\t')
Ejemplo n.º 20
0
def indel_frequency_by_1500x1500_cell_id_w_output_dir():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    brcd_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + BRCD_FILE)
    brcd_arr = logic_prep.make_arr_list_to_list(brcd_list)
    cell_id_list = logic_prep.make_cell_id(brcd_arr, "^")

    var_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + "var_list.txt",
                                            "\t")

    for idx in range(int(len(var_list) / 2)):
        main_idx = 2 * idx
        sub_idx = 2 * idx + 1
        main_arr = var_list[main_idx]
        sub_arr = var_list[sub_idx]

        main_sub_nm = [main_arr[0], sub_arr[0]]
        main_path = main_arr[5] + "/CRISPResso_on_" + main_arr[1].replace(
            ".fastq", "") + "_join"
        sub_path = sub_arr[5] + "/CRISPResso_on_" + sub_arr[1].replace(
            ".fastq", "") + "_join"
        path_arr = [main_path, sub_path]

        trgt_list = []
        trgt_err_list = []
        for path in path_arr:
            csv_list = util.csv_to_list_ignr_header(
                WORK_DIR + INPUT + SUBPATH + path + F_TABLE_FILE, "\t")
            tmp_list, err_list = logic_prep.get_data_by_cell_id(
                csv_list, brcd_arr, CONST_INIT)
            trgt_list.append(tmp_list)
            trgt_err_list.append(err_list)

        result_list, cnt_hom_hete_wt, junk_arr = logic.get_num_of_reads_percent_of_read_by_cell(
            trgt_list, cell_id_list, THRESHOLD_ARR)

        # make output path
        os.makedirs(WORK_DIR + 'output/' + SUBPATH, exist_ok=True)
        util.make_excel_by_list(
            WORK_DIR + "output/" + SUBPATH + "tot_read_by_cell_homo_hetero_" +
            main_sub_nm[0] + "_" + main_sub_nm[1] + "_" + str(idx),
            result_list, cnt_hom_hete_wt)

        for tmp_idx in range(len(trgt_err_list)):
            sorted_err_list = logic_prep.sort_list_by_ele(
                trgt_err_list[tmp_idx], -1)
            logic.count_num_by_err(sorted_err_list)
            util.make_excel_err_list(
                WORK_DIR + "output/" + SUBPATH + main_sub_nm[tmp_idx] +
                "_error_list_" + str(idx), sorted_err_list)

        junk_file_nm = ['cell_non_junk', 'non_cell_junk']
        for idx_junk in range(len(junk_arr)):
            util.make_excel_by_arr_list(
                WORK_DIR + "output/" + SUBPATH + junk_file_nm[idx_junk] + "_" +
                main_sub_nm[0] + "_" + main_sub_nm[1] + "_" + str(idx),
                junk_arr[idx_junk])
def split_TE_1_fl_n_by_1_right_away():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()

    header = ['sequence', '#duple', '#trnscprt',
              'chromosome:23bp(spacer + PAM)index range:strand:transcription:fam_name']

    fl_nm_f = WORK_DIR + "output/loop/TE_trgt_cyc"
    cyc_num = 3
    fl_nm_b = "_fln_by_1"

    res_f_num = 0

    # tm_arr = [[1, 2, 3, 5], [0, 4, 6, 7]]
    # tm_arr = [[0, 3], [1, 2]]
    tm_arr = [[0, 2]]
    for i_a in range(len(tm_arr)):
        result_dict = {}
        for i in tm_arr[i_a]:
            print(fl_nm_f + str(cyc_num) + "_" + str(i) + fl_nm_b)
            with open(fl_nm_f + str(cyc_num) + "_" + str(i) + fl_nm_b) as f:
                print(f.readline())
                while True:
                    tmp_line = f.readline().replace("\n", "")
                    if tmp_line == '':
                        break
                    dfam_arr = tmp_line.split('\t')
                    tot_seq = dfam_arr[0]
                    res_key = tot_seq
                    if res_key in result_dict:
                        result_dict[res_key].update(dfam_arr[3].replace(" ", "").split(',')[:-1])
                    else:
                        tmp_set = set(dfam_arr[3].replace(" ", "").split(',')[:-1])
                        result_dict.update({res_key: tmp_set})

        result0_list = []
        result1_list = []
        for res_key, val_set in result_dict.items():
            tmp_str = ""
            cnt_trpt = 0
            for tmp_val in val_set:
                if 'True' in tmp_val:
                    cnt_trpt += 1
                tmp_str += tmp_val + ", "
            if len(val_set) > 1:
                result0_list.append([res_key, len(val_set), cnt_trpt, tmp_str])
            else:
                result1_list.append([res_key, len(val_set), cnt_trpt, tmp_str])

        result_dict.clear()
        sorted_result0_list = logic_prep.sort_list_by_ele(result0_list, 1)
        result0_list.clear()

        util.make_csv(fl_nm_f + str(cyc_num + 1) + "_" + str(res_f_num) + "_fln_by_1", header, sorted_result0_list, 0, '\t')
        res_f_num += 1
        util.make_csv(fl_nm_f + str(cyc_num + 1) + "_" + str(res_f_num) + "_fln_by_1", header, result1_list, 0, '\t')
        res_f_num += 1
        sorted_result0_list.clear()
        result1_list.clear()
Ejemplo n.º 22
0
def multi_processing_split_big_files_then_find_seq_from_FASTQ():
    print('multi_processing_split_big_files_then_find_seq_from_FASTQ')
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()

    brcd_list = util.read_tb_txt(WORK_DIR + BARCD_SEQ_FILE)
    logic = Logic.Logics(brcd_list)

    # fastq file name without ext
    big_fastq_fl_nm_list = ["19k_ramu", "19k_my"]
    fastq_ext = '.fastq'

    for fastq_fl_nm in big_fastq_fl_nm_list:
        # split big file
        split_init = {
            'big_file_path': WORK_DIR + FASTQ + fastq_fl_nm + fastq_ext,
            'num_row': 4000000,
            'splited_files_dir': WORK_DIR + FASTQ + fastq_fl_nm + "/",
            'output_file_nm': fastq_fl_nm,
            'output_file_ext': fastq_ext
        }
        util.split_big_file_by_row(split_init)

        # get splited_files path
        sources = util.get_files_from_dir(split_init['splited_files_dir'] +
                                          '*.fastq')

        result_dict = {}
        for splited_fastq_fl in sources:
            print("get_FASTQ_seq_list :", splited_fastq_fl)
            fastq_list = util.get_FASTQ_seq_list(splited_fastq_fl)

            # divide data_list by MULTI_CNT
            splited_fastq_list = np.array_split(fastq_list, MULTI_CNT)
            fastq_list.clear()

            print("platform.system() : ", SYSTEM_NM)
            print("total cpu_count : ", str(TOTAL_CPU))
            print("will use : ", str(MULTI_CNT))
            pool = mp.Pool(processes=MULTI_CNT)
            ## analyze FASTQ seq after barcode seq
            pool_list = pool.map(logic.get_dict_multi_p_seq_from_FASTQ,
                                 splited_fastq_list)
            ## analyze whole FASTQ seq
            # pool_list = pool.map(logic.get_dict_multi_p_seq_from_whole_FASTQ, splited_fastq_list)

            print("merge_pool_list_to_result_dict")
            logic.merge_pool_list_to_result_dict(pool_list, result_dict)
            pool.close()
            pool_list[:] = []

        logic_prep.add_missing_brcd_to_dict(brcd_list, result_dict)
        print("make excel file")
        util.make_dict_to_excel(
            WORK_DIR + "output/result_" + fastq_fl_nm + "_" +
            BARCD_SEQ_FILE.replace("barcode_seq/", "").replace(".txt", ""),
            result_dict)
        result_dict.clear()
    def get_seq_by_pam_after_mut(self, path, mut_list, win_arr, init):
        logic_prep = LogicPrep.LogicPreps()

        pam_arr = init[1]
        len_f_pam_arr = init[2]
        len_b_pam_arr = init[3]
        adj_ref_idx = init[4]

        for mut_arr in mut_list:
            chr_num = mut_arr[0]
            pos = int(mut_arr[1]) + adj_ref_idx
            ref_p_seq = mut_arr[3]
            alt_p_seq = mut_arr[4]

            ref_m_seq = ""
            alt_m_seq = ""
            try:
                ref_m_seq += self.make_complement_string(ref_p_seq)
                if alt_p_seq == '.':
                    alt_p_seq = ""
                else:
                    alt_m_seq += self.make_complement_string(alt_p_seq)
            except Exception as err:
                print("make_complement_string ::: ", err)
                print(ref_p_seq, " : ref_p_seq")
                print(alt_p_seq, " : alt_p_seq")
                print(str(mut_arr))

            seq_record = SeqIO.read(path + "chr" + chr_num + ".fa", "fasta")
            p_seq = str(seq_record.seq).upper()
            m_seq = str(seq_record.seq.complement()).upper()

            ori_win_flag = True

            for idx in range(len(pam_arr)):
                pam = pam_arr[idx]
                len_f_pam = len_f_pam_arr[idx]
                len_b_pam = len_b_pam_arr[idx]

                ref_p_dict, p_ori_win_seq = self.get_matched_pam_p_seq_dict(
                    p_seq, pos, win_arr, ref_p_seq, pam, len_f_pam, len_b_pam)
                ref_m_dict, m_ori_win_seq = self.get_matched_pam_m_seq_dict(
                    m_seq, pos, win_arr, ref_m_seq, pam, len_f_pam, len_b_pam)

                mut_p_dict, _ = self.get_matched_pam_p_seq_dict(
                    p_seq, pos, win_arr, alt_p_seq, pam, len_f_pam, len_b_pam)
                mut_m_dict, _ = self.get_matched_pam_m_seq_dict(
                    m_seq, pos, win_arr, alt_m_seq, pam, len_f_pam, len_b_pam)

                self.remove_dict_val_by_key(mut_p_dict, ref_p_dict.keys())
                self.remove_dict_val_by_key(mut_m_dict, ref_m_dict.keys())

                if ori_win_flag:
                    mut_arr.append(p_ori_win_seq + " , " + m_ori_win_seq)
                    ori_win_flag = False

                logic_prep.add_result_seq_to_arr(mut_arr, mut_p_dict)
                logic_prep.add_result_seq_to_arr(mut_arr, mut_m_dict)
def get_GRCh38_Regulatory_Build_regulatory_features_by_ClinVar_dominant_mutation_not_on_CDS():
    clin_var_not_cds_fl_nm = 'ClinVar_dominant_mutation_not_on_CDS.txt'
    GRCh38_features_fl_nm = 'homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20190329.gff'

    logic = Logic.Logics()
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    clin_var_fl = util.read_csv_ignore_N_line(WORK_DIR + IN + clin_var_not_cds_fl_nm, '\t')
    GRCh38_features_fl = util.read_csv_ignore_N_line(WORK_DIR + IN + GRCh38_features_fl_nm, '\t', n_line=0)
    GRCh38_features_dict = logic_prep.get_dict_from_list_by_ele_key(GRCh38_features_fl, 0)

    result_dict = {}
    no_chr_key_list = []
    for clin_var_arr in clin_var_fl:
        tmp_clin_var_key = tuple(clin_var_arr[:5])
        chr_key = clin_var_arr[0]
        pos = int(clin_var_arr[1])
        if chr_key in GRCh38_features_dict:
            result_dict.update({tmp_clin_var_key: []})
            GRCh38_features_list = GRCh38_features_dict[chr_key]
            for GRCh38_features_arr in GRCh38_features_list:
                tmp_type = GRCh38_features_arr[2]
                tmp_info = GRCh38_features_arr[-1]
                st_idx = int(GRCh38_features_arr[3])
                en_idx = int(GRCh38_features_arr[4])
                if st_idx < pos < en_idx:
                    result_dict[tmp_clin_var_key].append([tmp_type, tmp_info])

        else:
            no_chr_key_list.append(tmp_clin_var_key)

    with open(WORK_DIR + OU + 'in_cds.txt', 'w') as in_cds_f:
        with open(WORK_DIR + OU + 'not_in_cds.txt', 'w') as not_cds_f:
            for f_key, val_list in result_dict.items():
                tmp_str = ""
                tmp_blnk = ""
                for tmp_f in f_key:
                    tmp_str = tmp_str + tmp_f + '\t'
                    tmp_blnk = tmp_blnk + '-' + '\t'

                if len(val_list) == 0:
                    not_cds_f.write(tmp_str[:-1] + '\n')
                else:
                    for idx in range(len(val_list)):
                        add_str = ""
                        for tm_f in val_list[idx]:
                            add_str = add_str + tm_f + '\t'

                        if idx == 0:
                            in_cds_f.write(tmp_str + add_str[:-1] + '\n')
                        else:
                            in_cds_f.write(tmp_blnk + add_str[:-1] + '\n')

    util.make_csv(WORK_DIR + OU + 'no_chr_key_list.txt', [], no_chr_key_list)
Ejemplo n.º 25
0
def make_deep_cas9_input():
    logic_prep = LogicPrep.LogicPreps()
    util = Util.Utils()

    trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site_fr_fasta(
        REF_PATH + CDS_FILE, INIT_BE)
    chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome(
        trgt_seq_dict)

    util.make_deep_cas9_input(WORK_DIR + "deep_cas_9/sample", [chr_dict],
                              INIT_BE, BATCH_SIZE)
Ejemplo n.º 26
0
def merge_multi_processing_4seq_excel_result():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    txt_sources = util.get_files_from_dir(WORK_DIR + "output/" + SUB_OUT_DIR + '*.txt')

    total_list = []
    for txt_file in txt_sources:
        total_list.extend(util.read_tb_txt(txt_file))

    merge_dict = logic_prep.make_4seq_list_to_dict(total_list)

    util.make_4seq_dict_to_excel(WORK_DIR + "output/" + SUB_OUT_DIR + "merge_result_count", merge_dict)
Ejemplo n.º 27
0
def filter_out_cds_wout_strt_cdn(cds_list):
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    print("start filter_out_cds_wout_strt_cdn!!!!")
    result_list = []
    for cds_arr in cds_list:
        gene_sym = cds_arr[0]
        nm_id = cds_arr[1]
        chr_nm = cds_arr[2]
        strand = cds_arr[3]
        orf_strt_pos = int(cds_arr[6])
        orf_end_pos = int(cds_arr[7])

        p_seq, m_seq = util.read_file_by_biopython(REF_DIR + chr_nm + ".fa", "fasta")

        if strand == '+':
            strt_codon = p_seq[orf_strt_pos: orf_strt_pos + 3]
            if strt_codon in STRT_CD_ARR:
                end_codon = p_seq[orf_end_pos - 3: orf_end_pos]
                if end_codon in END_CD_ARR:
                    start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr)

                    p_cds_seq = logic_prep.get_seq_by_idx_arr(p_seq, start_idx_arr, end_idx_arr)
                    if len(p_cds_seq) % 3 != 0:
                        continue
                    if logic.exist_another_orf_end_codon_in_cds_seq(p_cds_seq):
                        continue

                    tmp_arr = []
                    tmp_arr.extend(cds_arr)
                    result_list.append(tmp_arr)

        else:
            strt_codon = m_seq[orf_end_pos - 3: orf_end_pos][::-1]
            if strt_codon in STRT_CD_ARR:
                end_codon = m_seq[orf_strt_pos: orf_strt_pos + 3][::-1]
                if end_codon in END_CD_ARR:
                    start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr)

                    m_cds_seq = logic_prep.get_seq_by_idx_arr(m_seq, start_idx_arr, end_idx_arr)
                    if len(m_cds_seq) % 3 != 0:
                        continue
                    if logic.exist_another_orf_end_codon_in_cds_seq(m_cds_seq, False):
                        continue

                    tmp_arr = []
                    tmp_arr.extend(cds_arr)
                    result_list.append(tmp_arr)

    print("DONE filter_out_cds_wout_strt_cdn!!!!")
    return result_list
Ejemplo n.º 28
0
def main_YG():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    mut_sum_dict = util.read_txt_dvd_by_tab(WORK_DIR + MUT_FILE)
    sorted_mut_dict = logic_prep.sort_dict(mut_sum_dict)

    result_dict = logic.get_seqs_bfr_aft_trgt_idx(sorted_mut_dict,
                                                  INITIAL_MAIN)

    util.make_excel(WORK_DIR + "analyze_hg19", sorted_mut_dict, result_dict)
def main_20201127():
    util = Util.Utils()
    logic_prep = LogicPrep.LogicPreps()
    logic = Logic.Logics()

    df = util.read_excel_to_df(WORK_DIR + IN + IN_EXCEL, SHEET_NAME)
    len_df = len(df[df.columns[0]])

    result_list = []
    for i in range(len_df):
        cnt = 0
        ori_seq = df.loc[i][0]
        n_of_mismatch = int(df.loc[i][1])
        n_of_sub_seq = int(df.loc[i][2])

        bp_f = ori_seq[:ORI_SEQ_STRCTR[0]]
        bf_spacer_n_fr_ngg = ori_seq[ORI_SEQ_STRCTR[0]:ORI_SEQ_STRCTR[1]]
        gg_fr_ngg = ori_seq[ORI_SEQ_STRCTR[1]:ORI_SEQ_STRCTR[2]]
        bf_rtt_only = ori_seq[ORI_SEQ_STRCTR[2]:ORI_SEQ_STRCTR[3]]
        bp_b = ori_seq[ORI_SEQ_STRCTR[3]:]

        bf_mm_seq = bf_spacer_n_fr_ngg + bf_rtt_only

        idx_set = logic_prep.make_seq_idx_set(0, len(bf_mm_seq))
        mm_seq_set = set()
        while True:
            if len(mm_seq_set) >= n_of_sub_seq:
                break
            mm_idx_list = random.sample(idx_set, n_of_mismatch)
            af_mm_seq = bf_mm_seq
            for j in mm_idx_list:
                tmp_set = BASE_NT - {bf_mm_seq[j].lower()}
                af_mm_seq = logic.swap_char_in_string(
                    af_mm_seq, j,
                    random.sample(tmp_set, 1)[0])
            mm_seq_set.add(af_mm_seq)
        af_mm_seq_list = list(mm_seq_set)
        for tmp_seq in af_mm_seq_list[:n_of_sub_seq]:
            if cnt == 0:
                result_list.append([
                    ori_seq, bp_f + tmp_seq[:len(bf_spacer_n_fr_ngg)] +
                    gg_fr_ngg + tmp_seq[len(bf_spacer_n_fr_ngg):] + bp_b,
                    n_of_mismatch
                ])
            else:
                result_list.append([
                    '', bp_f + tmp_seq[:len(bf_spacer_n_fr_ngg)] + gg_fr_ngg +
                    tmp_seq[len(bf_spacer_n_fr_ngg):] + bp_b, n_of_mismatch
                ])
            cnt += 1

    util.make_excel(WORK_DIR + OU + SHEET_NAME + '_result',
                    ['ori_seq', 'sub_seq', '#_of_mismatch'], result_list)
    def sort_n_merge_by_chr(self, init_merge, init_be):
        ref_path = init_merge[0]
        cdf_file = init_merge[1]
        a_or_c_idx = init_merge[2]
        a_c_rule = init_merge[3]
        work_dir = init_merge[4]
        top_n = init_merge[5]

        logic_prep = LogicPrep.LogicPreps()
        util = Util.Utils()

        trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site(
            ref_path + cdf_file, init_be)
        chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome(
            trgt_seq_dict, "primary_assembly:ASM275486v1:")
        a_c_dict = self.filter_out_by_ACGTU_rule(chr_dict, a_or_c_idx,
                                                 a_c_rule)

        abe_score_dict = logic_prep.get_deep_base_ed_score(
            work_dir + "deep_ABE/ABE_Efficiency.txt")
        cbe_score_dict = logic_prep.get_deep_base_ed_score(
            work_dir + "deep_CBE/CBE_Efficiency.txt")
        cs9_score_dict = logic_prep.get_deep_cas9_tupl(
            work_dir + "deep_cas_9/", "RANK_final_DeepCas9_Final.txt",
            "sample.txt")

        top_n_abe_list = []
        top_n_cbe_list = []
        for chr_key, trnscrpt_list in a_c_dict.items():
            result_list = []
            result_list = logic_prep.merge_cas9_abe_cbe_to_list(
                chr_key, [
                    trnscrpt_list, abe_score_dict, cbe_score_dict,
                    cs9_score_dict
                ], result_list)

            sort_by_abe_list = logic_prep.sort_by_idx_element(
                result_list, -2, [])
            sort_by_cbe_list = logic_prep.sort_by_idx_element(
                result_list, -1, [])
            """
            # extend TOP N lists to (top_n_abe_list, top_n_cbe_list)
            it needs filter out same context seq in different trnscrpt
            """
            top_n_abe_list.extend(sort_by_abe_list[:top_n])
            top_n_cbe_list.extend(sort_by_cbe_list[:top_n])

        util.make_excel_after_sorting(
            work_dir + "merge_cas9_abe_cbe_top_N/merge_by_ABE_top_" +
            str(top_n), top_n_abe_list, init_be)
        util.make_excel_after_sorting(
            work_dir + "merge_cas9_abe_cbe_top_N/merge_by_CBE_top_" +
            str(top_n), top_n_cbe_list, init_be)