def main(): logic_pre = LogicPrep.LogicsPrep() logic = Logic.Logics() for i in range(1, 30): FILE_NAME_LIST.append(str(i)) description_dict = logic_pre.read_dat_file_for_description( WORK_DIR + CDS_EACH_FNAME, FILE_NAME_LIST) cds_dict, sm_gene_diff_trscrpt = logic_pre.read_gtf_file_by_line_to_dict( WORK_DIR + CDS_FNAME, description_dict) logic.get_guide_ref(cds_dict, WORK_DIR + SEQ_FNAME, INITIAL_SEQ)
def merge_off_target_total(): util = Util.Utils() logic_prep = LogicPrep.LogicsPrep() logic = Logic.Logics() off_trgt_dict = util.read_txt_to_dict(INITIAL_MRGE_OFF_TRGT) for i in range(1, 30): FILE_NAME_LIST.append(str(i)) seq_cnt_group_by_crpt_id = {} re_off_trgt_dict = {} first_merge_dict = {} result_dict = {} for f_num in FILE_NAME_LIST: print("starting with file [" + str(f_num) + "]") df_obj = util.read_excel_2_dataframe(WORK_DIR + CAS_OFF_EXCEL, f_num) first_merge_dict = logic_prep.merge_excel_n_off_trgt( df_obj, off_trgt_dict) for cas_scr_opt in CAS_SCORE_OPT: for off_trg_opt_arr in FILTER_OUT_OPT: result_dict = logic.filter_out_by_rule(cas_scr_opt, off_trg_opt_arr, MAX_SEQ_NUM, first_merge_dict, result_dict) seq_cnt_group_by_crpt_id = util.make_excel_w_off_trgt( WORK_DIR + CAS_OFF_EXCEL + "off_trgt_", "total", result_dict, seq_cnt_group_by_crpt_id) re_off_trgt_dict = logic.get_re_off_target_seq(MAX_SEQ_NUM, seq_cnt_group_by_crpt_id, first_merge_dict, re_off_trgt_dict) util.make_tab_txt_seq_cnt_group_by_crpt_id( WORK_DIR + CAS_OFF_EXCEL + "seq_cnt_group_by_crpt_id_", seq_cnt_group_by_crpt_id) util.make_tab_txt_re_off_target_seq( WORK_DIR + CAS_OFF_EXCEL + "re_off_target_seq_", re_off_trgt_dict)
def merge_off_target_indi(self, init_off_trgt, f_nm_list, wrk_dir, excel_path, max_num, filt_out_opt): util = Util.Utils() logic_prep = LogicPrep.LogicsPrep() off_trgt_opt = filt_out_opt[0] cas_scr_opt = filt_out_opt[1] off_trgt_dict = util.read_txt_to_dict(init_off_trgt) seq_cnt_group_by_crpt_id = {} re_off_trgt_dict = {} for f_num in f_nm_list: print("starting with file [" + str(f_num) + "]") df_obj = util.read_excel_2_dataframe(wrk_dir + excel_path, f_num) first_merge_dict = logic_prep.merge_excel_n_off_trgt( df_obj, off_trgt_dict) result_dict = {} for cas_opt in cas_scr_opt: for off_trg_opt_arr in off_trgt_opt: result_dict = self.filter_out_by_rule( cas_opt, off_trg_opt_arr, max_num, first_merge_dict, result_dict) seq_cnt_group_by_crpt_id = util.make_excel_w_off_trgt( wrk_dir + excel_path + "off_trgt_", f_num, result_dict, seq_cnt_group_by_crpt_id) re_off_trgt_dict = self.get_re_off_target_seq( max_num, seq_cnt_group_by_crpt_id, first_merge_dict, re_off_trgt_dict) print("done with file [" + str(f_num) + "]\n") util.make_tab_txt_seq_cnt_group_by_crpt_id( wrk_dir + excel_path + "seq_cnt_group_by_crpt_id_", seq_cnt_group_by_crpt_id) util.make_tab_txt_re_off_target_seq( wrk_dir + excel_path + "re_off_target_seq_", re_off_trgt_dict)
def get_guide_ref(self, cds_dict, path, init): util = Util.Utils() logic_pre = LogicPrep.LogicsPrep() # result_dict = {} idx = 1 for key, vals in cds_dict.items(): result_dict = {} tmp_p_dict, tmp_m_dict = logic_pre.read_seq_dict(path, key, init) for val_dict in vals.values(): if 'CDS' in val_dict: cds_seq_arr = val_dict['CDS'] total_cds_len = 0 for cds_tmp in cds_seq_arr: cds_seq_tmp = cds_tmp.split(" ") total_cds_len = total_cds_len + ( int(cds_seq_tmp[1]) - int(cds_seq_tmp[0]) + 1) prent_cds_len = 0 for cds_seq in cds_seq_arr: cds_seq_exon_num = cds_seq.split(" ") prent_cds_len = prent_cds_len + ( int(cds_seq_exon_num[1]) - int(cds_seq_exon_num[0]) + 1) for i in range( int(cds_seq_exon_num[0]) + 1, int(cds_seq_exon_num[1]) + 1): if i in tmp_p_dict: result_dict[idx] = {} result_dict[idx].update({ 'Target gene name': val_dict['Target gene name'] }) result_dict[idx].update({ 'Ensembl transcript ID': val_dict['Ensembl transcript ID'] }) result_dict[idx].update({ 'Ensembl Gene ID': val_dict['Ensembl Gene ID'] }) if 'Description' in val_dict: result_dict[idx].update({ 'Description': val_dict['Description'] }) result_dict[idx].update( {'Position of Base After cut': i}) result_dict[idx].update( {'Target context sequence': tmp_p_dict[i]}) result_dict[idx].update({'Strand': '+'}) result_dict[idx].update( {'Exon Number': cds_seq_exon_num[2]}) # the pos ratio out of total CDS depends on which strand gene occurs if '+' == val_dict['Strand']: this_len = prent_cds_len - ( int(cds_seq_exon_num[1]) - i + 1) result_dict[idx].update( {'Ratio': this_len / total_cds_len}) else: this_len = prent_cds_len - ( i - int(cds_seq_exon_num[0]) + 1) result_dict[idx].update( {'Ratio': this_len / total_cds_len}) idx = idx + 1 if i in tmp_m_dict: result_dict[idx] = {} result_dict[idx].update({ 'Target gene name': val_dict['Target gene name'] }) result_dict[idx].update({ 'Ensembl transcript ID': val_dict['Ensembl transcript ID'] }) result_dict[idx].update({ 'Ensembl Gene ID': val_dict['Ensembl Gene ID'] }) if 'Description' in val_dict: result_dict[idx].update({ 'Description': val_dict['Description'] }) result_dict[idx].update( {'Position of Base After cut': i}) result_dict[idx].update({ 'Target context sequence': tmp_m_dict[i].split(" ")[0] }) result_dict[idx].update({ 'Target context anti sequence': tmp_m_dict[i].split(" ")[1] }) result_dict[idx].update({'Strand': '-'}) result_dict[idx].update( {'Exon Number': cds_seq_exon_num[2]}) # the pos ratio out of total CDS depends on which strand gene occurs if '+' == val_dict['Strand']: this_len = prent_cds_len - ( int(cds_seq_exon_num[1]) - i + 1) result_dict[idx].update( {'Ratio': this_len / total_cds_len}) else: this_len = prent_cds_len - ( i - int(cds_seq_exon_num[0]) + 1) result_dict[idx].update( {'Ratio': this_len / total_cds_len}) idx = idx + 1 print("DONE file_" + key) util.make_excel(result_dict, init, key)