def get_gse_gsm_dict(fn, gsm_raw_dict, gsm_dict): gsms = gsm_raw_dict['gsm'] # gse_gsms_dict fn_pkl = f'{fn}.pkl' if os.path.isfile(fn_pkl): gse_gsms_dict = load_table_dict_pkl(fn_pkl) else: gses = set() for gses_raw in gsm_raw_dict["series_id"]: gses_curr = gses_raw.split(',') for gse in gses_curr: gses.add(gse) gses = list(gses) gse_gsms_dict = {} for gse in gses: gse_gsms_dict[gse] = [] for gsm in gsms: gses_raw = gsm_dict["series_id"][gsm] gses_curr = gses_raw.split(',') for gse in gses_curr: gse_gsms_dict[gse].append(gsm) save_table_dict_pkl(fn_pkl, gse_gsms_dict) return gse_gsms_dict
def get_raw_dict(fn): fn_xlsx = f'{fn}.xlsx' fn_pkl = f'{fn}.pkl' if os.path.isfile(fn_pkl): gsm_raw_dict = load_table_dict_pkl(fn_pkl) else: gsm_raw_dict = load_table_dict_xlsx(fn_xlsx) save_table_dict_pkl(fn_pkl, gsm_raw_dict) return gsm_raw_dict
def get_gsm_dict(fn, gsm_raw_dict): gsms = gsm_raw_dict['gsm'] fn_pkl = f'{fn}.pkl' if os.path.isfile(fn_pkl): gsm_dict = load_table_dict_pkl(fn_pkl) else: gsm_dict = {} for key in tqdm(gsm_raw_dict, desc='gsm_dict processing'): gsm_dict[key] = {} for index, gsm in enumerate(gsms): gsm_dict[key][gsm] = gsm_raw_dict[key][index] save_table_dict_pkl(fn_pkl, gsm_dict) return gsm_dict
passed_words.append(word) return passed_words GPL = '21145' suffix = '22_09_20' gsm_key = 'gsm' gse_key = 'series_id' source_key = 'source_name_ch1' characteristics_key = 'characteristics_ch1' fn_xlsx = f'{get_data_path()}/GPL{GPL}/GPL{GPL}_gsm_table_{suffix}.xlsx' fn_pkl = f'{get_data_path()}/GPL{GPL}/GPL{GPL}_gsm_table_{suffix}.pkl' if os.path.isfile(fn_pkl): gsm_raw_dict = load_table_dict_pkl(fn_pkl) else: gsm_raw_dict = load_table_dict_xlsx(fn_xlsx) save_table_dict_pkl(fn_pkl, gsm_raw_dict) gsms = gsm_raw_dict[gsm_key] fn = f'{get_data_path()}/GPL{GPL}/bad_words.txt' f = open(fn) bad_words = set(f.read().splitlines()) f.close() fn = f'{get_data_path()}/GPL{GPL}/target_chs.txt' f = open(fn) target_chs = set(f.read().splitlines()) f.close()