def get_logreg_params(self, pval_thresh=5e-2, hq_mods=pd.DataFrame()): """ Gets significant PCA components - logreg_constant: decides whether the constant variable in logistic regression is considered """ signif_pca_comps, signif_pca_react_loads = {}, {} if self.hq_models.empty == True and hq_mods.empty==True: print("Give list of models or run -get_hq_samples") return None elif hq_mods.empty!=True: self.hq_models = hq_mods pheno_id = "_".join(self.hq_models.name.split("_")[1:]) for sampled_map_num, ic_val in self.hq_models.items(): landscape_sample_num = sampled_map_num.split("_")[-1] sample_id = "sample_"+landscape_sample_num+"_map_assess.json" landscape_assess_sample_file = self.assess_file_loc+sample_id min_pca_pval=False if path.exists(landscape_assess_sample_file): landscape_assess_sample = load_json_obj(landscape_assess_sample_file) pval_dict = ast.literal_eval(landscape_assess_sample["p_values_"+pheno_id].replace("nan", "1.0")) coef_dict = ast.literal_eval(landscape_assess_sample["coefs_"+pheno_id]) comp_dict = ast.literal_eval(landscape_assess_sample["PCA_comp_dict_"+pheno_id]) comp_dict = {"x"+str(k+1):v for k, v in comp_dict.items() } signif_pca_comps[sampled_map_num] = {} for pca_comp, p_val in pval_dict.items(): if p_val < pval_thresh and pca_comp!="const": signif_pca_comps[sampled_map_num].update({pca_comp:{ "p_val": p_val, "coef": coef_dict[pca_comp],"pca_load":comp_dict[pca_comp]}}) self.signif_pca = signif_pca_comps return signif_pca_comps
def load_samples_assess_df(ENSEMBLE_MAP_ASSESS, pheno_list): ### -------------- LOAD 2 ----------------- print("...loading SAMPLES_ASSESS_DF to identify minimum BIC or AIC MNCs") onlyfiles = [ f for f in listdir(ENSEMBLE_MAP_ASSESS) if os.path.isfile(os.path.join(ENSEMBLE_MAP_ASSESS, f)) ] onlyfiles = [f for f in onlyfiles if f != ".DS_Store"] samplesAfter = [f for f in onlyfiles if "sample_" in f] wanted_keys = [] ### Options for what we want in SAMPLES_ASSESS_DF for pheno_id in pheno_list: wanted_keys.extend(["AIC_" + pheno_id, "BIC_" + pheno_id]) SAMPLES_ASSESS_DF = {} for landscape_sample_name in tqdm(samplesAfter): landscape_sample_num = landscape_sample_name.split("_")[1] sample_id = "sampled_map_" + str(landscape_sample_num) landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS + landscape_sample_name if os.path.exists(landscape_assess_sample_file): landscape_assess_sample = load_json_obj( landscape_assess_sample_file) SAMPLES_ASSESS_DF[sample_id] = {} SAMPLES_ASSESS_DF[sample_id].update( dict((k, landscape_assess_sample[k]) for k in wanted_keys if k in landscape_assess_sample)) # transform to pandas dataframe SAMPLES_ASSESS_DF = pd.DataFrame.from_dict(SAMPLES_ASSESS_DF, orient="index") print("\t... SAMPLES_ASSESS_DF shape: (samples: %d, assess_cols: %d)" % (SAMPLES_ASSESS_DF.shape[0], SAMPLES_ASSESS_DF.shape[1])) return SAMPLES_ASSESS_DF
def get_sample_constraints(variant_dec_file): """Load the allele-constraint map for a particular model sample" """ if path.exists(variant_dec_file): variant_dec_dict = load_json_obj(variant_dec_file) else: print("variant_dec_dict does not exist: ", variant_dec_file) variant_dec_dict = {} return variant_dec_dict
def load_landscape_sample(fva_landscape_file): """Load the popFVA landscape for a particular model sample """ fva_landscape_dict = load_json_obj(fva_landscape_file) obj_val_list = {} for strain_id, strain_fva_dict in fva_landscape_dict.items(): obj_val_list[strain_id] = {} for rxn, max_min_dict in strain_fva_dict.items(): obj_val_list[strain_id].update({rxn+"_max":float(format(max_min_dict["maximum"],'.10f')), rxn+"_min":float(format(max_min_dict["minimum"],'.10f'))}) fva_landscape_df = pd.DataFrame.from_dict(obj_val_list, orient="index") return fva_landscape_df
def load_ensemble_data(self, STRAIN_NUM=375,ALLELE_NUM=237,ACTION_NUM=4, ADD_NO_CHANGE=False, pheno_list = ["ethambutol", "isoniazid", "rifampicin", "4-aminosalicylic_acid", "pyrazinamide", "ethionamide","ofloxacin", "cycloserine"], STANDARDIZE=False, FILTER_RXN_DIR=False, test_set=True): """ Loads in the data describing a particular ensemble """ self.action_num = ACTION_NUM self.add_no_change = ADD_NO_CHANGE ENSEMBLE_DIR = "ens_strains"+str(STRAIN_NUM)+"_alleles"+str(ALLELE_NUM)+"_actions"+str(ACTION_NUM) if not path.exists(ENSEMBLE_DIR): raise ValueError('\t... directory "%s" does not exist' %s (ENSEMBLE_DIR)) else: print("dir ensemble: %s" % (ENSEMBLE_DIR)) POPFVA_SAMPLES_DIR = ENSEMBLE_DIR+"/popfva_samples/" if not path.exists(POPFVA_SAMPLES_DIR): print('\t... directory "%s" does not exist' %s (POPFVA_SAMPLES_DIR)) raise ValueError('\t... directory "%s" does not exist' %s (ENSEMBLE_DIR)) else: print("dir popfva samples: %s" % (POPFVA_SAMPLES_DIR)) self.popfva_file_loc = POPFVA_SAMPLES_DIR ### Create folders to save different types of sample assessments if STANDARDIZE==True: ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment_std/" ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova_std/" ### Save ANOVA F-test enrichments. else: ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment/" ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova/" if FILTER_RXN_DIR==True: ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment_rxnfilt_std"+str(STANDARDIZE)+"/" ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova_rxnfilt_std"+str(STANDARDIZE)+"/" self.assess_file_loc = ENSEMBLE_MAP_ASSESS ENSEMBLE_MAP_COMPRESS = ENSEMBLE_DIR+"/popfva_compress/" ### Save numpy array versions of landscapes ### -------------- LOAD 1 ----------------- print("(1) load COBRA_MODEL, base_flux_samples, pheno_to_data2d_dict, pheno_to_Y_dict ...") MODEL_SAMPLES_FILE = ENSEMBLE_DIR+"/"+"base_flux_samples.csv" base_flux_samples = pd.read_csv(MODEL_SAMPLES_FILE,index_col=0) self.flux_samples = base_flux_samples ENSEMBLE_BASEMODEL_FILE = ENSEMBLE_DIR+"/base_cobra_model.json" COBRA_MODEL = load_json_model(ENSEMBLE_BASEMODEL_FILE) self.base_cobra_model = COBRA_MODEL ### Load in the genetic variant matrix and AMR phenotypes for each case. pheno_to_data2d_dict = {} pheno_to_Y_dict = {} ALLELE_PHENO_FILE = ENSEMBLE_DIR+"/allele_pheno_data/" for pheno_id in pheno_list: G_VARIANT_MATRIX_FILE = ALLELE_PHENO_FILE+"/allele_df_"+pheno_id+".csv" PHENO_MATRIX_FILE = ALLELE_PHENO_FILE+"/pheno_df_"+pheno_id+".csv" pheno_to_data2d_dict.update({pheno_id: pd.read_csv(G_VARIANT_MATRIX_FILE,index_col=0)}) pheno_to_Y_dict.update({pheno_id: pd.read_csv(PHENO_MATRIX_FILE,index_col=0)[pheno_id]})## to make Series self.x_allele_dict = pheno_to_data2d_dict self.y_pheno_dict = pheno_to_Y_dict self.pheno_list = pheno_list ### -------------- LOAD 2 ----------------- print("(2) load SAMPLES_ASSESS_DF ...") onlyfiles = [f for f in listdir(ENSEMBLE_MAP_ASSESS) if path.isfile(path.join(ENSEMBLE_MAP_ASSESS, f))] onlyfiles = [f for f in onlyfiles if f != ".DS_Store"] if test_set==True: samplesAfter = [f for f in onlyfiles if "sample_" in f][:20]# only get 20 sample so files are small else: samplesAfter = [f for f in onlyfiles if "sample_" in f] wanted_keys = [] ### Options for what we want in SAMPLES_ASSESS_DF are as follows... (look in 02_ass_ensemble.py for more info) ### "AIC_", "BIC_", "prsquared_", "loglikelihood_", "LLR_pval_", "p_values_", "coefs_", "std_err_", "PCA_comp_dict_" for pheno_id in pheno_list: wanted_keys.extend(["AIC_"+pheno_id, "BIC_"+pheno_id, "prsquared_"+pheno_id, "std_err_"+pheno_id, "loglikelihood_"+pheno_id, "LLR_pval_"+pheno_id, "cv_score_mean_"+pheno_id]) SAMPLES_ASSESS_DF = {} for landscape_sample_name in tqdm(samplesAfter): landscape_sample_num = landscape_sample_name.split("_")[1] sample_id = "sampled_map_"+str(landscape_sample_num) landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS+landscape_sample_name if path.exists(landscape_assess_sample_file): landscape_assess_sample = load_json_obj(landscape_assess_sample_file) SAMPLES_ASSESS_DF[sample_id] = {} SAMPLES_ASSESS_DF[sample_id].update(dict((k, landscape_assess_sample[k]) for k in wanted_keys if k in landscape_assess_sample)) # transform to pandas dataframe SAMPLES_ASSESS_DF = pd.DataFrame.from_dict(SAMPLES_ASSESS_DF,orient="index") print("\t... SAMPLES_ASSESS_DF shape: (samples: %d, assess_cols: %d)" % (SAMPLES_ASSESS_DF.shape[0], SAMPLES_ASSESS_DF.shape[1])) self.assess_df = SAMPLES_ASSESS_DF ### -------------- LOAD 3 ----------------- print("(3) load SAMPLES_ANOVA_DICT ...") SAMPLES_ANOVA_DF = {} for pheno_id in pheno_list: SAMPLES_ANOVA_DF[pheno_id] = {} for landscape_sample_name in tqdm(samplesAfter[:]): landscape_sample_num = landscape_sample_name.split("_")[1] sample_id = "sample_"+landscape_sample_num+"_map_anova.json" landscape_anova_sample_file = ENSEMBLE_MAP_ANOVA+sample_id if path.exists(landscape_anova_sample_file): landscape_anova_sample = load_json_obj(landscape_anova_sample_file) for pheno_id in pheno_list: SAMPLES_ANOVA_DF[pheno_id]["sampled_map_"+landscape_sample_num] = {} SAMPLES_ANOVA_DF[pheno_id]["sampled_map_"+landscape_sample_num].update(landscape_anova_sample[pheno_id]["pVal"]) print("\t... generating SAMPLES_ANOVA_DICT") SAMPLES_ANOVA_DICT = {} for pheno_id in tqdm(pheno_list): SAMPLES_ANOVA_DICT.update({pheno_id: pd.DataFrame.from_dict(SAMPLES_ANOVA_DF[pheno_id],orient="index")}) self.anova_dict = SAMPLES_ANOVA_DICT ### -------------- LOAD 3 ----------------- print("(4) load SAMPLES_AC_DF ...") allele_col_ids = [x for x in pheno_to_data2d_dict[pheno_list[0]].columns] SAMPLES_AC_DF = {} for landscape_sample_name in tqdm(samplesAfter): landscape_sample_num = landscape_sample_name.split("_")[1] sample_id = "sampled_map_"+str(landscape_sample_num) landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS+landscape_sample_name if path.exists(landscape_assess_sample_file): landscape_assess_sample = load_json_obj(landscape_assess_sample_file) SAMPLES_AC_DF[sample_id] = {} SAMPLES_AC_DF[sample_id].update(dict((k, landscape_assess_sample[k]) for k in allele_col_ids if k in landscape_assess_sample)) SAMPLES_AC_DF = pd.DataFrame.from_dict(SAMPLES_AC_DF,orient="index") print("\t... SAMPLES_AC_DF shape: (samples: %d, assess_cols: %d)" % (SAMPLES_AC_DF.shape[0], SAMPLES_AC_DF.shape[1])) self.constraint_df = SAMPLES_AC_DF
print('\t... creating ensemble directory:' + ENSEMBLE_DIR + "/") os.makedirs(ENSEMBLE_DIR + "/") print('\t... saving parameters to ensemble directory') with open(ENSEMBLE_DIR + '/mnc_ensemble_args.txt', 'w') as f: f.write('\n'.join(sys.argv[1:])) ### save to json args_dict = { "action_num": args.action_num, 'nabound': args.add_na_bound, 'popFVA_STANDARDIZE': args.popFVA_STANDARDIZE, 'testsize': args.testsize } save_json_obj(args_dict, ENSEMBLE_DIR + "/mnc_ensemble_args.json") else: exit_script = False args_dict = load_json_obj(ENSEMBLE_DIR + "/mnc_ensemble_args.json") if str(args_dict["nabound"]) != str(args.add_na_bound): print("--nabound argument is different!") exit_script = True if args_dict["action_num"] != args.action_num: print("--action_num argument is different!") exit_script = True if args_dict["popFVA_STANDARDIZE"] != args.popFVA_STANDARDIZE: print("--popfvascale argument is different!", args_dict["popFVA_STANDARDIZE"], str(args.popFVA_STANDARDIZE)) exit_script = True if args_dict["testsize"] != args.testsize:
'--bicthresh', type=int, dest='bic_threshold', default=10, help= 'Delta BIC determines the set of high quality models to perform analysis on. (default: 10). See Burnham and Anderson 2002 book on model selection for further information' ) ### load args args = parser.parse_args() ENSEMBLE_DIR = args.mnc_dir TESTSET = args.train_test BIC_cutoff = args.bic_threshold ensemble_args_dict = load_json_obj(ENSEMBLE_DIR + "/mnc_ensemble_args.json") action_num = ensemble_args_dict["action_num"] # 4 ADD_NA_BOUND = ensemble_args_dict["nabound"] # False STANDARDIZE_ = ensemble_args_dict["popFVA_STANDARDIZE"] # False print("action_num (%d), nabound (%s), standardize (%s)" % (action_num, str(ADD_NA_BOUND), str(STANDARDIZE_))) SCALE_POPFVA_ = True pval_threshold = 1.0 load_threshold = 0.0 fdr_correction = False save_data = True #### IMPORTANT - top_models = pd.read_csv(ENSEMBLE_DIR+"/tables/best_mncs_"+pheno_id+".csv",index_col=0) will fail if best_mncs_ file is not generated before~! #### write code for getting list of best MNCs for each phenotype if TESTSET == False: