# more on args if args.bgen_index is None: args.bgen_index = args.bgen_path + '.idx2' # some hail environment logging before run logging.info('echo $PYSPARK_SUBMIT_ARGS') os.system('echo $PYSPARK_SUBMIT_ARGS') # initialize hail logging.info('Initialize hail') hl.init(log=args.hail_log) # read in GWAS sum stats and clumped variants logging.info('Read subset and GWAS / LD-clumping YAML') myinputs = gwas_helper.read_yaml(args.subset_and_gwas, args.google_cloud_project) if args.gwas_ht is None: gwas_ht = None else: logging.info('Read GWAS hail Table') gwas_ht = hl.read_table(args.gwas_ht) gwas_ht = gwas_ht.key_by('rsid') gwas_ht = gwas_ht.repartition(40) gwas_ht = gwas_ht.cache() if args.mode != 'skip_subset': ## collect clump variant clump_var_files = [] for i in list(myinputs.keys()): for j in list(myinputs[i]['GWASs'].keys()):
logging.info('Loading trait table') filename, trait_colname = parse_input(args.trait_table, 'eid') df_trait = ghelper.tsv_to_pd_df(filename, indiv_col = trait_colname) indiv_pool = np.array(df_trait[trait_colname].to_list()) logging.info('--> Current sample size = {}'.format(indiv_pool.shape[0])) # read in covariate table logging.info('Loading covariate table') filename, covar_colname = parse_input(args.covar_table, 'eid') df_covar = ghelper.tsv_to_pd_df(filename, indiv_col = covar_colname) indiv_pool = np.intersect1d(indiv_pool, df_covar[covar_colname].to_list()) logging.info('--> Current sample size = {}'.format(indiv_pool.shape[0])) # read in predictor matrix logging.info('Loading predictor matrice. Looping over all inputs') predictor_tables_dic = gwas_helper.read_yaml(args.predictor_table_yaml) # args.predictor_table_list.split('::') print(predictor_tables_dic) predictor_table_keys = list(predictor_tables_dic.keys()) ntotal = len(predictor_table_keys) if args.mode == 'naive': piled_pred_expr = None ntotal = len(predictor_table_keys) for i in range(ntotal): filename = args.predictor_table_prefix + predictor_tables_dic[predictor_table_keys[i]] + args.predictor_table_suffix colname = args.predictor_gene_column df_pred_expr = ghelper.tsv_to_pd_df(filename, indiv_col = colname) df_pred_expr = df_pred_expr.drop(columns = [colname]) if piled_pred_expr is None: piled_pred_expr = df_pred_expr else: piled_pred_expr = pd.concat((piled_pred_expr, df_pred_expr), ignore_index = True)
logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') # some hail environment logging before run logging.info('echo $PYSPARK_SUBMIT_ARGS') os.system('echo $PYSPARK_SUBMIT_ARGS') # initialize hail logging.info('Initialize hail') hl.init() # load phenotypes and covariates logging.info('Start loading phenotypes and covariates (the full table)') pheno_covar_dic = helper.read_yaml(args.pheno_covar_yaml) covar_names = pheno_covar_dic['covar_names'] # 'age_recruitment,sex,pc1,pc2' pheno_names = pheno_covar_dic['pheno_names'] # 'ht,mcv,mch' indiv_id = pheno_covar_dic['indiv_id'] # 'eid' int_names = pheno_covar_dic['int_names'] # 'age_recruitment,sex' str_names = pheno_covar_dic['str_names'] # 'eid' logging.info('--> Read in CSV file as data.frame') tstart = time.time() covar, trait = hail_helper.read_and_split_phenotype_csv( args.pheno_covar_csv, pheno_names=pheno_names, covar_names=covar_names, indiv_id=indiv_id, int_names=int_names, str_names=str_names) tend = time.time()