# more on args
if args.bgen_index is None:
    args.bgen_index = args.bgen_path + '.idx2'

# some hail environment logging before run
logging.info('echo $PYSPARK_SUBMIT_ARGS')
os.system('echo $PYSPARK_SUBMIT_ARGS')

# initialize hail
logging.info('Initialize hail')
hl.init(log=args.hail_log)

# read in GWAS sum stats and clumped variants
logging.info('Read subset and GWAS / LD-clumping YAML')
myinputs = gwas_helper.read_yaml(args.subset_and_gwas,
                                 args.google_cloud_project)

if args.gwas_ht is None:
    gwas_ht = None
else:
    logging.info('Read GWAS hail Table')
    gwas_ht = hl.read_table(args.gwas_ht)
    gwas_ht = gwas_ht.key_by('rsid')
    gwas_ht = gwas_ht.repartition(40)
    gwas_ht = gwas_ht.cache()

if args.mode != 'skip_subset':
    ## collect clump variant
    clump_var_files = []
    for i in list(myinputs.keys()):
        for j in list(myinputs[i]['GWASs'].keys()):
logging.info('Loading trait table')
filename, trait_colname = parse_input(args.trait_table, 'eid')
df_trait = ghelper.tsv_to_pd_df(filename, indiv_col = trait_colname)
indiv_pool = np.array(df_trait[trait_colname].to_list())
logging.info('--> Current sample size = {}'.format(indiv_pool.shape[0]))

# read in covariate table
logging.info('Loading covariate table')
filename, covar_colname = parse_input(args.covar_table, 'eid')
df_covar = ghelper.tsv_to_pd_df(filename, indiv_col = covar_colname)
indiv_pool = np.intersect1d(indiv_pool, df_covar[covar_colname].to_list())
logging.info('--> Current sample size = {}'.format(indiv_pool.shape[0]))

# read in predictor matrix
logging.info('Loading predictor matrice. Looping over all inputs')
predictor_tables_dic = gwas_helper.read_yaml(args.predictor_table_yaml)  # args.predictor_table_list.split('::')
print(predictor_tables_dic)
predictor_table_keys = list(predictor_tables_dic.keys())
ntotal = len(predictor_table_keys)
if args.mode == 'naive':
    piled_pred_expr = None
    ntotal = len(predictor_table_keys)
    for i in range(ntotal):
        filename = args.predictor_table_prefix + predictor_tables_dic[predictor_table_keys[i]] + args.predictor_table_suffix
        colname = args.predictor_gene_column
        df_pred_expr = ghelper.tsv_to_pd_df(filename, indiv_col = colname)
        df_pred_expr = df_pred_expr.drop(columns = [colname])
        if piled_pred_expr is None:
            piled_pred_expr = df_pred_expr
        else:
            piled_pred_expr = pd.concat((piled_pred_expr, df_pred_expr), ignore_index = True)
logging.basicConfig(level=logging.INFO,
                    stream=sys.stderr,
                    format='%(asctime)s  %(message)s',
                    datefmt='%Y-%m-%d %I:%M:%S %p')

# some hail environment logging before run
logging.info('echo $PYSPARK_SUBMIT_ARGS')
os.system('echo $PYSPARK_SUBMIT_ARGS')

# initialize hail
logging.info('Initialize hail')
hl.init()

# load phenotypes and covariates
logging.info('Start loading phenotypes and covariates (the full table)')
pheno_covar_dic = helper.read_yaml(args.pheno_covar_yaml)
covar_names = pheno_covar_dic['covar_names']  # 'age_recruitment,sex,pc1,pc2'
pheno_names = pheno_covar_dic['pheno_names']  # 'ht,mcv,mch'
indiv_id = pheno_covar_dic['indiv_id']  # 'eid'
int_names = pheno_covar_dic['int_names']  # 'age_recruitment,sex'
str_names = pheno_covar_dic['str_names']  # 'eid'
logging.info('--> Read in CSV file as data.frame')
tstart = time.time()
covar, trait = hail_helper.read_and_split_phenotype_csv(
    args.pheno_covar_csv,
    pheno_names=pheno_names,
    covar_names=covar_names,
    indiv_id=indiv_id,
    int_names=int_names,
    str_names=str_names)
tend = time.time()