import trans if args.ncores is not None: import torch torch.set_num_threads(args.ncores) logging.info('Loading tables.') df_pheno = load_phenotype(args.phenotype_table) if args.covariate_table is not None: df_covar = load_covariate(args.covariate_table, args.covariate_yaml) indiv_lists = [df_covar.indiv.to_list(), df_pheno.indiv.to_list()] else: df_covar = None indiv_lists = [df_pheno.indiv.to_list()] if args.individual_list is not None: indiv_lists.append(load_list(args.individual_list)) indiv_list = take_intersect(indiv_lists) if args.individual_list_exclude is not None: indiv_list = exclude_b_from_a(a=indiv_list, b=load_list( args.individual_list_exclude)) indiv_list = sorted(indiv_list) df_pheno = rearrange_rows(df_pheno, indiv_list) df_pheno = transpose_df(df_pheno, col='indiv') if df_covar is not None: df_covar = rearrange_rows(df_covar, indiv_list) df_covar.set_index('indiv', inplace=True) logging.info('There are {} individauls being included.'.format(
logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') from tqdm import tqdm from pyutil import load_list, intersection if args.rename_yaml is not None: from pyutil import read_yaml rename_dict = read_yaml(args.rename_yaml) else: rename_dict = None out = [] model_files = load_list(args.model_list) logging.info('There are {} model files to load.'.format(len(model_files))) for model_path in tqdm(model_files): model = load_perf(model_path) out.append(model) out = pd.concat(out, axis=0) # clean rename dict so that it only contains keys occurring in out rename_dict_new = {} for k in rename_dict.keys(): if k in list(out.phenotype): rename_dict_new[k] = rename_dict[k] rename_dict = rename_dict_new new_pheno = [] for i in range(out.shape[0]):
input h5 ''') parser.add_argument('--gene_list', help=''' path to gene list ''') parser.add_argument('--output', default=0, help=''' output csv ''') args = parser.parse_args() import h5py import numpy as np import pandas as pd from pyutil import load_list gene_list = load_list(args.gene_list) with h5py.File(args.input, 'r') as f: genes = f['genes'][:].astype(str) gene_idxs = [] gene_list_new = [] for gene_id in gene_list: gene_idx = np.where(np.isin(genes, gene_id))[0] if len(gene_idx) == 0: continue else: gene_idxs.append(gene_idx) gene_list_new.append(gene_id) gene_idxs = np.concatenate(gene_idxs, axis=0) gene_list = gene_list_new df_tmp = pd.DataFrame({'idx': gene_idxs, 'gene': gene_list})
A plink BIM file. ''') parser.add_argument('--output', help=''' Output SNP list. ''') args = parser.parse_args() import logging, time, sys, os # configing util logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') import pandas as pd from pyutil import load_list, intersection, write_list snp_list = load_list(args.input) df_bim = pd.read_csv(args.input_bim, sep='\s+', header=None) df_bim.columns = ['chr', 'rsid', 'placeholder', 'pos', 'a1', 'a2'] df_bim = filter_out_ambiguious_snps(df_bim) snp_list = intersection(snp_list, list(df_bim.rsid)) # before output # I found that there are some SNPs with ID like: AFFX-SNP_XXX__rsYYY or SNP_A-ZZZ # I cleaned them up by removing all SNPs with SNP_A-ZZZ as ID and # keep rsYYY from AFFX-SNP_XXX__rsYYY snp_list = clean_up_rsid(snp_list) write_list(snp_list, args.output)
import pyemma if args.grm_cache is not None: if args.reml is True: grm_cache = args.grm_cache + '.reml.pkl.gz' else: grm_cache = args.grm_cache + '.mle.pkl.gz' else: grm_cache = None logging.info('Loading phenotype table.') df_y = read_table(args.y_table[0], indiv_col=args.y_table[1]) pheno_indiv = df_y.indiv.to_list() if args.y_list is not None: y_list = load_list(args.y_list) df_y = df_y[['indiv'] + y_list] pheno_list = df_y.columns[1:].to_list() if grm_cache is None or not file_exists(grm_cache): logging.info('Loading GRM from scratch.') grm, grm_indiv = pyemma.load_grm(args.grm + '.grm.gz', args.grm + '.grm.id') common_indiv = intersection(grm_indiv, pheno_indiv) grm, indiv = subset_grm(grm, grm_indiv, common_indiv) ymat = subset_y(df_y, indiv) logging.info('Starting GRM EVD.') eig_val, eig_vec = pyemma.pyemma_mle_mat_fac(grm) to_cache = {'vec': eig_vec, 'val': eig_val, 'indiv': indiv}