Beispiel #1
0
    import trans

    if args.ncores is not None:
        import torch
        torch.set_num_threads(args.ncores)

    logging.info('Loading tables.')
    df_pheno = load_phenotype(args.phenotype_table)
    if args.covariate_table is not None:
        df_covar = load_covariate(args.covariate_table, args.covariate_yaml)
        indiv_lists = [df_covar.indiv.to_list(), df_pheno.indiv.to_list()]
    else:
        df_covar = None
        indiv_lists = [df_pheno.indiv.to_list()]
    if args.individual_list is not None:
        indiv_lists.append(load_list(args.individual_list))
    indiv_list = take_intersect(indiv_lists)

    if args.individual_list_exclude is not None:
        indiv_list = exclude_b_from_a(a=indiv_list,
                                      b=load_list(
                                          args.individual_list_exclude))

    indiv_list = sorted(indiv_list)

    df_pheno = rearrange_rows(df_pheno, indiv_list)
    df_pheno = transpose_df(df_pheno, col='indiv')
    if df_covar is not None:
        df_covar = rearrange_rows(df_covar, indiv_list)
        df_covar.set_index('indiv', inplace=True)
    logging.info('There are {} individauls being included.'.format(
Beispiel #2
0
    logging.basicConfig(level=logging.INFO,
                        stream=sys.stderr,
                        format='%(asctime)s  %(message)s',
                        datefmt='%Y-%m-%d %I:%M:%S %p')

    from tqdm import tqdm
    from pyutil import load_list, intersection

    if args.rename_yaml is not None:
        from pyutil import read_yaml
        rename_dict = read_yaml(args.rename_yaml)
    else:
        rename_dict = None

    out = []
    model_files = load_list(args.model_list)
    logging.info('There are {} model files to load.'.format(len(model_files)))
    for model_path in tqdm(model_files):
        model = load_perf(model_path)
        out.append(model)
    out = pd.concat(out, axis=0)

    # clean rename dict so that it only contains keys occurring in out
    rename_dict_new = {}
    for k in rename_dict.keys():
        if k in list(out.phenotype):
            rename_dict_new[k] = rename_dict[k]
    rename_dict = rename_dict_new

    new_pheno = []
    for i in range(out.shape[0]):
    input h5
''')
parser.add_argument('--gene_list', help='''
    path to gene list
''')
parser.add_argument('--output', default=0, help='''
    output csv
''')
args = parser.parse_args()

import h5py
import numpy as np
import pandas as pd
from pyutil import load_list

gene_list = load_list(args.gene_list)
with h5py.File(args.input, 'r') as f:
    genes = f['genes'][:].astype(str)
    gene_idxs = []
    gene_list_new = []
    for gene_id in gene_list:
        gene_idx = np.where(np.isin(genes, gene_id))[0]
        if len(gene_idx) == 0:
            continue
        else:
            gene_idxs.append(gene_idx)
            gene_list_new.append(gene_id)
    gene_idxs = np.concatenate(gene_idxs, axis=0)
    gene_list = gene_list_new

df_tmp = pd.DataFrame({'idx': gene_idxs, 'gene': gene_list})
        A plink BIM file.
    ''')
    parser.add_argument('--output', help='''
        Output SNP list.
    ''')
    args = parser.parse_args()

    import logging, time, sys, os
    # configing util
    logging.basicConfig(level=logging.INFO,
                        stream=sys.stderr,
                        format='%(asctime)s  %(message)s',
                        datefmt='%Y-%m-%d %I:%M:%S %p')

    import pandas as pd
    from pyutil import load_list, intersection, write_list
    snp_list = load_list(args.input)
    df_bim = pd.read_csv(args.input_bim, sep='\s+', header=None)
    df_bim.columns = ['chr', 'rsid', 'placeholder', 'pos', 'a1', 'a2']

    df_bim = filter_out_ambiguious_snps(df_bim)
    snp_list = intersection(snp_list, list(df_bim.rsid))

    # before output
    # I found that there are some SNPs with ID like: AFFX-SNP_XXX__rsYYY or SNP_A-ZZZ
    # I cleaned them up by removing all SNPs with SNP_A-ZZZ as ID and
    # keep rsYYY from AFFX-SNP_XXX__rsYYY
    snp_list = clean_up_rsid(snp_list)

    write_list(snp_list, args.output)
Beispiel #5
0
    import pyemma

    if args.grm_cache is not None:
        if args.reml is True:
            grm_cache = args.grm_cache + '.reml.pkl.gz'
        else:
            grm_cache = args.grm_cache + '.mle.pkl.gz'
    else:
        grm_cache = None

    logging.info('Loading phenotype table.')
    df_y = read_table(args.y_table[0], indiv_col=args.y_table[1])
    pheno_indiv = df_y.indiv.to_list()
    if args.y_list is not None:
        y_list = load_list(args.y_list)
        df_y = df_y[['indiv'] + y_list]
    pheno_list = df_y.columns[1:].to_list()

    if grm_cache is None or not file_exists(grm_cache):
        logging.info('Loading GRM from scratch.')
        grm, grm_indiv = pyemma.load_grm(args.grm + '.grm.gz',
                                         args.grm + '.grm.id')
        common_indiv = intersection(grm_indiv, pheno_indiv)
        grm, indiv = subset_grm(grm, grm_indiv, common_indiv)
        ymat = subset_y(df_y, indiv)

        logging.info('Starting GRM EVD.')

        eig_val, eig_vec = pyemma.pyemma_mle_mat_fac(grm)
        to_cache = {'vec': eig_vec, 'val': eig_val, 'indiv': indiv}