コード例 #1
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_eig_full_post_proc(cv_obj, src_path, cmd, cores=2, skip=False):
    assert 'eig' in cv_obj.full
    p_stdout = 'SKIPPED'
    p_status = 0

    eig_res = cv_obj.full['eig']['eig.res']
    eig_snps = cv_obj.full['eig']['eig.snps']
    eig_ev = cv_obj.full['eig']['eig.ev']
    eig_k = cv_obj.full['eig']['eig.k']

    k = None
    with open(eig_k, 'r') as input:
        line = input.readline()
        line = line.rstrip("\n")
        k = int(line)

    cmd = cmd.format( \
        src_path=src_path, \
        eig_res=eig_res, eig_snps=eig_snps, eig_ev=eig_ev, eig_k=k, \
        geno_file=cv_obj.X_bin, pheno_file=cv_obj.Y_file, pheno_name=cv_obj.Y_name, \
        samples=cv_obj.full['samples'], \
        cores=cores \
    )
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #2
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_eig_pca(cv_obj, eig_path, cmd, k=10, rep=None, fold=None, skip=False):
    eig_pca = eig_ev = eig_pca_log = eig_pca_plot = None
    p_stdout = 'SKIPPED'
    p_status = 0
    cv_obj_sub = None
    if rep is None or fold is None:
        cv_obj_sub = cv_obj.full
    else:
        cv_obj_sub = cv_obj.cv[rep][fold]
    assert 'eig' in cv_obj_sub
    eig_pca = cv_obj_sub['eig']['eig.pca']
    eig_ev = cv_obj_sub['eig']['eig.ev']
    eig_pca_log = cv_obj_sub['eig']['eig.pca.log']
    eig_pca_plot = cv_obj_sub['eig']['eig.pca.plot']
    eig_geno = cv_obj_sub['eig']['eig.geno']
    eig_snps = cv_obj_sub['eig']['eig.snps']
    eig_snps_rm = cv_obj_sub['eig']['eig.snps.rm']
    eig_pheno = cv_obj_sub['eig']['eig.pheno']

    cmd = cmd.format( \
        eig_path=eig_path, eig_k=k, \
        eig_geno=eig_geno, eig_snps=eig_snps, eig_snps_rm=eig_snps_rm, eig_pheno=eig_pheno, \
        eig_pca=eig_pca,   eig_ev=eig_ev,     eig_pca_log=eig_pca_log, eig_pca_plot=eig_pca_plot \
    )
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (rep, fold, cmd, p_stdout, p_status)
コード例 #3
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_eig_post_proc(cv_obj, src_path, cmd, rep, fold, skip=False):
    assert 'eig' in cv_obj.cv[rep][fold]
    p_stdout = 'SKIPPED'
    p_status = 0

    eig_res = cv_obj.cv[rep][fold]['eig']['eig.res']
    eig_snps = cv_obj.cv[rep][fold]['eig']['eig.snps']
    eig_ev = cv_obj.cv[rep][fold]['eig']['eig.ev']
    eig_k = cv_obj.cv[rep][fold]['eig']['eig.k']
    eig_sel = cv_obj.cv[rep][fold]['features_sel']

    k = None
    with open(eig_k, 'r') as input:
        line = input.readline()
        line = line.rstrip("\n")
        k = int(line)

    cmd = cmd.format(src_path=src_path,
                     eig_res=eig_res,
                     eig_snps=eig_snps,
                     eig_ev=eig_ev,
                     eig_k=k,
                     features_sel=eig_sel)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (rep, fold, cmd, p_stdout, p_status)
コード例 #4
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_eig_twstats(cv_obj,
                   eig_path,
                   cmd,
                   rep=None,
                   fold=None,
                   k_max=10,
                   alpha=0.05,
                   skip=False):
    eig_pca_pv = None
    eig_k = None
    p_stdout = 'SKIPPED'
    p_status = 0
    cv_obj_sub = None
    if rep is None or fold is None:
        cv_obj_sub = cv_obj.full
    else:
        cv_obj_sub = cv_obj.cv[rep][fold]
    assert 'eig' in cv_obj_sub
    eig_ev = cv_obj_sub['other']['eig_ev']
    eig_pca_pv = path.join(cv_obj_sub['odir'], 'eig.pca.pv')
    cmd = cmd.format(eig_path=eig_path, eig_ev=eig_ev, eig_pca_pv=eig_pca_pv)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    if p_status == '0':
        eig_k = cv_eig_twstats_k(eig_pca_pv, k_max, alpha)
    return (rep, fold, {
        'eig_pca_pv': eig_pca_pv,
        'eig_k': eig_k
    }, cmd, p_stdout, p_status)
コード例 #5
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_preproc_X_gds(cv_obj,
                     cmd,
                     cmd_filter,
                     src_path,
                     rep=None,
                     fold=None,
                     skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0
    assert cv_obj.X_type == "vcf", "Expected X type \"vcf\" but have %s" % cv_obj.X_type
    o_f = None
    in_s = None
    if rep is None or fold is None:
        o_f = cv_obj.full['features_pr']
        in_s = cv_obj.full['samples']
    else:
        o_f = cv_obj.cv[rep][fold]['features_pr']
        in_s = cv_obj.cv[rep][fold]['samples_train']
    cmd = cmd.format(src_path=src_path,
                     gds_file=cv_obj.X_gds,
                     o_f=o_f,
                     in_s=in_s,
                     proc_filter=cmd_filter)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (rep, fold, cmd, p_stdout, p_status)
コード例 #6
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_convert_X_vcf(cmd, src_path, vcf_file, gds_file, bin_file, skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0
    cmd = cmd.format(src_path=src_path,
                     v_file=vcf_file,
                     g_file=gds_file,
                     b_file=bin_file)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #7
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_fold_sum(cv_obj, src_path, model, rep, skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0

    pred_files = [fold[model]["pred.csv"] for fold in cv_obj.cv[rep].values()]
    cmd = "Rscript {src_path}/utils/cv_fold_sum.R -pred_files {pred_files} -o_file {o_file} --plot --src_path {src_path}/utils --verbose"

    cmd = cmd.format(src_path=src_path,
                     pred_files=' '.join(pred_files),
                     o_file=path.join(cv_obj.odir,
                                      "%s_rep%d_perf.csv" % (model, rep)))
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (rep, cmd, p_stdout, p_status)
コード例 #8
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_model(cv_obj,
             src_path,
             model,
             model_cmd,
             model_params,
             rep=None,
             fold=None,
             skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0
    if rep is None or fold is None:
        x_file = cv_obj.X_bin
        y_file = cv_obj.Y_file
        y_pheno = cv_obj.Y_name
        o_dir = cv_obj.full['odir']
        o_bname = model
        samples_train = cv_obj.full['samples']
        features = cv_obj.full['features_sel']
        cmd = model_cmd.format( \
            src_path=src_path, \
            x_file=x_file, y_file=y_file, y_pheno=y_pheno, \
            o_dir=o_dir, o_bname=o_bname, \
            samples_train=samples_train, features=features, \
            model_params=model_params \
        )
    else:
        x_file = cv_obj.X_bin
        y_file = cv_obj.Y_file
        y_pheno = cv_obj.Y_name
        o_dir = cv_obj.cv[rep][fold]['odir']
        o_bname = model
        samples_train = cv_obj.cv[rep][fold]['samples_train']
        samples_test = cv_obj.cv[rep][fold]['samples_test']
        features = cv_obj.cv[rep][fold]['features_sel']
        cmd = model_cmd.format( \
            src_path=src_path, \
            x_file=x_file, y_file=y_file, y_pheno=y_pheno, \
            o_dir=o_dir, o_bname=o_bname, \
            samples_train=samples_train, samples_test=samples_test, features=features, \
            model_params=model_params \
        )
    if not skip:
        ## delay:
        #if rep is not None or fold is not None:
        #time.sleep((fold-1)*3 + (rep-1)*3 + 5)
        p_stdout, p_status = run_cmd(cmd)
    return (rep, fold, cmd, p_stdout, p_status)
コード例 #9
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_combi_sel_features(cv_obj, src_path, skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0

    i_files = []
    for rep in cv_obj.cv.keys():
        for fold in cv_obj.cv[rep].keys():
            i_files.append(cv_obj.cv[rep][fold]['features_sel'])

    cmd = "Rscript {src_path}/utils/combine_sel_features.R -sel_features_files {i_files} -o_file {o_file} --src_path {src_path}/utils --verbose"
    cmd = cmd.format(src_path=src_path,
                     i_files=' '.join(i_files),
                     o_file=cv_obj.full['features_sel'])

    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #10
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_samples_intersect(cv_obj, src_path):
    # already done
    if cv_obj.done['samples']:
        return ("SKIP: Sample file already created", "", 0)
    cmd = "Rscript {src_path}/utils/process_x_y_samples.R -geno_file {g} -pheno_file {p} -pheno_name {n} -o_samples {o} -o_stats {s} --rm_miss --miss_value NA --src_path {src_path}/utils --include_full --verbose"
    #cmd = cmd.format(src_path=src_path, p=cv_obj.Y_file, n=cv_obj.Y_name, o=cv_obj.samples, s=cv_obj.Y_stat['file'])
    cmd = cmd.format(src_path=src_path,
                     p=cv_obj.Y_file,
                     g=cv_obj.X_bin,
                     n=cv_obj.Y_name,
                     o=cv_obj.samples,
                     s=cv_obj.Y_stat['file'])
    if cv_obj.in_samples is not None:
        cmd += " --include_samples %s" % cv_obj.in_samples
    if cv_obj.ex_samples is not None:
        cmd += " --exclude_samples %s" % cv_obj.ex_samples
    p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #11
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_rep_sum(cv_obj, src_path, model, skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0

    i_files = [
        path.join(cv_obj.odir, "%s_rep%d_perf.csv" % (model, rep))
        for rep in cv_obj.cv.keys()
    ]
    cmd = "Rscript {src_path}/utils/cv_rep_sum.R -perf_files {i_files} -o_file {o_file} --src_path {src_path}/utils --reps {reps} --min_rep_pct 50.0 --verbose"

    cmd = cmd.format(src_path=src_path,
                     i_files=' '.join(i_files),
                     o_file=path.join(cv_obj.odir,
                                      "%s_total_perf.csv" % model),
                     reps=cv_obj.reps)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #12
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_merge_Xs(src_path,
                ofile,
                in_samples=None,
                ex_samples=None,
                skip=False,
                *mat_files):
    p_stdout = 'SKIPPED'
    p_status = 0
    cmd = "Rscript {src_path}/utils/merge_tables.R -mat_files {mat_files} -ofile {ofile} --verbose --src_path {src_path}/utils"
    cmd = cmd.format(src_path=src_path,
                     mat_files=' '.join(mat_files),
                     ofile=ofile)
    if in_samples is not None:
        cmd += (" --include_samples %s" % in_samples)
    if ex_samples is not None:
        cmd += (" --exclude_samples %s" % ex_samples)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #13
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_pheno_sum(cv_objs, src_path, odir, model, skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0

    i_perf = [
        path.join(cv_obj.odir, "%s_total_perf.csv" % model)
        for cv_obj in cv_objs if cv_obj.Y_check
    ]
    i_names = [cv_obj.Y_name for cv_obj in cv_objs if cv_obj.Y_check]
    i_mods = [
        cv_obj.full[model]['combi.txt'] for cv_obj in cv_objs if cv_obj.Y_check
    ]
    o_file = path.join(odir, '%s_CV_summary.csv' % model)

    cmd = "Rscript {src_path}/utils/cv_y_sum.R -y_perf_files {i_perf} -y_names {i_names} -y_models {i_mods} -o_file {o_file} --verbose"

    cmd = cmd.format(src_path=src_path,
                     i_perf=' '.join(i_perf),
                     i_names=' '.join(i_names),
                     i_mods=' '.join(i_mods),
                     o_file=o_file)
    if not skip:
        p_stdout, p_status = run_cmd(cmd)
    return (cmd, p_stdout, p_status)
コード例 #14
0
     write_log(info, logging, args.verbose)
     res = cv_convert_X_vcf(cmd=args.xvcf_convert_cmd,
                            src_path=args.src_path,
                            vcf_file=args.X_file2,
                            gds_file=args.X_gds,
                            bin_file=args.X_bin,
                            skip=vcf_gds_skip)
     assert res[2] == 0
     info = "CMD: %s : %s\n%s\n" % (res[0], res[2], res[1])
     write_log(info, logging, args.verbose)
     # VCF/GDS Bin + Bin
     info = 'Merging X VCF/GDS BIN and X BIN\n'
     write_log(info, logging, args.verbose)
     # VCF/GDS BIN copy
     res = run_cmd("cp %s %s" %
                   (args.X_bin, args.X_bin +
                    '.tmp'))  # make copy of VCF/GDS BIN but name ".tmp"
     assert res[1] == 0
     merge_bin_skip = path.isfile(args.X_bin +
                                  '.gds')  # skip if copy exists with ".gds"
     res = cv_merge_Xs(args.src_path, args.X_bin, args.in_samples,
                       args.ex_samples, merge_bin_skip, args.X_file,
                       args.X_bin)
     info = "CMD: %s : %s\n%s\n" % (res[0], res[2], res[1])
     write_log(info, logging, args.verbose)
     assert res[2] == 0
     res = run_cmd(
         "mv %s %s" %
         (args.X_bin + '.tmp', args.X_bin + '.gds'))  # rename copy
     assert res[1] == 0
 else:
コード例 #15
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_eig_sel_k(cv_obj,
                 eig_path,
                 eig_assoc_cmd,
                 eig_lambda_cmd,
                 k_max,
                 k_step=1,
                 l_min=1.0,
                 rep=None,
                 fold=None,
                 skip=False):
    info = ''
    cv_obj_sub = None
    p_cmds = []
    p_os = []
    p_ss = []
    l_p = re.compile('^lambda.*')
    k = 0
    l = 100.0
    if rep is None or fold is None:
        cv_obj_sub = cv_obj.full
    else:
        cv_obj_sub = cv_obj.cv[rep][fold]
    assert 'eig' in cv_obj_sub
    eig_sel = cv_obj_sub['eig']['eig.select']
    eig_lambda = cv_obj_sub['eig']['eig.lambda']
    eig_res = cv_obj_sub['eig']['eig.res']
    eig_log = cv_obj_sub['eig']['eig.log']
    eig_pca = cv_obj_sub['eig']['eig.pca']
    eig_geno = cv_obj_sub['eig']['eig.geno']
    eig_snps = cv_obj_sub['eig']['eig.snps']
    eig_pheno = cv_obj_sub['eig']['eig.pheno']
    eig_k = cv_obj_sub['eig']['eig.k']

    k_l = {}
    if not skip:
        with open(eig_sel, 'w') as res:
            while l >= l_min and l > 1.0 and k < k_max:  # stop if l <= 1.0 OR l < l_min OR reached max. number of PCs [l should not be < 1.0 w.r.t. EIGENSTRAT]
                if k == 0: k = 1
                else: k = min(k_max, k + k_step)
                # EIGENSTRAT
                cmd = eig_assoc_cmd.format(eig_path=eig_path,
                                           eig_geno=eig_geno,
                                           eig_snps=eig_snps,
                                           eig_pheno=eig_pheno,
                                           eig_pca=eig_pca,
                                           eig_res=eig_res,
                                           eig_log=eig_log,
                                           eig_k=k)
                p_stdout, p_status = run_cmd(cmd)
                p_cmds.append(cmd)
                p_os.append(p_stdout)
                p_ss.append(p_status)
                if p_status != 0:
                    stdout.write('break eig: %s' % p_status)
                    break
                # LAMBDA
                cmd = eig_lambda_cmd.format(eig_path=eig_path,
                                            eig_res=eig_res,
                                            eig_lambda=eig_lambda)
                p_stdout, p_status = run_cmd(cmd)
                p_cmds.append(cmd)
                p_os.append(p_stdout)
                p_ss.append(p_status)
                if p_status != 0:
                    stdout.write('break lambda: %s' % p_status)
                    break
                # Get LAMBDA
                with open(eig_lambda, 'r') as l_file:
                    for line in l_file:
                        if line and l_p.match(line) is not None:
                            #stdout.write("k=%d: %s" %(k,line))
                            l = line.rstrip("\n").split(' ')[1]
                            l = float(l.replace('lambda=', ''))
                            break
                res.write("%d\t%.5f\n" % (k, l))
                res.flush()
                k_l[k] = l
        # Reached max PC num. but lambda still "bad" -> get k with min. lambda
        if k == k_max and l >= l_min and l > 1.0:
            k = 1
            l = k_l[k]
            for k_ in sorted(k_l.keys()):
                if k_l[k_] < k_l[k]: k = k_
            info += "Minimal k with minimal lambda is %d with %.5f\n" % (
                k, k_l[k])
            # EIGENSTRAT
            cmd = eig_assoc_cmd.format(eig_path=eig_path,
                                       eig_geno=eig_geno,
                                       eig_snps=eig_snps,
                                       eig_pheno=eig_pheno,
                                       eig_pca=eig_pca,
                                       eig_res=eig_res,
                                       eig_log=eig_log,
                                       eig_k=k)
            p_stdout, p_status = run_cmd(cmd)
            p_cmds.append(cmd)
            p_os.append(p_stdout)
            p_ss.append(p_status)
            if p_status != 0: stdout.write('break eig: %s' % p_status)
            # LAMBDA
            cmd = eig_lambda_cmd.format(eig_path=eig_path,
                                        eig_res=eig_res,
                                        eig_lambda=eig_lambda)
            p_stdout, p_status = run_cmd(cmd)
            p_cmds.append(cmd)
            p_os.append(p_stdout)
            p_ss.append(p_status)
            if p_status != 0: stdout.write('break lambda: %s' % p_status)
        info += "\n".join([
            '%s : %s\n%s\n' % (p_cmds[i], p_ss[i], p_os[i])
            for i in range(0, len(p_cmds))
        ])
        with open(eig_k, 'w') as output:
            output.write("%d\n" % k)
    else:
        info = '\nSKIPPED\n'
    return (rep, fold, info)
コード例 #16
0
ファイル: cv_utils.py プロジェクト: VGalata/gear_base_scripts
def cv_preproc_X_bin_vcf(cv_obj,
                         cmd_bin,
                         cmd_filter_bin,
                         cmd_gds,
                         cmd_filter_gds,
                         src_path,
                         rep=None,
                         fold=None,
                         cores=2,
                         skip=False):
    p_stdout = 'SKIPPED'
    p_status = 0
    cmd = ''
    assert cv_obj.X_type == "bin_vcf", "Expected X type \"bin_vcf\" but have %s" % cv_obj.X_type
    o_f = None
    in_s = None
    if rep is None or fold is None:
        o_f = cv_obj.full['features_pr']
        in_s = cv_obj.full['samples']
    else:
        o_f = cv_obj.cv[rep][fold]['features_pr']
        in_s = cv_obj.cv[rep][fold]['samples_train']
    o_f_1 = o_f + ".vcf"
    o_f_2 = o_f + ".bin"
    # VCF/GDS preproc
    cmd_gds = cmd_gds.format(src_path=src_path,
                             gds_file=cv_obj.X_gds,
                             o_f=o_f_1,
                             in_s=in_s,
                             proc_filter=cmd_filter_gds)
    if not skip:
        p_so, p_status = run_cmd(cmd_gds)
        p_stdout += ("\n" + p_so)
        cmd = cmd_gds
    if p_status != 0:  # non-zero status -> return: calling code should check the exit status
        return (rep, fold, cmd, p_stdout, p_status)
    # Bin. preproc
    cmd_bin = cmd_bin.format(src_path=src_path,
                             mat=cv_obj.X_file.split(',')[0],
                             o_f=o_f_2,
                             in_s=in_s,
                             proc_filter=cmd_filter_bin,
                             cores=cores)
    if not skip:
        p_so, p_status = run_cmd(cmd_bin)
        p_stdout += ("\n" + p_so)
        cmd += ("\n" + cmd_bin)
    if p_status != 0:  # non-zero status -> return: calling code should check the exit status
        return (rep, fold, cmd, p_stdout, p_status)
    # Concat both feature lists
    cmd_cat = "cat %s %s" % (o_f_1, o_f_2)
    if not skip:
        p_so, p_status = run_cmd(cmd_cat)
        with open(o_f, "w") as f:
            p = subprocess.Popen(shlex.split(cmd_cat),
                                 stdout=f,
                                 stderr=subprocess.PIPE)
        p_so = p.stderr.read().decode()
        p_comm = p.communicate()[0]
        p_status = p.returncode
        p_stdout += ("\n" + p_so)
        cmd += ("\n" + cmd_cat)
    return (rep, fold, cmd, p_stdout, p_status)