def cv_eig_full_post_proc(cv_obj, src_path, cmd, cores=2, skip=False): assert 'eig' in cv_obj.full p_stdout = 'SKIPPED' p_status = 0 eig_res = cv_obj.full['eig']['eig.res'] eig_snps = cv_obj.full['eig']['eig.snps'] eig_ev = cv_obj.full['eig']['eig.ev'] eig_k = cv_obj.full['eig']['eig.k'] k = None with open(eig_k, 'r') as input: line = input.readline() line = line.rstrip("\n") k = int(line) cmd = cmd.format( \ src_path=src_path, \ eig_res=eig_res, eig_snps=eig_snps, eig_ev=eig_ev, eig_k=k, \ geno_file=cv_obj.X_bin, pheno_file=cv_obj.Y_file, pheno_name=cv_obj.Y_name, \ samples=cv_obj.full['samples'], \ cores=cores \ ) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_eig_pca(cv_obj, eig_path, cmd, k=10, rep=None, fold=None, skip=False): eig_pca = eig_ev = eig_pca_log = eig_pca_plot = None p_stdout = 'SKIPPED' p_status = 0 cv_obj_sub = None if rep is None or fold is None: cv_obj_sub = cv_obj.full else: cv_obj_sub = cv_obj.cv[rep][fold] assert 'eig' in cv_obj_sub eig_pca = cv_obj_sub['eig']['eig.pca'] eig_ev = cv_obj_sub['eig']['eig.ev'] eig_pca_log = cv_obj_sub['eig']['eig.pca.log'] eig_pca_plot = cv_obj_sub['eig']['eig.pca.plot'] eig_geno = cv_obj_sub['eig']['eig.geno'] eig_snps = cv_obj_sub['eig']['eig.snps'] eig_snps_rm = cv_obj_sub['eig']['eig.snps.rm'] eig_pheno = cv_obj_sub['eig']['eig.pheno'] cmd = cmd.format( \ eig_path=eig_path, eig_k=k, \ eig_geno=eig_geno, eig_snps=eig_snps, eig_snps_rm=eig_snps_rm, eig_pheno=eig_pheno, \ eig_pca=eig_pca, eig_ev=eig_ev, eig_pca_log=eig_pca_log, eig_pca_plot=eig_pca_plot \ ) if not skip: p_stdout, p_status = run_cmd(cmd) return (rep, fold, cmd, p_stdout, p_status)
def cv_eig_post_proc(cv_obj, src_path, cmd, rep, fold, skip=False): assert 'eig' in cv_obj.cv[rep][fold] p_stdout = 'SKIPPED' p_status = 0 eig_res = cv_obj.cv[rep][fold]['eig']['eig.res'] eig_snps = cv_obj.cv[rep][fold]['eig']['eig.snps'] eig_ev = cv_obj.cv[rep][fold]['eig']['eig.ev'] eig_k = cv_obj.cv[rep][fold]['eig']['eig.k'] eig_sel = cv_obj.cv[rep][fold]['features_sel'] k = None with open(eig_k, 'r') as input: line = input.readline() line = line.rstrip("\n") k = int(line) cmd = cmd.format(src_path=src_path, eig_res=eig_res, eig_snps=eig_snps, eig_ev=eig_ev, eig_k=k, features_sel=eig_sel) if not skip: p_stdout, p_status = run_cmd(cmd) return (rep, fold, cmd, p_stdout, p_status)
def cv_eig_twstats(cv_obj, eig_path, cmd, rep=None, fold=None, k_max=10, alpha=0.05, skip=False): eig_pca_pv = None eig_k = None p_stdout = 'SKIPPED' p_status = 0 cv_obj_sub = None if rep is None or fold is None: cv_obj_sub = cv_obj.full else: cv_obj_sub = cv_obj.cv[rep][fold] assert 'eig' in cv_obj_sub eig_ev = cv_obj_sub['other']['eig_ev'] eig_pca_pv = path.join(cv_obj_sub['odir'], 'eig.pca.pv') cmd = cmd.format(eig_path=eig_path, eig_ev=eig_ev, eig_pca_pv=eig_pca_pv) if not skip: p_stdout, p_status = run_cmd(cmd) if p_status == '0': eig_k = cv_eig_twstats_k(eig_pca_pv, k_max, alpha) return (rep, fold, { 'eig_pca_pv': eig_pca_pv, 'eig_k': eig_k }, cmd, p_stdout, p_status)
def cv_preproc_X_gds(cv_obj, cmd, cmd_filter, src_path, rep=None, fold=None, skip=False): p_stdout = 'SKIPPED' p_status = 0 assert cv_obj.X_type == "vcf", "Expected X type \"vcf\" but have %s" % cv_obj.X_type o_f = None in_s = None if rep is None or fold is None: o_f = cv_obj.full['features_pr'] in_s = cv_obj.full['samples'] else: o_f = cv_obj.cv[rep][fold]['features_pr'] in_s = cv_obj.cv[rep][fold]['samples_train'] cmd = cmd.format(src_path=src_path, gds_file=cv_obj.X_gds, o_f=o_f, in_s=in_s, proc_filter=cmd_filter) if not skip: p_stdout, p_status = run_cmd(cmd) return (rep, fold, cmd, p_stdout, p_status)
def cv_convert_X_vcf(cmd, src_path, vcf_file, gds_file, bin_file, skip=False): p_stdout = 'SKIPPED' p_status = 0 cmd = cmd.format(src_path=src_path, v_file=vcf_file, g_file=gds_file, b_file=bin_file) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_fold_sum(cv_obj, src_path, model, rep, skip=False): p_stdout = 'SKIPPED' p_status = 0 pred_files = [fold[model]["pred.csv"] for fold in cv_obj.cv[rep].values()] cmd = "Rscript {src_path}/utils/cv_fold_sum.R -pred_files {pred_files} -o_file {o_file} --plot --src_path {src_path}/utils --verbose" cmd = cmd.format(src_path=src_path, pred_files=' '.join(pred_files), o_file=path.join(cv_obj.odir, "%s_rep%d_perf.csv" % (model, rep))) if not skip: p_stdout, p_status = run_cmd(cmd) return (rep, cmd, p_stdout, p_status)
def cv_model(cv_obj, src_path, model, model_cmd, model_params, rep=None, fold=None, skip=False): p_stdout = 'SKIPPED' p_status = 0 if rep is None or fold is None: x_file = cv_obj.X_bin y_file = cv_obj.Y_file y_pheno = cv_obj.Y_name o_dir = cv_obj.full['odir'] o_bname = model samples_train = cv_obj.full['samples'] features = cv_obj.full['features_sel'] cmd = model_cmd.format( \ src_path=src_path, \ x_file=x_file, y_file=y_file, y_pheno=y_pheno, \ o_dir=o_dir, o_bname=o_bname, \ samples_train=samples_train, features=features, \ model_params=model_params \ ) else: x_file = cv_obj.X_bin y_file = cv_obj.Y_file y_pheno = cv_obj.Y_name o_dir = cv_obj.cv[rep][fold]['odir'] o_bname = model samples_train = cv_obj.cv[rep][fold]['samples_train'] samples_test = cv_obj.cv[rep][fold]['samples_test'] features = cv_obj.cv[rep][fold]['features_sel'] cmd = model_cmd.format( \ src_path=src_path, \ x_file=x_file, y_file=y_file, y_pheno=y_pheno, \ o_dir=o_dir, o_bname=o_bname, \ samples_train=samples_train, samples_test=samples_test, features=features, \ model_params=model_params \ ) if not skip: ## delay: #if rep is not None or fold is not None: #time.sleep((fold-1)*3 + (rep-1)*3 + 5) p_stdout, p_status = run_cmd(cmd) return (rep, fold, cmd, p_stdout, p_status)
def cv_combi_sel_features(cv_obj, src_path, skip=False): p_stdout = 'SKIPPED' p_status = 0 i_files = [] for rep in cv_obj.cv.keys(): for fold in cv_obj.cv[rep].keys(): i_files.append(cv_obj.cv[rep][fold]['features_sel']) cmd = "Rscript {src_path}/utils/combine_sel_features.R -sel_features_files {i_files} -o_file {o_file} --src_path {src_path}/utils --verbose" cmd = cmd.format(src_path=src_path, i_files=' '.join(i_files), o_file=cv_obj.full['features_sel']) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_samples_intersect(cv_obj, src_path): # already done if cv_obj.done['samples']: return ("SKIP: Sample file already created", "", 0) cmd = "Rscript {src_path}/utils/process_x_y_samples.R -geno_file {g} -pheno_file {p} -pheno_name {n} -o_samples {o} -o_stats {s} --rm_miss --miss_value NA --src_path {src_path}/utils --include_full --verbose" #cmd = cmd.format(src_path=src_path, p=cv_obj.Y_file, n=cv_obj.Y_name, o=cv_obj.samples, s=cv_obj.Y_stat['file']) cmd = cmd.format(src_path=src_path, p=cv_obj.Y_file, g=cv_obj.X_bin, n=cv_obj.Y_name, o=cv_obj.samples, s=cv_obj.Y_stat['file']) if cv_obj.in_samples is not None: cmd += " --include_samples %s" % cv_obj.in_samples if cv_obj.ex_samples is not None: cmd += " --exclude_samples %s" % cv_obj.ex_samples p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_rep_sum(cv_obj, src_path, model, skip=False): p_stdout = 'SKIPPED' p_status = 0 i_files = [ path.join(cv_obj.odir, "%s_rep%d_perf.csv" % (model, rep)) for rep in cv_obj.cv.keys() ] cmd = "Rscript {src_path}/utils/cv_rep_sum.R -perf_files {i_files} -o_file {o_file} --src_path {src_path}/utils --reps {reps} --min_rep_pct 50.0 --verbose" cmd = cmd.format(src_path=src_path, i_files=' '.join(i_files), o_file=path.join(cv_obj.odir, "%s_total_perf.csv" % model), reps=cv_obj.reps) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_merge_Xs(src_path, ofile, in_samples=None, ex_samples=None, skip=False, *mat_files): p_stdout = 'SKIPPED' p_status = 0 cmd = "Rscript {src_path}/utils/merge_tables.R -mat_files {mat_files} -ofile {ofile} --verbose --src_path {src_path}/utils" cmd = cmd.format(src_path=src_path, mat_files=' '.join(mat_files), ofile=ofile) if in_samples is not None: cmd += (" --include_samples %s" % in_samples) if ex_samples is not None: cmd += (" --exclude_samples %s" % ex_samples) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
def cv_pheno_sum(cv_objs, src_path, odir, model, skip=False): p_stdout = 'SKIPPED' p_status = 0 i_perf = [ path.join(cv_obj.odir, "%s_total_perf.csv" % model) for cv_obj in cv_objs if cv_obj.Y_check ] i_names = [cv_obj.Y_name for cv_obj in cv_objs if cv_obj.Y_check] i_mods = [ cv_obj.full[model]['combi.txt'] for cv_obj in cv_objs if cv_obj.Y_check ] o_file = path.join(odir, '%s_CV_summary.csv' % model) cmd = "Rscript {src_path}/utils/cv_y_sum.R -y_perf_files {i_perf} -y_names {i_names} -y_models {i_mods} -o_file {o_file} --verbose" cmd = cmd.format(src_path=src_path, i_perf=' '.join(i_perf), i_names=' '.join(i_names), i_mods=' '.join(i_mods), o_file=o_file) if not skip: p_stdout, p_status = run_cmd(cmd) return (cmd, p_stdout, p_status)
write_log(info, logging, args.verbose) res = cv_convert_X_vcf(cmd=args.xvcf_convert_cmd, src_path=args.src_path, vcf_file=args.X_file2, gds_file=args.X_gds, bin_file=args.X_bin, skip=vcf_gds_skip) assert res[2] == 0 info = "CMD: %s : %s\n%s\n" % (res[0], res[2], res[1]) write_log(info, logging, args.verbose) # VCF/GDS Bin + Bin info = 'Merging X VCF/GDS BIN and X BIN\n' write_log(info, logging, args.verbose) # VCF/GDS BIN copy res = run_cmd("cp %s %s" % (args.X_bin, args.X_bin + '.tmp')) # make copy of VCF/GDS BIN but name ".tmp" assert res[1] == 0 merge_bin_skip = path.isfile(args.X_bin + '.gds') # skip if copy exists with ".gds" res = cv_merge_Xs(args.src_path, args.X_bin, args.in_samples, args.ex_samples, merge_bin_skip, args.X_file, args.X_bin) info = "CMD: %s : %s\n%s\n" % (res[0], res[2], res[1]) write_log(info, logging, args.verbose) assert res[2] == 0 res = run_cmd( "mv %s %s" % (args.X_bin + '.tmp', args.X_bin + '.gds')) # rename copy assert res[1] == 0 else:
def cv_eig_sel_k(cv_obj, eig_path, eig_assoc_cmd, eig_lambda_cmd, k_max, k_step=1, l_min=1.0, rep=None, fold=None, skip=False): info = '' cv_obj_sub = None p_cmds = [] p_os = [] p_ss = [] l_p = re.compile('^lambda.*') k = 0 l = 100.0 if rep is None or fold is None: cv_obj_sub = cv_obj.full else: cv_obj_sub = cv_obj.cv[rep][fold] assert 'eig' in cv_obj_sub eig_sel = cv_obj_sub['eig']['eig.select'] eig_lambda = cv_obj_sub['eig']['eig.lambda'] eig_res = cv_obj_sub['eig']['eig.res'] eig_log = cv_obj_sub['eig']['eig.log'] eig_pca = cv_obj_sub['eig']['eig.pca'] eig_geno = cv_obj_sub['eig']['eig.geno'] eig_snps = cv_obj_sub['eig']['eig.snps'] eig_pheno = cv_obj_sub['eig']['eig.pheno'] eig_k = cv_obj_sub['eig']['eig.k'] k_l = {} if not skip: with open(eig_sel, 'w') as res: while l >= l_min and l > 1.0 and k < k_max: # stop if l <= 1.0 OR l < l_min OR reached max. number of PCs [l should not be < 1.0 w.r.t. EIGENSTRAT] if k == 0: k = 1 else: k = min(k_max, k + k_step) # EIGENSTRAT cmd = eig_assoc_cmd.format(eig_path=eig_path, eig_geno=eig_geno, eig_snps=eig_snps, eig_pheno=eig_pheno, eig_pca=eig_pca, eig_res=eig_res, eig_log=eig_log, eig_k=k) p_stdout, p_status = run_cmd(cmd) p_cmds.append(cmd) p_os.append(p_stdout) p_ss.append(p_status) if p_status != 0: stdout.write('break eig: %s' % p_status) break # LAMBDA cmd = eig_lambda_cmd.format(eig_path=eig_path, eig_res=eig_res, eig_lambda=eig_lambda) p_stdout, p_status = run_cmd(cmd) p_cmds.append(cmd) p_os.append(p_stdout) p_ss.append(p_status) if p_status != 0: stdout.write('break lambda: %s' % p_status) break # Get LAMBDA with open(eig_lambda, 'r') as l_file: for line in l_file: if line and l_p.match(line) is not None: #stdout.write("k=%d: %s" %(k,line)) l = line.rstrip("\n").split(' ')[1] l = float(l.replace('lambda=', '')) break res.write("%d\t%.5f\n" % (k, l)) res.flush() k_l[k] = l # Reached max PC num. but lambda still "bad" -> get k with min. lambda if k == k_max and l >= l_min and l > 1.0: k = 1 l = k_l[k] for k_ in sorted(k_l.keys()): if k_l[k_] < k_l[k]: k = k_ info += "Minimal k with minimal lambda is %d with %.5f\n" % ( k, k_l[k]) # EIGENSTRAT cmd = eig_assoc_cmd.format(eig_path=eig_path, eig_geno=eig_geno, eig_snps=eig_snps, eig_pheno=eig_pheno, eig_pca=eig_pca, eig_res=eig_res, eig_log=eig_log, eig_k=k) p_stdout, p_status = run_cmd(cmd) p_cmds.append(cmd) p_os.append(p_stdout) p_ss.append(p_status) if p_status != 0: stdout.write('break eig: %s' % p_status) # LAMBDA cmd = eig_lambda_cmd.format(eig_path=eig_path, eig_res=eig_res, eig_lambda=eig_lambda) p_stdout, p_status = run_cmd(cmd) p_cmds.append(cmd) p_os.append(p_stdout) p_ss.append(p_status) if p_status != 0: stdout.write('break lambda: %s' % p_status) info += "\n".join([ '%s : %s\n%s\n' % (p_cmds[i], p_ss[i], p_os[i]) for i in range(0, len(p_cmds)) ]) with open(eig_k, 'w') as output: output.write("%d\n" % k) else: info = '\nSKIPPED\n' return (rep, fold, info)
def cv_preproc_X_bin_vcf(cv_obj, cmd_bin, cmd_filter_bin, cmd_gds, cmd_filter_gds, src_path, rep=None, fold=None, cores=2, skip=False): p_stdout = 'SKIPPED' p_status = 0 cmd = '' assert cv_obj.X_type == "bin_vcf", "Expected X type \"bin_vcf\" but have %s" % cv_obj.X_type o_f = None in_s = None if rep is None or fold is None: o_f = cv_obj.full['features_pr'] in_s = cv_obj.full['samples'] else: o_f = cv_obj.cv[rep][fold]['features_pr'] in_s = cv_obj.cv[rep][fold]['samples_train'] o_f_1 = o_f + ".vcf" o_f_2 = o_f + ".bin" # VCF/GDS preproc cmd_gds = cmd_gds.format(src_path=src_path, gds_file=cv_obj.X_gds, o_f=o_f_1, in_s=in_s, proc_filter=cmd_filter_gds) if not skip: p_so, p_status = run_cmd(cmd_gds) p_stdout += ("\n" + p_so) cmd = cmd_gds if p_status != 0: # non-zero status -> return: calling code should check the exit status return (rep, fold, cmd, p_stdout, p_status) # Bin. preproc cmd_bin = cmd_bin.format(src_path=src_path, mat=cv_obj.X_file.split(',')[0], o_f=o_f_2, in_s=in_s, proc_filter=cmd_filter_bin, cores=cores) if not skip: p_so, p_status = run_cmd(cmd_bin) p_stdout += ("\n" + p_so) cmd += ("\n" + cmd_bin) if p_status != 0: # non-zero status -> return: calling code should check the exit status return (rep, fold, cmd, p_stdout, p_status) # Concat both feature lists cmd_cat = "cat %s %s" % (o_f_1, o_f_2) if not skip: p_so, p_status = run_cmd(cmd_cat) with open(o_f, "w") as f: p = subprocess.Popen(shlex.split(cmd_cat), stdout=f, stderr=subprocess.PIPE) p_so = p.stderr.read().decode() p_comm = p.communicate()[0] p_status = p.returncode p_stdout += ("\n" + p_so) cmd += ("\n" + cmd_cat) return (rep, fold, cmd, p_stdout, p_status)