cluster.runCmd(job_name=job, cmd=cmd, logfile=log_file) # set up config for association test config = deepcopy(configdict) config["assoc_type"] = assoc_type config["null_model_file"] = configdict["data_prefix"] + "_null_model.RData" if assoc_type == "aggregate": config["aggregate_variant_file"] = configdict[ "data_prefix"] + "_aggregate_list_chr .RData" config["out_prefix"] = configdict["data_prefix"] + "_" + assocScript config["segment_file"] = segment_file configfile = configdict["config_prefix"] + "_" + assocScript + ".config" TopmedPipeline.writeConfig(config, configfile) # get segments for each chromosome chrom_list = TopmedPipeline.parseChromosomes(chromosomes).split(" ") segment_list = TopmedPipeline.getChromSegments(segment_file, chrom_list) segment_str = ["-".join([str(i) for i in s]) for s in segment_list] segments = dict(zip(chrom_list, segment_str)) # run association tests holdids_combine = [] for chromosome in chrom_list: job_assoc = assocScript + "_chr" + chromosome rscript = os.path.join(pipeline, "R", assocScript + ".R") args = ["-s", rscript, configfile, "--chromosome " + chromosome] # no email for jobs by segment jobid = cluster.submitJob(job_name=job_assoc, cmd=driver, args=args, holdid=holdids,
TopmedPipeline.writeConfig(config, configfile) jobid = cluster.submitJob(job_name=job, cmd=driver, args=["-c", rscript, configfile, version], holdid=[jobid], array_range=chromosomes, email=email, print_only=print_only) job = "combine_variants" rscript = os.path.join(pipeline, "R", job + ".R") config = dict() config["chromosomes"] = TopmedPipeline.parseChromosomes(chromosomes) config["in_file"] = configdict["data_prefix"] + "_pruned_variants_chr .RData" config["out_file"] = configdict["data_prefix"] + "_pruned_variants.RData" configfile = configdict["config_prefix"] + "_" + job + ".config" TopmedPipeline.writeConfig(config, configfile) jobid = cluster.submitJob(job_name=job, cmd=driver, args=[rscript, configfile, version], holdid=[jobid], email=email, print_only=print_only) job = "pca_byrel" rscript = os.path.join(pipeline, "R", job + ".R")
config = deepcopy(configdict) config["sample_include_file"] = configdict["data_prefix"] + "_unrelated.RData" config["out_file"] = configdict["data_prefix"] + "_pruned_variants_chr .RData" configfile = configdict["config_prefix"] + "_" + job + ".config" TopmedPipeline.writeConfig(config, configfile) jobid = cluster.submitJob(job_name=job, cmd=driver, args=["-c", rscript, configfile, version], holdid=[jobid], array_range=chromosomes, email=email, print_only=print_only) job = "combine_variants" rscript = os.path.join(pipeline, "R", job + ".R") config = dict() config["chromosomes"] = TopmedPipeline.parseChromosomes(chromosomes) config["in_file"] = configdict["data_prefix"] + "_pruned_variants_chr .RData" config["out_file"] = configdict["data_prefix"] + "_pruned_variants.RData" configfile = configdict["config_prefix"] + "_" + job + ".config" TopmedPipeline.writeConfig(config, configfile) jobid = cluster.submitJob(job_name=job, cmd=driver, args=[rscript, configfile, version], holdid=[jobid], email=email, print_only=print_only) job = "pca_byrel" rscript = os.path.join(pipeline, "R", job + ".R") config = deepcopy(configdict) config["related_file"] = configdict["data_prefix"] + "_related.RData" config["unrelated_file"] = configdict["data_prefix"] + "_unrelated.RData"
if run_null_model: config["null_model_file"] = configdict["data_prefix"] + "_null_model.RData" config["phenotype_file"] = configdict["data_prefix"] + "_phenotypes.RData" if assoc_type == "aggregate": config["aggregate_variant_file"] = configdict["data_prefix"] + "_aggregate_list_chr .RData" assocScript = "assoc_" + assoc_type config["out_prefix"] = configdict["data_prefix"] + "_" + assocScript config["segment_file"] = segment_file configfile = configdict["config_prefix"] + "_" + assocScript + ".config" TopmedPipeline.writeConfig(config, configfile) # get segments for each chromosome chrom_list = TopmedPipeline.parseChromosomes(chromosomes).split(" ") segment_list = TopmedPipeline.getChromSegments(segment_file, chrom_list) segment_str = ["-".join([str(i) for i in s]) for s in segment_list] segments = dict(zip(chrom_list, segment_str)) # run association tests hold_combine = [] for chromosome in chrom_list: job_assoc = assocScript + "_chr" + chromosome rscript = os.path.join(pipeline, "R", assocScript + ".R") args = ["-s", rscript, configfile, "--chromosome " + chromosome, version] # no email for jobs by segment submitID = cluster.submitJob(job_name=job_assoc, cmd=driver, args=args, holdid=hold_null_agg, array_range=segments[chromosome], print_only=print_only) combScript = "assoc_combine"
for subdir in ['config', 'log']: if not os.path.exists(configdict['output_file'] + '/' + subdir): os.mkdir(configdict['output_file'] + '/' + subdir) #configdict = TopmedPipeline.directorySetup(configdict, subdirs=["config", "log"]) job = "vcf2gds" rscript = os.path.join(pipeline, "R", job + ".R") # parsing bcf files relies on streaming bcftools output, so can't run in parallel if os.path.splitext(configdict["vcf_file"])[1] == ".bcf": ncores = None chrom_string = TopmedPipeline.parseChromosomes(chromosomes) chrom_list = chrom_string.split(' ') for chrom in chrom_list: if os.path.isfile(configdict['gds_file'].replace('chr ', 'chr' + chrom)) == False: cmd = " ".join([ 'bsub -q big -n 4', "-R 'rusage[mem=45000]'", 'Rscript', rscript, config_dir, '--chromosome ' + chrom ]) print(cmd) os.system(cmd) #jobid = cluster.submitJob(job_name=job, cmd=driver, args=["-c", rscript, configfile], array_range=chromosomes, request_cores=ncores) job = "merge_gds"