def run_example_cluster(): """ run a set of jobs on cluster """ print "" print "" print "=====================================" print "======== Submit and Wait ========" print "=====================================" print "" print "" functionJobs = make_jobs() print "output ret field in each job before sending it onto the cluster" for (i, job) in enumerate(functionJobs): print "Job #", i, "- ret: ", job.ret print "" print "sending function jobs to cluster" print "" processedFunctionJobs = process_jobs(functionJobs) print "ret fields AFTER execution on cluster" for (i, job) in enumerate(processedFunctionJobs): print "Job #", i, "- ret: ", str(job.ret)[0:10]
def run_example_local_multithreading(): """ run a set of jobs on local machine using several cores """ print "=====================================" print "====== Local Multithreading =======" print "=====================================" print "" print "" print "generating function jobs" functionJobs = make_jobs() # KybJob object start out with an empty ret field, which is only filled after execution print "output ret field in each job before multithreaded computation" for (i, job) in enumerate(functionJobs): print "Job #", i, "- ret: ", job.ret print "" print "executing jobs on local machine using 1 thread" processedFunctionJobs = process_jobs(functionJobs, local=True, maxNumThreads=1) print "ret fields AFTER execution on local machine" for (i, job) in enumerate(processedFunctionJobs): print "Job #", i, "- ret: ", str(job.ret)[0:10]
def align_rnaseq_reads(yaml_config): """ wrapper for aligning rnaseq reads using """ operation_seleted = "3" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid lib_type = 'PE' lib_type = 'SE' if len(det['fastq']) == 1 else lib_type ## library insert size lib_insert_size = 100000 num_cpu = 3 arg = [[det, lib_type, lib_insert_size, num_cpu]] job = pg.cBioJob(call_align_reads, arg) job.mem = "90gb" job.vmem = "90gb" job.pmem = "30gb" job.pvmem = "30gb" job.nodes = 1 job.ppn = num_cpu job.walltime = "48:00:00" Jobs.append(job) print print "sending read alignment with STAR jobs to worker" print processedJobs = pg.process_jobs(Jobs, local=False)
def transcript_prediction_trsk(yaml_config): """ transcript prediction using TranscriptSkimmer """ operation_seleted = "4" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [det] job = pg.cBioJob(call_transcript_prediction_trsk, arg) ## native specifications job.mem="32gb" job.vmem="32gb" job.pmem="32gb" job.pvmem="32gb" job.nodes = 1 job.ppn = 1 job.walltime = "9:00:00" Jobs.append(job) print print "sending transcript assembly trsk jobs to worker" print local = True ## cluster compute switch processedJobs = pg.process_jobs(Jobs, local=local)
def alignment_filter(yaml_config): """ run multimapper resolution program """ operation_seleted = "m" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): num_cpus = 5 ## arguments to pygrid arg = [[det['short_name'], det['read_map_dir'], num_cpus]] job = pg.cBioJob(call_alignment_filter, arg) ## native specifications job.pmem="90gb" job.pvmem="90gb" job.mem="90gb" job.vmem="90gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "48:00:00" Jobs.append(job) print print "sending multi map resolution jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_cuff(yaml_config): """ transcript prediction using cufflinks """ operation_seleted = "c" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det, 4]] job = pg.cBioJob(call_transcript_prediction_cuff, arg) ## native specifications job.mem="96gb" job.vmem="96gb" job.pmem="24gb" job.pvmem="24gb" job.nodes = 1 job.ppn = 4 job.walltime = "32:00:00" Jobs.append(job) print print "sending transcript assembly cufflinks jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def create_genome_index(yaml_config): """ wrapper for calling genome index function """ operation_seleted = "2" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid num_cpus = 4 arg = [[det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus, det['read_length']-1]] job = pg.cBioJob(call_genome_index, arg) job.mem="46gb" job.vmem="46gb" job.pmem="46gb" job.pvmem="46gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "24:00:00" Jobs.append(job) print print "sending star genome index jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def align_rnaseq_reads(yaml_config): """ wrapper for aligning rnaseq reads using """ operation_seleted = "3" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid lib_type = 'PE' lib_type = 'SE' if len(det['fastq'])==1 else lib_type ## library insert size lib_insert_size = 100000 num_cpu = 3 arg = [[det, lib_type, lib_insert_size, num_cpu]] job = pg.cBioJob(call_align_reads, arg) job.mem="90gb" job.vmem="90gb" job.pmem="30gb" job.pvmem="30gb" job.nodes = 1 job.ppn = num_cpu job.walltime = "48:00:00" Jobs.append(job) print print "sending read alignment with STAR jobs to worker" print processedJobs = pg.process_jobs(Jobs, local=False)
def download_gtf(yaml_config): """ download gtf/gff file from remote data publishing services """ operation_seleted = "a" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['release_nb'], det['long_name'], det['genome_dir']]] if det['release_db'] == 'ensembl_metazoa_genome': job = pg.cBioJob(call_metazoa_gtf, arg) elif det['release_db'] == 'phytozome_genome': job = pg.cBioJob(call_phytozome_gtf, arg) elif det['release_db'] == 'ensembl_genome': job = pg.cBioJob(call_ensembl_gtf, arg) else: print "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det['release_db'] sys.exit(0) job.mem="2gb" job.vmem="2gb" job.pmem="2gb" job.pvmem="2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending gtf download job to worker" print processedJobs = pg.process_jobs(Jobs)
def download_sra_data(yaml_config): """ download sra file for the working organism """ operation_seleted = "1" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['sra_run_id'], det['fastq_path']]] job = pg.cBioJob(call_download_sra_file, arg) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending download SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def download_sra_data(yaml_config): """ download sra file for the working organism """ operation_seleted = "1" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['sra_run_id'], det['fastq_path']]] job = pg.cBioJob(call_download_sra_file, arg) job.mem="2gb" job.vmem="2gb" job.pmem="2gb" job.pvmem="2gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending download SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_stringtie(yaml_config): """ transcript prediction using StringTie """ operation_seleted = "5" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det["read_map_dir"], det["short_name"], det["read_assembly_dir"]]] job = pg.cBioJob(call_transcript_prediction_stringtie, arg) cpus = 1 ## native specifications job.mem="12gb" job.vmem="12gb" job.pmem="12gb" job.pvmem="12gb" job.nodes = 1 job.ppn = cpus job.walltime = "24:00:00" Jobs.append(job) print("\nsending transcript assembly stringtie jobs to worker\n") local_compute = False ## switching between local multithreading and cluster computing processedJobs = pg.process_jobs(Jobs, local=local_compute)
def create_genome_index(yaml_config): """ wrapper for calling genome index function """ operation_seleted = "2" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid num_cpus = 4 arg = [[ det['fasta'], det['genome_index_dir'], det['gtf'], num_cpus, det['read_length'] - 1 ]] job = pg.cBioJob(call_genome_index, arg) job.mem = "46gb" job.vmem = "46gb" job.pmem = "46gb" job.pvmem = "46gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "24:00:00" Jobs.append(job) print print "sending star genome index jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def alignment_filter(yaml_config): """ run multimapper resolution program """ operation_seleted = "m" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): num_cpus = 5 ## arguments to pygrid arg = [[det['short_name'], det['read_map_dir'], num_cpus]] job = pg.cBioJob(call_alignment_filter, arg) ## native specifications job.pmem = "90gb" job.pvmem = "90gb" job.mem = "90gb" job.vmem = "90gb" job.nodes = 1 job.ppn = num_cpus job.walltime = "48:00:00" Jobs.append(job) print print "sending multi map resolution jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def transcript_prediction_stringtie(yaml_config): """ transcript prediction using StringTie """ operation_seleted = "5" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[ det["read_map_dir"], det["short_name"], det["read_assembly_dir"] ]] job = pg.cBioJob(call_transcript_prediction_stringtie, arg) cpus = 1 ## native specifications job.mem = "12gb" job.vmem = "12gb" job.pmem = "12gb" job.pvmem = "12gb" job.nodes = 1 job.ppn = cpus job.walltime = "24:00:00" Jobs.append(job) print("\nsending transcript assembly stringtie jobs to worker\n") local_compute = False ## switching between local multithreading and cluster computing processedJobs = pg.process_jobs(Jobs, local=local_compute)
def transcript_prediction_trsk(yaml_config): """ transcript prediction using TranscriptSkimmer """ operation_seleted = "4" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [det] job = pg.cBioJob(call_transcript_prediction_trsk, arg) ## native specifications job.mem = "32gb" job.vmem = "32gb" job.pmem = "32gb" job.pvmem = "32gb" job.nodes = 1 job.ppn = 1 job.walltime = "9:00:00" Jobs.append(job) print print "sending transcript assembly trsk jobs to worker" print local = True ## cluster compute switch processedJobs = pg.process_jobs(Jobs, local=local)
def transcript_prediction_cuff(yaml_config): """ transcript prediction using cufflinks """ operation_seleted = "c" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det, 4]] job = pg.cBioJob(call_transcript_prediction_cuff, arg) ## native specifications job.mem = "96gb" job.vmem = "96gb" job.pmem = "24gb" job.pvmem = "24gb" job.nodes = 1 job.ppn = 4 job.walltime = "32:00:00" Jobs.append(job) print print "sending transcript assembly cufflinks jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation """ operation_seleted = "f" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "cufflinks": gff_file = "%s/transcripts.gtf" % det["read_assembly_dir"] ## cufflinks run output file outFile = "%s/%s_cufflinks_genes.gff" % ( det["read_assembly_dir"], org_name, ) ## example: A_thaliana_cufflinks_genes.gff elif data_method == "trsk": gff_file = "%s/tmp_trsk_genes.gff" % det["read_assembly_dir"] ## trsk run output file outFile = "%s/%s_trsk_genes.gff" % ( det["read_assembly_dir"], org_name, ) ## example: A_thaliana_trsk_genes.gff else: gff_file = det["gtf"] ## public database genome annotation file outFile = "%s/%s_%s.gff" % ( det["read_assembly_dir"], org_name, det["genome_release_db"], ) ## example: A_thaliana_arabidopsis-tair10.gff ## arguments to pygrid arg = [[gff_file, det["fasta"], outFile]] job = pg.cBioJob(call_filter_genes, arg) ## native specifications job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending filter gene models jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation """ operation_seleted = "f" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): if data_method == "cufflinks": gff_file = "%s/transcripts.gtf" % det[ 'read_assembly_dir'] ## cufflinks run output file outFile = "%s/%s_cufflinks_genes.gff" % ( det['read_assembly_dir'], org_name ) ## example: A_thaliana_cufflinks_genes.gff elif data_method == "trsk": gff_file = "%s/tmp_trsk_genes.gff" % det[ 'read_assembly_dir'] ## trsk run output file outFile = "%s/%s_trsk_genes.gff" % ( det['read_assembly_dir'], org_name ) ## example: A_thaliana_trsk_genes.gff else: gff_file = det['gtf'] ## public database genome annotation file outFile = "%s/%s_%s.gff" % ( det['read_assembly_dir'], org_name, det['genome_release_db'] ) ## example: A_thaliana_arabidopsis-tair10.gff ## arguments to pygrid arg = [[gff_file, det['fasta'], outFile]] job = pg.cBioJob(call_filter_genes, arg) ## native specifications job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending filter gene models jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def download_gtf(yaml_config): """ download gtf/gff file from remote data publishing services """ operation_seleted = "a" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det['release_nb'], det['long_name'], det['genome_dir']]] if det['release_db'] == 'ensembl_metazoa_genome': job = pg.cBioJob(call_metazoa_gtf, arg) elif det['release_db'] == 'phytozome_genome': job = pg.cBioJob(call_phytozome_gtf, arg) elif det['release_db'] == 'ensembl_genome': job = pg.cBioJob(call_ensembl_gtf, arg) elif det['release_db'] == 'ensembl_fungi_genome': job = pg.cBioJob(call_fungi_gtf, arg) elif det['release_db'] == 'ensembl_protists_genome': job = pg.cBioJob(call_protists_gtf, arg) else: exit( "error: download gtf plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det['release_db']) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending gtf download job to worker" print local_compute = True processedJobs = pg.process_jobs(Jobs, local=local_compute)
def download_fasta(yaml_config): """ download fasta file from remote data publishing services """ operation_seleted = "g" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): ## arguments to pygrid arg = [[det["release_nb"], det["long_name"], det["genome_dir"]]] if det["release_db"] == "ensembl_metazoa_genome": job = pg.cBioJob(call_metazoa_fasta, arg) elif det["release_db"] == "phytozome_genome": job = pg.cBioJob(call_phytozome_fasta, arg) elif det["release_db"] == "ensembl_genome": job = pg.cBioJob(call_ensembl_fasta, arg) elif det["release_db"] == "ensembl_fungi_genome": job = pg.cBioJob(call_fungi_fasta, arg) else: exit( "error: download fasta plugin for %s not available, module works with ensembl_genome, ensembl_metazoa_genome and phytozome_genome servers." % det["release_db"] ) job.mem = "2gb" job.vmem = "2gb" job.pmem = "2gb" job.pvmem = "2gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) print print "sending fasta download job to worker" print local_compute = True processedJobs = pg.process_jobs(Jobs, local=local_compute)
def main(yaml_config): """ """ config_map = yaml.safe_load(open(yaml_config, "rU")) exp_path_pfx = config_map['experiment_data_path']['dir'] org_db = defaultdict() for ent in config_map['experiment']: species_name = ent['organism_name'] genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name=short_name) org_db[short_name]['work_dir'] = "%s/%s/set_union_refix" % ( exp_path_pfx, short_name) org_db[short_name]['data_dir'] = "%s/%s/set_2" % (exp_path_pfx, short_name) ## prepare jobs Jobs = [] for org_name, det in org_db.items(): arg = [[org_name, det['work_dir'], det['data_dir']]] job = pg.cBioJob(distribute_model_train, arg) job.mem = "4gb" job.vmem = "4gb" job.pmem = "4gb" job.pvmem = "4gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) compute_local = True print "sending jobs to worker" processedJobs = pg.process_jobs(Jobs, local=compute_local)
def main(yaml_config): """ """ config_map = yaml.safe_load(open(yaml_config, "rU")) exp_path_pfx = config_map["experiment_data_path"]["dir"] org_db = defaultdict() for ent in config_map["experiment"]: species_name = ent["organism_name"] genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name=short_name) org_db[short_name]["work_dir"] = "%s/%s/set_union_refix" % (exp_path_pfx, short_name) org_db[short_name]["data_dir"] = "%s/%s/set_2" % (exp_path_pfx, short_name) ## prepare jobs Jobs = [] for org_name, det in org_db.items(): arg = [[org_name, det["work_dir"], det["data_dir"]]] job = pg.cBioJob(distribute_model_train, arg) job.mem = "4gb" job.vmem = "4gb" job.pmem = "4gb" job.pvmem = "4gb" job.nodes = 1 job.ppn = 1 job.walltime = "2:00:00" Jobs.append(job) compute_local = True print "sending jobs to worker" processedJobs = pg.process_jobs(Jobs, local=compute_local)
def decompose_sra_file(yaml_config): """ decompress the .sra file from ncbi sra """ operation_seleted = "d" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): sra_file = "%s/%s.sra" % (det['fastq_path'], det['sra_run_id']) if not os.path.isfile(sra_file):## check the file present or not print "error: missing sequencing read file %s" % sra_file sys.exit(0) ## TODO can be consider to the yaml file options #library_type = "pe" library_type = "pe" compress_format = "gzip" ## arguments to pygrid arg = [[sra_file, det['fastq_path']]] job = pg.cBioJob(call_decompose_sra_file, arg) job.mem="6gb" job.vmem="6gb" job.pmem="6gb" job.pvmem="6gb" job.nodes = 1 job.ppn = 1 job.walltime = "24:00:00" Jobs.append(job) print print "sending decompress SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
def decompose_sra_file(yaml_config): """ decompress the .sra file from ncbi sra """ operation_seleted = "d" orgdb = expdb.experiment_db(yaml_config, operation_seleted) Jobs = [] for org_name, det in orgdb.items(): sra_file = "%s/%s.sra" % (det['fastq_path'], det['sra_run_id']) if not os.path.isfile(sra_file): ## check the file present or not print "error: missing sequencing read file %s" % sra_file sys.exit(0) ## TODO can be consider to the yaml file options #library_type = "pe" library_type = "pe" compress_format = "gzip" ## arguments to pygrid arg = [[sra_file, det['fastq_path']]] job = pg.cBioJob(call_decompose_sra_file, arg) job.mem = "6gb" job.vmem = "6gb" job.pmem = "6gb" job.pvmem = "6gb" job.nodes = 1 job.ppn = 1 job.walltime = "24:00:00" Jobs.append(job) print print "sending decompress SRA file jobs to worker" print processedJobs = pg.process_jobs(Jobs)
job = pg.cBioJob(call_fetch_db_signals, arg) ## native specifications job.mem = "5gb" job.vmem = "5gb" job.pmem = "5gb" job.pvmem = "5gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending genomic signal fetch jobs to worker" print processedJobs = pg.process_jobs(Jobs) def call_filter_genes(args_list): """ wrapper for submitting jobs to pygrid """ from rnaseq_align_assembly import refine_transcript_models as filter_tool gtf_file, fasta_file, result_file = args_list filter_tool.filter_gene_models(gtf_file, fasta_file, result_file) return "done" def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation
job = pg.cBioJob(call_fetch_db_signals, arg) ## native specifications job.mem="5gb" job.vmem="5gb" job.pmem="5gb" job.pvmem="5gb" job.nodes = 1 job.ppn = 1 job.walltime = "1:00:00" Jobs.append(job) print print "sending genomic signal fetch jobs to worker" print processedJobs = pg.process_jobs(Jobs) def call_filter_genes(args_list): """ wrapper for submitting jobs to pygrid """ from rnaseq_align_assembly import refine_transcript_models as filter_tool gtf_file, fasta_file, result_file = args_list filter_tool.filter_gene_models(gtf_file, fasta_file, result_file) return "done" def filter_genes(yaml_config, data_method): """ filter out invalid gene models from the provided genome annotation