def run_phasedBam2bed(doe_csv, in_dir, out_dir, header_name_of_exp_id, extension, logs_dir, sort, execute): myos.check_if_directory_exists_create_it(out_dir) qname = 'regevlab' mem_usage = '5000' if extension is not None: bam_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_dir, extension) else: bam_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_dir, '.bam') for exp_name, bam_in_fn in bam_fn_dict.iteritems(): bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage) bed_p_fn = os.path.join(out_dir, exp_name+'.p.bed') bed_m_fn = os.path.join(out_dir, exp_name+'.m.bed') runcmd = execs_commands.phasedBam2bed(bam_in_fn, bed_p_fn, bed_m_fn) if sort: sort_p_cmd = execs_commands.bedops().sortbed(bed_p_fn, os.path.splitext(bed_p_fn)[0]+'.sorted.bed', '--max-mem 4G') rm_p_cmd = 'rm %s' %(bed_p_fn) sort_m_cmd = execs_commands.bedops().sortbed(bed_m_fn, os.path.splitext(bed_m_fn)[0]+'.sorted.bed', '--max-mem 4G') rm_m_cmd = 'rm %s' %(bed_m_fn) runcmd = runcmd+';'+sort_p_cmd+';'+rm_p_cmd+';'+sort_m_cmd+';'+rm_m_cmd fullcmd = bsubcmd+'\"'+runcmd+'\"' print fullcmd else: fullcmd = bsubcmd+'\"'+runcmd+'\"' print fullcmd if execute: os.system(fullcmd) return 0
def write_gemma_jobs(gemma_options, output_dir, logs_dir, input_dir, genotype_fn, execute): qname = "serial_requeue" mem_usage = "1000" myos.check_if_directory_exists_create_it(output_dir) myos.check_if_directory_exists_create_it(logs_dir) batch_number = 1 # counter of batch batch_size = 1 # how many runs per batch job_number_within_batch = 0 # counter of job within batch pheno_files = os.listdir(input_dir) for pheno_n in pheno_files: pheno_fn = os.path.join(input_dir, pheno_n) if job_number_within_batch == 0: job_name = os.path.splitext(pheno_n)[0] bsubcmd = myos.write_bsub_string_no_rm_logs_dir(logs_dir, job_name, qname = qname, mem_usage = mem_usage, time = '1438') if os.path.exists(pheno_fn) is False or os.path.exists(genotype_fn) is False: print 'Oooops, One of these files to process does not exist!!! %s %s' %(pheno_fn, genotype_fn) return 0 output_prefix = job_name exec_cmd = execs_commands.gemma(output_dir, gemma_options, pheno_fn, genotype_fn, output_prefix) print exec_cmd job_script_fn = bsubcmd.split(' ')[-1] with open(job_script_fn, "a") as job_script_f: print bsubcmd job_script_f.write('echo \"%s\"\n' %(exec_cmd)) job_script_f.write(exec_cmd+'\n') job_number_within_batch += 1 if job_number_within_batch == batch_size: if execute: os.system(bsubcmd) batch_number += 1 job_number_within_batch = 0 if job_number_within_batch > 0 and job_number_within_batch < batch_size: if execute: os.system(bsubcmd) return 0
def write_InferDPB_jobs(output_dir, logs_dir, input_dir, ncore='4', threadnum=4, execute=False): qname = "serial_requeue" mem_usage = "1000" #threadnum = 4 myos.check_if_directory_exists_create_it(output_dir) myos.check_if_directory_exists_create_it(logs_dir) #batch_number = 1 # counter of batch #batch_size = 1 # how many runs per batch #job_number_within_batch = 0 # counter of job within batch fn_array = np.array([0.1, 0.2, 0.4, 0.8, 0.9], float) fp_array = np.array([0.0001, 0.001, 0.002, 0.01], float) tmp = 1 for fn in fn_array: for fp in fp_array: tmp = tmp + 1 for i in range(1, 6): d2s = os.path.join(input_dir, 'Drug_Sub') p2d = os.path.join(input_dir, 'Protein_Domain') d2p = os.path.join(input_dir, 'Drug_Protein_' + str(i)) s2d_in = os.path.join(input_dir, 'Sub_Domain_' + str(i)) outname = 'Sub_Domain_Result_' + str(tmp) + '_' + str(i) job_name = 's2d_job' + '_' + str(tmp) + '_' + str(i) bsubcmd = myos.write_bsub_string_no_rm_logs_dir( logs_dir, job_name, qname=qname, mem_usage=mem_usage, ncores=ncore, time='1438') if os.path.exists(d2s) is False or os.path.exists( p2d) is False or os.path.exists( d2p) is False or os.path.exists(s2d_in) is False: print "Cannot find some input files!" return 0 exec_cmd = execs_commands.inferDPB_fnfp( output_dir, fn, fp, threadnum, d2s, p2d, d2p, s2d_in, outname) print exec_cmd job_script_fn = bsubcmd.split(' ')[-1] with open(job_script_fn, 'a') as job_script_f: print bsubcmd job_script_f.write('echo \"%s\"\n' % (exec_cmd)) job_script_f.write(exec_cmd + '\n') if execute: os.system(bsubcmd) return 0
def run_bedmap(in_reference_fn, doe_csv, header_name_of_exp_id, in_map_dir, extension, bedmap_options, out_dir, logs_dir, execute): myos.check_if_directory_exists_create_it(out_dir) qname = 'regevlab' mem_usage = '5000' bed_map_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_map_dir, extension) for exp_name, bed_map_fn in bed_map_fn_dict.iteritems(): bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, myos.basename_no_ext(bed_map_fn), qname = qname, mem_usage = mem_usage) out_fn = os.path.join(out_dir, myos.basename_no_ext(bed_map_fn)+'_mapped2_'+myos.basename_no_ext(in_reference_fn)+'.bedmap') runcmd = execs_commands.bedops().bedmap(bedmap_options, in_reference_fn, bed_map_fn, out_fn) fullcmd = bsubcmd+'\"'+runcmd+'\"' print fullcmd if execute: os.system(fullcmd) return 0
def run_cp_chris2rawdata(logs_dir, out_dir, doe_csv_fn, header_name_of_in_fn): qname = 'regevlab' mem_usage = '5000' dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn) myos.remove_all_files_given_dir(out_dir) myos.check_if_directory_exists_create_it(out_dir) for exp_name, in_fn in dict_fq_fns.iteritems(): bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage) if os.path.splitext(in_fn)[1] == '.gz': cp_cmd = 'cp %s %s; gunzip %s' %(in_fn, os.path.join(out_dir, exp_name+'.fq.gz'), os.path.join(out_dir, exp_name+'.fq.gz')) else: cp_cmd = 'cp %s %s' %(in_fn, os.path.join(out_dir, exp_name+'.fq')) fullcmd = bsubcmd+'\"'+cp_cmd+'\"' print fullcmd os.system(fullcmd) return 0
def run_fastqc(doe_csv_fn, out_dir, logs_dir, header_name_of_in_fn, extension_name, in_dir, execute): myos.check_if_directory_exists_create_it(out_dir) qname = 'regevlab' mem_usage = '5000' if extension_name is not None: dict_fq_fns = doe_reader.create_experiment_fns(doe_csv_fn, 'name', in_dir, extension_name) elif header_name_of_in_fn is not None: dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn) out_dir = out_dir dep_cmd = myos.load_dependencies_cmd(dependencies_list) for exp_name, in_fn in dict_fq_fns.iteritems(): bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage) runcmd = execs_commands.fastqc(in_fn, out_dir) fullcmd = bsubcmd+'\"'+runcmd+'\"' print fullcmd if execute: os.system(fullcmd) return 0
def write_gemma_jobs(gemma_options, output_dir, logs_dir, input_dir, genotype_fn, execute): qname = "serial_requeue" mem_usage = "1000" myos.check_if_directory_exists_create_it(output_dir) myos.check_if_directory_exists_create_it(logs_dir) batch_number = 1 # counter of batch batch_size = 1 # how many runs per batch job_number_within_batch = 0 # counter of job within batch pheno_files = os.listdir(input_dir) for pheno_n in pheno_files: pheno_fn = os.path.join(input_dir, pheno_n) if job_number_within_batch == 0: job_name = os.path.splitext(pheno_n)[0] bsubcmd = myos.write_bsub_string_no_rm_logs_dir( logs_dir, job_name, qname=qname, mem_usage=mem_usage, time='1438') if os.path.exists(pheno_fn) is False or os.path.exists( genotype_fn) is False: print 'Oooops, One of these files to process does not exist!!! %s %s' % ( pheno_fn, genotype_fn) return 0 output_prefix = job_name exec_cmd = execs_commands.gemma(output_dir, gemma_options, pheno_fn, genotype_fn, output_prefix) print exec_cmd job_script_fn = bsubcmd.split(' ')[-1] with open(job_script_fn, "a") as job_script_f: print bsubcmd job_script_f.write('echo \"%s\"\n' % (exec_cmd)) job_script_f.write(exec_cmd + '\n') job_number_within_batch += 1 if job_number_within_batch == batch_size: if execute: os.system(bsubcmd) batch_number += 1 job_number_within_batch = 0 if job_number_within_batch > 0 and job_number_within_batch < batch_size: if execute: os.system(bsubcmd) return 0
def write_InferDPB_jobs(output_dir, logs_dir, input_dir, ncore="4", threadnum=4, execute=False): qname = "serial_requeue" mem_usage = "1000" # threadnum = 4 myos.check_if_directory_exists_create_it(output_dir) myos.check_if_directory_exists_create_it(logs_dir) # batch_number = 1 # counter of batch # batch_size = 1 # how many runs per batch # job_number_within_batch = 0 # counter of job within batch fn_array = np.array([0.1, 0.2, 0.4, 0.8, 0.9], float) fp_array = np.array([0.0001, 0.001, 0.002, 0.01], float) tmp = 1 for fn in fn_array: for fp in fp_array: tmp = tmp + 1 for i in range(1, 6): d2s = os.path.join(input_dir, "Drug_Sub") p2d = os.path.join(input_dir, "Protein_Domain") d2p = os.path.join(input_dir, "Drug_Protein_" + str(i)) s2d_in = os.path.join(input_dir, "Sub_Domain_" + str(i)) outname = "Sub_Domain_Result_" + str(tmp) + "_" + str(i) job_name = "s2d_job" + "_" + str(tmp) + "_" + str(i) bsubcmd = myos.write_bsub_string_no_rm_logs_dir( logs_dir, job_name, qname=qname, mem_usage=mem_usage, ncores=ncore, time="1438" ) if ( os.path.exists(d2s) is False or os.path.exists(p2d) is False or os.path.exists(d2p) is False or os.path.exists(s2d_in) is False ): print "Cannot find some input files!" return 0 exec_cmd = execs_commands.inferDPB_fnfp(output_dir, fn, fp, threadnum, d2s, p2d, d2p, s2d_in, outname) print exec_cmd job_script_fn = bsubcmd.split(" ")[-1] with open(job_script_fn, "a") as job_script_f: print bsubcmd job_script_f.write('echo "%s"\n' % (exec_cmd)) job_script_f.write(exec_cmd + "\n") if execute: os.system(bsubcmd) return 0
def run_trim_galore(logs_dir, out_dir, doe_csv_fn, header_name_of_in_fn, header_name_of_adapter_seq, header_name_of_read_length, trim_galore_options, rm_shorter_than_space, tissue): trim_galore_options_list = trim_galore_options.split(' ') clip_R1_value_index = general.index_in_unique_list(trim_galore_options_list, '--clip_R1')+1 print trim_galore_options_list[clip_R1_value_index] clip_R1_value = int(trim_galore_options_list[clip_R1_value_index]) qname = 'regevlab' mem_usage = '5000' dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn) dict_adapter_seq = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_adapter_seq) dict_read_length = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_read_length) myos.remove_all_files_given_dir(out_dir) myos.check_if_directory_exists_create_it(out_dir) for exp_name, in_fn in dict_fq_fns.iteritems(): adapter_seq = dict_adapter_seq[exp_name] read_length = int(dict_read_length[exp_name]) bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name+'_'+tissue, qname = qname, mem_usage = mem_usage) runcmd_tgf = execs_commands.trim_galore_filter(adapter_seq, trim_galore_options+' --length %s' %(read_length-clip_R1_value-rm_shorter_than_space), in_fn, out_dir) fullcmd = bsubcmd+'\"'+runcmd_tgf+'\"' print fullcmd myos.write_fullcmd(fullcmd, logs_dir, exp_name+'_'+tissue) os.system(fullcmd) return 0
def write_DECODE_jobs(logs_dir, input_dir, gene_residual_file, tissuenm, outdir, splitnum=20, execute=False): qname = "serial_requeue" mem_usage = "25000" myos.check_if_directory_exists_create_it(logs_dir) myos.check_if_directory_exists_create_it(outdir) genetotalnum = myos.wccount(input_dir + "genelocsnp") taskseq = splitinteger(genetotalnum, splitnum) for i in range(0, splitnum): job_name = 'gtex_decode' + '_' + str(taskseq[i][0]) + '_' + str( taskseq[i][1]) bsubcmd = myos.write_bsub_string_no_rm_logs_dir(logs_dir, job_name, qname=qname, mem_usage=mem_usage, time='300') if os.path.exists(gene_residual_file) is False: print "Cannot find some input files!" return 0 exec_cmd = execs_commands.gtex_decode(gene_residual_file, taskseq[i][0], taskseq[i][1], tissuenm, outdir) print exec_cmd job_script_fn = bsubcmd.split(' ')[-1] with open(job_script_fn, 'a') as job_script_f: print bsubcmd job_script_f.write('echo \"%s\"\n' % (exec_cmd)) job_script_f.write(exec_cmd + '\n') if execute: os.system(bsubcmd) return 0