Exemple #1
0
def run_phasedBam2bed(doe_csv, in_dir, out_dir, header_name_of_exp_id, extension, logs_dir, sort, execute):
    myos.check_if_directory_exists_create_it(out_dir)
    qname = 'regevlab'
    mem_usage = '5000'
    if extension is not None:
        bam_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_dir, extension)
    else:
        bam_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_dir, '.bam')
    for exp_name, bam_in_fn in bam_fn_dict.iteritems():
        bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage)
        bed_p_fn = os.path.join(out_dir, exp_name+'.p.bed')
        bed_m_fn = os.path.join(out_dir, exp_name+'.m.bed')
        runcmd = execs_commands.phasedBam2bed(bam_in_fn, bed_p_fn, bed_m_fn)
        if sort:
            sort_p_cmd = execs_commands.bedops().sortbed(bed_p_fn, os.path.splitext(bed_p_fn)[0]+'.sorted.bed', '--max-mem 4G')
            rm_p_cmd = 'rm %s' %(bed_p_fn)
            sort_m_cmd = execs_commands.bedops().sortbed(bed_m_fn, os.path.splitext(bed_m_fn)[0]+'.sorted.bed', '--max-mem 4G')
            rm_m_cmd = 'rm %s' %(bed_m_fn)
            runcmd = runcmd+';'+sort_p_cmd+';'+rm_p_cmd+';'+sort_m_cmd+';'+rm_m_cmd
            fullcmd = bsubcmd+'\"'+runcmd+'\"'
            print fullcmd
        else:
            fullcmd = bsubcmd+'\"'+runcmd+'\"'
            print fullcmd
        if execute:
            os.system(fullcmd)
    return 0
Exemple #2
0
def write_gemma_jobs(gemma_options, output_dir, logs_dir, input_dir, genotype_fn, execute):
    qname = "serial_requeue" 
    mem_usage = "1000"
    myos.check_if_directory_exists_create_it(output_dir)
    myos.check_if_directory_exists_create_it(logs_dir)
    batch_number = 1 # counter of batch
    batch_size = 1 # how many runs per batch
    job_number_within_batch = 0 # counter of job within batch
    pheno_files = os.listdir(input_dir)
    for pheno_n in pheno_files:
        pheno_fn = os.path.join(input_dir, pheno_n)
        if job_number_within_batch == 0:
            job_name = os.path.splitext(pheno_n)[0]
            bsubcmd = myos.write_bsub_string_no_rm_logs_dir(logs_dir, job_name, qname = qname, mem_usage = mem_usage, time = '1438')
            if os.path.exists(pheno_fn) is False or os.path.exists(genotype_fn) is False:
                print 'Oooops, One of these files to process does not exist!!! %s %s' %(pheno_fn, genotype_fn)
                return 0
        output_prefix = job_name
        exec_cmd = execs_commands.gemma(output_dir, gemma_options, pheno_fn, genotype_fn, output_prefix) 
        print exec_cmd
        job_script_fn = bsubcmd.split(' ')[-1]
        with open(job_script_fn, "a") as job_script_f:
            print bsubcmd 
            job_script_f.write('echo \"%s\"\n' %(exec_cmd))
            job_script_f.write(exec_cmd+'\n')
            job_number_within_batch += 1
        if job_number_within_batch == batch_size:
            if execute:
                os.system(bsubcmd)
            batch_number += 1
            job_number_within_batch = 0
    if job_number_within_batch > 0 and job_number_within_batch < batch_size:
        if execute:
            os.system(bsubcmd)
    return 0
def write_InferDPB_jobs(output_dir,
                        logs_dir,
                        input_dir,
                        ncore='4',
                        threadnum=4,
                        execute=False):
    qname = "serial_requeue"
    mem_usage = "1000"
    #threadnum = 4
    myos.check_if_directory_exists_create_it(output_dir)
    myos.check_if_directory_exists_create_it(logs_dir)
    #batch_number = 1 # counter of batch
    #batch_size = 1 # how many runs per batch
    #job_number_within_batch = 0 # counter of job within batch

    fn_array = np.array([0.1, 0.2, 0.4, 0.8, 0.9], float)
    fp_array = np.array([0.0001, 0.001, 0.002, 0.01], float)
    tmp = 1
    for fn in fn_array:
        for fp in fp_array:
            tmp = tmp + 1
            for i in range(1, 6):
                d2s = os.path.join(input_dir, 'Drug_Sub')
                p2d = os.path.join(input_dir, 'Protein_Domain')
                d2p = os.path.join(input_dir, 'Drug_Protein_' + str(i))
                s2d_in = os.path.join(input_dir, 'Sub_Domain_' + str(i))
                outname = 'Sub_Domain_Result_' + str(tmp) + '_' + str(i)
                job_name = 's2d_job' + '_' + str(tmp) + '_' + str(i)
                bsubcmd = myos.write_bsub_string_no_rm_logs_dir(
                    logs_dir,
                    job_name,
                    qname=qname,
                    mem_usage=mem_usage,
                    ncores=ncore,
                    time='1438')
                if os.path.exists(d2s) is False or os.path.exists(
                        p2d) is False or os.path.exists(
                            d2p) is False or os.path.exists(s2d_in) is False:
                    print "Cannot find some input files!"
                    return 0
                exec_cmd = execs_commands.inferDPB_fnfp(
                    output_dir, fn, fp, threadnum, d2s, p2d, d2p, s2d_in,
                    outname)
                print exec_cmd
                job_script_fn = bsubcmd.split(' ')[-1]
                with open(job_script_fn, 'a') as job_script_f:
                    print bsubcmd
                    job_script_f.write('echo \"%s\"\n' % (exec_cmd))
                    job_script_f.write(exec_cmd + '\n')
                if execute:
                    os.system(bsubcmd)
    return 0
Exemple #4
0
def run_bedmap(in_reference_fn, doe_csv, header_name_of_exp_id, in_map_dir, extension, bedmap_options, out_dir, logs_dir, execute):
    myos.check_if_directory_exists_create_it(out_dir)
    qname = 'regevlab'
    mem_usage = '5000'
    bed_map_fn_dict = doe_reader.create_experiment_fns(doe_csv, header_name_of_exp_id, in_map_dir, extension)
    for exp_name, bed_map_fn in bed_map_fn_dict.iteritems():
        bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, myos.basename_no_ext(bed_map_fn), qname = qname, mem_usage = mem_usage)
        out_fn = os.path.join(out_dir, myos.basename_no_ext(bed_map_fn)+'_mapped2_'+myos.basename_no_ext(in_reference_fn)+'.bedmap')
        runcmd = execs_commands.bedops().bedmap(bedmap_options, in_reference_fn, bed_map_fn, out_fn)
        fullcmd = bsubcmd+'\"'+runcmd+'\"'
        print fullcmd
        if execute:
            os.system(fullcmd)
    return 0
Exemple #5
0
def run_cp_chris2rawdata(logs_dir, out_dir, doe_csv_fn, header_name_of_in_fn):
    qname = 'regevlab'
    mem_usage = '5000'
    dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn)
    myos.remove_all_files_given_dir(out_dir)
    myos.check_if_directory_exists_create_it(out_dir)
    for exp_name, in_fn in dict_fq_fns.iteritems():
        bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage)
        if os.path.splitext(in_fn)[1] == '.gz':
            cp_cmd = 'cp %s %s; gunzip %s' %(in_fn, os.path.join(out_dir, exp_name+'.fq.gz'), os.path.join(out_dir, exp_name+'.fq.gz'))
        else:
            cp_cmd = 'cp %s %s' %(in_fn, os.path.join(out_dir, exp_name+'.fq'))
        fullcmd = bsubcmd+'\"'+cp_cmd+'\"'
        print fullcmd
        os.system(fullcmd)
    return 0
Exemple #6
0
def run_fastqc(doe_csv_fn, out_dir, logs_dir, header_name_of_in_fn, extension_name, in_dir, execute):
    myos.check_if_directory_exists_create_it(out_dir)
    qname = 'regevlab'
    mem_usage = '5000'
    if extension_name is not None:
      dict_fq_fns = doe_reader.create_experiment_fns(doe_csv_fn, 'name', in_dir, extension_name)
    elif header_name_of_in_fn is not None:
      dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn)
    out_dir = out_dir
    dep_cmd = myos.load_dependencies_cmd(dependencies_list)
    for exp_name, in_fn in dict_fq_fns.iteritems():
        bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name, qname = qname, mem_usage = mem_usage)
        runcmd = execs_commands.fastqc(in_fn, out_dir)
        fullcmd = bsubcmd+'\"'+runcmd+'\"'
        print fullcmd
        if execute:
            os.system(fullcmd)
    return 0
Exemple #7
0
def write_gemma_jobs(gemma_options, output_dir, logs_dir, input_dir,
                     genotype_fn, execute):
    qname = "serial_requeue"
    mem_usage = "1000"
    myos.check_if_directory_exists_create_it(output_dir)
    myos.check_if_directory_exists_create_it(logs_dir)
    batch_number = 1  # counter of batch
    batch_size = 1  # how many runs per batch
    job_number_within_batch = 0  # counter of job within batch
    pheno_files = os.listdir(input_dir)
    for pheno_n in pheno_files:
        pheno_fn = os.path.join(input_dir, pheno_n)
        if job_number_within_batch == 0:
            job_name = os.path.splitext(pheno_n)[0]
            bsubcmd = myos.write_bsub_string_no_rm_logs_dir(
                logs_dir,
                job_name,
                qname=qname,
                mem_usage=mem_usage,
                time='1438')
            if os.path.exists(pheno_fn) is False or os.path.exists(
                    genotype_fn) is False:
                print 'Oooops, One of these files to process does not exist!!! %s %s' % (
                    pheno_fn, genotype_fn)
                return 0
        output_prefix = job_name
        exec_cmd = execs_commands.gemma(output_dir, gemma_options, pheno_fn,
                                        genotype_fn, output_prefix)
        print exec_cmd
        job_script_fn = bsubcmd.split(' ')[-1]
        with open(job_script_fn, "a") as job_script_f:
            print bsubcmd
            job_script_f.write('echo \"%s\"\n' % (exec_cmd))
            job_script_f.write(exec_cmd + '\n')
            job_number_within_batch += 1
        if job_number_within_batch == batch_size:
            if execute:
                os.system(bsubcmd)
            batch_number += 1
            job_number_within_batch = 0
    if job_number_within_batch > 0 and job_number_within_batch < batch_size:
        if execute:
            os.system(bsubcmd)
    return 0
Exemple #8
0
def write_InferDPB_jobs(output_dir, logs_dir, input_dir, ncore="4", threadnum=4, execute=False):
    qname = "serial_requeue"
    mem_usage = "1000"
    # threadnum = 4
    myos.check_if_directory_exists_create_it(output_dir)
    myos.check_if_directory_exists_create_it(logs_dir)
    # batch_number = 1 # counter of batch
    # batch_size = 1 # how many runs per batch
    # job_number_within_batch = 0 # counter of job within batch

    fn_array = np.array([0.1, 0.2, 0.4, 0.8, 0.9], float)
    fp_array = np.array([0.0001, 0.001, 0.002, 0.01], float)
    tmp = 1
    for fn in fn_array:
        for fp in fp_array:
            tmp = tmp + 1
            for i in range(1, 6):
                d2s = os.path.join(input_dir, "Drug_Sub")
                p2d = os.path.join(input_dir, "Protein_Domain")
                d2p = os.path.join(input_dir, "Drug_Protein_" + str(i))
                s2d_in = os.path.join(input_dir, "Sub_Domain_" + str(i))
                outname = "Sub_Domain_Result_" + str(tmp) + "_" + str(i)
                job_name = "s2d_job" + "_" + str(tmp) + "_" + str(i)
                bsubcmd = myos.write_bsub_string_no_rm_logs_dir(
                    logs_dir, job_name, qname=qname, mem_usage=mem_usage, ncores=ncore, time="1438"
                )
                if (
                    os.path.exists(d2s) is False
                    or os.path.exists(p2d) is False
                    or os.path.exists(d2p) is False
                    or os.path.exists(s2d_in) is False
                ):
                    print "Cannot find some input files!"
                    return 0
                exec_cmd = execs_commands.inferDPB_fnfp(output_dir, fn, fp, threadnum, d2s, p2d, d2p, s2d_in, outname)
                print exec_cmd
                job_script_fn = bsubcmd.split(" ")[-1]
                with open(job_script_fn, "a") as job_script_f:
                    print bsubcmd
                    job_script_f.write('echo "%s"\n' % (exec_cmd))
                    job_script_f.write(exec_cmd + "\n")
                if execute:
                    os.system(bsubcmd)
    return 0
Exemple #9
0
def run_trim_galore(logs_dir, out_dir, doe_csv_fn, header_name_of_in_fn, header_name_of_adapter_seq, header_name_of_read_length, trim_galore_options, rm_shorter_than_space, tissue):
    trim_galore_options_list = trim_galore_options.split(' ')
    clip_R1_value_index = general.index_in_unique_list(trim_galore_options_list, '--clip_R1')+1
    print trim_galore_options_list[clip_R1_value_index]
    clip_R1_value = int(trim_galore_options_list[clip_R1_value_index])
    qname = 'regevlab'
    mem_usage = '5000'
    dict_fq_fns = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_in_fn)
    dict_adapter_seq = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_adapter_seq)
    dict_read_length = doe_reader.read_experiment_field(doe_csv_fn, 'name', header_name_of_read_length)
    myos.remove_all_files_given_dir(out_dir)
    myos.check_if_directory_exists_create_it(out_dir)
    for exp_name, in_fn in dict_fq_fns.iteritems():
        adapter_seq = dict_adapter_seq[exp_name]
        read_length = int(dict_read_length[exp_name])
        bsubcmd = myos.create_bsub_string_no_rm_logs_dir(logs_dir, exp_name+'_'+tissue, qname = qname, mem_usage = mem_usage)
        runcmd_tgf = execs_commands.trim_galore_filter(adapter_seq, trim_galore_options+' --length %s' %(read_length-clip_R1_value-rm_shorter_than_space), in_fn, out_dir)
        fullcmd = bsubcmd+'\"'+runcmd_tgf+'\"'
        print fullcmd
        myos.write_fullcmd(fullcmd, logs_dir, exp_name+'_'+tissue)
        os.system(fullcmd)
    return 0
Exemple #10
0
def write_DECODE_jobs(logs_dir,
                      input_dir,
                      gene_residual_file,
                      tissuenm,
                      outdir,
                      splitnum=20,
                      execute=False):
    qname = "serial_requeue"
    mem_usage = "25000"
    myos.check_if_directory_exists_create_it(logs_dir)
    myos.check_if_directory_exists_create_it(outdir)
    genetotalnum = myos.wccount(input_dir + "genelocsnp")
    taskseq = splitinteger(genetotalnum, splitnum)
    for i in range(0, splitnum):
        job_name = 'gtex_decode' + '_' + str(taskseq[i][0]) + '_' + str(
            taskseq[i][1])
        bsubcmd = myos.write_bsub_string_no_rm_logs_dir(logs_dir,
                                                        job_name,
                                                        qname=qname,
                                                        mem_usage=mem_usage,
                                                        time='300')
        if os.path.exists(gene_residual_file) is False:
            print "Cannot find some input files!"
            return 0
        exec_cmd = execs_commands.gtex_decode(gene_residual_file,
                                              taskseq[i][0], taskseq[i][1],
                                              tissuenm, outdir)
        print exec_cmd
        job_script_fn = bsubcmd.split(' ')[-1]
        with open(job_script_fn, 'a') as job_script_f:
            print bsubcmd
            job_script_f.write('echo \"%s\"\n' % (exec_cmd))
            job_script_f.write(exec_cmd + '\n')
        if execute:
            os.system(bsubcmd)
    return 0