Example #1
0
def read_clust_conf():

    import os

    conf_file = os.environ['HOME'] + "/picopili.conf"
    configs = read_conf(conf_file)
    cluster = configs['cluster']

    pico_bin = os.path.dirname(os.path.realpath(__file__))
    clust_dir = os.path.dirname(pico_bin) + '/cluster_templates'

    assert os.path.isdir(
        clust_dir
    ), "Unable to find cluster job submission template directory %s" % str(
        clust_dir)

    # load queue configuration info
    # - submission syntax, queue names, job holds
    clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf')

    return clust_conf
Example #2
0
shape_call = [
    shapeit_ex, '--input-bed', chrstem + '.bed', chrstem + '.bim',
    chrstem + '.fam', '--input-map', map_arg, '--input-ref', hap_arg, leg_arg,
    samp_arg, '--window',
    str(args.window),
    str(duo_txt), '--thread',
    str(args.threads), '--seed',
    str(args.shape_seed), '--output-max', outstem + '.phased.haps',
    outstem + '.phased.sample', '--output-log', outstem + '.shape.log'
]

print ' '.join(shape_call) + '\n'

# setup naming from task index
configs = read_conf(os.environ['HOME'] + '/picopili.conf')
clust_confdir = os.path.dirname(str(rp_bin)) + '/cluster_templates/'
clust_conf = read_conf(clust_confdir + str(configs['cluster'] + '.conf'))
task_id = str(clust_conf['log_task_id'])

# submit
jobres = send_job(jobname='shape.' + str(outdot),
                  cmd=' '.join(shape_call),
                  logname='shape.' + str(outdot) + '.chr' + task_id +
                  '.sub.log',
                  mem=int(args.mem_req) * 1000,
                  walltime=30,
                  njobs=22,
                  threads=int(args.threads),
                  sleep=str(args.sleep))
Example #3
0
print '--chr_info_file '+str(args.chr_info_file)

print '\nCluster settings:'
print '--sleep '+str(args.sleep)
if args.full_pipe:
    print '--full-pipe'


#############
print '\n...Checking dependencies...'
#############

# get cluster configuration
# needed for specifying logfile names with clust_conf['log_task_id']
conf_file = os.environ['HOME']+"/picopili.conf"
configs = read_conf(conf_file)
cluster = configs['cluster']
clust_conf = read_clust_conf()

# from config
impute_ex = find_exec('impute2',key='i2loc')
shapeit_ex = find_exec('shapeit',key='shloc')

# get directory containing current script
# (to get absolute path for scripts)
rp_bin = os.path.dirname(os.path.realpath(__file__))
chunker_ex = rp_bin+'/chunk_snps.py'
test_exec(chunker_ex,'picopili chunking script')

if args.ref_dir is not None:
	# verify exists
Example #4
0
def send_job(
        jobname,
        arrayfile=None,
        cmd=None,
        logname=None,
        logloc=None,
        mem=None,
        walltime=None,
        #             week=None,
        njobs=None,
        maxpar=10000,
        threads=None,
        wait_file=None,
        wait_name=None,
        wait_num=None,
        cluster=None,
        sleep=30,
        testonly=False,
        forcearray=False):

    # validate args
    if arrayfile is None and cmd is None:
        raise ValueError("Require either array file or command.")

    elif arrayfile is not None and cmd is not None:
        raise ValueError("Require either array file or command, not both.")

    if logloc is None:
        logloc = os.getcwd()

    if not os.path.isdir(logloc):
        os.mkdir(logloc)

    if maxpar < 1:
        maxpar = 10000

    # get cluster queue name
    if cluster is None:
        conf_file = os.environ['HOME'] + "/picopili.conf"
        configs = read_conf(conf_file)
        cluster = configs['cluster']

    # get queue template
    pico_bin = os.path.dirname(os.path.realpath(__file__))
    clust_dir = os.path.dirname(pico_bin) + '/cluster_templates'

    assert os.path.isdir(
        clust_dir
    ), "Unable to find cluster job submission template directory %s" % str(
        clust_dir)

    # load queue configuration info
    # - submission syntax, queue names, job holds
    clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf')

    # basic template
    with open(str(clust_dir) + '/' + str(cluster) + '.sub.sh',
              'r') as single_templ:
        templ = single_templ.read()

    # setup memory args
    if mem is None:
        mem = 2000
    mem_mb = str(int(mem))
    if int(mem) > 1000:
        mem_gb = str(int(mem) / 1000)
    else:
        mem_gb = str(1)

    if mem > 30000:
        mem_txt = str(clust_conf['big_mem_txt'])
    else:
        mem_txt = ""

    # multithreading arguments
    if threads is None:
        threads = 1

    # queue picking from job length
    if walltime is None:
        walltime = 1
        queue_name = clust_conf['hour_q']
    elif walltime <= 1.0:
        queue_name = clust_conf['hour_q']
    elif walltime <= 2.0:
        queue_name = clust_conf['hour2_q']
    elif walltime <= 4.0:
        queue_name = clust_conf['hour4_q']
    elif walltime <= 24.0:
        queue_name = clust_conf['day_q']
    else:
        queue_name = clust_conf['long_q']

    # job dependencies
    if wait_name is not None:
        hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),
                                                  hold_num=str(wait_num))

    elif wait_file is not None:
        with open(wait_file, 'r') as wait_fi:
            wait_name = wait_fi.readline()
            hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),
                                                      hold_num=str(wait_num))

    else:
        hold_str = ""

    # load base template

    # for single jobs
    if cmd is not None and (njobs is None or njobs <= 1) and not forcearray:

        njobs = 1
        tot_threads = int(threads)

        # log name
        if logname is None:
            logname = str(jobname) + '.sub.log'

        # command line
        cmd_str = cmd

        # dummy task array args for dict
        array_jobs = njobs
        j_per_core = 1

    # for array jobs
    else:

        # setup indexing tasks
        j_per_core = int(clust_conf['j_per_node'])
        if j_per_core == 1:
            task_index = str(clust_conf['task_id'])
        else:
            task_index = "$1"

        # cmd or array file spec
        if cmd is not None:
            cmd_line = cmd.format(task=task_index)
            tot_threads = int(njobs) * int(threads)

        else:
            assert os.path.isfile(
                arrayfile), "Job array file %s not found." % str(arrayfile)

            njobs = file_len(arrayfile)
            tot_threads = int(njobs) * int(threads)

            cmd_tmp = dedent("""\
                cline=`head -n {task} {fi} | tail -n 1`
                echo $cline
                $cline
            """)
            cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile)

        # parallelization of array jobs on a node
        if j_per_core > 1:

            from math import floor, ceil

            # max simul tasks with memory limit
            node_mem = float(clust_conf['array_mem_mb'])
            task_mem_lim = int(floor((node_mem - 1.0) / float(mem)))

            # max simul tasks with threading
            if task_mem_lim > floor(int(j_per_core) / int(threads)):
                task_mem_lim = floor(int(j_per_core) / int(threads))

            if task_mem_lim < 1:
                task_mem_lim = 1

            # number of jobs to cover all tasks
            array_jobs = int(ceil(float(njobs) / float(task_mem_lim)))

            # convert multi-line command to script
            if len(cmd_line.splitlines()) > 1:
                tmp_script = open('temp_cmd.' + str(jobname) + '.sh', 'w')
                tmp_script.write(cmd_line)
                tmp_script.close()
                os.chmod(tmp_script.name,
                         stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE)
                cmd_line = './' + tmp_script.name

            # setup to do task_mem_lim jobs on each node
            # note: specified above that cmd_line uses $1 (first arg) as task index
            # we manage that here with ${tid}
            par_tmp = dedent("""\
                # array index for this job            
                jj={job_index}
                
                # number of jobs to run on node
                nodej={nodej}
                
                # total number of jobs to run in task array
                maxj={njobs}
                
                # task index of first task on this node
                tid=$(($nodej * ($jj - 1) + 1))
                
                # find index of last task for this node
                # - from either node task limit (nodej)
                #   or total number of tasks (maxj)
                if [ "$tid" -le $(($maxj - $nodej + 1)) ]; then
                    last_task=$(($tid + $nodej - 1))
                else
                    last_task=$(($maxj))
                fi
                
                # start the tasks
                while [ "$tid" -le "$last_task" ]; do
                    {cmd_line} $tid &
                    tid=$(($tid+1))
                done
                
                # let all tasks finish
                wait
            """)

            cmd_str = par_tmp.format(njobs=str(njobs),
                                     nodej=str(task_mem_lim),
                                     job_index=str(clust_conf['task_id']),
                                     cmd_line=cmd_line)

        else:
            array_jobs = njobs
            cmd_str = cmd_line

        # log name
        if logname is None:
            logname = str(jobname) + '.sub.' + str(
                clust_conf['log_task_id']) + '.log'

    # fill in template
    jobdict = {
        "job_name": str(jobname),
        "cmd_string": cmd_str,  # formatted elsewhere
        "log_name": str(logloc) + '/' + str(logname),
        "mem_in_mb": str(mem_mb),
        "mem_in_gb": str(mem_gb),
        "big_mem_txt": str(mem_txt),
        "threads": str(threads),
        "total_threads": str(tot_threads),
        "wall_hours": str(walltime),
        "njobs": str(njobs),
        "array_jobs": str(array_jobs),
        "array_max": str(maxpar),
        "core_par": str(j_per_core),
        "task_id": str(clust_conf['task_id']),
        "log_task_id": str(clust_conf['log_task_id']),
        "queue_name": str(queue_name),
        "sleep_time": str(sleep),
        "project": str(clust_conf['project']),
        "workdir": os.getcwd()
    }

    # write job script
    sub_file = open(str(jobname) + '.sub.sh', 'w')
    sub_file.write(templ.format(**jobdict))
    sub_file.close()

    # finalize or remove optional lines
    if njobs <= 1 and not forcearray:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_ARRAY_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call(
            ['sed', '-i', 's/^::PICO_ARRAY_ONLY:://',
             str(sub_file.name)])

    if threads <= 1:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_THREAD_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call(
            ['sed', '-i', 's/^::PICO_THREAD_ONLY:://',
             str(sub_file.name)])

    if njobs <= 1 and not forcearray and threads <= 1:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_THREADARRAY_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call([
            'sed', '-i', 's/^::PICO_THREADARRAY_ONLY:://',
            str(sub_file.name)
        ])

    # command to run
    if hold_str != "":
        launch_str = clust_conf['sub_cmd'] + ' ' + hold_str + ' ' + str(
            sub_file.name)
    else:
        launch_str = clust_conf['sub_cmd'] + ' ' + str(sub_file.name)

    # record
    print launch_str

    # run
    if not testonly:
        p = subprocess.Popen(launch_str.split(),
                             stderr=subprocess.STDOUT,
                             stdout=subprocess.PIPE)
        out, err = p.communicate()
        if p.returncode is None or p.returncode == 0:
            return out
        else:
            raise EnvironmentError((p.returncode, err, out))

    else:
        return 0
Example #5
0


if str(args.addout) != '' and args.addout is not None:
    outdot = str(args.out)+'.'+str(args.addout)
else:
    outdot = str(args.out)


#############
print '\n...Reading ricopili config file...'
#############

### read plink, shapeit loc from config
conf_file = os.environ['HOME']+"/ricopili.conf"
configs = read_conf(conf_file)

plinkx = configs['p2loc']+"plink"
shapeit_ex = configs['shloc'] + '/bin/shapeit'



#############
print '\n...Checking dependencies...'
#############



# TODO: here