Esempio n. 1
0
    def __init__(self,
                 namespace=None,
                 throttle=0,
                 memsize=20,
                 time_limit=48,
                 ssh_key=None,
                 local_workdir='.'):

        self.config = Config()

        if namespace is None:
            self.namespace = str(os.getpid())
        else:
            self.namespace = namespace

        # These will default to the config cluster working directory.
        self.runner = ClusterJobRunner()
        self.submitter = ClusterJobSubmitter()

        self.memsize = memsize  # expressed in GB
        self.time_limit = time_limit
        self.throttle = throttle
        self.ssh_key = ssh_key

        local_workdir = os.path.abspath(local_workdir)
        if not os.path.exists(local_workdir):
            os.mkdir(local_workdir)
        self.local_workdir = local_workdir
Esempio n. 2
0
    def run_hicup(self):

        # Copy files
        # NB! There is vulnerability in below as we asssume input file follows odom lab convention
        code = self.fq1.split('_')[0]
        destination = "%s@%s:%s" % (self.conf.user, self.conf.cluster,
                                    self.conf.clusterworkdir)
        LOGGER.info("Copying %s to cluster ..." % self.fq1)
        transfer_file(self.fq1, destination)
        if self.fq2 is not None:
            LOGGER.info("Copying %s to cluster ..." % self.fq2)
            transfer_file(self.fq2, destination)

        # Send bsub for running hicup with correct number of thread request
        submitter = ClusterJobSubmitter()

        # FIX ME: Yes, its bad practice to hard code dependencies but this is a temporary fix as in some reason hicup can not be found even though in path
        #         Moreover, in some reason softlinking hicup to bin does not seem to be enough, probably beacuse the way dependencies in hicup main program are implemented.
        cmd = "mkdir %s && sleep 1 && cd %s && ~/software/external/hicup_v0.5.10/hicup --config %s && rm %s && rm %s" % (
            self.hicup_output_dir, self.conf.clusterworkdir,
            self.hicup_conf_fname, self.fq2, self.hicup_conf_fname)
        jobid = submitter.submit_command(cmd=cmd,
                                         mem=self.conf.clustermem,
                                         auto_requeue=False,
                                         threads=self.conf.num_threads)
        LOGGER.info("Hicup execution job id = %s" % jobid)
        #
        cmd = "cd %s && cs_run_hicup_postprocess.py --fq1 %s" % (
            self.conf.clusterworkdir, self.fq1)
        jobid = submitter.submit_command(cmd=cmd,
                                         mem=self.conf.clustermem,
                                         auto_requeue=False,
                                         threads=self.conf.num_threads,
                                         depend_jobs=[jobid])
        LOGGER.info("Hicup post process job id = %s" % jobid)
Esempio n. 3
0
    def ed_run_post_process(self,
                            merged_fn,
                            bams,
                            source_path,
                            merged_path,
                            compressed_path,
                            print_commands_only=False):

        # Setup cluster job submitter
        submitter = ClusterJobSubmitter()

        cleanup = ""
        if self.cleanup:
            cleanup = " && rm %s" % (" ".join(bams))

        # Execute merge job
        merge_cmd = "cd %s && samtools merge -u -@ 8 %s %s%s" % (
            source_path, os.path.join(merged_path,
                                      merged_fn), " ".join(bams), cleanup)
        print merge_cmd
        if not print_commands_only:
            jobid_merge = submitter.submit_command(cmd=merge_cmd,
                                                   mem=8000,
                                                   mincpus=8,
                                                   auto_requeue=False)

        # Execute duplicate marking
        if self.cleanup:
            cleanup = " && rm %s" % merged_fn
        dupmark_fn = merged_fn + "_dupmark.bam"
        dupmark_log = merged_fn + "_dupmark.log"
        d_cmd = "cd %s && picard --Xmx 32g MarkDuplicates I=%s O=%s M=%s VALIDATION_STRINGENCY=SILENT ASSUME_SORTED=True COMPRESSION_LEVEL=0 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1024" % (
            merged_path, merged_fn, dupmark_fn, dupmark_log)
        d_cmd = d_cmd + cleanup
        print d_cmd
        if not print_commands_only:
            jobid_d = submitter.submit_command(cmd=d_cmd,
                                               mem=50000,
                                               auto_requeue=False,
                                               depend_jobs=[jobid_merge])

        # Set up coverage and and flagstat computations for marked duplicate files
        coverage_fn = dupmark_fn + ".coverage"
        flagstat_fn = dupmark_fn + ".flagstat"

        genome_size_fn = None
        try:
            genome = Genome.objects.filter(
                library__code=merged_fn.split('_')[0])
            fasta = genome[0].fasta
            genome_size_fn = os.path.join(DBCONF.clustergenomedir,
                                          fasta + '.size')
        except Genome.DoesNotExist, _err:
            LOGGER.info("No reference genome for %s." % (merged_fn))
            sys.exit(1)
Esempio n. 4
0
    def ed_download(self, print_download_commands_only=False):
        '''Downloads all project related files.'''

        # Set some variables for submitting the download jobs to cluster.
        submitter = ClusterJobSubmitter()

        # Install information about files locations, ids etc.
        self.ed_check()

        # Set some variables for threaded downloading.
        # We will try to control for the number of threads by
        # setting new download jobs to depend on complete of previous download jobs
        # The assumption is that in average the run time of all download threads will be the same.
        tnr = 0
        jobids = []
        newids = []

        for edstem in self.edfiles:

            status = self.edfiles[edstem].status

            if status != 'new':
                if status == 'downloaded':
                    LOGGER.info("Skipping %s (%s)", edstem, status)
                if status == 'processed':
                    LOGGER.info("Skipping %s (%s)", edstem, status)
                continue

            ## Prepare download command for for submision to the cluster
            cmd = 'cs_edinburgh_download.py -a --file1 %s --file2 %s --file1_md5 %s --file2_md5 %s -p %s -l %d' % (
                self.edfiles[edstem].file1, self.edfiles[edstem].file2,
                self.edfiles[edstem].file1_md5, self.edfiles[edstem].file2_md5,
                self.project, self.edfiles[edstem].laneid)
            if print_download_commands_only:
                print cmd
                continue
            # Submit download job
            if jobids:
                jobid = submitter.submit_command(cmd=cmd,
                                                 mem=1000,
                                                 auto_requeue=False,
                                                 depend_jobs=[jobids[tnr]])
            else:
                jobid = submitter.submit_command(cmd=cmd,
                                                 mem=1000,
                                                 auto_requeue=False)
            LOGGER.info("Setting up downloads for %s ... (jobid=%s)", edstem,
                        jobid)
            newids.append(int(jobid))
            tnr += 1
            if tnr == self.athreads:
                jobids = newids
                newids = []
                tnr = 0
Esempio n. 5
0
    def ed_process(self, print_commands_only=False):
        '''Process files that have been labeled as downloaded'''

        # Set some variables for submitting the download jobs to cluster.
        submitter = ClusterJobSubmitter()

        # Check the project files first
        self.ed_check()

        django.setup()

        # Fetch archive location in the file system.
        try:
            archive = ArchiveLocation.objects.get(name=self.aname)
        except ArchiveLocation.DoesNotExist, _err:
            raise SystemExit("Archive location \'%s\' does not exist!" %
                             self.aname)
Esempio n. 6
0
class ClusterJobManager(object):
    '''
  Moderately abstract base class providing some methods and attributes
  commonly used in higher-level cluster process management classes
  (e.g. GsnapManager, LastzManager).
  '''
    __slots__ = ('namespace', 'submitter', 'runner', 'config', 'throttle',
                 'memsize', 'ssh_key', 'local_workdir', 'time_limit')

    def __init__(self,
                 namespace=None,
                 throttle=0,
                 memsize=20,
                 time_limit=48,
                 ssh_key=None,
                 local_workdir='.'):

        self.config = Config()

        if namespace is None:
            self.namespace = str(os.getpid())
        else:
            self.namespace = namespace

        # These will default to the config cluster working directory.
        self.runner = ClusterJobRunner()
        self.submitter = ClusterJobSubmitter()

        self.memsize = memsize  # expressed in GB
        self.time_limit = time_limit
        self.throttle = throttle
        self.ssh_key = ssh_key

        local_workdir = os.path.abspath(local_workdir)
        if not os.path.exists(local_workdir):
            os.mkdir(local_workdir)
        self.local_workdir = local_workdir

    def submit_command(self, cmd, *args, **kwargs):
        '''
    Submit a command to be run via bsub on the cluster. Returns the ID
    of the launched job. To wait on the completion of the submitted
    job, see the wait_on_cluster method.
    '''
        return self.submitter.submit_command(cmd, *args, **kwargs)

    def run_command(self, cmd, *args, **kwargs):
        '''
    Run a command directly on the cluster head node, waiting for the
    result to be returned. Returns a file descriptor containing the
    stdout of the job. This method is typically used for commands
    which should complete almost immediately.
    '''
        return self.runner.run_command(cmd, *args, **kwargs)

    def cluster_file_exists(self, file):
        '''
    Test whether a file exists in the configured cluster working directory.
    '''
        cmd = ("if [ -e %s ]; then echo yes; else echo no; fi" % file)
        LOGGER.debug(cmd)

        # This runs the test command in the cluster working directory.
        with self.runner.run_command(cmd) as ofh:
            first_line = ofh.readline()
            first_line = first_line.rstrip('\n')

        return first_line == 'yes'

    def cluster_jobs_count(self):
        '''
    Return a count of the jobs currently running on the cluster for
    the configured user.
    '''
        cmd = ("bjobs -u %s" % self.config.clusteruser)
        LOGGER.debug(cmd)
        count = 0

        with self.runner.run_command(cmd) as ofh:
            for line in ofh:
                count += 1

        # Account for the extra header line.
        return count - 1

    def return_file_to_localhost(self,
                                 clusterout,
                                 outfile,
                                 execute=True,
                                 donefile=False):
        '''
    If execute is False, returns a command string that can be used to
    transfer a cluster output files back to our local working
    directory. If execute is True, the command will also be run on the
    cluster.
    '''
        myhost = getfqdn()
        myuser = getuser()
        sshcmd = "scp"

        # Transferring the files back to localhost requires an appropriate
        # passwordless ssh key to be given access on our localhost. The
        # alternative is some horrendous pexpect hack which is only a
        # little more secure (see: sshSangerTunnel.py).
        if self.ssh_key is not None:
            sshcmd += " -i %s" % self.ssh_key

        # Note that we need quoting of e.g. file paths containing
        # spaces. Also, the initial './' allows filenames to contain
        # colons.
        if not os.path.isabs(clusterout):
            clusterout = './%s' % (clusterout, )
        sshcmd += (
            r' %s %s@%s:\"' % (bash_quote(clusterout), myuser, myhost) +
            bash_quote(bash_quote(self.local_workdir + r'/%s' % outfile)) +
            r'\"')

        if donefile:
            sshcmd += " && ssh"
            if self.ssh_key is not None:
                sshcmd += " -i %s" % self.ssh_key
            sshcmd += (r' %s@%s touch ' % (myuser, myhost) + bash_quote(
                bash_quote(self.local_workdir + r'/%s.done' % outfile)))

        if execute is True:
            # This *should* die on failure.
            self.runner.run_command(sshcmd)

        return sshcmd

    def wait_on_cluster(self, jobs, cleanup_cmd=None):
        '''
    Wait for the alignment jobs running on the cluster to contact a
    designated socket file location to indicate that the jobs have
    finished.
    '''
        # Set up a job to notify localhost that the cluster is finished.
        with NamedTemporaryFile() as sobj:
            socketfile = sobj.name

        # The nc utility is pretty commonly installed; if it is not, this
        # will not work.
        LOGGER.info("Submitting monitor job to the cluster.")
        cmd = "ssh"
        if self.ssh_key is not None:
            cmd += " -i %s" % self.ssh_key
        cmd += (" %s@%s 'echo OK | nc -U %s'" %
                (getuser(), getfqdn(), socketfile))
        monjob = self.submitter.submit_command(cmd,
                                               depend_jobs=jobs,
                                               auto_requeue=False)

        # Optional clean-up job, typically used to delete temporary files.
        if cleanup_cmd is not None:
            LOGGER.info("Submitting clean-up job to the cluster.")
            self.submitter.submit_command(cleanup_cmd,
                                          depend_jobs=[monjob],
                                          auto_requeue=False)

        # Set up a socket server and wait for the cluster to get back to us.
        LOGGER.info("Waiting on a reply from the cluster...")
        sock = socket(AF_UNIX, SOCK_STREAM)
        sock.bind(socketfile)
        sock.listen(1)
        (conn, _addr) = sock.accept()

        message = ''
        while 1:
            data = conn.recv(1024)
            if not data:
                break
            message += data
        conn.close()
        os.unlink(socketfile)

        LOGGER.info("Cluster reply received: %s", message)

        return
Esempio n. 7
0
 def __init__(self, *args, **kwargs):
     self.job = ClusterJobSubmitter(*args, **kwargs)
     super(StarClusterJobSubmitter, self).__init__(*args, **kwargs)
Esempio n. 8
0
class StarClusterJobSubmitter(AlignmentJobRunner):
    '''
  Class representing the submission of a STAR job to the
  cluster. This class works similarly to BwaClusterJobSubmitter.
  '''
    def __init__(self, *args, **kwargs):
        self.job = ClusterJobSubmitter(*args, **kwargs)
        super(StarClusterJobSubmitter, self).__init__(*args, **kwargs)

    def submit(self,
               filenames,
               is_paired=False,
               destnames=None,
               cleanup=True,
               *args,
               **kwargs):
        '''
    Actually submit the job. The optional destnames argument can be
    used to name files on the cluster differently to the source. This
    is occasionally useful.
    '''
        paired_sanity_check(filenames, is_paired)

        # First, copy the files across and uncompress on the server. We
        # remove commas here because otherwise tophat is a little too keen
        # to split on them (quoting doesn't work).
        LOGGER.info("Copying files to the cluster.")
        destnames = [
            re.sub(',+', '_', os.path.basename(fname)) for fname in filenames
        ]
        destnames = self.job.transfer_data(filenames, destnames)

        # Next, create flag for cleanup
        if cleanup:
            cleanupflag = '--cleanup'
        else:
            cleanupflag = ''

        if self.samplename:
            sampleflag = '--sample %s' % self.samplename
        else:
            sampleflag = ''

        # This now searches directly on the cluster.
        progpath = self.job.find_remote_executable('cs_runStarWithSplit.py',
                                                   path=self.conf.clusterpath)

        # Next, submit the actual jobs on the actual cluster.
        fnlist = " ".join([quote(x) for x in destnames])
        cmd = ("python %s --loglevel %d %s --rcp %s:%s %s %s %s" %
               (progpath, LOGGER.getEffectiveLevel(), cleanupflag,
                self.conf.datahost, self.finaldir, sampleflag, self.genome,
                fnlist))

        LOGGER.info("Submitting STAR job to cluster.")
        self.job.submit_command(cmd, *args, **kwargs)

    @classmethod
    def build_genome_index_path(cls, genome, *args, **kwargs):

        # Import here rather than main file as otherwise cluster operations fail.
        from ..models import Program

        conf = Config()

        # Get information about default aligner, check that the program is
        # in path and try to predict its version.
        alignerinfo = ProgramSummary('STAR',
                                     ssh_host=conf.cluster,
                                     ssh_port=conf.clusterport,
                                     ssh_user=conf.clusteruser,
                                     ssh_path=conf.clusterpath)
        indexdir = None

        # Check that the version of aligner has been registered in
        # repository.
        try:
            Program.objects.get(program=alignerinfo.program,
                                version=alignerinfo.version,
                                current=True)
            indexdir = "%s_%s" % ('STAR', alignerinfo.version)

        except Program.DoesNotExist, _err:
            sys.exit((
                """Aligner "%s" version "%s" found at path "%s" """ %
                (alignerinfo.program, alignerinfo.version, alignerinfo.path)) +
                     "not recorded as current in repository! Quitting.")

        # Build path to STAR genome dir. Note that STAR takes dir name only without indexdir.fa suffix in the end.
        gpath = genome_fasta_path(genome,
                                  indexdir=indexdir,
                                  genomedir=conf.clustergenomedir)
        # A bit of an ugly hack here: Remove indexdir.fa suffix from gpath created by genome_fasta_path
        gpath = os.path.split(gpath)[0]

        return gpath
Esempio n. 9
0
class BwaClusterJobSubmitter(AlignmentJobRunner):
    '''Class representing the submission of a bwa job to the
  cluster. This class in fact uploads the fastq file, gunzips it if
  necessary, and then submits a job to split the fastq file into
  chunks and run the alignments as secondary jobs, also spawning one
  last job which waits for the first to complete before merging the
  output and copying it back to the source server.'''
    def __init__(self, *args, **kwargs):
        self.job = ClusterJobSubmitter(*args, **kwargs)
        super(BwaClusterJobSubmitter, self).__init__(*args, **kwargs)

    def submit(self,
               filenames,
               is_paired=False,
               destnames=None,
               cleanup=True,
               nocc=None,
               bwa_algorithm='aln',
               fileshost=None,
               nosplit=False,
               rcp=None,
               lcp=None,
               *args,
               **kwargs):
        '''Actually submit the job. The optional destnames argument can be
    used to name files on the cluster differently to the source. This
    is occasionally useful.'''

        assert (bwa_algorithm in ('aln', 'mem'))
        paired_sanity_check(filenames, is_paired)

        # by lukk01:
        # NB! Copying files to cluster is not any more necessary as long s the hostflag = '--fileshost %s' is uncommented below.
        # However, this would be a pull rather than push and we should then pull from the archive for process_file.py
        # It would requre, though, re-writing of data processing and alignment orders in process_file.py. Just a throught.
        #

        # First, copy the files across and uncompress on the server.
        LOGGER.info("Copying files to the cluster.")
        destnames = self.job.transfer_data(filenames, destnames)

        # Next, create flag for cleanup
        cleanupflag = '--cleanup' if cleanup else ''

        # Next, create flag for number of non-unique reads to keep in samse/sampe
        noccflag = ('--n_occ %s' % (nocc, )) if nocc else ''

        # Sample names containing spaces are bad on the command line,
        # and potentially problematic in bam read groups.
        sampleflag = '--sample %s' % self.samplename if self.samplename else ''

        # Whether to run bwa mem or aln.
        algoflag = '--algorithm %s' % bwa_algorithm

        # Deal with default values for fileshost and rcp/lcp. I.e. figure out if files are located in cluster and results would need to be copied somewhere or not.
        cpflag = ''
        hostflag = ''
        filehost = gethostname()
        if filehost != self.conf.cluster:
            # hostflag  = '--fileshost %s' % filehost
            cpflag = '--rcp %s:%s' % (self.conf.datahost, self.finaldir)
        else:
            # the files are already in host. Override cleanup to prevent source files to be deleted.
            LOGGER.info(
                "Input files are local. Overriding --cleanup to prevent files being deleted."
            )
            cleanupflag = ''
            cpflag = '--lcp %s' % self.finaldir

        # If fileshost has been specified, override default
        if fileshost is not None:
            hostflag = '--fileshost %s' % fileshost
        # If rcp has been specified, override default
        if rcp is not None:
            cpflag = '--rcp %s' % rcp
        # If lcp has been specified, override default
        if lcp is not None:
            cpflag = '--lcp %s' % lcp
        # If nosplit has been set, forward the value
        splitflag = ''
        if nosplit is not None:
            splitflag = '--no-split'

        # This now searches directly on the cluster.
        progpath = self.job.find_remote_executable('cs_runBwaWithSplit.py',
                                                   path=self.conf.clusterpath)

        if progpath is None:
            raise StandardError(
                "cs_runBwaWithSplit.py not found on clusterpath. Possible misconfiguration?"
            )

        # Next, submit the actual jobs on the actual cluster.
        if is_paired:
            LOGGER.debug("Running bwa on paired-end sequencing input.")
            fnlist = " ".join([quote(x) for x in filenames])
            # fnlist = " ".join([ quote(x) for x in destnames ])
            ## FIXME think about ways this could be improved.
            ## In the submitted command:
            ##   --rcp       is where cs_runBwaWithSplit_Merge.py eventually copies
            ##                 the reassembled bam file (via scp).
            cmd = ("python %s --loglevel %d %s %s %s %s %s %s %s %s %s" %
                   (progpath, LOGGER.getEffectiveLevel(), cleanupflag,
                    hostflag, noccflag, cpflag, splitflag, sampleflag,
                    algoflag, self.genome, fnlist))

        else:
            LOGGER.debug("Running bwa on single-end sequencing input.")
            fnlist = quote(filenames[0])
            # fnlist = quote(destnames[0])
            cmd = ("python %s --loglevel %d %s %s %s %s %s %s %s %s %s" %
                   (progpath, LOGGER.getEffectiveLevel(), cleanupflag,
                    hostflag, noccflag, cpflag, splitflag, sampleflag,
                    algoflag, self.genome, fnlist))

        LOGGER.info("Submitting bwa job to cluster.")
        self.job.submit_command(cmd, *args, **kwargs)

    @classmethod
    def build_genome_index_path(cls, genome, *args, **kwargs):

        # Import here rather than main file as otherwise cluster operations fail.
        from ..models import Program

        conf = Config()

        # Get information about default aligner, check that the program is
        # in path and try to predict its version.
        alignerinfo = ProgramSummary(conf.aligner,
                                     ssh_host=conf.cluster,
                                     ssh_port=conf.clusterport,
                                     ssh_user=conf.clusteruser,
                                     ssh_path=conf.clusterpath)
        indexdir = None

        # Check that the version of aligner has been registered in
        # repository.
        try:
            Program.objects.get(program=alignerinfo.program,
                                version=alignerinfo.version,
                                current=True)
            indexdir = "%s-%s" % (alignerinfo.program, alignerinfo.version)

        # If aligner version is missing, try to insert it into the database
        # (FIXME not yet implemented while we see how this works).
        except Program.DoesNotExist, _err:
            sys.exit((
                """Aligner "%s" version "%s" found at path "%s" """ %
                (alignerinfo.program, alignerinfo.version, alignerinfo.path)) +
                     "not recorded as current in repository! Quitting.")

        gpath = genome_fasta_path(genome,
                                  indexdir=indexdir,
                                  genomedir=conf.clustergenomedir)

        return gpath