Esempio n. 1
0
    def start( self ):
        """
        Invoked at boot time or when the mesosbox service is started.
        """
        while not os.path.exists( '/tmp/cloud-init.done' ):
            log.info( "Waiting for cloud-init to finish ..." )
            time.sleep( 1 )
        log.info( "Starting mesosbox" )
        self.__patch_etc_hosts( { 'mesos-master': self.master_ip } )
        self.__mount_ebs_volume( )
        self.__create_lazy_dirs( )

        if self.master_ip == self.node_ip:
            node_type = 'master'
        else:
            node_type = 'slave'

        self._copy_dir_from_master( shared_dir )

        log_path = '/var/log/mesosbox/mesos{}'.format( node_type )
        mkdir_p( log_path )
        os.chown( log_path, self.uid, self.gid )

        log.info( "Starting %s services" % node_type )
        check_call( [ initctl, 'emit', 'mesosbox-start-%s' % node_type ] )
Esempio n. 2
0
    def __mount_ebs_volume(self):
        """
        Attach, format (if necessary) and mount the EBS volume with the same cluster ordinal as
        this node.
        """
        ebs_volume_size = self.instance_tag('ebs_volume_size') or '0'
        ebs_volume_size = int(ebs_volume_size)
        if ebs_volume_size:
            instance_name = self.instance_tag('Name')
            cluster_ordinal = int(self.instance_tag('cluster_ordinal'))
            volume_name = '%s__%d' % (instance_name, cluster_ordinal)
            volume = EC2VolumeHelper(ec2=self.ec2,
                                     availability_zone=self.availability_zone,
                                     name=volume_name,
                                     size=ebs_volume_size,
                                     volume_type="gp2")
            # TODO: handle case where volume is already attached
            device_ext = '/dev/sdf'
            device = '/dev/xvdf'
            volume.attach(self.instance_id, device_ext)

            # Wait for inode to appear and make sure its a block device
            while True:
                try:
                    assert stat.S_ISBLK(os.stat(device).st_mode)
                    break
                except OSError as e:
                    if e.errno == errno.ENOENT:
                        time.sleep(1)
                    else:
                        raise

            # Only format empty volumes
            volume_label = volume_label_hash(volume_name)
            if check_output(['file', '-sL',
                             device]).strip() == device + ': data':
                check_call(['mkfs', '-t', 'ext4', device])
                check_call(['e2label', device, volume_label])
            else:
                # If the volume is not empty, verify the file system label
                actual_label = check_output(['e2label', device]).strip()
                if actual_label != volume_label:
                    raise AssertionError(
                        "Expected volume label '%s' (derived from '%s') but got '%s'"
                        % (volume_label, volume_name, actual_label))
            current_mount_point = self.__mount_point(device)
            if current_mount_point is None:
                mkdir_p(self.persistent_dir)
                check_call(['mount', device, self.persistent_dir])
            elif current_mount_point == self.persistent_dir:
                pass
            else:
                raise RuntimeError(
                    "Can't mount device %s on '%s' since it is already mounted on '%s'"
                    % (device, self.persistent_dir, current_mount_point))
        else:
            # No persistent volume is attached and the root volume is off limits, so we will need
            # to place persistent data on the ephemeral volume.
            self.persistent_dir = self.ephemeral_dir
Esempio n. 3
0
def download_sample_and_align(job, sample, inputs, ids):
    """
    Downloads the sample and runs BWA-kit

    :param JobFunctionWrappingJob job: Passed by Toil automatically
    :param tuple(str, list) sample: UUID and URLS for sample
    :param Namespace inputs: Contains input arguments
    :param dict ids: FileStore IDs for shared inputs
    """
    uuid, urls = sample
    r1_url, r2_url = urls if len(urls) == 2 else (urls[0], None)
    job.fileStore.logToMaster(
        'Downloaded sample: {0}. R1 {1}\nR2 {2}\nStarting BWA Run'.format(
            uuid, r1_url, r2_url))
    # Read fastq samples from file store
    ids['r1'] = job.addChildJobFn(download_url_job,
                                  r1_url,
                                  s3_key_path=inputs.ssec,
                                  disk=inputs.file_size).rv()
    if r2_url:
        ids['r2'] = job.addChildJobFn(download_url_job,
                                      r2_url,
                                      s3_key_path=inputs.ssec,
                                      disk=inputs.file_size).rv()
    else:
        ids['r2'] = None
    # Create config for bwakit
    inputs.cores = min(inputs.maxCores, multiprocessing.cpu_count())
    inputs.uuid = uuid
    config = dict(
        **vars(inputs)
    )  # Create config as a copy of inputs since it has values we want
    config.update(ids)  # Overwrite attributes with the FileStoreIDs from ids
    config = argparse.Namespace(**config)
    # Define and wire job functions
    bam_id = job.wrapJobFn(run_bwakit,
                           config,
                           sort=inputs.sort,
                           trim=inputs.trim,
                           disk=inputs.file_size,
                           cores=inputs.cores)
    job.addFollowOn(bam_id)
    output_name = uuid + '.bam' + str(
        inputs.suffix) if inputs.suffix else uuid + '.bam'
    if urlparse(inputs.output_dir).scheme == 's3':
        bam_id.addChildJobFn(s3am_upload_job,
                             file_id=bam_id.rv(),
                             file_name=output_name,
                             s3_dir=inputs.output_dir,
                             s3_key_path=inputs.ssec,
                             cores=inputs.cores,
                             disk=inputs.file_size)
    else:
        mkdir_p(inputs.ouput_dir)
        bam_id.addChildJobFn(copy_file_job,
                             name=output_name,
                             file_id=bam_id.rv(),
                             output_dir=inputs.output_dir,
                             disk=inputs.file_size)
Esempio n. 4
0
    def __mount_ebs_volume( self ):
        """
        Attach, format (if necessary) and mount the EBS volume with the same cluster ordinal as
        this node.
        """
        ebs_volume_size = self.__get_instance_tag( self.instance_id, 'ebs_volume_size' ) or '0'
        ebs_volume_size = int( ebs_volume_size )
        if ebs_volume_size:
            instance_name = self.__get_instance_tag( self.instance_id, 'Name' )
            cluster_ordinal = int( self.__get_instance_tag( self.instance_id, 'cluster_ordinal' ) )
            volume_name = '%s__%d' % (instance_name, cluster_ordinal)
            volume = EC2VolumeHelper( ec2=self.ec2,
                                      availability_zone=self.availability_zone,
                                      name=volume_name,
                                      size=ebs_volume_size,
                                      volume_type="gp2")

            # TODO: handle case where volume is already attached
            device_ext = '/dev/sdf'
            device = '/dev/xvdf'
            volume.attach( self.instance_id, device_ext )

            # Wait for inode to appear and make sure its a block device
            while True:
                try:
                    assert stat.S_ISBLK( os.stat( device ).st_mode )
                    break
                except OSError as e:
                    if e.errno == errno.ENOENT:
                        time.sleep( 1 )
                    else:
                        raise

            # Only format empty volumes
            volume_label = volume_label_hash( volume_name )
            if check_output( [ 'file', '-sL', device ] ).strip( ) == device + ': data':
                check_call( [ 'mkfs', '-t', 'ext4', device ] )
                check_call( [ 'e2label', device, volume_label ] )
            else:
                # if the volume is not empty, verify the file system label
                actual_label = check_output( [ 'e2label', device ] ).strip( )
                if actual_label != volume_label:
                    raise AssertionError(
                        "Expected volume label '%s' (derived from '%s') but got '%s'" %
                        (volume_label, volume_name, actual_label) )
            current_mount_point = self.__mount_point( device )
            if current_mount_point is None:
                mkdir_p( self.persistent_dir )
                check_call( [ 'mount', device, self.persistent_dir ] )
            elif current_mount_point == self.persistent_dir:
                pass
            else:
                raise RuntimeError(
                    "Can't mount device %s on '%s' since it is already mounted on '%s'" % (
                        device, self.persistent_dir, current_mount_point) )
        else:
            # No persistent volume is attached and the root volume is off limits, so we will need
            # to place persistent data on the ephemeral volume.
            self.persistent_dir = self.ephemeral_dir
Esempio n. 5
0
 def setUpClass(cls):
     super(ToilTest, cls).setUpClass()
     cls._tempDirs = []
     tempBaseDir = os.environ.get('TOIL_TEST_TEMP', None)
     if tempBaseDir is not None and not os.path.isabs(tempBaseDir):
         tempBaseDir = os.path.abspath(os.path.join(cls._projectRootPath(), tempBaseDir))
         mkdir_p(tempBaseDir)
     cls._tempBaseDir = tempBaseDir
Esempio n. 6
0
 def setUpClass(cls):
     super(ToilTest, cls).setUpClass()
     cls._tempDirs = []
     tempBaseDir = os.environ.get('TOIL_TEST_TEMP', None)
     if tempBaseDir is not None and not os.path.isabs(tempBaseDir):
         tempBaseDir = os.path.abspath(os.path.join(cls._projectRootPath(), tempBaseDir))
         mkdir_p(tempBaseDir)
     cls._tempBaseDir = tempBaseDir
Esempio n. 7
0
 def __create_lazy_dirs( self ):
     log.info( "Bind-mounting directory structure" )
     for (parent, name, persistent) in self.lazy_dirs:
         assert parent[ 0 ] == os.path.sep
         location = self.persistent_dir if persistent else self.ephemeral_dir
         physical_path = os.path.join( location, parent[ 1: ], name )
         mkdir_p( physical_path )
         os.chown( physical_path, self.uid, self.gid )
         logical_path = os.path.join( parent, name )
         check_call( [ 'mount', '--bind', physical_path, logical_path ] )
Esempio n. 8
0
 def __create_lazy_dirs( self ):
     log.info( "Bind-mounting directory structure" )
     for (parent, name, persistent) in self.lazy_dirs:
         assert parent[ 0 ] == os.path.sep
         location = self.persistent_dir if persistent else self.ephemeral_dir
         physical_path = os.path.join( location, parent[ 1: ], name )
         mkdir_p( physical_path )
         os.chown( physical_path, self.uid, self.gid )
         logical_path = os.path.join( parent, name )
         check_call( [ 'mount', '--bind', physical_path, logical_path ] )
Esempio n. 9
0
def consolidate_output(job, config, mutect, pindel, muse):
    """
    Combine the contents of separate tarball outputs into one via streaming

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str mutect: MuTect tarball FileStoreID
    :param str pindel: Pindel tarball FileStoreID
    :param str muse: MuSe tarball FileStoreID
    """
    work_dir = job.fileStore.getLocalTempDir()
    mutect_tar, pindel_tar, muse_tar = None, None, None
    if mutect:
        mutect_tar = job.fileStore.readGlobalFile(
            mutect, os.path.join(work_dir, 'mutect.tar.gz'))
    if pindel:
        pindel_tar = job.fileStore.readGlobalFile(
            pindel, os.path.join(work_dir, 'pindel.tar.gz'))
    if muse:
        muse_tar = job.fileStore.readGlobalFile(
            muse, os.path.join(work_dir, 'muse.tar.gz'))
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar is mutect_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'mutect',
                                os.path.basename(tarinfo.name))
                        elif tar is pindel_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'pindel',
                                os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(
                                config.uuid, 'muse',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(job=job,
                    fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)
Esempio n. 10
0
def consolidate_output_tarballs(job, inputs, vcqc_id, spladder_id):
    """
    Combine the contents of separate tarballs into one.

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str vcqc_id: FileStore ID of variant calling and QC tarball
    :param str spladder_id: FileStore ID of spladder tarball
    """
    job.fileStore.logToMaster('Consolidating files and uploading: {}'.format(
        inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve IDs
    uuid = inputs.uuid
    # Unpack IDs
    # Retrieve output file paths to consolidate
    vcqc_tar = job.fileStore.readGlobalFile(
        vcqc_id, os.path.join(work_dir, 'vcqc.tar.gz'))
    spladder_tar = job.fileStore.readGlobalFile(
        spladder_id, os.path.join(work_dir, 'spladder.tar.gz'))
    # I/O
    fname = uuid + '.tar.gz' if not inputs.improper_pair else 'IMPROPER_PAIR' + uuid + '.tar.gz'
    out_tar = os.path.join(work_dir, fname)
    # Consolidate separate tarballs into one
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in [vcqc_tar, spladder_tar]:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == vcqc_tar:
                            tarinfo.name = os.path.join(
                                uuid, 'variants_and_qc',
                                os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(
                                uuid, 'spladder',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output directory
    if inputs.output_dir:
        mkdir_p(inputs.output_dir)
        shutil.copy(out_tar,
                    os.path.join(inputs.output_dir, os.path.basename(out_tar)))
    # Upload to S3
    if inputs.output_s3_dir:
        out_id = job.fileStore.writeGlobalFile(out_tar)
        job.addChildJobFn(s3am_upload_job,
                          file_id=out_id,
                          s3_dir=inputs.output_s3_dir,
                          file_name=fname,
                          key_path=inputs.ssec,
                          cores=inputs.cores)
Esempio n. 11
0
 def __create_lazy_dirs( self ):
     log.info( "Bind-mounting directory structure" )
     for (parent, name, persistent) in self.lazy_dirs:
         assert parent[ 0 ] == os.path.sep
         logical_path = os.path.join( parent, name )
         if persistent is None:
             tag = 'persist' + logical_path.replace( os.path.sep, '_' )
             persistent = less_strict_bool( self.instance_tag( tag ) )
         location = self.persistent_dir if persistent else self.ephemeral_dir
         physical_path = os.path.join( location, parent[ 1: ], name )
         mkdir_p( physical_path )
         os.chown( physical_path, self.uid, self.gid )
         check_call( [ 'mount', '--bind', physical_path, logical_path ] )
Esempio n. 12
0
 def __create_lazy_dirs(self):
     log.info("Bind-mounting directory structure")
     for (parent, name, persistent) in self.lazy_dirs:
         assert parent[0] == os.path.sep
         logical_path = os.path.join(parent, name)
         if persistent is None:
             tag = 'persist' + logical_path.replace(os.path.sep, '_')
             persistent = less_strict_bool(self.instance_tag(tag))
         location = self.persistent_dir if persistent else self.ephemeral_dir
         physical_path = os.path.join(location, parent[1:], name)
         mkdir_p(physical_path)
         os.chown(physical_path, self.uid, self.gid)
         check_call(['mount', '--bind', physical_path, logical_path])
Esempio n. 13
0
def consolidate_output(job, config, kallisto_output, graphical_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str kallisto_output: FileStoreID for Kallisto output
    :param str graphical_output: FileStoreID for output of graphing step
    """
    job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    graphical_tar, kallisto_tar = None, None
    # Retrieve output file paths to consolidate
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(
            kallisto_output, os.path.join(work_dir, 'kallisto_output.tar.gz'))
    if graphical_output:
        graphical_tar = job.fileStore.readGlobalFile(
            graphical_output, os.path.join(work_dir,
                                           'single_cell_plots.tar.gz'))
    # I/O
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [graphical_tar, kallisto_tar] if x is not None]
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == kallisto_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, os.path.basename(tarinfo.name))
                        elif tar == graphical_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'plots',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(
            file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')],
            output_dir=config.output_dir)
def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str kallisto_output: FileStoreID for Kallisto output
    :param tuple(str, str) rsem_output: FileStoreIDs for RSEM output
    :param str fastqc_output: FileStoreID for FastQC output
    """
    job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve output file paths to consolidate
    rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None
    if rsem_output:
        rsem_id, hugo_id = rsem_output
        rsem_tar = job.fileStore.readGlobalFile(rsem_id, os.path.join(work_dir, 'rsem.tar.gz'))
        hugo_tar = job.fileStore.readGlobalFile(hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz'))
    if fastqc_output:
        fastqc_tar = job.fileStore.readGlobalFile(fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz'))
    # I/O
    if not config.paired:
        config.uuid = 'SINGLE-END.{}'.format(config.uuid)
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == rsem_tar:
                            tarinfo.name = os.path.join(config.uuid, 'RSEM', os.path.basename(tarinfo.name))
                        elif tar == hugo_tar:
                            tarinfo.name = os.path.join(config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name))
                        elif tar == kallisto_tar:
                            tarinfo.name = os.path.join(config.uuid, 'Kallisto', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output directory
    if config.output_dir:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
    # Upload to S3
    if config.s3_output_dir:
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir))
        s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)
Esempio n. 15
0
 def _copy_dir_from_master( self, dir ):
     if dir:
         mkdir_p( dir )
         while True:
             try:
                 check_call( [ 'sudo', '-u', 'mesosbox', 'rsync', '-r', '-e',
                                 'ssh -o StrictHostKeyChecking=no', "mesos-master:" + dir,
                                 dir ] )
             except:
                 log.warning( "Failed to rsync specified directory, trying again in 10 sec" )
                 time.sleep( 10 )
             else:
                 break
         os.chown( dir, self.uid, self.gid )
Esempio n. 16
0
    def start(self):
        while not os.path.exists( '/tmp/cloud-init.done' ):
            log.info( "Waiting for cloud-init to finish ..." )
            time.sleep( 1 )

        self.__patch_etc_hosts( { 'mesos-master': self.master_ip } )

        if self.master_ip == self.node_ip:
            node_type = 'master'
        else:
            node_type = 'slave'

        log_path='/var/log/mesosbox/mesos{}'.format(node_type)
        mkdir_p(log_path)
        os.chown( log_path, self.uid, self.gid )

        log.info( "Starting %s services" % node_type )
        check_call( [initctl, 'emit', 'mesosbox-start-%s' % node_type ] )
Esempio n. 17
0
 def _testExternal(self, moduleName, pyFiles):
     dirPath = self._createTempDir()
     pycFiles = set(pyFile + 'c' for pyFile in pyFiles)
     for relPath in pyFiles:
         path = os.path.join(dirPath, relPath)
         mkdir_p(os.path.dirname(path))
         with open(path, 'w') as f:
             f.write('pass\n')
     sys.path.append(dirPath)
     try:
         userScript = importlib.import_module(moduleName)
         try:
             self._test(userScript.__name__, expectedContents=pycFiles)
         finally:
             del userScript
             del sys.modules[moduleName]
         self.assertFalse(moduleName in sys.modules)
     finally:
         sys.path.remove(dirPath)
Esempio n. 18
0
 def _testExternal(self, moduleName, pyFiles):
     dirPath = self._createTempDir()
     pycFiles = set(pyFile + 'c' for pyFile in pyFiles)
     for relPath in pyFiles:
         path = os.path.join(dirPath, relPath)
         mkdir_p(os.path.dirname(path))
         with open(path, 'w') as f:
             f.write('pass\n')
     sys.path.append(dirPath)
     try:
         userScript = importlib.import_module(moduleName)
         try:
             self._test(userScript.__name__, expectedContents=pycFiles)
         finally:
             del userScript
             del sys.modules[moduleName]
         self.assertFalse(moduleName in sys.modules)
     finally:
         sys.path.remove(dirPath)
def consolidate_output(job, config, mutect, pindel, muse):
    """
    Combine the contents of separate tarball outputs into one via streaming

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str mutect: MuTect tarball FileStoreID
    :param str pindel: Pindel tarball FileStoreID
    :param str muse: MuSe tarball FileStoreID
    """
    work_dir = job.fileStore.getLocalTempDir()
    mutect_tar, pindel_tar, muse_tar = None, None, None
    if mutect:
        mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz'))
    if pindel:
        pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz'))
    if muse:
        muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz'))
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar is mutect_tar:
                            tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name))
                        elif tar is pindel_tar:
                            tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir))
        s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)
Esempio n. 20
0
def consolidate_output_tarballs(job, inputs, vcqc_id, spladder_id):
    """
    Combine the contents of separate tarballs into one.

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str vcqc_id: FileStore ID of variant calling and QC tarball
    :param str spladder_id: FileStore ID of spladder tarball
    """
    job.fileStore.logToMaster('Consolidating files and uploading: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve IDs
    uuid = inputs.uuid
    # Unpack IDs
    # Retrieve output file paths to consolidate
    vcqc_tar = job.fileStore.readGlobalFile(vcqc_id, os.path.join(work_dir, 'vcqc.tar.gz'))
    spladder_tar = job.fileStore.readGlobalFile(spladder_id, os.path.join(work_dir, 'spladder.tar.gz'))
    # I/O
    fname = uuid + '.tar.gz' if not inputs.improper_pair else 'IMPROPER_PAIR' + uuid + '.tar.gz'
    out_tar = os.path.join(work_dir, fname)
    # Consolidate separate tarballs into one
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in [vcqc_tar, spladder_tar]:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == vcqc_tar:
                            tarinfo.name = os.path.join(uuid, 'variants_and_qc', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(uuid, 'spladder', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output directory
    if inputs.output_dir:
        mkdir_p(inputs.output_dir)
        shutil.copy(out_tar, os.path.join(inputs.output_dir, os.path.basename(out_tar)))
    # Upload to S3
    if inputs.output_s3_dir:
        out_id = job.fileStore.writeGlobalFile(out_tar)
        job.addChildJobFn(s3am_upload_job, file_id=out_id, s3_dir=inputs.output_s3_dir,
                          file_name=fname, key_path=inputs.ssec, cores=inputs.cores)
Esempio n. 21
0
    def _testExternal(self, moduleName, pyFiles, virtualenv=False):
        dirPath = self._createTempDir()
        if virtualenv:
            self.assertTrue(inVirtualEnv())
            # --never-download prevents silent upgrades to pip, wheel and setuptools
            check_call(['virtualenv', '--never-download', dirPath])
            sitePackages = os.path.join(dirPath, 'lib', 'python2.7',
                                        'site-packages')
            # tuple assignment is necessary to make this line immediately precede the try:
            oldPrefix, sys.prefix, dirPath = sys.prefix, dirPath, sitePackages
        else:
            oldPrefix = None
        try:
            pycFiles = set(pyFile + 'c' for pyFile in pyFiles)
            for relPath in pyFiles:
                path = os.path.join(dirPath, relPath)
                mkdir_p(os.path.dirname(path))
                with open(path, 'w') as f:
                    f.write('pass\n')
            sys.path.append(dirPath)
            try:
                userScript = importlib.import_module(moduleName)
                try:
                    self._test(userScript.__name__,
                               expectedContents=pycFiles,
                               allowExtraContents=virtualenv)
                finally:
                    del userScript
                    while moduleName:
                        del sys.modules[moduleName]
                        self.assertFalse(moduleName in sys.modules)
                        moduleName = '.'.join(moduleName.split('.')[:-1])

            finally:
                sys.path.remove(dirPath)
        finally:
            if oldPrefix:
                sys.prefix = oldPrefix
Esempio n. 22
0
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None):
    """
    Uploads a file from the FileStore to an output directory on the local filesystem or S3.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str filename: basename for file
    :param str file_id: FileStoreID
    :param str output_dir: Amazon S3 URL or local path
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    :return:
    """
    job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir))
    work_dir = job.fileStore.getLocalTempDir()
    filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename))
    if urlparse(output_dir).scheme == 's3':
        s3am_upload(fpath=os.path.join(work_dir, filepath),
                    s3_dir=output_dir,
                    s3_key_path=s3_key_path)
    elif os.path.exists(os.path.join(output_dir, filename)):
        job.fileStore.logToMaster("File already exists: {}".format(filename))
    else:
        mkdir_p(output_dir)
        copy_files([filepath], output_dir)
Esempio n. 23
0
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None):
    """
    Uploads a file from the FileStore to an output directory on the local filesystem or S3.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str filename: basename for file
    :param str file_id: FileStoreID
    :param str output_dir: Amazon S3 URL or local path
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    :return:
    """
    job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir))
    work_dir = job.fileStore.getLocalTempDir()
    filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename))
    if urlparse(output_dir).scheme == 's3':
        s3am_upload(job=job, fpath=os.path.join(work_dir, filepath),
                    s3_dir=output_dir,
                    s3_key_path=s3_key_path)
    elif os.path.exists(os.path.join(output_dir, filename)):
        job.fileStore.logToMaster("File already exists: {}".format(filename))
    else:
        mkdir_p(output_dir)
        copy_files([filepath], output_dir)
Esempio n. 24
0
    def _testExternal(self, moduleName, pyFiles, virtualenv=False):
        dirPath = self._createTempDir()
        if virtualenv:
            self.assertTrue(inVirtualEnv())
            # --never-download prevents silent upgrades to pip, wheel and setuptools
            check_call(['virtualenv', '--never-download', dirPath])
            sitePackages = os.path.join(dirPath, 'lib', 'python2.7', 'site-packages')
            # tuple assignment is necessary to make this line immediately precede the try:
            oldPrefix, sys.prefix, dirPath = sys.prefix, dirPath, sitePackages
        else:
            oldPrefix = None
        try:
            pycFiles = set(pyFile + 'c' for pyFile in pyFiles)
            for relPath in pyFiles:
                path = os.path.join(dirPath, relPath)
                mkdir_p(os.path.dirname(path))
                with open(path, 'w') as f:
                    f.write('pass\n')
            sys.path.append(dirPath)
            try:
                userScript = importlib.import_module(moduleName)
                try:
                    self._test(userScript.__name__,
                               expectedContents=pycFiles,
                               allowExtraContents=virtualenv)
                finally:
                    del userScript
                    while moduleName:
                        del sys.modules[moduleName]
                        self.assertFalse(moduleName in sys.modules)
                        moduleName = '.'.join(moduleName.split('.')[:-1])

            finally:
                sys.path.remove(dirPath)
        finally:
            if oldPrefix:
                sys.prefix = oldPrefix
Esempio n. 25
0
def download_sample_and_align(job, sample, inputs, ids):
    """
    Downloads the sample and runs BWA-kit

    :param JobFunctionWrappingJob job: Passed by Toil automatically
    :param tuple(str, list) sample: UUID and URLS for sample
    :param Namespace inputs: Contains input arguments
    :param dict ids: FileStore IDs for shared inputs
    """
    uuid, urls = sample
    r1_url, r2_url = urls if len(urls) == 2 else (urls[0], None)
    job.fileStore.logToMaster('Downloaded sample: {0}. R1 {1}\nR2 {2}\nStarting BWA Run'.format(uuid, r1_url, r2_url))
    # Read fastq samples from file store
    ids['r1'] = job.addChildJobFn(download_url_job, r1_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv()
    if r2_url:
        ids['r2'] = job.addChildJobFn(download_url_job, r2_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv()
    else:
        ids['r2'] = None
    # Create config for bwakit
    inputs.cores = min(inputs.maxCores, multiprocessing.cpu_count())
    inputs.uuid = uuid
    config = dict(**vars(inputs))  # Create config as a copy of inputs since it has values we want
    config.update(ids)  # Overwrite attributes with the FileStoreIDs from ids
    config = argparse.Namespace(**config)
    # Define and wire job functions
    bam_id = job.wrapJobFn(run_bwakit, config, sort=inputs.sort, trim=inputs.trim,
                           disk=inputs.file_size, cores=inputs.cores)
    job.addFollowOn(bam_id)
    output_name = uuid + '.bam' + str(inputs.suffix) if inputs.suffix else uuid + '.bam'
    if urlparse(inputs.output_dir).scheme == 's3':
        bam_id.addChildJobFn(s3am_upload_job, file_id=bam_id.rv(), file_name=output_name, s3_dir=inputs.output_dir,
                             s3_key_path=inputs.ssec, cores=inputs.cores, disk=inputs.file_size)
    else:
        mkdir_p(inputs.ouput_dir)
        bam_id.addChildJobFn(copy_file_job, name=output_name, file_id=bam_id.rv(), output_dir=inputs.output_dir,
                                    disk=inputs.file_size)
Esempio n. 26
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    MarginPhase pipeline

    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """

    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')

    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the MarginPhase pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default=DEFAULT_CONFIG_NAME,
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default=DEFAULT_MANIFEST_NAME,
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')

    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME),
                      generate_manifest)

    # Pipeline execution
    elif args.command == 'run':
        # sanity check
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-marginphase generate-config"'.format(args.config))
        require(
            os.path.exists(args.manifest),
            '{} not found and no samples provided. Please '
            'run "toil-marginphase generate-manifest"'.format(args.manifest))

        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize
        config.defaultCores = int(min(MP_CPU, config.maxCores))
        config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint
        config.maxMemory = sys.maxint
        # fix parsing of GB to int
        if args.maxMemory:
            args.maxMemory = args.maxMemory.upper()
            if args.maxMemory.endswith('B'):
                args.maxMemory = args.maxMemory.rstrip('B')
            # actual parsing
            if args.maxMemory.endswith('G'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024
            elif args.maxMemory.endswith('M'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('M')) * 1024 * 1024
            elif args.maxMemory.endswith('K'):
                config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024
            else:
                config.maxMemory = int(args.maxMemory)

        # Config sanity checks
        require(config.output_dir, 'No output location specified')
        if urlparse(config.output_dir).scheme != "s3":
            config.output_dir = config.output_dir.replace("file://", "", 1)
            mkdir_p(config.output_dir)
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        require(config.partition_size,
                "Configuration parameter partition-size is required")
        require(config.partition_margin,
                "Configuration parameter partition-margin is required")

        if 'save_intermediate_files' not in config or not config.save_intermediate_files:
            config.intermediate_file_location = None
        elif urlparse(config.output_dir).scheme == "s3":
            raise UserError(
                "Config parameter 'save_intermediate_files' cannot be used with s3 output directory"
            )
        else:
            intermediate_location = os.path.join(
                config.output_dir, "intermediate",
                datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
            mkdir_p(intermediate_location)
            config.intermediate_file_location = intermediate_location
        if "margin_phase_image" not in config or len(
                config.margin_phase_image) == 0:
            config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT
        if "margin_phase_tag" not in config or len(
                config.margin_phase_tag) == 0:
            config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT
        if "cpecan_image" not in config or len(config.cpecan_image) == 0:
            config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT
        if "cpecan_tag" not in config or len(config.cpecan_tag) == 0:
            config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT
        if "unittest" not in config:
            config.unittest = False
        if "minimal_output" not in config:
            config.minimal_output = False
        if "minimal_cpecan_output" not in config:
            config.minimal_cpecan_output = False
        if "cpecan_probabilities" not in config:
            config.cpecan_probabilities = False

        # get samples
        samples = parse_samples(config, args.manifest)

        # Program checks
        for program in ['docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow
        Job.Runner.startToil(
            Job.wrapJobFn(map_job, prepare_input, samples, config), args)
Esempio n. 27
0
def consolidate_output(job, config, chunk_infos):
    #prep
    start = time.time()
    uuid = config.uuid
    work_dir = job.fileStore.getLocalTempDir()
    out_tar = os.path.join(work_dir, '{}.tar.gz'.format(config.uuid))

    log(job, "{}".format(datetime.datetime.now()), uuid, 'consolidate_output')
    log(job, "consolidating {} files".format(len(chunk_infos)), uuid,
        'consolidate_output')

    # build tarball
    out_tars = [out_tar]
    output_file_count = 0
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for ci in chunk_infos:
            file_id = ci[CI_OUTPUT_FILE_ID]
            tar_file = os.path.join(work_dir,
                                    "{}.tar.gz".format(ci[CI_CHUNK_INDEX]))
            job.fileStore.readGlobalFile(file_id, tar_file)
            out_tars.append(tar_file)
            with tarfile.open(tar_file, 'r') as f_in:
                for tarinfo in f_in:
                    if config.minimal_output and (
                        (tarinfo.name.endswith("bam")
                         or tarinfo.name.endswith("sam")
                         or tarinfo.name.endswith("bai"))
                            and ID_MERGED not in tarinfo.name):
                        log(
                            job,
                            "(Minimal Output) Skipping output file: {}".format(
                                tarinfo.name), uuid, 'consolidate_output')
                        continue
                    if config.minimal_cpecan_output and tarinfo.name.endswith(
                            "gz"):
                        log(
                            job,
                            "(Minimal cPecan Output) Skipping output file: {}".
                            format(tarinfo.name), uuid, 'consolidate_output')
                        continue
                    log(job, "file {}".format(tarinfo.name), uuid,
                        'consolidate_output')
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        f_out.addfile(tarinfo, fileobj=f_in_file)
                        output_file_count += 1
    log(
        job,
        "Consolidated {} files in {} tarballs".format(output_file_count,
                                                      len(out_tars)), uuid,
        'consolidate_output')

    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        log(job, "Uploading {} to S3: {}".format(out_tar, config.output_dir),
            uuid, 'consolidate_output')
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.maxCores)
    else:
        log(job, "Moving {} to output dir: {}".format(out_tar,
                                                      config.output_dir), uuid,
            'consolidate_output')
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)

    # log
    log_generic_job_debug(job,
                          config.uuid,
                          "consolidate_output",
                          work_dir=work_dir)
    log_time(job, "consolidate_output", start, config.uuid)
    log(job, "{}".format(datetime.datetime.now()), uuid, 'END')

    # return location (calculated the same whether s3:// or file://
    return os.path.join(config.output_dir, os.path.basename(out_tar))
Esempio n. 28
0
def prepare_input(job, sample, config, enqueue_consolidation=True):

    # job prep
    config = argparse.Namespace(**vars(config))
    uuid, url, contig_name, reference_url, params_url = sample
    config.uuid = uuid
    config.contig_name = contig_name
    config.reference_url = reference_url
    config.params_url = params_url
    if config.intermediate_file_location is not None:
        config.intermediate_file_location = os.path.join(
            config.intermediate_file_location, uuid)
        mkdir_p(config.intermediate_file_location)
    work_dir = job.fileStore.getLocalTempDir()
    start = time.time()
    log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START')
    log(
        job,
        "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}"
        .format(url, contig_name, reference_url,
                params_url), uuid, 'prepare_input')

    # todo global resource estimation
    config.maxCores = min(config.maxCores, multiprocessing.cpu_count())
    config.defaultCores = min(MP_CPU, config.maxCores)
    config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95))
    #config.disk

    # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported

    #ref fasta
    if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '',
                                                  1)
        ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            ref_genome_fileid, os.path.join(work_dir, ref_genome_filename))
    else:
        download_url(reference_url, work_dir=work_dir)
        ref_genome_filename = os.path.basename(reference_url)
        ref_genome_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, ref_genome_filename))
    ref_genome_size = os.stat(os.path.join(work_dir,
                                           ref_genome_filename)).st_size
    config.reference_genome_fileid = ref_genome_fileid

    #params
    if params_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1)
    else:
        download_url(params_url, work_dir=work_dir)
        params_filename = os.path.basename(params_url)
        params_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, params_filename))
    config.params_fileid = params_fileid

    # download bam
    if url.startswith(TOIL_JOBSTORE_PROTOCOL):
        bam_filename = "{}.input.{}.bam".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1),
            os.path.join(work_dir, bam_filename))
    else:
        download_url(url, work_dir=work_dir)
        bam_filename = os.path.basename(url)
    data_bam_location = os.path.join("/data", bam_filename)
    workdir_bam_location = os.path.join(work_dir, bam_filename)

    # index the bam
    _index_bam(job, config, work_dir, bam_filename)

    # sanity check
    workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai")
    if not os.path.isfile(workdir_bai_location):
        raise UserError("BAM index file not created for {}: {}".format(
            bam_filename, workdir_bai_location))

    # get start and end location
    start_idx = sys.maxint
    end_idx = 0
    with closing(
            pysam.AlignmentFile(
                workdir_bam_location,
                'rb' if bam_filename.endswith("bam") else 'r')) as aln:
        for read in aln.fetch():
            align_start = read.reference_start
            align_end = read.reference_end
            start_idx = min([start_idx, align_start])
            end_idx = max([end_idx, align_end])
    log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx,
                                               end_idx), uuid, 'prepare_input')

    # get reads from positions
    chunk_infos = list()
    idx = start_idx
    while idx < end_idx:
        ci = {CI_UUID: uuid}
        ci[CI_CHUNK_BOUNDARY_START] = idx
        chunk_start = idx - config.partition_margin
        ci[CI_CHUNK_START] = chunk_start
        idx += config.partition_size
        ci[CI_CHUNK_BOUNDARY_END] = idx
        chunk_end = idx + config.partition_margin
        ci[CI_CHUNK_END] = chunk_end
        chunk_infos.append(ci)

    # enqueue jobs
    log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid,
        'prepare_input')
    idx = 0
    enqueued_jobs = 0
    returned_tarballs = list()
    for ci in chunk_infos:
        #prep
        ci[CI_CHUNK_INDEX] = idx
        chunk_start = ci[CI_CHUNK_START]
        chunk_end = ci[CI_CHUNK_END]
        chunk_position_description = "{}:{}-{}".format(config.contig_name,
                                                       chunk_start, chunk_end)
        bam_split_command = [
            "view", "-b", data_bam_location, chunk_position_description
        ]
        chunk_name = "{}.{}.bam".format(config.uuid, idx)

        #write chunk
        chunk_location = os.path.join(work_dir, chunk_name)
        with open(chunk_location, 'w') as out:
            docker_call(job,
                        config,
                        work_dir,
                        bam_split_command,
                        DOCKER_SAMTOOLS_IMG,
                        DOCKER_SAMTOOLS_TAG,
                        outfile=out)

        #document read count
        chunk_size = os.stat(chunk_location).st_size
        ci[CI_CHUNK_SIZE] = chunk_size
        ci[CI_REF_FA_SIZE] = ref_genome_size
        read_count = prepare_input__get_bam_read_count(job, work_dir,
                                                       chunk_name)
        ci[CI_READ_COUNT] = read_count
        log(
            job,
            "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format(
                chunk_position_description, idx, chunk_size,
                int(chunk_size / 1024 / 1024),
                read_count), uuid, 'prepare_input')
        if config.intermediate_file_location is not None:
            copy_files(file_paths=[chunk_location],
                       output_dir=config.intermediate_file_location)

        # enqueue marginPhase job
        if read_count > 0:
            chunk_fileid = job.fileStore.writeGlobalFile(chunk_location)
            mp_cores = config.defaultCores
            mp_mem = int(
                min(
                    int(chunk_size * MP_MEM_BAM_FACTOR +
                        ref_genome_size * MP_MEM_REF_FACTOR),
                    config.maxMemory))
            mp_disk = int(
                min(
                    int(chunk_size * MP_DSK_BAM_FACTOR +
                        ref_genome_size * MP_DSK_REF_FACTOR +
                        (0 if config.cpecan_probabilities else
                         MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk))
            log(
                job,
                "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format(
                    mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem,
                    int(mp_mem / 1024 / 1024 / 1024)),
                "{}.{}".format(uuid, idx), 'prepare_input')
            mp_mem = str(int(mp_mem / 1024)) + "K"
            mp_disk = str(int(mp_disk) / 1024) + "K"
            margin_phase_job = job.addChildJobFn(run_margin_phase,
                                                 config,
                                                 chunk_fileid,
                                                 ci,
                                                 memory=mp_mem,
                                                 cores=mp_cores,
                                                 disk=mp_disk)
            returned_tarballs.append(margin_phase_job.rv())
            enqueued_jobs += 1
        idx += 1

    log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input')

    # enqueue merging and consolidation job
    merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs)
    final_return_value = merge_job.rv()
    if enqueue_consolidation:
        consolidation_job = merge_job.addFollowOnJobFn(consolidate_output,
                                                       config, merge_job.rv())
        final_return_value = consolidation_job.rv()

    # log
    log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir)
    log_time(job, "prepare_input", start, config.uuid)

    # return appropriate output
    return final_return_value
Esempio n. 29
0
    def testDockerClean(self,
                        disableCaching=True,
                        detached=True,
                        rm=True,
                        deferParam=None):
        """
        Run the test container that creates a file in the work dir, and sleeps
        for 5 minutes.
        Ensure that the calling job gets SIGKILLed after a minute, leaving
        behind the spooky/ghost/zombie container. Ensure that the container is
        killed on batch system shutdown (through the deferParam mechanism).
        """

        # We need to test the behaviour of `deferParam` with `rm` and
        # `detached`. We do not look at the case where `rm` and `detached` are
        # both True.  This is the truth table for the different combinations at
        # the end of the test. R = Running, X = Does not exist, E = Exists but
        # not running.
        #              None     FORGO     STOP    RM
        #    rm        X         R         X      X
        # detached     R         R         E      X
        #  Neither     R         R         E      X

        data_dir = os.path.join(self.tempDir, 'data')
        working_dir = os.path.join(self.tempDir, 'working')
        test_file = os.path.join(working_dir, 'test.txt')

        mkdir_p(data_dir)
        mkdir_p(working_dir)

        options = Job.Runner.getDefaultOptions(
            os.path.join(self.tempDir, 'jobstore'))
        options.logLevel = self.dockerTestLogLevel
        options.workDir = working_dir
        options.clean = 'always'
        options.disableCaching = disableCaching

        # No base64 logic since it might create a name starting with a `-`.
        container_name = uuid.uuid4().hex
        A = Job.wrapJobFn(_testDockerCleanFn, working_dir, detached, rm,
                          deferParam, container_name)
        try:
            Job.Runner.startToil(A, options)
        except FailedJobsException:
            # The file created by spooky_container would remain in the directory
            # and since it was created inside the container, it would have had
            # uid and gid == 0 (root) which may cause problems when docker
            # attempts to clean up the jobstore.
            file_stats = os.stat(test_file)
            assert file_stats.st_gid != 0
            assert file_stats.st_uid != 0

            if (rm and (deferParam != FORGO)) or deferParam == RM:
                # These containers should not exist
                assert containerIsRunning(container_name) is None, \
                    'Container was not removed.'

            elif deferParam == STOP:
                # These containers should exist but be non-running
                assert containerIsRunning(container_name) == False, \
                    'Container was not stopped.'

            else:
                # These containers will be running
                assert containerIsRunning(container_name) == True, \
                    'Container was not running.'
        client = docker.from_env(version='auto')
        dockerKill(container_name, client)
        try:
            os.remove(test_file)
        except:
            pass
Esempio n. 30
0
def _populate_keys_from_metadata_server(self):
    global _populate_keys_from_metadata_server_orig
    path = os.path.expanduser(cache_path)
    tmp_path = path + '.tmp'
    while True:
        log.debug('Attempting to read cached credentials from %s.', path)
        try:
            with open(path, 'r') as f:
                content = f.read()
                if content:
                    record = content.split('\n')
                    assert len(record) == 4
                    self._access_key = record[0]
                    self._secret_key = record[1]
                    self._security_token = record[2]
                    self._credential_expiry_time = str_to_datetime(record[3])
                else:
                    log.debug('%s is empty. Credentials are not temporary.',
                              path)
                    return
        except IOError as e:
            if e.errno == errno.ENOENT:
                log.debug('Cached credentials are missing.')
                dir_path = os.path.dirname(path)
                if not os.path.exists(dir_path):
                    log.debug('Creating parent directory %s', dir_path)
                    # A race would be ok at this point
                    mkdir_p(dir_path)
            else:
                raise
        else:
            if self._credentials_need_refresh():
                log.debug('Cached credentials are expired.')
            else:
                log.debug('Cached credentials exist and are still fresh.')
                return
        # We get here if credentials are missing or expired
        log.debug('Racing to create %s.', tmp_path)
        # Only one process, the winner, will succeed
        try:
            fd = os.open(tmp_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0600)
        except OSError as e:
            if e.errno == errno.EEXIST:
                log.debug(
                    'Lost the race to create %s. Waiting on winner to remove it.',
                    tmp_path)
                while os.path.exists(tmp_path):
                    time.sleep(.1)
                log.debug('Winner removed %s. Trying from the top.', tmp_path)
            else:
                raise
        else:
            try:
                log.debug(
                    'Won the race to create %s. '
                    'Requesting credentials from metadata service.', tmp_path)
                _populate_keys_from_metadata_server_orig(self)
            except:
                os.close(fd)
                fd = None
                log.debug('Failed to obtain credentials, removing %s.',
                          tmp_path)
                # This unblocks the loosers.
                os.unlink(tmp_path)
                # Bail out. It's too likely to happen repeatedly
                raise
            else:
                if self._credential_expiry_time is None:
                    os.close(fd)
                    fd = None
                    log.debug(
                        'Credentials are not temporary. '
                        'Leaving %s empty and renaming it to %s.', tmp_path,
                        path)
                else:
                    log.debug('Writing credentials to %s.', tmp_path)
                    with os.fdopen(fd, 'w') as fh:
                        fd = None
                        fh.write('\n'.join([
                            self._access_key, self._secret_key,
                            self._security_token,
                            datetime_to_str(self._credential_expiry_time)
                        ]))
                    log.debug('Wrote credentials to %s. '
                              'Renaming it to %s.', tmp_path, path)
                os.rename(tmp_path, path)
                return
            finally:
                if fd is not None:
                    os.close(fd)
Esempio n. 31
0
 def testDockerClean(self, caching=True):
     """
     Run the test container that creates a file in the work dir, and sleeps for 5 minutes.  Ensure
     that the calling job gets SIGKILLed after a minute, leaving behind the spooky/ghost/zombie
     container. Ensure that the container is killed on batch system shutdown (through the defer
     mechanism).
     This inherently also tests _docker
     :returns: None
     """
     # We need to test the behaviour of `defer` with `rm` and `detached`. We do not look at the case
     # where `rm` and `detached` are both True.  This is the truth table for the different
     # combinations at the end of the test. R = Running, X = Does not exist, E = Exists but not
     # running.
     #              None     FORGO     STOP    RM
     #    rm        X         R         X      X
     # detached     R         R         E      X
     #  Neither     R         R         E      X
     assert os.getuid() != 0, "Cannot test this if the user is root."
     data_dir = os.path.join(self.tempDir, 'data')
     work_dir = os.path.join(self.tempDir, 'working')
     test_file = os.path.join(data_dir, 'test.txt')
     mkdir_p(data_dir)
     mkdir_p(work_dir)
     options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore'))
     options.logLevel = 'INFO'
     options.workDir = work_dir
     options.clean = 'always'
     if not caching:
         options.disableCaching = True
     for rm in (True, False):
         for detached in (True, False):
             if detached and rm:
                 continue
             for defer in (FORGO, STOP, RM, None):
                 # Not using base64 logic here since it might create a name starting with a `-`.
                 container_name = uuid.uuid4().hex
                 A = Job.wrapJobFn(_testDockerCleanFn, data_dir, detached, rm, defer,
                                   container_name)
                 try:
                     Job.Runner.startToil(A, options)
                 except FailedJobsException:
                     # The file created by spooky_container would remain in the directory, and since
                     # it was created inside the container, it would have had uid and gid == 0 (root)
                     # upon creation. If the defer mechanism worked, it should now be non-zero and we
                     # check for that.
                     file_stats = os.stat(test_file)
                     assert file_stats.st_gid != 0
                     assert file_stats.st_uid != 0
                     if (rm and defer != FORGO) or defer == RM:
                         # These containers should not exist
                         assert _containerIsRunning(container_name) is None, \
                             'Container was not removed.'
                     elif defer == STOP:
                         # These containers should exist but be non-running
                         assert _containerIsRunning(container_name) == False, \
                             'Container was not stopped.'
                     else:
                         # These containers will be running
                         assert _containerIsRunning(container_name) == True, \
                             'Container was not running.'
                 finally:
                     # Prepare for the next test.
                     _dockerKill(container_name, RM)
                     os.remove(test_file)
Esempio n. 32
0
 def testDockerClean(self, caching=True):
     """
     Run the test container that creates a file in the work dir, and sleeps for 5 minutes.  Ensure
     that the calling job gets SIGKILLed after a minute, leaving behind the spooky/ghost/zombie
     container. Ensure that the container is killed on batch system shutdown (through the defer
     mechanism).
     This inherently also tests _docker
     :returns: None
     """
     # We need to test the behaviour of `defer` with `rm` and `detached`. We do not look at the case
     # where `rm` and `detached` are both True.  This is the truth table for the different
     # combinations at the end of the test. R = Running, X = Does not exist, E = Exists but not
     # running.
     #              None     FORGO     STOP    RM
     #    rm        X         R         X      X
     # detached     R         R         E      X
     #  Neither     R         R         E      X
     assert os.getuid() != 0, "Cannot test this if the user is root."
     data_dir = os.path.join(self.tempDir, 'data')
     work_dir = os.path.join(self.tempDir, 'working')
     test_file = os.path.join(data_dir, 'test.txt')
     mkdir_p(data_dir)
     mkdir_p(work_dir)
     options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore'))
     options.logLevel = 'INFO'
     options.workDir = work_dir
     options.clean = 'always'
     if not caching:
         options.disableCaching = True
     for rm in (True, False):
         for detached in (True, False):
             if detached and rm:
                 continue
             for defer in (FORGO, STOP, RM, None):
                 # Not using base64 logic here since it might create a name starting with a `-`.
                 container_name = uuid.uuid4().hex
                 A = Job.wrapJobFn(_testDockerCleanFn, data_dir, detached, rm, defer,
                                   container_name)
                 try:
                     Job.Runner.startToil(A, options)
                 except FailedJobsException:
                     # The file created by spooky_container would remain in the directory, and since
                     # it was created inside the container, it would have had uid and gid == 0 (root)
                     # upon creation. If the defer mechanism worked, it should now be non-zero and we
                     # check for that.
                     file_stats = os.stat(test_file)
                     assert file_stats.st_gid != 0
                     assert file_stats.st_uid != 0
                     if (rm and defer != FORGO) or defer == RM:
                         # These containers should not exist
                         assert _containerIsRunning(container_name) is None, \
                             'Container was not removed.'
                     elif defer == STOP:
                         # These containers should exist but be non-running
                         assert _containerIsRunning(container_name) == False, \
                             'Container was not stopped.'
                     else:
                         # These containers will be running
                         assert _containerIsRunning(container_name) == True, \
                             'Container was not running.'
                 finally:
                     # Prepare for the next test.
                     _dockerKill(container_name, RM)
                     os.remove(test_file)
Esempio n. 33
0
def consolidate_output(job, config, kallisto_output, rsem_star_output,
                       fastqc_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param FileID kallisto_output: FileStoreID for Kallisto output
    :param tuple(FileID, FileID, FileID)|tuple(FileID, FileID, FileID, bool, FileID) rsem_star_output:
            FileStoreIDs for RSEM and STAR output, and a flag/FileID if run with bamQC
    :param FileID fastqc_output: FileStoreID for FastQC output
    """
    job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    config.uuid = 'SINGLE-END.' + config.uuid if not config.paired else config.uuid
    # Retrieve output file paths to consolidate
    rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar = None, None, None, None, None, None
    if rsem_star_output:
        if config.bamqc:
            rsem_id, hugo_id, star_id, fail_flag, bamqc_id = flatten(
                rsem_star_output)
            bamqc_tar = job.fileStore.readGlobalFile(
                bamqc_id, os.path.join(work_dir, 'bamqc.tar.gz'))
            config.uuid = 'FAIL.' + config.uuid if fail_flag else config.uuid
        else:
            rsem_id, hugo_id, star_id = flatten(rsem_star_output)
        rsem_tar = job.fileStore.readGlobalFile(
            rsem_id, os.path.join(work_dir, 'rsem.tar.gz'))
        hugo_tar = job.fileStore.readGlobalFile(
            hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz'))
        star_tar = job.fileStore.readGlobalFile(
            star_id, os.path.join(work_dir, 'star.tar.gz'))
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(
            kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz'))
    if fastqc_output:
        fastqc_tar = job.fileStore.readGlobalFile(
            fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz'))
    # I/O
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [
        x for x in
        [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar]
        if x is not None
    ]
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == rsem_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'RSEM',
                                os.path.basename(tarinfo.name))
                        elif tar == hugo_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'RSEM', 'Hugo',
                                os.path.basename(tarinfo.name))
                        elif tar == kallisto_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'Kallisto',
                                os.path.basename(tarinfo.name))
                        elif tar == bamqc_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'bamQC',
                                os.path.basename(tarinfo.name))
                        elif tar == fastqc_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'fastQC',
                                os.path.basename(tarinfo.name))
                        elif tar == star_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'STAR',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(
            file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')],
            output_dir=config.output_dir)