def assert_bam_is_paired_end(job, bam_path, region='chr6'): """ Confirm that a BAM is paired-end and not single-end. Raises an error if not paired-end :param JobFunctionWrappingJob job: passed automatically by Toil :param str region: Region of the genome to select :param str bam_path: Path to BAM """ # Check if BAM index exists, otherwise index BAM bam_no_ext = os.path.splitext(bam_path)[0] if not os.path.exists(bam_no_ext + '.bai') and not os.path.exists(bam_no_ext + '.bam.bai'): index_bam(job, bam_path) docker_bam_path = docker_path(bam_path) work_dir = os.path.dirname(os.path.abspath(bam_path)) # Check for both "chr" and no "chr" format results = [] regions = [region, 'chr' + region] if 'chr' not in region else [ region, region.lstrip('chr') ] for r in regions: parameters = [ 'view', '-c', '-f', '1', docker_bam_path, r ] # Chr6 chosen for testing, any region with reads will work out = dockerCheckOutput(job, workDir=work_dir, parameters=parameters, tool=samtools_version) results.append(int(out.strip())) assert any(x for x in results if x != 0), 'BAM is not paired-end, aborting run.'
def _testSubprocessDockerPipeChainFn(job): """ Return the result of simple pipe chain. Should be 2 """ parameters = [['printf', 'x\n y\n'], ['wc', '-l']] return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters)
def _testDockerPipeChainErrorFn(job): """ Return True if the command exit 1 | wc -l raises a CalledProcessError when run through the docker interface """ parameters = [ ['exit', '1'], ['wc', '-l'] ] try: return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters) except CalledProcessError: return True return False
def _testSubprocessDockerPipeChainErrorFn(job): """ Return True if the command exit 1 | wc -l raises a CalledProcessError when run through the docker interface """ parameters = [ ['exit', '1'], ['wc', '-l'] ] try: return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters) except subprocess.CalledProcessError: return True return False
def start(self, job): """ Start spark and hdfs master containers :param job: The underlying job. """ if self.hostname is None: self.hostname = subprocess.check_output([ "hostname", "-f", ])[:-1] _log.info("Started Spark master container.") self.sparkContainerID = dockerCheckOutput( job=job, defer=STOP, workDir=os.getcwd(), tool= "quay.io/ucsc_cgl/apache-spark-master:2.1.1--acd08bd3e5670502636fb6842b777eb2aff6def7", dockerParameters=[ "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw", "-e", "SPARK_MASTER_IP=" + self.hostname, "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work" ], parameters=[self.hostname])[:-1] _log.info("Started HDFS Datanode.") self.hdfsContainerID = dockerCheckOutput( job=job, defer=STOP, workDir=os.getcwd(), tool= "quay.io/ucsc_cgl/apache-hadoop-master:2.7.4--9a9cdee76b65a50e63ad2ca8f9db6da2bd11965c", dockerParameters=["--net=host", "-d"], parameters=[self.hostname])[:-1] return self.hostname
def __start_datanode(self, job): """ Launches the Hadoop datanode. :param job: The underlying job. """ self.hdfsContainerID = dockerCheckOutput( job=job, defer=STOP, workDir=os.getcwd(), tool= "quay.io/ucsc_cgl/apache-hadoop-worker:2.7.4--9a9cdee76b65a50e63ad2ca8f9db6da2bd11965c", dockerParameters=[ "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw" ], parameters=[self.masterIP])[:-1]
def _testDockerPipeChainFn(job): """ Return the result of simple pipe chain. Should be 2 """ parameters = [ ['printf', 'x\n y\n'], ['wc', '-l'] ] return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters)
def start(self, job): """ Start spark and hdfs worker containers :param job: The underlying job. """ # start spark and our datanode self.sparkContainerID = dockerCheckOutput( job=job, defer=STOP, workDir=os.getcwd(), tool= "quay.io/ucsc_cgl/apache-spark-worker:2.1.1--acd08bd3e5670502636fb6842b777eb2aff6def7", dockerParameters=[ "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw", "-e", "\"SPARK_MASTER_IP=" + self.masterIP + ":" + _SPARK_MASTER_PORT + "\"", "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work" ], parameters=[self.masterIP + ":" + _SPARK_MASTER_PORT])[:-1] self.__start_datanode(job) # fake do/while to check if HDFS is up hdfs_down = True retries = 0 while hdfs_down and (retries < 5): _log.info("Sleeping 30 seconds before checking HDFS startup.") time.sleep(30) clusterID = "" try: clusterID = subprocess.check_output([ "docker", "exec", self.hdfsContainerID, "grep", "clusterID", "-R", "/opt/apache-hadoop/logs" ]) except: # grep returns a non-zero exit code if the pattern is not found # we expect to not find the pattern, so a non-zero code is OK pass if "Incompatible" in clusterID: _log.warning("Hadoop Datanode failed to start with: %s", clusterID) _log.warning("Retrying container startup, retry #%d.", retries) retries += 1 _log.warning("Removing ephemeral hdfs directory.") subprocess.check_call([ "docker", "exec", self.hdfsContainerID, "rm", "-rf", "/ephemeral/hdfs" ]) _log.warning("Killing container %s.", self.hdfsContainerID) subprocess.check_call(["docker", "kill", self.hdfsContainerID]) # todo: this is copied code. clean up! _log.info("Restarting datanode.") self.__start_datanode(job) else: _log.info("HDFS datanode started up OK!") hdfs_down = False if retries >= 5: raise RuntimeError( "Failed %d times trying to start HDFS datanode." % retries) return