Exemple #1
0
def assert_bam_is_paired_end(job, bam_path, region='chr6'):
    """
    Confirm that a BAM is paired-end and not single-end. Raises an error if not paired-end

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str region: Region of the genome to select
    :param str bam_path: Path to BAM
    """
    # Check if BAM index exists, otherwise index BAM
    bam_no_ext = os.path.splitext(bam_path)[0]
    if not os.path.exists(bam_no_ext +
                          '.bai') and not os.path.exists(bam_no_ext +
                                                         '.bam.bai'):
        index_bam(job, bam_path)

    docker_bam_path = docker_path(bam_path)
    work_dir = os.path.dirname(os.path.abspath(bam_path))

    # Check for both "chr" and no "chr" format
    results = []
    regions = [region, 'chr' + region] if 'chr' not in region else [
        region, region.lstrip('chr')
    ]
    for r in regions:
        parameters = [
            'view', '-c', '-f', '1', docker_bam_path, r
        ]  # Chr6 chosen for testing, any region with reads will work
        out = dockerCheckOutput(job,
                                workDir=work_dir,
                                parameters=parameters,
                                tool=samtools_version)
        results.append(int(out.strip()))
    assert any(x for x in results
               if x != 0), 'BAM is not paired-end, aborting run.'
Exemple #2
0
def _testSubprocessDockerPipeChainFn(job):
    """
    Return the result of simple pipe chain.  Should be 2
    """
    parameters = [['printf', 'x\n y\n'], ['wc', '-l']]
    return dockerCheckOutput(job,
                             tool='quay.io/ucsc_cgl/spooky_test',
                             parameters=parameters)
Exemple #3
0
def _testDockerPipeChainErrorFn(job):
    """
    Return True if the command exit 1 | wc -l raises a CalledProcessError when run through 
    the docker interface
    """
    parameters = [ ['exit', '1'], ['wc', '-l'] ]
    try:
        return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters)
    except CalledProcessError:
        return True
    return False
Exemple #4
0
def _testSubprocessDockerPipeChainErrorFn(job):
    """
    Return True if the command exit 1 | wc -l raises a CalledProcessError when run through 
    the docker interface
    """
    parameters = [ ['exit', '1'], ['wc', '-l'] ]
    try:
        return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters)
    except subprocess.CalledProcessError:
        return True
    return False
Exemple #5
0
    def start(self, job):
        """
        Start spark and hdfs master containers

        :param job: The underlying job.
        """

        if self.hostname is None:
            self.hostname = subprocess.check_output([
                "hostname",
                "-f",
            ])[:-1]

        _log.info("Started Spark master container.")
        self.sparkContainerID = dockerCheckOutput(
            job=job,
            defer=STOP,
            workDir=os.getcwd(),
            tool=
            "quay.io/ucsc_cgl/apache-spark-master:2.1.1--acd08bd3e5670502636fb6842b777eb2aff6def7",
            dockerParameters=[
                "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw",
                "-e", "SPARK_MASTER_IP=" + self.hostname, "-e",
                "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e",
                "SPARK_WORKER_DIR=/ephemeral/spark/work"
            ],
            parameters=[self.hostname])[:-1]
        _log.info("Started HDFS Datanode.")
        self.hdfsContainerID = dockerCheckOutput(
            job=job,
            defer=STOP,
            workDir=os.getcwd(),
            tool=
            "quay.io/ucsc_cgl/apache-hadoop-master:2.7.4--9a9cdee76b65a50e63ad2ca8f9db6da2bd11965c",
            dockerParameters=["--net=host", "-d"],
            parameters=[self.hostname])[:-1]

        return self.hostname
Exemple #6
0
    def __start_datanode(self, job):
        """
        Launches the Hadoop datanode.

        :param job: The underlying job.
        """
        self.hdfsContainerID = dockerCheckOutput(
            job=job,
            defer=STOP,
            workDir=os.getcwd(),
            tool=
            "quay.io/ucsc_cgl/apache-hadoop-worker:2.7.4--9a9cdee76b65a50e63ad2ca8f9db6da2bd11965c",
            dockerParameters=[
                "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw"
            ],
            parameters=[self.masterIP])[:-1]
Exemple #7
0
def _testDockerPipeChainFn(job):
    """
    Return the result of simple pipe chain.  Should be 2
    """
    parameters = [ ['printf', 'x\n y\n'], ['wc', '-l'] ]
    return dockerCheckOutput(job, tool='quay.io/ucsc_cgl/spooky_test', parameters=parameters)
Exemple #8
0
    def start(self, job):
        """
        Start spark and hdfs worker containers

        :param job: The underlying job.
        """

        # start spark and our datanode
        self.sparkContainerID = dockerCheckOutput(
            job=job,
            defer=STOP,
            workDir=os.getcwd(),
            tool=
            "quay.io/ucsc_cgl/apache-spark-worker:2.1.1--acd08bd3e5670502636fb6842b777eb2aff6def7",
            dockerParameters=[
                "--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw",
                "-e", "\"SPARK_MASTER_IP=" + self.masterIP + ":" +
                _SPARK_MASTER_PORT + "\"", "-e",
                "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e",
                "SPARK_WORKER_DIR=/ephemeral/spark/work"
            ],
            parameters=[self.masterIP + ":" + _SPARK_MASTER_PORT])[:-1]
        self.__start_datanode(job)

        # fake do/while to check if HDFS is up
        hdfs_down = True
        retries = 0
        while hdfs_down and (retries < 5):

            _log.info("Sleeping 30 seconds before checking HDFS startup.")
            time.sleep(30)
            clusterID = ""
            try:
                clusterID = subprocess.check_output([
                    "docker", "exec", self.hdfsContainerID, "grep",
                    "clusterID", "-R", "/opt/apache-hadoop/logs"
                ])
            except:
                # grep returns a non-zero exit code if the pattern is not found
                # we expect to not find the pattern, so a non-zero code is OK
                pass

            if "Incompatible" in clusterID:
                _log.warning("Hadoop Datanode failed to start with: %s",
                             clusterID)
                _log.warning("Retrying container startup, retry #%d.", retries)
                retries += 1

                _log.warning("Removing ephemeral hdfs directory.")
                subprocess.check_call([
                    "docker", "exec", self.hdfsContainerID, "rm", "-rf",
                    "/ephemeral/hdfs"
                ])

                _log.warning("Killing container %s.", self.hdfsContainerID)
                subprocess.check_call(["docker", "kill", self.hdfsContainerID])

                # todo: this is copied code. clean up!
                _log.info("Restarting datanode.")
                self.__start_datanode(job)

            else:
                _log.info("HDFS datanode started up OK!")
                hdfs_down = False

        if retries >= 5:
            raise RuntimeError(
                "Failed %d times trying to start HDFS datanode." % retries)

        return