Esempio n. 1
0
 def findMesosBinary(self, name):
     try:
         return next(which(name))
     except StopIteration:
         try:
             # Special case for users of PyCharm on OS X. This is where Homebrew installs
             # it. It's hard to set PATH for PyCharm (or any GUI app) on OS X so let's
             # make it easy for those poor souls.
             return next(which(name, path=['/usr/local/sbin']))
         except StopIteration:
             raise RuntimeError("Cannot find the '%s' binary. Make sure Mesos is installed "
                                "and it's 'bin' directory is present on the PATH." % name)
Esempio n. 2
0
def needs_gridengine(test_item):
    """
    Use as a decorator before test classes or methods to only run them if GridEngine is installed.
    """
    test_item = _mark_test('gridengine', test_item)
    if next(which('qsub'), None):
        return test_item
    else:
        return unittest.skip("Install GridEngine to include this test.")(test_item)
Esempio n. 3
0
def needs_parasol(test_item):
    """
    Use as decorator so tests are only run if Parasol is installed.
    """
    test_item = _mark_test('parasol', test_item)
    if next(which('parasol'), None):
        return test_item
    else:
        return unittest.skip("Install Parasol to include this test.")(test_item)
Esempio n. 4
0
def needs_slurm(test_item):
    """
    Use as a decorator before test classes or methods to only run them if Slurm is installed.
    """
    test_item = _mark_test('slurm', test_item)
    if next(which('squeue'), None):
        return test_item
    else:
        return unittest.skip("Install Slurm to include this test.")(test_item)
Esempio n. 5
0
def main():
    """
    This Toil pipeline aligns reads and performs alternative splicing analysis.

    Please read the README.md located in the same directory for run instructions.
    """
    # Define Parser object and add to toil
    url_prefix = 'https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/'
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--config', required=True,
                        help='Path to configuration file for samples, one per line. UUID,URL_to_bamfile. '
                             'The URL may be a standard "http://", a "file://<abs_path>", or "s3://<bucket>/<key>"')
    parser.add_argument('--gtf', help='URL to annotation GTF file',
                        default=url_prefix + 'rnaseq_cgl/gencode.v23.annotation.gtf')
    parser.add_argument('--gtf-pickle', help='Pickled GTF file',
                        default=url_prefix + 'spladder/gencode.v23.annotation.gtf.pickle')
    parser.add_argument('--gtf-m53', help='M53 preprocessing annotation table',
                        default=url_prefix + 'spladder/gencode.v23.annotation.gtf.m53')
    parser.add_argument('--positions', help='URL to SNP positions over genes file (TSV)',
                        default=url_prefix + 'spladder/positions_fixed.tsv')
    parser.add_argument('--genome', help='URL to Genome fasta',
                        default=url_prefix + 'rnaseq_cgl/hg38_no_alt.fa')
    parser.add_argument('--genome-index', help='Index file (fai) of genome',
                        default=url_prefix + 'spladder/hg38_no_alt.fa.fai')
    parser.add_argument('--ssec', default=None, help='Path to master key used for downloading encrypted files.')
    parser.add_argument('--output-s3-dir', default=None, help='S3 Directory of the form: s3://bucket/directory')
    parser.add_argument('--output-dir', default=None, help='full path where final results will be output')
    parser.add_argument('--sudo', action='store_true', default=False,
                        help='Set flag if sudo is required to run Docker.')
    parser.add_argument('--star-index', help='URL to download STAR Index built from HG38/gencodev23 annotation.',
                        default=url_prefix + 'rnaseq_cgl/starIndex_hg38_no_alt.tar.gz')
    parser.add_argument('--fwd-3pr-adapter', help="Sequence for the FWD 3' Read Adapter.", default='AGATCGGAAGAG')
    parser.add_argument('--rev-3pr-adapter', help="Sequence for the REV 3' Read Adapter.", default='AGATCGGAAGAG')
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Sanity Checks
    if args.config:
        assert os.path.isfile(args.config), 'Config not found at: {}'.format(args.config)
    if args.ssec:
        assert os.path.isfile(args.ssec), 'Encryption key not found at: {}'.format(args.config)
    if args.output_s3_dir:
        assert args.output_s3_dir.startswith('s3://'), 'Wrong format for output s3 directory'
    # Program checks
    for program in ['curl', 'docker']:
        assert which(program), 'Program "{}" must be installed on every node.'.format(program)

    Job.Runner.startToil(Job.wrapJobFn(parse_input_samples, args), args)
Esempio n. 6
0
def needs_appliance(test_item):
    import json
    test_item = _mark_test('appliance', test_item)
    if next(which('docker'), None):
        image = applianceSelf()
        try:
            images = check_output(['docker', 'inspect', image])
        except CalledProcessError:
            images = []
        else:
            images = {i['Id'] for i in json.loads(images) if image in i['RepoTags']}
        if len(images) == 0:
            return unittest.skip("Cannot find appliance image %s. Be sure to run 'make docker' "
                                 "prior to running this test." % image)(test_item)
        elif len(images) == 1:
            return test_item
        else:
            assert False, 'Expected `docker inspect` to return zero or one image.'
    else:
        return unittest.skip('Install Docker to include this test.')(test_item)
Esempio n. 7
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)
        if maxMemory != sys.maxint:
            logger.warn('The Parasol batch system does not support maxMemory.')
        # Keep the name of the results file for the pstat2 command..
        command = config.parasolCommand
        if os.path.sep not in command:
            try:
                command = next(which(command))
            except StopIteration:
                raise RuntimeError("Can't find %s on PATH." % command)
        logger.info('Using Parasol at %s', command)
        self.parasolCommand = command
        self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore)

        # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
        # have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
        # memory) tuples for each batch. A new batch is created whenever a job has a new unique
        # combination of cpu and memory requirements.
        self.resultsFiles = dict()
        self.maxBatches = config.parasolMaxBatches

        # Allows the worker process to send back the IDs of jobs that have finished, so the batch
        #  system can decrease its used cpus counter
        self.cpuUsageQueue = Queue()

        # Also stores finished job IDs, but is read by getUpdatedJobIDs().
        self.updatedJobsQueue = Queue()

        # Use this to stop the worker when shutting down
        self.running = True

        self.worker = Thread(target=self.updatedJobWorker, args=())
        self.worker.start()
        self.usedCpus = 0
        self.jobIDsToCpu = {}

        # Set of jobs that have been issued but aren't known to have finished or been killed yet.
        #  Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
        #  removed in killBatchJobs.
        self.runningJobs = set()
Esempio n. 8
0
def needs_appliance(test_item):
    import json
    test_item = _mark_test('appliance', test_item)
    if next(which('docker'), None):
        image = applianceSelf()
        try:
            images = subprocess.check_output(['docker', 'inspect', image])
        except subprocess.CalledProcessError:
            images = []
        else:
            images = {
                i['Id']
                for i in json.loads(images) if image in i['RepoTags']
            }
        if len(images) == 0:
            return unittest.skip(
                "Cannot find appliance image %s. Be sure to run 'make docker' "
                "prior to running this test." % image)(test_item)
        elif len(images) == 1:
            return test_item
        else:
            assert False, 'Expected `docker inspect` to return zero or one image.'
    else:
        return unittest.skip('Install Docker to include this test.')(test_item)
Esempio n. 9
0
def main():
    """
    GATK germline pipeline with variant filtering and annotation.
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawTextHelpFormatter)

    # Generate subparsers
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the GATK germline pipeline')
    parser_run.add_argument(
        '--config',
        required=True,
        type=str,
        help='Path to the (filled in) config file, generated with '
        '"generate-config".')
    parser_run.add_argument(
        '--manifest',
        type=str,
        help='Path to the (filled in) manifest file, generated with '
        '"generate-manifest".\nDefault value: "%(default)s".')
    parser_run.add_argument(
        '--sample',
        default=None,
        nargs=2,
        type=str,
        help='Input sample identifier and BAM file URL or local path')
    parser_run.add_argument('--output-dir',
                            default=None,
                            help='Path/URL to output directory')
    parser_run.add_argument(
        '-s',
        '--suffix',
        default=None,
        help='Additional suffix to add to the names of the output files')
    parser_run.add_argument('--preprocess-only',
                            action='store_true',
                            help='Only runs preprocessing steps')

    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-germline.yaml'),
                      generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'),
                      generate_manifest)
    elif options.command == 'run':
        # Program checks
        for program in ['curl', 'docker']:
            require(
                next(which(program)),
                program + ' must be installed on every node.'.format(program))

        require(
            os.path.exists(options.config),
            '{} not found. Please run "generate-config"'.format(
                options.config))

        # Read sample manifest
        samples = []
        if options.manifest:
            samples.extend(parse_manifest(options.manifest))

        # Add BAM sample from command line
        if options.sample:
            uuid, url = options.sample
            # samples tuple: (uuid, url, paired_url, rg_line)
            # BAM samples should not have as paired URL or read group line
            samples.append(GermlineSample(uuid, url, None, None))

        require(
            len(samples) > 0,
            'No samples were detected in the manifest or on the command line')

        # Parse inputs
        inputs = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(options.config).read()).iteritems()
        }

        required_fields = {
            'genome_fasta', 'output_dir', 'run_bwa', 'sorted',
            'snp_filter_annotations', 'indel_filter_annotations', 'preprocess',
            'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator',
            'cores', 'file_size', 'xmx', 'suffix'
        }

        input_fields = set(inputs.keys())
        require(
            input_fields > required_fields,
            'Missing config parameters:\n{}'.format(', '.join(required_fields -
                                                              input_fields)))

        if inputs['output_dir'] is None:
            inputs['output_dir'] = options.output_dir

        require(inputs['output_dir'] is not None,
                'Missing output directory PATH/URL')

        if inputs['suffix'] is None:
            inputs['suffix'] = options.suffix if options.suffix else ''

        if inputs['preprocess_only'] is None:
            inputs['preprocess_only'] = options.preprocess_only

        if inputs['run_vqsr']:
            # Check that essential VQSR parameters are present
            vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'}
            require(
                input_fields > vqsr_fields,
                'Missing parameters for VQSR:\n{}'.format(
                    ', '.join(vqsr_fields - input_fields)))

        # Check that hard filtering parameters are present. If only running preprocessing steps, then we do
        # not need filtering information.
        elif not inputs['preprocess_only']:
            hard_filter_fields = {
                'snp_filter_name', 'snp_filter_expression',
                'indel_filter_name', 'indel_filter_expression'
            }
            require(
                input_fields > hard_filter_fields,
                'Missing parameters for hard filtering:\n{}'.format(
                    ', '.join(hard_filter_fields - input_fields)))

            # Check for falsey hard filtering parameters
            for hard_filter_field in hard_filter_fields:
                require(
                    inputs[hard_filter_field],
                    'Missing %s value for hard filtering, '
                    'got %s.' % (hard_filter_field, inputs[hard_filter_field]))

        # Set resource parameters
        inputs['xmx'] = human2bytes(inputs['xmx'])
        inputs['file_size'] = human2bytes(inputs['file_size'])
        inputs['cores'] = int(inputs['cores'])

        inputs['annotations'] = set(inputs['snp_filter_annotations'] +
                                    inputs['indel_filter_annotations'])

        # HaplotypeCaller test data for testing
        inputs['hc_output'] = inputs.get('hc_output', None)

        # It is a toil-scripts convention to store input parameters in a Namespace object
        config = argparse.Namespace(**inputs)

        root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config)
        Job.Runner.startToil(root, options)
Esempio n. 10
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    MarginPhase pipeline

    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """

    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')

    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the MarginPhase pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default=DEFAULT_CONFIG_NAME,
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default=DEFAULT_MANIFEST_NAME,
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')

    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME),
                      generate_manifest)

    # Pipeline execution
    elif args.command == 'run':
        # sanity check
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-marginphase generate-config"'.format(args.config))
        require(
            os.path.exists(args.manifest),
            '{} not found and no samples provided. Please '
            'run "toil-marginphase generate-manifest"'.format(args.manifest))

        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize
        config.defaultCores = int(min(MP_CPU, config.maxCores))
        config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint
        config.maxMemory = sys.maxint
        # fix parsing of GB to int
        if args.maxMemory:
            args.maxMemory = args.maxMemory.upper()
            if args.maxMemory.endswith('B'):
                args.maxMemory = args.maxMemory.rstrip('B')
            # actual parsing
            if args.maxMemory.endswith('G'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024
            elif args.maxMemory.endswith('M'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('M')) * 1024 * 1024
            elif args.maxMemory.endswith('K'):
                config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024
            else:
                config.maxMemory = int(args.maxMemory)

        # Config sanity checks
        require(config.output_dir, 'No output location specified')
        if urlparse(config.output_dir).scheme != "s3":
            config.output_dir = config.output_dir.replace("file://", "", 1)
            mkdir_p(config.output_dir)
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        require(config.partition_size,
                "Configuration parameter partition-size is required")
        require(config.partition_margin,
                "Configuration parameter partition-margin is required")

        if 'save_intermediate_files' not in config or not config.save_intermediate_files:
            config.intermediate_file_location = None
        elif urlparse(config.output_dir).scheme == "s3":
            raise UserError(
                "Config parameter 'save_intermediate_files' cannot be used with s3 output directory"
            )
        else:
            intermediate_location = os.path.join(
                config.output_dir, "intermediate",
                datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
            mkdir_p(intermediate_location)
            config.intermediate_file_location = intermediate_location
        if "margin_phase_image" not in config or len(
                config.margin_phase_image) == 0:
            config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT
        if "margin_phase_tag" not in config or len(
                config.margin_phase_tag) == 0:
            config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT
        if "cpecan_image" not in config or len(config.cpecan_image) == 0:
            config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT
        if "cpecan_tag" not in config or len(config.cpecan_tag) == 0:
            config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT
        if "unittest" not in config:
            config.unittest = False
        if "minimal_output" not in config:
            config.minimal_output = False
        if "minimal_cpecan_output" not in config:
            config.minimal_cpecan_output = False
        if "cpecan_probabilities" not in config:
            config.cpecan_probabilities = False

        # get samples
        samples = parse_samples(config, args.manifest)

        # Program checks
        for program in ['docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow
        Job.Runner.startToil(
            Job.wrapJobFn(map_job, prepare_input, samples, config), args)
Esempio n. 11
0
def main():
    """
    This Toil pipeline aligns reads and performs alternative splicing analysis.

    Please read the README.md located in the same directory for run instructions.
    """
    # Define Parser object and add to toil
    url_prefix = 'https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/'
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '--config',
        required=True,
        help=
        'Path to configuration file for samples, one per line. UUID,URL_to_bamfile. '
        'The URL may be a standard "http://", a "file://<abs_path>", or "s3://<bucket>/<key>"'
    )
    parser.add_argument('--gtf',
                        help='URL to annotation GTF file',
                        default=url_prefix +
                        'rnaseq_cgl/gencode.v23.annotation.gtf')
    parser.add_argument('--gtf-pickle',
                        help='Pickled GTF file',
                        default=url_prefix +
                        'spladder/gencode.v23.annotation.gtf.pickle')
    parser.add_argument('--gtf-m53',
                        help='M53 preprocessing annotation table',
                        default=url_prefix +
                        'spladder/gencode.v23.annotation.gtf.m53')
    parser.add_argument('--positions',
                        help='URL to SNP positions over genes file (TSV)',
                        default=url_prefix + 'spladder/positions_fixed.tsv')
    parser.add_argument('--genome',
                        help='URL to Genome fasta',
                        default=url_prefix + 'rnaseq_cgl/hg38_no_alt.fa')
    parser.add_argument('--genome-index',
                        help='Index file (fai) of genome',
                        default=url_prefix + 'spladder/hg38_no_alt.fa.fai')
    parser.add_argument(
        '--ssec',
        default=None,
        help='Path to master key used for downloading encrypted files.')
    parser.add_argument('--output-s3-dir',
                        default=None,
                        help='S3 Directory of the form: s3://bucket/directory')
    parser.add_argument('--output-dir',
                        default=None,
                        help='full path where final results will be output')
    parser.add_argument('--sudo',
                        action='store_true',
                        default=False,
                        help='Set flag if sudo is required to run Docker.')
    parser.add_argument(
        '--star-index',
        help=
        'URL to download STAR Index built from HG38/gencodev23 annotation.',
        default=url_prefix + 'rnaseq_cgl/starIndex_hg38_no_alt.tar.gz')
    parser.add_argument('--fwd-3pr-adapter',
                        help="Sequence for the FWD 3' Read Adapter.",
                        default='AGATCGGAAGAG')
    parser.add_argument('--rev-3pr-adapter',
                        help="Sequence for the REV 3' Read Adapter.",
                        default='AGATCGGAAGAG')
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Sanity Checks
    if args.config:
        assert os.path.isfile(args.config), 'Config not found at: {}'.format(
            args.config)
    if args.ssec:
        assert os.path.isfile(
            args.ssec), 'Encryption key not found at: {}'.format(args.config)
    if args.output_s3_dir:
        assert args.output_s3_dir.startswith(
            's3://'), 'Wrong format for output s3 directory'
    # Program checks
    for program in ['curl', 'docker']:
        assert which(
            program), 'Program "{}" must be installed on every node.'.format(
                program)

    Job.Runner.startToil(Job.wrapJobFn(parse_input_samples, args), args)
Esempio n. 12
0
def main():
    """
    GATK germline pipeline with variant filtering and annotation.
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)

    # Generate subparsers
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser('generate-config',
                          help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest',
                          help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate',
                          help='Generates a config and manifest in the current working directory.')

    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline')
    parser_run.add_argument('--config',
                            required=True,
                            type=str,
                            help='Path to the (filled in) config file, generated with '
                                 '"generate-config".')
    parser_run.add_argument('--manifest',
                            type=str,
                            help='Path to the (filled in) manifest file, generated with '
                                 '"generate-manifest".\nDefault value: "%(default)s".')
    parser_run.add_argument('--sample',
                            default=None,
                            nargs=2,
                            type=str,
                            help='Input sample identifier and BAM file URL or local path')
    parser_run.add_argument('--output-dir',
                            default=None,
                            help='Path/URL to output directory')
    parser_run.add_argument('-s', '--suffix',
                            default=None,
                            help='Additional suffix to add to the names of the output files')
    parser_run.add_argument('--preprocess-only',
                            action='store_true',
                            help='Only runs preprocessing steps')

    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest)
    elif options.command == 'run':
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program)),
                    program + ' must be installed on every node.'.format(program))

        require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config))

        # Read sample manifest
        samples = []
        if options.manifest:
            samples.extend(parse_manifest(options.manifest))

        # Add BAM sample from command line
        if options.sample:
            uuid, url = options.sample
            # samples tuple: (uuid, url, paired_url, rg_line)
            # BAM samples should not have as paired URL or read group line
            samples.append(GermlineSample(uuid, url, None, None))

        require(len(samples) > 0,
                'No samples were detected in the manifest or on the command line')

        # Parse inputs
        inputs = {x.replace('-', '_'): y for x, y in
                  yaml.load(open(options.config).read()).iteritems()}

        required_fields = {'genome_fasta',
                           'output_dir',
                           'run_bwa',
                           'sorted',
                           'snp_filter_annotations',
                           'indel_filter_annotations',
                           'preprocess',
                           'preprocess_only',
                           'run_vqsr',
                           'joint_genotype',
                           'run_oncotator',
                           'cores',
                           'file_size',
                           'xmx',
                           'suffix'}

        input_fields = set(inputs.keys())
        require(input_fields > required_fields,
                'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields)))

        if inputs['output_dir'] is None:
            inputs['output_dir'] = options.output_dir

        require(inputs['output_dir'] is not None,
                'Missing output directory PATH/URL')

        if inputs['suffix'] is None:
            inputs['suffix'] = options.suffix if options.suffix else ''

        if inputs['preprocess_only'] is None:
            inputs['preprocess_only'] = options.preprocess_only

        if inputs['run_vqsr']:
            # Check that essential VQSR parameters are present
            vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'}
            require(input_fields > vqsr_fields,
                    'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields)))

        # Check that hard filtering parameters are present. If only running preprocessing steps, then we do
        # not need filtering information.
        elif not inputs['preprocess_only']:
            hard_filter_fields = {'snp_filter_name', 'snp_filter_expression',
                                  'indel_filter_name', 'indel_filter_expression'}
            require(input_fields > hard_filter_fields,
                    'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields)))

            # Check for falsey hard filtering parameters
            for hard_filter_field in hard_filter_fields:
                require(inputs[hard_filter_field], 'Missing %s value for hard filtering, '
                                                   'got %s.' % (hard_filter_field, inputs[hard_filter_field]))

        # Set resource parameters
        inputs['xmx'] = human2bytes(inputs['xmx'])
        inputs['file_size'] = human2bytes(inputs['file_size'])
        inputs['cores'] = int(inputs['cores'])

        inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations'])

        # HaplotypeCaller test data for testing
        inputs['hc_output'] = inputs.get('hc_output', None)

        # It is a toil-scripts convention to store input parameters in a Namespace object
        config = argparse.Namespace(**inputs)

        root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config)
        Job.Runner.startToil(root, options)
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil exome pipeline

    Perform variant / indel analysis given a pair of tumor/normal BAM files.
    Samples are optionally preprocessed (indel realignment and base quality score recalibration)
    The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel.

    General usage:
    1. Type "toil-exome generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-exome run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline

    Structure of variant pipeline (per sample)

           1 2 3 4          14 -------
           | | | |          |        |
        0 --------- 5 ----- 15 -------- 17
                    |       |        |
                   ---      16 -------
                   | |
                   6 7
                   | |
                   8 9
                   | |
                  10 11
                   | |
                  12 13

    0 = Start node
    1 = reference index
    2 = reference dict
    3 = normal bam index
    4 = tumor bam index
    5 = pre-processing node / DAG declaration
    6,7 = RealignerTargetCreator
    8,9 = IndelRealigner
    10,11 = BaseRecalibration
    12,13 = PrintReads
    14 = MuTect
    15 = Pindel
    16 = MuSe
    17 = Consolidate Output and move/upload results
    ==================================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline')
    parser_run.add_argument('--config', default='config-toil-exome.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--manifest', default='manifest-toil-exome.tsv', type=str,
                            help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--normal', default=None, type=str,
                            help='URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--tumor', default=None, type=str,
                            help='URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the'
                                                                   '"--tumor" and "--normal" option')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if args.normal or args.tumor or args.uuid:
            require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied')
            samples = [[args.uuid, args.normal, args.tumor]]
        else:
            samples = parse_manifest(args.manifest)
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Exome pipeline sanity checks
        if config.preprocessing:
            require(config.reference and config.phase and config.mills and config.dbsnp,
                    'Missing inputs for preprocessing, check config file.')
        if config.run_mutect:
            require(config.reference and config.dbsnp and config.cosmic,
                    'Missing inputs for MuTect, check config file.')
        if config.run_pindel:
            require(config.reference, 'Missing input (reference) for Pindel.')
        if config.run_muse:
            require(config.reference and config.dbsnp,
                    'Missing inputs for MuSe, check config file.')
        require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program), None), program + ' must be installed on every node.'.format(program))

        # Launch Pipeline
        Job.Runner.startToil(Job.wrapJobFn(download_shared_files, samples, config), args)
Esempio n. 14
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General usage:
    1. Type "toil-rnaseq generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-rnaseq run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/rnaseq_cgl

    Structure of RNA-Seq Pipeline (per sample)

                  8
                  |
                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 7 -- 9
                |           |
                6 -----------

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = FastQC
    7 = Kallisto
    8 = BamQC (as specified by CKCC at UC Santa Cruz)
    9 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL, needed for upload to S3)
    """
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the RNA-seq pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default='config-toil-rnaseq.yaml',
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default='manifest-toil-rnaseq.tsv',
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--samples',
        default=None,
        nargs='+',
        type=str,
        help=
        'Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain '
        'fastq files. URLs follow the format: http://foo.com/sample.tar, '
        'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.'
        'Samples passed in this way will be assumed to be paired end, if using single-end data, '
        'please use the manifest option.')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'),
                      generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'),
                      generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-rnaseq generate-config"'.format(args.config))
        if not args.samples:
            require(
                os.path.exists(args.manifest),
                '{} not found and no samples provided. Please '
                'run "toil-rnaseq generate-manifest"'.format(args.manifest))
            samples = parse_samples(path_to_manifest=args.manifest)
        else:
            samples = parse_samples(sample_urls=args.samples)
        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Config sanity checks
        require(
            config.kallisto_index or config.star_index,
            'URLs not provided for Kallisto or STAR, so there is nothing to do!'
        )
        if config.star_index or config.rsem_ref:
            require(
                config.star_index and config.rsem_ref,
                'Input provided for STAR or RSEM but not both. STAR: '
                '{}, RSEM: {}'.format(config.star_index, config.rsem_ref))
        require(config.output_dir,
                'No output location specified: {}'.format(config.output_dir))
        for input in [
                x for x in
            [config.kallisto_index, config.star_index, config.rsem_ref] if x
        ]:
            require(
                urlparse(input).scheme in schemes,
                'Input in config must have the appropriate URL prefix: {}'.
                format(schemes))
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        # Program checks
        for program in ['curl', 'docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow, calling map_job() to run the pipeline for each sample
        with Toil(args) as toil:
            toil.start(Job.wrapJobFn(map_job, download_sample, samples,
                                     config))
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General usage:
    1. Type "toil-rnaseq generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-rnaseq run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/rnaseq_cgl

    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 7

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the RNA-seq pipeline')
    group = parser_run.add_mutually_exclusive_group(required=True)
    parser_run.add_argument('--config', default='config-toil-rnaseq.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    group.add_argument('--manifest', default='manifest-toil-rnaseq.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s"')
    group.add_argument('--samples', default=None, nargs='+', type=str,
                       help='Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain '
                            'fastq files. URLs follow the format: http://foo.com/sample.tar, '
                            'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.'
                            'Samples passed in this way will be assumed to be paired end, if using single-end data, '
                            'please use the manifest option.')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if not args.samples:
            require(os.path.exists(args.manifest), '{} not found and no samples provided. Please '
                                                   'run "toil-rnaseq generate-manifest"'.format(args.manifest))
            samples = parse_samples(path_to_manifest=args.manifest)
        else:
            samples = parse_samples(sample_urls=args.samples)
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Config sanity checks
        require(config.kallisto_index or config.star_index,
                'URLs not provided for Kallisto or STAR, so there is nothing to do!')
        if config.star_index or config.rsem_ref:
            require(config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: '
                                                           '{}, RSEM: {}'.format(config.star_index, config.rsem_ref))
        require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, '
                                                           'otherwise sample output is not stored anywhere!')
        for input in [x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x]:
            require(urlparse(input).scheme in schemes,
                    'Input in config must have the appropriate URL prefix: {}'.format(schemes))
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program), None), program + ' must be installed on every node.'.format(program))

        # Start the workflow by using map_job() to run the pipeline for each sample
        Job.Runner.startToil(Job.wrapJobFn(map_job, download_sample, samples, config), args)
Esempio n. 16
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil exome pipeline

    Perform variant / indel analysis given a pair of tumor/normal BAM files.
    Samples are optionally preprocessed (indel realignment and base quality score recalibration)
    The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel.

    General usage:
    1. Type "toil-exome generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-exome run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline

    Structure of variant pipeline (per sample)

           1 2 3 4          14 -------
           | | | |          |        |
        0 --------- 5 ----- 15 -------- 17
                    |       |        |
                   ---      16 -------
                   | |
                   6 7
                   | |
                   8 9
                   | |
                  10 11
                   | |
                  12 13

    0 = Start node
    1 = reference index
    2 = reference dict
    3 = normal bam index
    4 = tumor bam index
    5 = pre-processing node / DAG declaration
    6,7 = RealignerTargetCreator
    8,9 = IndelRealigner
    10,11 = BaseRecalibration
    12,13 = PrintReads
    14 = MuTect
    15 = Pindel
    16 = MuSe
    17 = Consolidate Output and move/upload results
    ==================================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )
    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the CGL exome pipeline')
    parser_run.add_argument(
        '--config',
        default='config-toil-exome.yaml',
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    parser_run.add_argument(
        '--manifest',
        default='manifest-toil-exome.tsv',
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')
    parser_run.add_argument(
        '--normal',
        default=None,
        type=str,
        help=
        'URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, '
        'and gnos://. The UUID for the sample must be given with the "--uuid" flag.'
    )
    parser_run.add_argument(
        '--tumor',
        default=None,
        type=str,
        help=
        'URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, '
        'and gnos://. The UUID for the sample must be given with the "--uuid" flag.'
    )
    parser_run.add_argument('--uuid',
                            default=None,
                            type=str,
                            help='Provide the UUID of a sample when using the'
                            '"--tumor" and "--normal" option')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-exome.yaml'),
                      generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'),
                      generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-rnaseq generate-config"'.format(args.config))
        if args.normal or args.tumor or args.uuid:
            require(args.normal and args.tumor and args.uuid,
                    '"--tumor", "--normal" and "--uuid" must all be supplied')
            samples = [[args.uuid, args.normal, args.tumor]]
        else:
            samples = parse_manifest(args.manifest)
        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Exome pipeline sanity checks
        if config.preprocessing:
            require(
                config.reference and config.phase and config.mills
                and config.dbsnp,
                'Missing inputs for preprocessing, check config file.')
        if config.run_mutect:
            require(config.reference and config.dbsnp and config.cosmic,
                    'Missing inputs for MuTect, check config file.')
        if config.run_pindel:
            require(config.reference, 'Missing input (reference) for Pindel.')
        if config.run_muse:
            require(config.reference and config.dbsnp,
                    'Missing inputs for MuSe, check config file.')
        require(config.output_dir,
                'No output location specified: {}'.format(config.output_dir))
        # Program checks
        for program in ['curl', 'docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Launch Pipeline
        Job.Runner.startToil(
            Job.wrapJobFn(download_shared_files, samples, config), args)
Esempio n. 17
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil RNA-seq single cell pipeline

    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )
    # Run subparser
    parser_run = subparsers.add_parser(
        'run', help='Runs the RNA-seq single cell pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default=DEFAULT_CONFIG_NAME,
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default=DEFAULT_MANIFEST_NAME,
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME),
                      generate_manifest)

    # Pipeline execution
    elif args.command == 'run':
        # sanity check
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-rnaseq generate-config"'.format(args.config))
        require(
            os.path.exists(args.manifest),
            '{} not found and no samples provided. Please '
            'run "toil-rnaseq generate-manifest"'.format(args.manifest))
        # get samples
        samples = parse_samples(path_to_manifest=args.manifest)
        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Config sanity checks
        require(
            config.kallisto_index,
            'URLs not provided for Kallisto index, so there is nothing to do!')
        require(config.output_dir,
                'No output location specified: {}'.format(config.output_dir))
        require(
            urlparse(config.kallisto_index).scheme in SCHEMES,
            'Kallisto index in config must have the appropriate URL prefix: {}'
            .format(SCHEMES))
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        # Program checks
        for program in ['curl', 'docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow
        Job.Runner.startToil(
            Job.wrapJobFn(map_job, run_single_cell, samples, config), args)