def parseManifest(path_to_manifest): require( os.path.exists(path_to_manifest), "[parseManifest]Didn't find manifest file, looked " "{}".format(path_to_manifest)) allowed_file_types = ["fq", "bam"] #allowed_file_types = ["fq-gzp", "fq", "fa-gzp", "fa", "f5-tar", "bam"] def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample = line.strip().split("\t") require( len(sample) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line)) file_type, sample_url, sample_label, sample_filesize = sample # check the file_type and the URL require(file_type in allowed_file_types, "[parse_line]Unrecognized file type {}".format(file_type)) require( urlparse(sample_url).scheme and urlparse(sample_url), "Invalid URL passed for {}".format(sample_url)) return Sample(file_type=file_type, URL=sample_url, label=sample_label, file_size=human2bytes(sample_filesize)) with open(path_to_manifest, "r") as fH: return map( parse_line, [x for x in fH if (not x.isspace() and not x.startswith("#"))])
def joint_genotype_and_filter(job, gvcfs, config): """ Checks for enough disk space for joint genotyping, then calls the genotype and filter pipeline function. :param JobFunctionWrappingJob job: passed automatically by Toil :param dict gvcfs: Dictionary of GVCFs {Sample ID: FileStoreID} :param Namespace config: Input parameters and reference FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.available_disk Total available disk space :returns: FileStoreID for the joint genotyped and filtered VCF file :rtype: str """ # Get the total size of genome reference files genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # Require at least 2.5x the sum of the individual GVCF files cohort_size = sum(gvcf.size for gvcf in gvcfs.values()) require(int(2.5 * cohort_size + genome_ref_size) < config.available_disk, 'There is not enough disk space to joint ' 'genotype samples:\n{}'.format('\n'.join(gvcfs.keys()))) job.fileStore.logToMaster('Merging cohort into a single GVCF file') return job.addChildJobFn(genotype_and_filter, gvcfs, config).rv()
def parse_manifest(manifest_path): """ Parse manifest file :param str manifest_path: Path to manifest file :return: samples :rtype: list[str, list] """ samples = [] with open(manifest_path, 'r') as f: for line in f: if not line.isspace() and not line.startswith('#'): sample = line.strip().split('\t') require( 2 <= len(sample) <= 3, 'Bad manifest format! ' 'Expected UUID\tURL1\t[URL2] (tab separated), got: {}'. format(sample)) uuid = sample[0] urls = sample[1:] for url in urls: require( urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url)) samples.append([uuid, urls]) return samples
def joint_genotype_and_filter(job, gvcfs, config): """ Checks for enough disk space for joint genotyping, then calls the genotype and filter pipeline function. :param JobFunctionWrappingJob job: passed automatically by Toil :param dict gvcfs: Dictionary of GVCFs {Sample ID: FileStoreID} :param Namespace config: Input parameters and reference FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.available_disk Total available disk space :returns: FileStoreID for the joint genotyped and filtered VCF file :rtype: str """ # Get the total size of genome reference files genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # Require at least 2.5x the sum of the individual GVCF files cohort_size = sum(gvcf.size for gvcf in gvcfs.values()) require( int(2.5 * cohort_size + genome_ref_size) < config.available_disk, 'There is not enough disk space to joint ' 'genotype samples:\n{}'.format('\n'.join(gvcfs.keys()))) job.fileStore.logToMaster('Merging cohort into a single GVCF file') return job.addChildJobFn(genotype_and_filter, gvcfs, config).rv()
def marginAlignRootJobFunction(job, config, sample): def cull_sample_files(): if sample.file_type == "fq": config["sample_FileStoreID"] = job.addChildJobFn( urlDownlodJobFunction, sample.URL, disk=sample.file_size).rv() return None elif sample.file_type == "bam": bwa_alignment_fid = job.addChildJobFn(urlDownlodJobFunction, sample.URL, disk=sample.file_size).rv() config["sample_FileStoreID"] = job.addChildJobFn( getFastqFromBam, sample, disk=(2 * sample.file_size)).rv() return bwa_alignment_fid else: raise RuntimeError( "[marginAlignRootJobFunction]Unsupported sample file type %s" % sample.file_type) # download/import the reference config["reference_FileStoreID"] = job.addChildJobFn( urlDownlodJobFunction, config["ref"], disk=config["ref_size"]).rv() # cull the sample, which can be a fastq or a BAM this will be None if we are doing BWA alignment alignment_fid = cull_sample_files() # checks if we're doing alignments or variant calling if config["realign"] or config["caller"]: # download the input model, if given. Fail if no model is given and we're performing HMM realignment without # doing EM if config["hmm_file"] is not None: config["input_hmm_FileStoreID"] = job.addChildJobFn( urlDownlodJobFunction, config["hmm_file"], disk="10M").rv() else: if config["realign"]: require( config["EM"], "[marginAlignRootJobFunction]Need to specify an input model or " "set EM to True to perform HMM realignment") config["input_hmm_FileStoreID"] = None # initialize key in config for trained model if we're performing EM if config["EM"]: config["normalized_trained_model_FileStoreID"] = None config["sample_label"] = sample.label config["reference_label"] = config["ref"] job.fileStore.logToMaster("[run_tool]Processing sample:{}".format( config["sample_label"])) job.fileStore.logToMaster("[run_tool]Chaining :{}".format( config["chain"])) job.fileStore.logToMaster("[run_tool]Realign :{}".format( config["realign"])) job.fileStore.logToMaster("[run_tool]Caller :{}".format( config["caller"])) job.fileStore.logToMaster("[run_tool]Stats :{}".format( config["stats"])) job.addFollowOnJobFn(marginAlignJobFunction, config, alignment_fid)
def signalAlignCheckInputJobFunction(job, config, sample): require(config["ref"], "[signalAlignCheckInputJobFunction]Missing reference URL") require(config["ledger_url"], "[signalAlignCheckInputJobFunction]Missing ledger URL") require(config["HMM_file"], "[signalAlignCheckInputJobFunction]Missing HMM file URL") require(config["HDP_file"], "[signalAlignCheckInputJobFunction]Missing HDP file URL") if config["degenerate"]: require(checkDegenerate(config["degenerate"]), "[signalAlignJobFunction]Degenerate %s not allowed" % config["degenerate"]) job.addFollowOnJobFn(signalAlignRootJobFunction, config, sample)
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers, cores, memory, sudo): ''' Optionally launches a Spark cluster and then runs ADAM to count k-mers on an input file. :param job: Toil job :param input_file: URL/path to input file to count k-mers on :param output_path: URL/path to save k-mer counts at :param kmer_length: The length of k-mer substrings to count. :param spark_conf: Optional Spark configuration. If set, workers should \ not be set. :param workers: Optional number of Spark workers to launch. If set, \ spark_conf should not be set, and cores and memory should be set. :param cores: Number of cores per Spark worker. Must be set if workers is \ set. :param memory: Amount of memory to provided to Spark workers. Must be set \ if workers is set. :param sudo: Whether or not to run Spark containers with sudo. :type job: toil.Job :type input_file: string :type output_path: string :type kmer_length: int or string :type spark_conf: string or None :type workers: int or None :type cores: int or None :type memory: int or None :type sudo: boolean ''' require((spark_conf is not None and workers is None) or (workers is not None and cores is not None and memory is not None and spark_conf is not None), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).") # if we do not have a spark configuration, then we must spawn a cluster if spark_conf is None: master_hostname = spawn_spark_cluster(job, sudo, workers, cores) else: spark_conf = shlex.split(spark_conf) job.addChildJobFn(download_count_upload, masterHostname, input_file, output_file, kmer_length, spark_conf, memory, sudo)
def generate_file(file_path, generate_func): """ Checks file existance, generates file, and provides message :param str file_path: File location to generate file :param function generate_func: Function used to generate file """ require(not os.path.exists(file_path), file_path + ' already exists!') with open(file_path, 'w') as f: f.write(generate_func()) print('\t{} has been generated in the current working directory.'.format( os.path.basename(file_path)))
def _make_parameters(master_ip, default_parameters, memory, arguments, override_parameters): """ Makes a Spark Submit style job submission line. :param masterIP: The Spark leader IP address. :param default_parameters: Application specific Spark configuration parameters. :param memory: The memory to allocate to each Spark driver and executor. :param arguments: Arguments to pass to the submitted job. :param override_parameters: Parameters passed by the user, that override our defaults. :type masterIP: MasterAddress :type default_parameters: list of string :type arguments: list of string :type memory: int or None :type override_parameters: list of string or None """ # python doesn't support logical xor? # anywho, exactly one of memory or override_parameters must be defined require((override_parameters is not None or memory is not None) and ( override_parameters is None or memory is None ), "Either the memory setting must be defined or you must provide Spark configuration parameters." ) # if the user hasn't provided overrides, set our defaults parameters = [] if memory is not None: parameters = [ "--conf", "spark.driver.memory=%sg" % memory, "--conf", "spark.executor.memory=%sg" % memory ] else: parameters.extend(override_parameters) if master_ip: parameters.extend([ "--master", "spark://%s:%s" % (master_ip, SPARK_MASTER_PORT), "--conf", ("spark.hadoop.fs.default.name=hdfs://%s:%s" % (master_ip, HDFS_MASTER_PORT)) ]) # add the tool specific spark parameters parameters.extend(default_parameters) # spark submit expects a '--' to split the spark conf arguments from tool arguments parameters.append('--') # now add the tool arguments and return parameters.extend(arguments) return parameters
def test_pipeline_output_with_graphs(tmpdir): uuid = "test_rnaseqsc_g" output_dir = os.path.join(str(tmpdir), uuid) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_file = os.path.join(output_dir, uuid + ".tar.gz") jobstore = os.path.join(str(tmpdir), uuid + "_jobstore") input = "file://" + _get_test_fastq_files(tmpdir, tarball=True) config = _generate_config(tmpdir, output_dir, generate_graphs=True) manifest = _generate_manifest(tmpdir, [[uuid, "pseudo", input]]) subprocess.check_call([ 'toil-rnaseq-sc', 'run', '--config', config, '--manifest', manifest, '--maxCores', "1", jobstore ]) # ensure file and directories exist require(os.path.isfile(output_file), "expected outputfile to exist: " + output_file) subprocess.check_call(['tar', '-xvf', output_file, '-C', output_dir]) require( os.path.isfile(os.path.join(output_dir, uuid, "kallisto", "matrix.tsv")), "matrix.tsv file should exist in output tarball") require(os.path.isdir(os.path.join(output_dir, uuid, "kallisto", "plots")), "plots directory should exist in output tarball") require( len(os.listdir(os.path.join(output_dir, uuid, "kallisto", "plots"))) > 0, "plots directory should not be empty in output tarball")
def parse_samples(path_to_manifest): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): if line.isspace() or line.startswith('#'): continue sample = line.strip().split('\t') if len(sample) != 2: raise UserError( 'Bad manifest format! Expected 2 tab separated columns, got: {}' .format(sample)) # If a directory is passed in, use all samples in that directory uuid, url = sample if urlparse(url).scheme == '': url = [ 'file://' + os.path.join(url, x) for x in os.listdir(url) ] # If url is a tarball elif url.endswith('tar.gz') or url.endswith('tar'): require( urlparse(url).scheme in SCHEMES, 'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES)) url = [url] # If URL is a fastq or series of fastqs elif url.endswith('fastq.gz') or url.endswith( 'fastq') or url.endswith('fq.gz') or url.endswith('fq'): url = url.split(',') [ require( urlparse(x).scheme in SCHEMES, 'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES)) for x in url ] else: raise UserError( 'URL does not have approved extension: .tar.gz, .tar, .fastq.gz, .fastq, .fq.gz, .fq' ) sample = [uuid, url] samples.append(sample) return samples
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter): """ Adapter trimming for RNA-seq data :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 (if paired data) :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair) :return: R1 and R2 FileStoreIDs :rtype: tuple """ work_dir = job.fileStore.getLocalTempDir() if r2_id: require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.") # Retrieve files parameters = ['-a', fwd_3pr_adapter, '-m', '35'] if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend([ '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p', '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq' ]) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq']) # Call: CutAdapt docker_call( job=job, tool= 'quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2', work_dir=work_dir, parameters=parameters) # Write to fileStore if r1_id and r2_id: r1_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R2_cutadapt.fastq')) else: r1_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = None return r1_cut_id, r2_cut_id
def test_quant_to_pseudo(tmpdir): # paths resolution based on the assumption that the file is called from the root of the toil-rnaseq-sc directory input = os.path.abspath("testdata/input") output = os.path.join(str(tmpdir), "output") expected = os.path.abspath("testdata/expected") os.mkdir(output) quant_to_pseudo(job=None, input_dir=input, output_dir=output) filenames = os.listdir(expected) for file in filenames: with open(os.path.join(expected, file)) as expected_file, open( os.path.join(output, file)) as output_file: expected_read = expected_file.read() output_read = output_file.read() require( expected_read == output_read, "expected {} did not match actual {}".format( expected_read, output_read))
def parseManifest(path_to_manifest): require(os.path.exists(path_to_manifest), "[parseManifest]Didn't find manifest file, looked " "{}".format(path_to_manifest)) def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample_line = line.strip().split("\t") require(len(sample_line) == 3, "[parse_line]Invalid, len(line) != 3, offending {}".format(line)) url, sample_label, size = sample_line # check alignment URL require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url)) return SignalAlignSample(URL=url, size=size, sample_label=sample_label) with open(path_to_manifest, "r") as fH: return map(parse_line, [x for x in fH if (not x.isspace() and not x.startswith("#"))])
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers, cores, memory, sudo): ''' Optionally launches a Spark cluster and then runs ADAM to count k-mers on an input file. :param job: Toil job :param input_file: URL/path to input file to count k-mers on :param output_path: URL/path to save k-mer counts at :param kmer_length: The length of k-mer substrings to count. :param spark_conf: Optional Spark configuration. If set, workers should \ not be set. :param workers: Optional number of Spark workers to launch. If set, \ spark_conf should not be set, and cores and memory should be set. :param cores: Number of cores per Spark worker. Must be set if workers is \ set. :param memory: Amount of memory to provided to Spark workers. Must be set \ if workers is set. :param sudo: Whether or not to run Spark containers with sudo. :type job: toil.Job :type input_file: string :type output_path: string :type kmer_length: int or string :type spark_conf: string or None :type workers: int or None :type cores: int or None :type memory: int or None :type sudo: boolean ''' require((spark_conf is not None and workers is None) or ( workers is not None and cores is not None and memory is not None and spark_conf is not None ), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf)." ) # if we do not have a spark configuration, then we must spawn a cluster if spark_conf is None: master_hostname = spawn_spark_cluster(job, sudo, workers, cores) else: spark_conf = shlex.split(spark_conf) job.addChildJobFn(download_count_upload, masterHostname, input_file, output_file, kmer_length, spark_conf, memory, sudo)
def makeNanoporeReadLedgerJobFunction(job, tar_fid, batchsize, readstore_dir): workdir = job.fileStore.getLocalTempDir() minion_archive = job.fileStore.readGlobalFile(tar_fid) tar_handle = tarfile.open(minion_archive, "r:gz") members = tar_handle.getmembers() member_paths = [os.path.join(workdir, m.name) for m in members] tar_handle.extractall(path=workdir) require(batchsize <= len(member_paths), "[makeNanoporeReadLedgerJobFunction]Cannot split %s members into batches of %s" % (len(member_paths), batchsize)) member_iter = [member_paths[i:i + batchsize] for i in range(0, len(member_paths), batchsize)] tar_fids = [archiveBatchAndUploadToFileStore(job, b, workdir) for b in member_iter] ledger_shards = [job.addChildJobFn(makeNanoporeReadsJobFunction, fid, readstore_dir, cores=0.5).rv() for fid in tar_fids] tar_handle.close() return job.addFollowOnJobFn(consolidateLedgerShardsJobFunction, ledger_shards).rv()
def s3am_upload(fpath, s3_dir, num_cores=1, s3_key_path=None): """ Uploads a file to s3 via S3AM S3AM binary must be on the PATH to use this function For SSE-C encryption: provide a path to a 32-byte file :param str fpath: Path to file to upload :param str s3_dir: Ouptut S3 path. Format: s3://bucket/[directory] :param int num_cores: Number of cores to use for up/download with S3AM :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption """ require(s3_dir.startswith('s3://'), 'Format of s3_dir (s3://) is incorrect: {}'.format(s3_dir)) s3_dir = os.path.join(s3_dir, os.path.basename(fpath)) _s3am_with_retry(num_cores, file_path=fpath, s3_url=s3_dir, mode='upload', s3_key_path=s3_key_path)
def processReferenceSequence(ref_seq, workdir, motif_key=None, sub_char="X", parent_job=None): # make the forward and backward sequences, substituting the necessary motifs if motif_key is not None: motif, ok = getMotif(motif_key, ref_seq) require( ok, "[processReferenceSequence]Illegal motif_key given %s" % motif_key) if parent_job is not None: parent_job.fileStore.logToMaster( "[processReferenceSequence]Made %s substitutions" % motif.substitutionPositionCount()) try: fw_refseq = motif.forwardSubstitutedSequence(sub_char) bw_refseq = motif.complementSubstitutedSequence(sub_char) except AssertionError: return None, None, False else: fw_refseq = ref_seq.upper() bw_refseq = _reverseComplement(fw_refseq, reverse=False, complement=True) fw_refseqfile = LocalFile(workdir=workdir) bw_refseqfile = LocalFile(workdir=workdir) sequences = [fw_refseq, bw_refseq] sequence_files = [fw_refseqfile, bw_refseqfile] for f, s in zip(sequence_files, sequences): _h = open(f.fullpathGetter(), "w") _h.write(s + "\n") _h.close() [ require(os.path.exists(f.fullpathGetter()), "[processReferenceSequence]Missing %s" % f.filenameGetter()) for f in sequence_files ] return fw_refseqfile, bw_refseqfile, True
def parse_manifest(path_to_manifest): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): if not line.isspace() and not line.startswith('#'): sample = line.strip().split('\t') require(len(sample) == 3, 'Bad manifest format! ' 'Expected 3 tab separated columns, got: {}'.format(sample)) uuid, normal, tumor = sample for url in [normal, tumor]: require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url)) samples.append(sample) return samples
def download_sample(job, sample, config): """ Download sample and store unique attributes :param JobFunctionWrappingJob job: passed automatically by Toil :param list(str, str, str, str) sample: Sample information: filetype, paired/unpaired, UUID, and URL :param Namespace config: Argparse Namespace object containing argument inputs """ # Create copy of config that is sample specific config = argparse.Namespace(**vars(config)) config.file_type, config.paired, config.uuid, config.url = sample config.paired = True if config.paired == 'paired' else False config.cores = min(config.maxCores, multiprocessing.cpu_count()) disk = '2G' if config.ci_test else '20G' job.fileStore.logToMaster( 'UUID: {}\nURL: {}\nPaired: {}\nFile Type: {}\nCores: {}\nCIMode: {}'. format(config.uuid, config.url, config.paired, config.file_type, config.cores, config.ci_test)) # Download or locate local file and place in the jobStore tar_id, fastq_ids = None, None if config.file_type == 'tar': tar_id = job.addChildJobFn(download_url_job, config.url, cghub_key_path=config.gtkey, s3_key_path=config.ssec, disk=disk).rv() else: urls = config.url.split(',') if config.paired: require( len(urls) % 2 == 0, 'Fastq pairs must have multiples of 2 URLS separated by comma') config.gz = True if urls[0].endswith('gz') else None for url in urls: fastq_ids.append( job.addChildJobFn(download_url_job, url, cghub_key_path=config.gtkey, s3_key_path=config.ssec, disk=disk).rv()) job.addFollowOnJobFn(preprocessing_declaration, config, tar_id, fastq_ids)
def parse_manifest(manifest_path): """ Parse manifest file :param str manifest_path: Path to manifest file :return: samples :rtype: list[str, list] """ samples = [] with open(manifest_path, 'r') as f: for line in f: if not line.isspace() and not line.startswith('#'): sample = line.strip().split('\t') require(2 <= len(sample) <= 3, 'Bad manifest format! ' 'Expected UUID\tURL1\t[URL2] (tab separated), got: {}'.format(sample)) uuid = sample[0] urls = sample[1:] for url in urls: require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url)) samples.append([uuid, urls]) return samples
def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample_line = line.strip().split("\t") require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line)) filetype, url, sample_label, size = sample_line # checks: # check filetype require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype)) # check URL require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url)) return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label)
def parseManifestReadstore(path_to_manifest): require(os.path.exists(path_to_manifest), "[parseManifest]Didn't find manifest file, looked " "{}".format(path_to_manifest)) allowed_file_types = ("tar", "gz-tar") def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample_line = line.strip().split("\t") require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line)) filetype, url, sample_label, size = sample_line # checks: # check filetype require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype)) # check URL require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url)) return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label) with open(path_to_manifest, "r") as fH: return map(parse_line, [x for x in fH if (not x.isspace() and not x.startswith("#"))])
def parse_manifest(path_to_manifest): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): if not line.isspace() and not line.startswith('#'): sample = line.strip().split('\t') require( len(sample) == 3, 'Bad manifest format! ' 'Expected 3 tab separated columns, got: {}'.format(sample)) uuid, normal, tumor = sample for url in [normal, tumor]: require( urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url)) samples.append(sample) return samples
def _SignalMachine(read_label, cigar, nanopore_read): guide_aln = LocalFile(workdir=workdir) _handle = open(guide_aln.fullpathGetter(), "w") _handle.write(cigar) _handle.close() require(os.path.exists(guide_aln.fullpathGetter()), "NO guide aln file") signalMachine_args = [ "--sm3Hdp", "-s", "1", "-o", "%s" % degenerate_enum, "-L", "%s" % read_label, "-T", "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)), "-q", "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()), "-f", "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()), "-b", "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()), "-p", "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()), "-u", "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()), "-v", "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)), ] try: docker_call(job=job, tool=signalMachine_image, parameters=signalMachine_args, work_dir=(workdir + "/")) except subprocess.CalledProcessError: pass
def makeNanoporeReadsJobFunction(job, tar_fid, readstore_dir): def makeNanoporeRead(f5_path): # here we load the NanoporeRead and write it to a file np = NanoporeRead(fast_five_file=f5_path, twoD=False) # make this a config arg ok = np.Initialize(job) if not ok: return None _l = np.read_label tF = job.fileStore.getLocalTempFile() fH = open(tF, "w") ok = np.Write(job, fH, initialize=False) if not ok: fH.close() return None fH.close() # then we gzip it and deliver it to the readstore and return the ledger line fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l) fH = open(tF, "rb") gz = gzip.open(fn.fullpathGetter(), "wb") shutil.copyfileobj(fH, gz) fH.close() gz.close() try: deliverOutput(job, fn, readstore_dir) except RuntimeError: job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l) return None return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter())) def write_ledger_line(line, fH): l = "%s\t%s" % (line[0], line[1]) # read_label, npread URL fH.write(l) workdir = job.fileStore.getLocalTempDir() tar = job.fileStore.readGlobalFile(tar_fid) tar_handle = tarfile.open(tar, "r:gz") members = tar_handle.getmembers() members = [os.path.join(workdir, m.name) for m in members] tar_handle.extractall(path=workdir) [require(os.path.exists(m), "[makeNanoporeReadsJobFunction]Missing member %s" % m) for m in members] ledger_lines = list(map(makeNanoporeRead, members)) tar_handle.close() ledger_shard = job.fileStore.getLocalTempFile() _handle = open(ledger_shard, "w") [write_ledger_line(l, _handle) for l in ledger_lines if l is not None] _handle.close() return job.fileStore.writeGlobalFile(ledger_shard)
def getFastqFromBam(job, bam_sample, samtools_image="quay.io/ucsc_cgl/samtools"): # n.b. this is NOT a jobFunctionWrappingJob, it just takes the parent job as # an argument to have access to the job store # download the BAM to the local directory, use a uid to aviod conflicts uid = uuid.uuid4().hex work_dir = job.fileStore.getLocalTempDir() local_bam = LocalFile(workdir=work_dir, filename="bam_{}.bam".format(uid)) fastq_reads = LocalFile(workdir=work_dir, filename="fastq_reads{}.fq".format(uid)) urlDownload(parent_job=job, source_url=bam_sample.URL, destination_file=local_bam) require(not os.path.exists(fastq_reads.fullpathGetter()), "[getFastqFromBam]fastq file already exists") # run samtools to get the reads from the BAM # TODO use DOCKER_DIR and clean this up. idea: make globls.py or something samtools_parameters = [ "fastq", "/data/{}".format(local_bam.filenameGetter()) ] with open(fastq_reads.fullpathGetter(), 'w') as fH: docker_call(job=job, tool=samtools_image, parameters=samtools_parameters, work_dir=work_dir, outfile=fH) require(os.path.exists(fastq_reads.fullpathGetter()), "[getFastqFromBam]didn't generate reads") # upload fastq to fileStore return job.fileStore.writeGlobalFile(fastq_reads.fullpathGetter())
def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample_line = line.strip().split("\t") require(len(sample_line) == 3, "[parse_line]Invalid, len(line) != 3, offending {}".format(line)) url, sample_label, size = sample_line # check alignment URL require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url)) return SignalAlignSample(URL=url, size=size, sample_label=sample_label)
def _get_mount_path(self): """ Returns the path of the mount point of the current container. If this method is invoked outside of a Docker container a NotInsideContainerError is raised. Likewise if the docker daemon is unreachable from inside the container a UserError is raised. This method is idempotent. """ if self._mount_path is None: name = current_docker_container_id() if dockerd_is_reachable(): # Get name of mounted volume blob = json.loads( subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mnt = [ x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source'] ] require( len(sock_mnt) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') # Ensure formatting of command for 2 mount points if len(mounts) == 2: require( all(x['Source'] == x['Destination'] for x in mounts), 'Docker Src/Dst mount points, invoked with the -v argument, ' 'must be the same if only using one mount point aside from the docker ' 'socket.') work_mount = [ x['Source'] for x in mounts if 'docker.sock' not in x['Source'] ] else: # Ensure only one mirror mount exists aside from docker.sock mirror_mounts = [ x['Source'] for x in mounts if x['Source'] == x['Destination'] ] work_mount = [ x for x in mirror_mounts if 'docker.sock' not in x ] require( len(work_mount) == 1, 'Wrong number of mirror mounts provided, see ' 'documentation.') self._mount_path = work_mount[0] log.info('The work mount is: %s', self._mount_path) else: raise UserError( 'Docker daemon is not reachable, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument.' ) return self._mount_path
def main(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the ADAM preprocessing pipeline') parser_run.add_argument('--config', default='adam_preprocessing.config', type=str, help='Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') parser_run.add_argument('--sample', help='The S3 URL or local path to the input SAM or BAM file.' 'NOTE: unlike other pipelines, we do not support ftp://, gnos://, etc. schemes.') parser_run.add_argument('--output-dir', required=True, default=None, help='full path where final results will be output') parser_run.add_argument('-s', '--suffix', default='', help='Additional suffix to add to the names of the output files') Job.Runner.addToilOptions(parser_run) args = parser.parse_args() cwd = os.getcwd() if args.command == 'generate-config': generate_file(os.path.join(cwd, 'adam-preprocessing.config'), generate_config) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' 'generate-config'.format(args.config)) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} inputs = argparse.Namespace(**parsed_config) require(not (inputs.master_ip and inputs.num_nodes), 'Only one of master_ip and num_nodes can be provided.') if not hasattr(inputs, 'master_ip'): require(inputs.num_nodes > 1, 'num_nodes allocates one Spark/HDFS master and n-1 workers, and ' 'thus must be greater than 1. %d was passed.' % inputs.num_nodes) for arg in [inputs.dbsnp, inputs.memory]: require(arg, 'Required argument {} missing from config'.format(arg)) Job.Runner.startToil(Job.wrapJobFn(static_adam_preprocessing_dag, inputs, args.sample, args.output_dir), args)
def parse_samples(path_to_manifest=None, sample_urls=None): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :param list[str] sample_urls: Sample URLs :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] if sample_urls: for url in sample_urls: samples.append( ['tar', 'paired', os.path.basename(url.split('.')[0]), url]) elif path_to_manifest: with open(path_to_manifest, 'r') as f: for line in f.readlines(): if not line.isspace() and not line.startswith('#'): sample = line.strip().split('\t') require( len(sample) == 4, 'Bad manifest format! ' 'Expected 4 tab separated columns, got: {}'.format( sample)) file_type, paired, uuid, url = sample require( file_type == 'tar' or file_type == 'fq', '1st column must be "tar" or "fq": {}'.format( sample[0])) require( paired == 'paired' or paired == 'single', '2nd column must be "paired" or "single": {}'.format( sample[1])) if file_type == 'fq' and paired == 'paired': require( len(url.split(',')) == 2, 'Fastq pair requires two URLs separated' ' by a comma: {}'.format(url)) samples.append(sample) return samples
def parse_manifest(path_to_manifest): """ Parses manifest file for Toil Germline Pipeline :param str path_to_manifest: Path to sample manifest file :return: List of GermlineSample namedtuples :rtype: list[GermlineSample] """ bam_re = r"^(?P<uuid>\S+)\s(?P<url>\S+[bsc][r]?am)" fq_re = r"^(?P<uuid>\S+)\s(?P<url>\S+)\s(?P<paired_url>\S+)?\s?(?P<rg_line>@RG\S+)" samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): line = line.strip() if line.startswith('#'): continue bam_match = re.match(bam_re, line) fastq_match = re.match(fq_re, line) if bam_match: uuid = bam_match.group('uuid') url = bam_match.group('url') paired_url = None rg_line = None require('.bam' in url.lower(), 'Expected .bam extension:\n{}:\t{}'.format(uuid, url)) elif fastq_match: uuid = fastq_match.group('uuid') url = fastq_match.group('url') paired_url = fastq_match.group('paired_url') rg_line = fastq_match.group('rg_line') require('.fq' in url.lower() or '.fastq' in url.lower(), 'Expected .fq extension:\n{}:\t{}'.format(uuid, url)) else: raise ValueError('Could not parse entry in manifest: %s\n%s' % (f.name, line)) # Checks that URL has a scheme require( urlparse(url).scheme, 'Invalid URL passed for {}'.format(url)) samples.append(GermlineSample(uuid, url, paired_url, rg_line)) return samples
def parse_manifest(path_to_manifest): """ Parses manifest file for Toil Germline Pipeline :param str path_to_manifest: Path to sample manifest file :return: List of GermlineSample namedtuples :rtype: list[GermlineSample] """ bam_re = r"^(?P<uuid>\S+)\s(?P<url>\S+[bsc][r]?am)" fq_re = r"^(?P<uuid>\S+)\s(?P<url>\S+)\s(?P<paired_url>\S+)?\s?(?P<rg_line>@RG\S+)" samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): line = line.strip() if line.startswith('#'): continue bam_match = re.match(bam_re, line) fastq_match = re.match(fq_re, line) if bam_match: uuid = bam_match.group('uuid') url = bam_match.group('url') paired_url = None rg_line = None require('.bam' in url.lower(), 'Expected .bam extension:\n{}:\t{}'.format(uuid, url)) elif fastq_match: uuid = fastq_match.group('uuid') url = fastq_match.group('url') paired_url = fastq_match.group('paired_url') rg_line = fastq_match.group('rg_line') require('.fq' in url.lower() or '.fastq' in url.lower(), 'Expected .fq extension:\n{}:\t{}'.format(uuid, url)) else: raise ValueError('Could not parse entry in manifest: %s\n%s' % (f.name, line)) # Checks that URL has a scheme require(urlparse(url).scheme, 'Invalid URL passed for {}'.format(url)) samples.append(GermlineSample(uuid, url, paired_url, rg_line)) return samples
def main(): """toil-signalAlign master script """ def parse_args(): parser = argparse.ArgumentParser(description=print_help.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest="command") # parsers for running the full pipeline run_parser = subparsers.add_parser("run", help="runs full workflow on a BAM") run_parser.add_argument('--config', default='config-toil-signalAlign.yaml', type=str, help='Path to the (filled in) config file, generated with "generate".') run_parser.add_argument('--manifest', default='manifest-toil-signalAlign.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate". ' '\nDefault value: "%(default)s".') subparsers.add_parser("generate", help="generates a config file for your run, do this first") # parsers for running the readstore pipeline readstore_parser = subparsers.add_parser("run-readstore", help="generates a readstore from a tar of .fast5s") readstore_parser.add_argument('--config', default='config-toil-signalAlign-readstore.yaml', type=str, help='Path to the (filled in) config file, generated with "generate".') readstore_parser.add_argument('--manifest', default='manifest-toil-signalAlign-readstore.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate". ' '\nDefault value: "%(default)s".') subparsers.add_parser("generate-readstore", help="generates a config file for making a readstore") Job.Runner.addToilOptions(run_parser) Job.Runner.addToilOptions(readstore_parser) return parser.parse_args() def exitBadInput(message=None): if message is not None: print(message, file=sys.stderr) sys.exit(1) if len(sys.argv) == 1: exitBadInput(print_help()) cwd = os.getcwd() args = parse_args() if args.command == "generate" or args.command == "generate-readstore": if args.command == "generate": config_filename = "config-toil-signalAlign.yaml" manifest_filename = "manifest-toil-signalAlign.tsv" else: config_filename = "config-toil-signalAlign-readstore.yaml" manifest_filename = "manifest-toil-signalAlign-readstore.tsv" configGenerator = partial(generateConfig, command=args.command) manifestGenerator = partial(generateManifest, command=args.command) try: config_path = os.path.join(cwd, config_filename) generate_file(config_path, configGenerator) except UserError: print("[toil-nanopore]NOTICE using existing config file {}".format(config_path)) pass try: manifest_path = os.path.join(cwd, manifest_filename) generate_file(manifest_path, manifestGenerator) except UserError: print("[toil-nanopore]NOTICE using existing manifest {}".format(manifest_path)) elif args.command == "run": require(os.path.exists(args.config), "{config} not found run generate".format(config=args.config)) # Parse config config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} samples = parseManifest(args.manifest) for sample in samples: with Toil(args) as toil: if not toil.options.restart: root_job = Job.wrapJobFn(signalAlignCheckInputJobFunction, config, sample) return toil.start(root_job) else: toil.restart() elif args.command == "run-readstore": require(os.path.exists(args.config), "{config} not found run generate-readstore".format(config=args.config)) # Parse config config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} samples = parseManifestReadstore(args.manifest) with Toil(args) as toil: if not toil.options.restart: root_job = Job.wrapJobFn(makeReadstoreJobFunction, config, samples) return toil.start(root_job) else: toil.restart()
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil exome pipeline Perform variant / indel analysis given a pair of tumor/normal BAM files. Samples are optionally preprocessed (indel realignment and base quality score recalibration) The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel. General usage: 1. Type "toil-exome generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-exome run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline Structure of variant pipeline (per sample) 1 2 3 4 14 ------- | | | | | | 0 --------- 5 ----- 15 -------- 17 | | | --- 16 ------- | | 6 7 | | 8 9 | | 10 11 | | 12 13 0 = Start node 1 = reference index 2 = reference dict 3 = normal bam index 4 = tumor bam index 5 = pre-processing node / DAG declaration 6,7 = RealignerTargetCreator 8,9 = IndelRealigner 10,11 = BaseRecalibration 12,13 = PrintReads 14 = MuTect 15 = Pindel 16 = MuSe 17 = Consolidate Output and move/upload results ================================================== Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline') parser_run.add_argument('--config', default='config-toil-exome.yaml', type=str, help='Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') parser_run.add_argument('--manifest', default='manifest-toil-exome.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') parser_run.add_argument('--normal', default=None, type=str, help='URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.') parser_run.add_argument('--tumor', default=None, type=str, help='URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.') parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the' '"--tumor" and "--normal" option') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) if args.normal or args.tumor or args.uuid: require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied') samples = [[args.uuid, args.normal, args.tumor]] else: samples = parse_manifest(args.manifest) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Exome pipeline sanity checks if config.preprocessing: require(config.reference and config.phase and config.mills and config.dbsnp, 'Missing inputs for preprocessing, check config file.') if config.run_mutect: require(config.reference and config.dbsnp and config.cosmic, 'Missing inputs for MuTect, check config file.') if config.run_pindel: require(config.reference, 'Missing input (reference) for Pindel.') if config.run_muse: require(config.reference and config.dbsnp, 'Missing inputs for MuSe, check config file.') require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) # Program checks for program in ['curl', 'docker']: require(next(which(program), None), program + ' must be installed on every node.'.format(program)) # Launch Pipeline Job.Runner.startToil(Job.wrapJobFn(download_shared_files, samples, config), args)
def setup_and_run_bwakit(job, uuid, url, rg_line, config, paired_url=None): """ Downloads and runs bwakit for BAM or FASTQ files :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: Unique sample identifier :param str url: FASTQ or BAM file URL. BAM alignment URL must have .bam extension. :param Namespace config: Input parameters and shared FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.cores Number of cores for each job config.trim If True, trim adapters using bwakit config.amb FileStoreID for BWA index file prefix.amb config.ann FileStoreID for BWA index file prefix.ann config.bwt FileStoreID for BWA index file prefix.bwt config.pac FileStoreID for BWA index file prefix.pac config.sa FileStoreID for BWA index file prefix.sa config.alt FileStoreID for alternate contigs file or None :param str|None paired_url: URL to paired FASTQ :param str|None rg_line: Read group line (i.e. @RG\tID:foo\tSM:bar) :return: BAM FileStoreID :rtype: str """ bwa_config = deepcopy(config) bwa_config.uuid = uuid bwa_config.rg_line = rg_line # bwa_alignment uses a different naming convention bwa_config.ref = config.genome_fasta bwa_config.fai = config.genome_fai # Determine if sample is a FASTQ or BAM file using the file extension basename, ext = os.path.splitext(url) ext = ext.lower() if ext == '.gz': _, ext = os.path.splitext(basename) ext = ext.lower() # The pipeline currently supports FASTQ and BAM files require(ext in ['.fq', '.fastq', '.bam'], 'Please use .fq or .bam file extensions:\n%s' % url) # Download fastq files samples = [] input1 = job.addChildJobFn(download_url_job, url, name='file1', s3_key_path=config.ssec, disk=config.file_size) samples.append(input1.rv()) # If the extension is for a BAM file, then configure bwakit to realign the BAM file. if ext == '.bam': bwa_config.bam = input1.rv() else: bwa_config.r1 = input1.rv() # Download the paired FASTQ URL if paired_url: input2 = job.addChildJobFn(download_url_job, paired_url, name='file2', s3_key_path=config.ssec, disk=config.file_size) samples.append(input2.rv()) bwa_config.r2 = input2.rv() # The bwakit disk requirement depends on the size of the input files and the index # Take the sum of the input files and scale it by a factor of 4 bwa_index_size = sum([getattr(config, index_file).size for index_file in ['amb', 'ann', 'bwt', 'pac', 'sa', 'alt'] if getattr(config, index_file, None) is not None]) bwakit_disk = PromisedRequirement(lambda lst, index_size: int(4 * sum(x.size for x in lst) + index_size), samples, bwa_index_size) return job.addFollowOnJobFn(run_bwakit, bwa_config, sort=False, # BAM files are sorted later in the pipeline trim=config.trim, cores=config.cores, disk=bwakit_disk).rv()
def gatk_germline_pipeline(job, samples, config): """ Runs the GATK best practices pipeline for germline SNP and INDEL discovery. Steps in Pipeline 0: Generate and preprocess BAM - Uploads processed BAM to output directory 1: Call Variants using HaplotypeCaller - Uploads GVCF 2: Genotype VCF - Uploads VCF 3: Filter Variants using either "hard filters" or VQSR - Uploads filtered VCF :param JobFunctionWrappingJob job: passed automatically by Toil :param list[GermlineSample] samples: List of GermlineSample namedtuples :param Namespace config: Input parameters and reference FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.cores Number of cores for each job config.xmx Java heap size in bytes config.suffix Suffix added to output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption config.joint_genotype If True, then joint genotype and filter cohort config.hc_output URL or local path to HaplotypeCaller output for testing :return: Dictionary of filtered VCF FileStoreIDs :rtype: dict """ require(len(samples) > 0, 'No samples were provided!') # Get total size of genome reference files. This is used for configuring disk size. genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # 0: Generate processed BAM and BAI files for each sample # group preprocessing and variant calling steps in empty Job instance group_bam_jobs = Job() gvcfs = {} for sample in samples: # 0: Generate processed BAM and BAI files for each sample get_bam = group_bam_jobs.addChildJobFn(prepare_bam, sample.uuid, sample.url, config, paired_url=sample.paired_url, rg_line=sample.rg_line) # 1: Generate per sample gvcfs {uuid: gvcf_id} # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference # files, and the output GVCF file. The output GVCF is smaller than the input BAM file. hc_disk = PromisedRequirement(lambda bam, bai, ref_size: 2 * bam.size + bai.size + ref_size, get_bam.rv(0), get_bam.rv(1), genome_ref_size) get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller, get_bam.rv(0), get_bam.rv(1), config.genome_fasta, config.genome_fai, config.genome_dict, annotations=config.annotations, cores=config.cores, disk=hc_disk, memory=config.xmx, hc_output=config.hc_output) # Store cohort GVCFs in dictionary gvcfs[sample.uuid] = get_gvcf.rv() # Upload individual sample GVCF before genotyping to a sample specific output directory vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix) get_gvcf.addChildJobFn(output_file_job, vqsr_name, get_gvcf.rv(), os.path.join(config.output_dir, sample.uuid), s3_key_path=config.ssec, disk=PromisedRequirement(lambda x: x.size, get_gvcf.rv())) # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of # 30 exomes or one large WGS sample: # https://software.broadinstitute.org/gatk/documentation/article?id=3225 filtered_vcfs = {} if config.joint_genotype: # Need to configure joint genotype in a separate function to resolve promises filtered_vcfs = group_bam_jobs.addFollowOnJobFn(joint_genotype_and_filter, gvcfs, config).rv() # If not joint genotyping, then iterate over cohort and genotype and filter individually. else: for uuid, gvcf_id in gvcfs.iteritems(): filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn(genotype_and_filter, {uuid: gvcf_id}, config).rv() job.addChild(group_bam_jobs) return filtered_vcfs
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil BWA pipeline Alignment of fastq reads via BWA-kit General usage: 1. Type "toil-bwa generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-bwa run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/bwa_alignment Structure of the BWA pipeline (per sample) 0 --> 1 0 = Download sample 1 = Run BWA-kit =================================================================== :Dependencies: cURL: apt-get install curl Toil: pip install toil Docker: wget -qO- https://get.docker.com/ | sh Optional: S3AM: pip install --s3am (requires ~/.boto config file) Boto: pip install boto """ # Define Parser object and add to Toil parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the BWA alignment pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument('--config', default='config-toil-bwa.yaml', type=str, help='Path to the (filled in) config file, generated with "generate-config".') group.add_argument('--manifest', default='manifest-toil-bwa.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s".') group.add_argument('--sample', nargs='+', action=required_length(2, 3), help='Space delimited sample UUID and fastq files in the format: uuid url1 [url2].') # Print docstring help if no arguments provided if len(sys.argv) == 1: parser.print_help() sys.exit(1) Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-bwa.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-bwa.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run generate-config'.format(args.config)) if not args.sample: args.sample = None require(os.path.exists(args.manifest), '{} not found and no sample provided. ' 'Please run "generate-manifest"'.format(args.manifest)) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint samples = [args.sample[0], args.sample[1:]] if args.sample else parse_manifest(args.manifest) # Sanity checks require(config.ref, 'Missing URL for reference file: {}'.format(config.ref)) require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) # Launch Pipeline Job.Runner.startToil(Job.wrapJobFn(download_reference_files, config, samples), args)
def archiveBatchAndUploadToFileStore(parent_job, batch, workdir): tarname = "%s.tmp" % uuid.uuid4().hex tarpath = os.path.join(workdir, tarname) tarball_files(tar_name=tarname, file_paths=batch, output_dir=workdir) require(os.path.exists(tarpath), "[archiveBatchAndUploadToFileStore]Didn't make smaller tar") return parent_job.fileStore.writeGlobalFile(tarpath)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz MarginPhase pipeline ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the MarginPhase pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default=DEFAULT_CONFIG_NAME, type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default=DEFAULT_MANIFEST_NAME, type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME), generate_manifest) # Pipeline execution elif args.command == 'run': # sanity check require( os.path.exists(args.config), '{} not found. Please run ' '"toil-marginphase generate-config"'.format(args.config)) require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-marginphase generate-manifest"'.format(args.manifest)) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize config.defaultCores = int(min(MP_CPU, config.maxCores)) config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint config.maxMemory = sys.maxint # fix parsing of GB to int if args.maxMemory: args.maxMemory = args.maxMemory.upper() if args.maxMemory.endswith('B'): args.maxMemory = args.maxMemory.rstrip('B') # actual parsing if args.maxMemory.endswith('G'): config.maxMemory = int( args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024 elif args.maxMemory.endswith('M'): config.maxMemory = int( args.maxMemory.rstrip('M')) * 1024 * 1024 elif args.maxMemory.endswith('K'): config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024 else: config.maxMemory = int(args.maxMemory) # Config sanity checks require(config.output_dir, 'No output location specified') if urlparse(config.output_dir).scheme != "s3": config.output_dir = config.output_dir.replace("file://", "", 1) mkdir_p(config.output_dir) if not config.output_dir.endswith('/'): config.output_dir += '/' require(config.partition_size, "Configuration parameter partition-size is required") require(config.partition_margin, "Configuration parameter partition-margin is required") if 'save_intermediate_files' not in config or not config.save_intermediate_files: config.intermediate_file_location = None elif urlparse(config.output_dir).scheme == "s3": raise UserError( "Config parameter 'save_intermediate_files' cannot be used with s3 output directory" ) else: intermediate_location = os.path.join( config.output_dir, "intermediate", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) mkdir_p(intermediate_location) config.intermediate_file_location = intermediate_location if "margin_phase_image" not in config or len( config.margin_phase_image) == 0: config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT if "margin_phase_tag" not in config or len( config.margin_phase_tag) == 0: config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT if "cpecan_image" not in config or len(config.cpecan_image) == 0: config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT if "cpecan_tag" not in config or len(config.cpecan_tag) == 0: config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT if "unittest" not in config: config.unittest = False if "minimal_output" not in config: config.minimal_output = False if "minimal_cpecan_output" not in config: config.minimal_cpecan_output = False if "cpecan_probabilities" not in config: config.cpecan_probabilities = False # get samples samples = parse_samples(config, args.manifest) # Program checks for program in ['docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow Job.Runner.startToil( Job.wrapJobFn(map_job, prepare_input, samples, config), args)
def main(): """ GATK germline pipeline with variant filtering and annotation. """ # Define Parser object and add to jobTree parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) # Generate subparsers subparsers = parser.add_subparsers(dest='command') subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline') parser_run.add_argument('--config', required=True, type=str, help='Path to the (filled in) config file, generated with ' '"generate-config".') parser_run.add_argument('--manifest', type=str, help='Path to the (filled in) manifest file, generated with ' '"generate-manifest".\nDefault value: "%(default)s".') parser_run.add_argument('--sample', default=None, nargs=2, type=str, help='Input sample identifier and BAM file URL or local path') parser_run.add_argument('--output-dir', default=None, help='Path/URL to output directory') parser_run.add_argument('-s', '--suffix', default=None, help='Additional suffix to add to the names of the output files') parser_run.add_argument('--preprocess-only', action='store_true', help='Only runs preprocessing steps') Job.Runner.addToilOptions(parser_run) options = parser.parse_args() cwd = os.getcwd() if options.command == 'generate-config' or options.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config) if options.command == 'generate-manifest' or options.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest) elif options.command == 'run': # Program checks for program in ['curl', 'docker']: require(next(which(program)), program + ' must be installed on every node.'.format(program)) require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config)) # Read sample manifest samples = [] if options.manifest: samples.extend(parse_manifest(options.manifest)) # Add BAM sample from command line if options.sample: uuid, url = options.sample # samples tuple: (uuid, url, paired_url, rg_line) # BAM samples should not have as paired URL or read group line samples.append(GermlineSample(uuid, url, None, None)) require(len(samples) > 0, 'No samples were detected in the manifest or on the command line') # Parse inputs inputs = {x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems()} required_fields = {'genome_fasta', 'output_dir', 'run_bwa', 'sorted', 'snp_filter_annotations', 'indel_filter_annotations', 'preprocess', 'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator', 'cores', 'file_size', 'xmx', 'suffix'} input_fields = set(inputs.keys()) require(input_fields > required_fields, 'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields))) if inputs['output_dir'] is None: inputs['output_dir'] = options.output_dir require(inputs['output_dir'] is not None, 'Missing output directory PATH/URL') if inputs['suffix'] is None: inputs['suffix'] = options.suffix if options.suffix else '' if inputs['preprocess_only'] is None: inputs['preprocess_only'] = options.preprocess_only if inputs['run_vqsr']: # Check that essential VQSR parameters are present vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'} require(input_fields > vqsr_fields, 'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields))) # Check that hard filtering parameters are present. If only running preprocessing steps, then we do # not need filtering information. elif not inputs['preprocess_only']: hard_filter_fields = {'snp_filter_name', 'snp_filter_expression', 'indel_filter_name', 'indel_filter_expression'} require(input_fields > hard_filter_fields, 'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields))) # Check for falsey hard filtering parameters for hard_filter_field in hard_filter_fields: require(inputs[hard_filter_field], 'Missing %s value for hard filtering, ' 'got %s.' % (hard_filter_field, inputs[hard_filter_field])) # Set resource parameters inputs['xmx'] = human2bytes(inputs['xmx']) inputs['file_size'] = human2bytes(inputs['file_size']) inputs['cores'] = int(inputs['cores']) inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations']) # HaplotypeCaller test data for testing inputs['hc_output'] = inputs.get('hc_output', None) # It is a toil-scripts convention to store input parameters in a Namespace object config = argparse.Namespace(**inputs) root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config) Job.Runner.startToil(root, options)
def generate_file(file_path, generate_func): require(not os.path.exists(file_path), file_path + ' already exists!') with open(file_path, 'w') as f: f.write(generate_func()) print('\t{} has been generated in the current working directory.'.format(os.path.basename(file_path)))