def main(logger): # get the argument parser and parse args parser = get_parser() args = parser.parse_args() # use the logger logger.info('Attempting to echo the message...') # run a subprocess and log the attempt log_command(logger, 'echo {}'.format(args.message), shell=True)
def run_htseq(dest_dir, sjdb_gtf, id_attr, logger): command = [ HTSEQ, "-r", "name", "-s", "no", "-f", "bam", f"--idattr={id_attr}", "-m", "intersection-nonempty", os.path.join(dest_dir, "results", "Pass1", "Aligned.out.sorted-byname.bam"), sjdb_gtf, ">", "htseq-count.txt", ] failed = ut_log.log_command( logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results"), ) return failed
def run_sample( sample_key, mask_path, gtf_path, s3_input_bucket, s3_output_bucket, s3_output_prefix, run_dir, logger, ): t_config = TransferConfig(num_download_attempts=25) sample_name = os.path.basename(sample_key) sample_id = sample_name.split(".")[0] # this is brittle! local_sample = os.path.join(run_dir, "input", sample_name) s3c.download_file(Bucket=s3_input_bucket, Key=sample_key, Filename=local_sample, Config=t_config) veloctyo_command = [ "velocyto", "run-smartseq2", "-o", run_dir, "-m", mask_path, "-e", sample_id, local_sample, gtf_path, ] if ut_log.log_command( logger, veloctyo_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, ): logger.info(f"velocyto failed on {sample_id}") os.remove(local_sample) return output_file = os.path.join(run_dir, f"{sample_id}.loom") logger.info("Uploading {}".format(output_file)) time.sleep(10) s3c.upload_file( Filename=output_file, Bucket=s3_output_bucket, Key=os.path.join(s3_output_prefix, f"{sample_id}.loom"), Config=t_config, ) os.remove(local_sample) os.remove(output_file)
def run_htseq(dest_dir, sjdb_gtf, id_attr, logger): """ Run alignment job with htseq. dest_dir - Path local to the machine on EC2 under which alignment results are stored before uploaded to S3. Child path of run_dir/sample_name sjdb_gtf - Path of reference genome .gtf files used to detect splice junctions id_attr - Determine naming format in the count file for different genomes logger - Logger object that exposes the interface the code directly uses Return FAILED, a boolean value of whether the alignment run fails """ htseq_command = [ HTSEQ, "-r", "name", "-s", "no", "-f", "bam", f"--idattr={id_attr}", "-m", "intersection-nonempty", os.path.join(dest_dir, "results", "Pass1", "Aligned.out.sorted-byname.bam"), sjdb_gtf, ">", "htseq-count.txt", ] failed = ut_log.log_command( logger, htseq_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results"), ) return failed
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get('AWS_BATCH_JOB_ID'): args.root_dir = os.path.join(args.root_dir, os.environ['AWS_BATCH_JOB_ID']) # local directories sample_id = os.path.basename(args.s3_input_dir) result_path = os.path.join(args.root_dir, 'data', 'hca', sample_id) fastq_path = os.path.join(result_path, 'fastqs') os.makedirs(fastq_path) genome_base_dir = os.path.join(args.root_dir, "genome", "cellranger") os.makedirs(genome_base_dir) if args.taxon == 'h**o': genome_name = 'HG38-PLUS' elif args.taxon == 'mus': genome_name = 'MM10-PLUS' else: raise ValueError("unknown taxon {}".format(args.taxon)) # files that should be uploaded outside of the massive tgz # path should be relative to the run folder files_to_upload = [ 'outs/raw_gene_bc_matrices_h5.h5', 'outs/raw_gene_bc_matrices/{}/genes.tsv'.format(genome_name), 'outs/raw_gene_bc_matrices/{}/barcodes.tsv'.format(genome_name), 'outs/raw_gene_bc_matrices/{}/matrix.mtx'.format(genome_name), 'outs/web_summary.html', 'outs/metrics_summary.csv' ] genome_tar_source = os.path.join('s3://czi-hca/ref-genome/cellranger/', genome_name + '.tgz') genome_dir = os.path.join(genome_base_dir, genome_name) # download the ref genome data command = [ 'aws', 's3', 'cp', '--quiet', genome_tar_source, genome_base_dir ] log_command(logger, command, shell=True) genome_tar_file = os.path.basename(genome_tar_source) logger.debug('Extracting {}'.format(genome_tar_file)) with tarfile.open(os.path.join(genome_base_dir, genome_tar_file)) as tf: tf.extractall(path=genome_base_dir) sys.stdout.flush() # download the fastq files command = [ 'aws', 's3', 'cp', '--no-progress', '--recursive', '--force-glacier-transfer' if args.glacier else '', args.s3_input_dir, fastq_path ] log_command(logger, command, shell=True) # Run cellranger os.chdir(result_path) command = [ CELLRANGER, 'count', '--localmem=240', '--nosecondary', '--disable-ui', '--expect-cells={}'.format(args.cell_count), '--id={}'.format(sample_id), '--fastqs={}'.format(fastq_path), '--transcriptome={}'.format(genome_dir) ] log_command(logger, command, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) # Move results(websummary, cell-gene table, tarball) data back to S3 for file_name in files_to_upload: command = [ 'aws', 's3', 'cp', '--quiet', os.path.join(result_path, sample_id, file_name), '{}/'.format(args.s3_output_dir) ] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying cp {}".format(file_name)) else: raise RuntimeError("couldn't sync {}".format(file_name)) command = [ 'tar', 'cvzf', '{}.tgz'.format(os.path.join(result_path, sample_id)), sample_id ] log_command(logger, command, shell=True) command = [ 'aws', 's3', 'cp', '--quiet', '{}.tgz'.format(os.path.join(result_path, sample_id)), '{}/'.format(args.s3_output_dir) ] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying cp {}.tgz".format(sample_id)) else: raise RuntimeError("couldn't sync {}.tgz".format(sample_id))
def main(logger): """ Download reference genome, run alignment jobs, and upload results to S3. logger - Logger object that exposes the interface the code directly uses """ parser = get_parser() args = parser.parse_args() if os.environ.get("AWS_BATCH_JOB_ID"): root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"]) else: root_dir = "/mnt" # local directories if args.s3_input_path.endswith("/"): args.s3_input_path = args.s3_input_path[:-1] run_dir = os.path.join(root_dir, "data") os.makedirs(run_dir) # check if the input genome and region are valid if args.taxon in reference_genomes: if args.taxon in deprecated: logger.warn( f"The name '{args.taxon}' will be removed in the future," f" start using '{deprecated[args.taxon]}'") genome_name = reference_genomes[args.taxon] else: raise ValueError(f"unknown taxon {args.taxon}") if args.taxon == "gencode.vM19" or args.taxon == "gencode.vM19.ERCC": id_attr = "gene_name" else: id_attr = "gene_id" genome_dir = os.path.join(root_dir, "genome", "STAR", genome_name) ref_genome_star_file = f"STAR/{genome_name}.tgz" sjdb_gtf = os.path.join(root_dir, f"{genome_name}.gtf") if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"): raise ValueError(f"you must use --region west for {genome_name}") if args.region == "east": ref_genome_star_file = os.path.join("ref-genome", ref_genome_star_file) s3_input_bucket, s3_input_prefix = s3u.s3_bucket_and_key( args.s3_input_path) logger.info( f"""Run Info: partition {args.partition_id} out of {args.num_partitions} genome_dir:\t{genome_dir} ref_genome_star_file:\t{ref_genome_star_file} sjdb_gtf:\t{sjdb_gtf} id_attr:\t{id_attr} taxon:\t{args.taxon} s3_input_path:\t{args.s3_input_path}""") s3 = boto3.resource("s3") # download the reference genome data os.mkdir(os.path.join(root_dir, "genome")) logger.info("Downloading and extracting gtf data {}".format(sjdb_gtf)) s3c.download_file( Bucket=S3_REFERENCE[ "west"], # just always download this from us-west-2... Key=f"velocyto/{genome_name}.gtf", Filename=sjdb_gtf, ) os.mkdir(os.path.join(root_dir, "genome", "STAR")) logger.info( "Downloading and extracting STAR data {}".format(ref_genome_star_file)) s3_object = s3.Object(S3_REFERENCE[args.region], ref_genome_star_file) with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf: tf.extractall(path=os.path.join(root_dir, "genome", "STAR")) # Load Genome Into Memory command = [STAR, "--genomeDir", genome_dir, "--genomeLoad", "LoadAndExit"] if ut_log.log_command(logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True): raise RuntimeError("Failed to load genome into memory") sample_re = re.compile("([^/]+)_R\d(?:_\d+)?.fastq.gz$") s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key( args.s3_output_path) logger.info("Running partition {} of {}".format(args.partition_id, args.num_partitions)) # Check the input folder for existing runs if not args.force_realign: output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix, lambda r: (r["LastModified"], r["Key"])) else: output = [] output_files = { tuple(os.path.basename(fn).rsplit(".", 2)[0].split(".", 1)[:2]) for dt, fn in output if fn.endswith(".htseq-count.txt") and dt > CURR_MIN_VER } logger.info("Skipping {} existing results".format(len(output_files))) sample_files = [(fn, s) for fn, s in s3u.get_size(s3_input_bucket, s3_input_prefix) if fn.endswith("fastq.gz")] sample_lists = defaultdict(list) sample_sizes = defaultdict(list) for fn, s in sample_files: matched = sample_re.search(os.path.basename(fn)) if matched: sample_lists[matched.group(1)].append(fn) sample_sizes[matched.group(1)].append(s) logger.info(f"number of samples: {len(sample_lists)}") for sample_name in sorted( sample_lists)[args.partition_id::args.num_partitions]: if (sample_name, args.taxon) in output_files: logger.debug(f"{sample_name} already exists, skipping") continue if sum(sample_sizes[sample_name]) < args.min_size: logger.info(f"{sample_name} is below min_size, skipping") continue failed, dest_dir = run_sample( s3_input_bucket, sample_name, sorted(sample_lists[sample_name]), genome_dir, run_dir, args.star_proc, logger, ) failed = failed or run_htseq(dest_dir, sjdb_gtf, id_attr, logger) if not failed: upload_results(sample_name, args.taxon, dest_dir, args.s3_output_path, logger) command = ["rm", "-rf", dest_dir] ut_log.log_command(logger, command, shell=True) time.sleep(30) logger.info("Job completed")
def run_sample(s3_input_bucket, sample_name, sample_fns, genome_dir, run_dir, star_proc, logger): """ Run alignment jobs with STAR. s3_input_bucket - Name of the bucket with input fastq files to align sample_name - Sequenced sample name (joined by "_") sample_fns - Sample file names. Each file name is concatenated by sample_name, "_R1_" or"_R2_", a number, and ".fastq.gz" genome_dir - Path to reference genome run_dir - Path local to the machine on EC2 under which alignment results are stored before uploaded to S3 star_proc - Number of processes to give to each STAR run logger - Logger object that exposes the interface the code directly uses Return two values. FAILED is a boolean value of whether the alignment run fails. DEST_DIR is the path under which STAR alignment results are stored. This path is local to the machine on EC2 running the alignment, and that's where we copy the alignment results to upload to S3 later. """ t_config = TransferConfig(use_threads=False, num_download_attempts=25) dest_dir = os.path.join(run_dir, sample_name) if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.mkdir(os.path.join(dest_dir, "rawdata")) os.mkdir(os.path.join(dest_dir, "results")) os.mkdir(os.path.join(dest_dir, "results", "Pass1")) for sample_fn in sample_fns: s3c.download_file( Bucket=s3_input_bucket, Key=sample_fn, Filename=os.path.join(dest_dir, os.path.basename(sample_fn)), Config=t_config, ) # start running STAR # getting input files first reads = sorted( os.path.join(dest_dir, os.path.basename(sample_fn)) for sample_fn in sample_fns) input_command = COMMON_PARS[:] input_command.extend(( "--runThreadN", str(star_proc), "--genomeDir", genome_dir, "--readFilesIn", " ".join(reads), )) failed = ut_log.log_command( logger, input_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results", "Pass1"), ) # running sam tools sample_command = [ SAMTOOLS, "sort", "-m", "6000000000", "-o", "./Pass1/Aligned.out.sorted.bam", "./Pass1/Aligned.out.bam", ] failed = failed or ut_log.log_command( logger, sample_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=os.path.join(dest_dir, "results"), ) # running samtools index -b sample_index_command = [SAMTOOLS, "index", "-b", "Aligned.out.sorted.bam"] failed = failed or ut_log.log_command( logger, sample_index_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results", "Pass1"), ) # generating files for htseq-count output_command = [ SAMTOOLS, "sort", "-m", "6000000000", "-n", "-o", "./Pass1/Aligned.out.sorted-byname.bam", "./Pass1/Aligned.out.sorted.bam", ] failed = failed or ut_log.log_command( logger, output_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results"), ) return failed, dest_dir
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get("AWS_BATCH_JOB_ID"): args.root_dir = os.path.join(args.root_dir, os.environ["AWS_BATCH_JOB_ID"]) if args.sample_sheet_name is None: args.sample_sheet_name = "{}.csv".format(args.exp_id) # local directories result_path = os.path.join(args.root_dir, "data", "hca", args.exp_id) bcl_path = os.path.join(result_path, "bcl") output_path = os.path.join(result_path, "fastqs") os.makedirs(result_path) os.mkdir(bcl_path) # download sample sheet command = [ "aws", "s3", "cp", "--quiet", os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name), result_path, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying s3 copy") else: raise RuntimeError("couldn't download sample sheet {}".format( os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name))) # download the bcl files command = [ "aws", "s3", "sync", "--quiet", os.path.join(args.s3_input_dir, args.exp_id), bcl_path, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying s3 sync bcl") else: raise RuntimeError("couldn't sync {}".format( os.path.join(args.s3_input_dir, args.exp_id))) # Run cellranger mkfastq command = [ CELLRANGER, "mkfastq", "--localmem=60", "--sample-sheet={}".format( os.path.join(result_path, args.sample_sheet_name)), "--run={}".format(os.path.join(bcl_path)), "--output-dir={}".format(output_path), ] if log_command(logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True): raise RuntimeError("cellranger mkfastq failed") # upload fastq files to destination folder command = [ "aws", "s3", "sync", "--quiet", output_path, os.path.join(args.s3_output_dir, args.exp_id), ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying sync fastq") else: raise RuntimeError("couldn't sync fastqs")
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get('AWS_BATCH_JOB_ID'): args.root_dir = os.path.join(args.root_dir, os.environ['AWS_BATCH_JOB_ID']) if args.sample_sheet_name is None: args.sample_sheet_name = '{}.csv'.format(args.exp_id) # local directories result_path = os.path.join(args.root_dir, 'data', 'hca', args.exp_id) bcl_path = os.path.join(result_path, 'bcl') output_path = os.path.join(result_path, 'fastqs') os.makedirs(result_path) os.mkdir(bcl_path) # download sample sheet command = ['aws', 's3', 'cp', '--quiet', os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name), result_path] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying s3 copy") else: raise RuntimeError("couldn't download sample sheet {}".format( os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name)) ) # download the bcl files command = ['aws', 's3', 'sync', '--quiet', os.path.join(args.s3_input_dir, args.exp_id), bcl_path] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying s3 sync bcl") else: raise RuntimeError("couldn't sync {}".format( os.path.join(args.s3_input_dir, args.exp_id)) ) # Run cellranger mkfastq command = [CELLRANGER, 'mkfastq', '--localmem=60', '--sample-sheet={}'.format(os.path.join(result_path, args.sample_sheet_name)), '--run={}'.format(os.path.join(bcl_path)), '--output-dir={}'.format(output_path)] log_command(logger, command, shell=True) # upload fastq files to destination folder command = ['aws', 's3', 'sync', '--quiet', output_path, os.path.join(args.s3_output_dir, args.exp_id)] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying sync fastq") else: raise RuntimeError("couldn't sync fastqs")
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get('AWS_BATCH_JOB_ID'): root_dir = os.path.join('/mnt', os.environ['AWS_BATCH_JOB_ID']) else: root_dir = '/mnt' run_dir = os.path.join(root_dir, 'data', 'hca') os.makedirs(run_dir) if args.taxon == 'h**o': genome_dir = os.path.join(root_dir, "genome/STAR/HG38-PLUS/") ref_genome_file = 'hg38-plus.tgz' ref_genome_star_file = 'STAR/HG38-PLUS.tgz' sjdb_gtf = os.path.join(root_dir, 'genome', 'hg38-plus', 'hg38-plus.gtf') elif args.taxon == 'mus': genome_dir = os.path.join(root_dir, "genome/STAR/MM10-PLUS/") ref_genome_file = 'mm10-plus.tgz' ref_genome_star_file = 'STAR/MM10-PLUS.tgz' sjdb_gtf = os.path.join(root_dir, 'genome', 'mm10-plus', 'mm10-plus.gtf') else: raise ValueError('Invalid taxon {}'.format(args.taxon)) if args.star_proc > mp.cpu_count(): raise ValueError('Not enough CPUs to give {} processes to STAR'.format( args.star_proc)) s3_input_bucket,s3_input_prefix = s3u.s3_bucket_and_key(args.s3_input_path) logger.info( '''Run Info: partition {} out of {} star_proc:\t{} htseq_proc:\t{} genome_dir:\t{} ref_genome_file:\t{} ref_genome_star_file:\t{} sjdb_gtf:\t{} taxon:\t{} s3_input_path:\t{} input_dirs:\t{}'''.format( args.partition_id, args.num_partitions, args.star_proc, args.htseq_proc, genome_dir, ref_genome_file, ref_genome_star_file, sjdb_gtf, args.taxon, args.s3_input_path, ', '.join(args.input_dirs) ) ) s3 = boto3.resource('s3') # download the genome data os.mkdir(os.path.join(root_dir, 'genome')) logger.info('Downloading and extracting genome data {}'.format(ref_genome_file)) object = s3.Object('czbiohub-reference', ref_genome_file) with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf: tf.extractall(path=os.path.join(root_dir, 'genome')) # download STAR stuff os.mkdir(os.path.join(root_dir, 'genome', 'STAR')) logger.info('Downloading and extracting STAR data {}'.format(ref_genome_star_file)) object = s3.Object('czbiohub-reference', ref_genome_star_file) with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf: tf.extractall(path=os.path.join(root_dir, 'genome', 'STAR')) # Load Genome Into Memory command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'LoadAndExit'] ut_log.log_command(logger, command, shell=True) log_queue, log_thread = ut_log.get_thread_logger(logger) star_queue = mp.Queue() htseq_queue = mp.Queue() n_star_procs = mp.cpu_count() // args.star_proc star_args = (star_queue, htseq_queue, log_queue, s3_input_bucket, genome_dir, run_dir, args.star_proc) star_procs = [mp.Process(target=run_sample, args=star_args) for i in range(n_star_procs)] for p in star_procs: p.start() htseq_args = (htseq_queue, log_queue, args.s3_input_path, args.s3_output_path, args.taxon, sjdb_gtf) htseq_procs = [mp.Process(target=run_htseq, args=htseq_args) for i in range(args.htseq_proc)] for p in htseq_procs: p.start() sample_re = re.compile("([^/]+)_R\d_\d+.fastq.gz$") for input_dir in args.input_dirs: if args.s3_output_path is None: s3_output_path = os.path.join(args.s3_input_path, input_dir, 'results') else: s3_output_path = args.s3_output_path s3_output_bucket,s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path) # Check the input_dir folder for existing runs if not args.force_realign: output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix, lambda r: (r['LastModified'], r['Key'])) else: output = [] output_files = {tuple(os.path.basename(fn).split('.')[:2]) for dt,fn in output if fn.endswith('htseq-count.txt') and dt > CURR_MIN_VER} logger.info("Skipping {} existing results".format(len(output_files))) logger.info("Running partition {} of {} for {}".format( args.partition_id, args.num_partitions, input_dir) ) output = [ fn for fn in s3u.get_files(s3_input_bucket, os.path.join(s3_input_prefix, input_dir)) if fn.endswith('fastq.gz') ] logger.info("number of fastq.gz files: {}".format(len(output))) sample_lists = defaultdict(list) for fn in output: matched = sample_re.search(os.path.basename(fn)) if matched: sample_lists[matched.group(1)].append(fn) for sample_name in sorted(sample_lists)[args.partition_id::args.num_partitions]: if (sample_name, args.taxon) in output_files: logger.info("{} already exists, skipping".format(sample_name)) continue logger.info("Adding sample {} to queue".format(sample_name)) star_queue.put((input_dir, sample_name, sorted(sample_lists[sample_name]))) for i in range(n_star_procs): star_queue.put('STOP') for p in star_procs: p.join() for i in range(args.htseq_proc): htseq_queue.put('STOP') for p in htseq_procs: p.join() log_queue.put('STOP') log_thread.join() # Remove Genome from Memory command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'Remove'] ut_log.log_command(logger, command, shell=True) logger.info('Job completed')
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get("AWS_BATCH_JOB_ID"): args.root_dir = os.path.join(args.root_dir, os.environ["AWS_BATCH_JOB_ID"]) # local directories if args.s3_input_dir.endswith("/"): args.s3_input_dir = args.s3_input_dir[:-1] sample_id = os.path.basename(args.s3_input_dir) result_path = os.path.join(args.root_dir, "data", "hca", sample_id) fastq_path = os.path.join(result_path, "fastqs") os.makedirs(fastq_path) genome_base_dir = os.path.join(args.root_dir, "genome", "cellranger") os.makedirs(genome_base_dir) if args.taxon in reference_genomes: if args.taxon in deprecated: logger.warn(f"'{args.taxon}' will be removed in the future," f" use '{reference_genomes[args.taxon]}'") genome_name = reference_genomes[args.taxon] else: raise ValueError(f"unknown taxon {args.taxon}") if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"): raise ValueError(f"you must use --region west for {genome_name}") s3 = boto3.resource("s3") # download the ref genome data logger.info(f"Downloading and extracting genome data {genome_name}") if args.region == "east": s3_object = s3.Object(S3_REFERENCE[args.region], f"ref-genome/cellranger/{genome_name}.tgz") else: s3_object = s3.Object(S3_REFERENCE[args.region], f"cellranger/{genome_name}.tgz") with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf: tf.extractall(path=genome_base_dir) genome_dir = os.path.join(genome_base_dir, genome_name) sys.stdout.flush() # download the fastq files command = [ "aws", "s3", "cp", "--no-progress", "--recursive", "--force-glacier-transfer" if args.glacier else "", args.s3_input_dir, fastq_path, ] log_command(logger, command, shell=True) # Run cellranger os.chdir(result_path) command = [ CELLRANGER, "count", "--localmem=240", "--nosecondary", "--disable-ui", f"--expect-cells={args.cell_count}", f"--id={sample_id}", f"--fastqs={fastq_path}", f"--transcriptome={genome_dir}", ] failed = log_command( logger, command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, ) if failed: raise RuntimeError("cellranger count failed") # Move outs folder to S3 command = [ "aws", "s3", "sync", "--no-progress", os.path.join(result_path, sample_id, "outs"), args.s3_output_dir, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info(f"retrying sync") else: raise RuntimeError(f"couldn't sync output")
def run_sample(s3_input_bucket, sample_name, sample_fns, genome_dir, run_dir, star_proc, logger): t_config = TransferConfig(use_threads=False, num_download_attempts=25) dest_dir = os.path.join(run_dir, sample_name) if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.mkdir(os.path.join(dest_dir, "rawdata")) os.mkdir(os.path.join(dest_dir, "results")) os.mkdir(os.path.join(dest_dir, "results", "Pass1")) for sample_fn in sample_fns: s3c.download_file( Bucket=s3_input_bucket, Key=sample_fn, Filename=os.path.join(dest_dir, os.path.basename(sample_fn)), Config=t_config, ) # start running STAR # getting input files first reads = sorted( os.path.join(dest_dir, os.path.basename(sample_fn)) for sample_fn in sample_fns) command = COMMON_PARS[:] command.extend(( "--runThreadN", str(star_proc), "--genomeDir", genome_dir, "--readFilesIn", " ".join(reads), )) failed = ut_log.log_command( logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results", "Pass1"), ) # running sam tools command = [ SAMTOOLS, "sort", "-m", "6000000000", "-o", "./Pass1/Aligned.out.sorted.bam", "./Pass1/Aligned.out.bam", ] failed = failed or ut_log.log_command( logger, command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=os.path.join(dest_dir, "results"), ) # running samtools index -b command = [SAMTOOLS, "index", "-b", "Aligned.out.sorted.bam"] failed = failed or ut_log.log_command( logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results", "Pass1"), ) # generating files for htseq-count command = [ SAMTOOLS, "sort", "-m", "6000000000", "-n", "-o", "./Pass1/Aligned.out.sorted-byname.bam", "./Pass1/Aligned.out.sorted.bam", ] failed = failed or ut_log.log_command( logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=os.path.join(dest_dir, "results"), ) return failed, dest_dir
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get("AWS_BATCH_JOB_ID"): root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"]) else: root_dir = "/mnt" if args.sample_sheet_name is None: args.sample_sheet_name = "{}.csv".format(args.exp_id) # local directories result_path = os.path.join(root_dir, "data", "hca", args.exp_id) bcl_path = os.path.join(result_path, "bcl") output_path = os.path.join(result_path, "fastqs") # download sample sheet os.makedirs(result_path) os.mkdir(bcl_path) command = [ "aws", "s3", "cp", "--quiet", os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name), result_path, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying s3 copy") else: raise RuntimeError("couldn't download sample sheet {}".format( os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name))) # download the bcl files command = [ "aws", "s3", "sync", "--quiet", "--force-glacier-transfer" if args.force_glacier else "", os.path.join(args.s3_input_dir, args.exp_id), bcl_path, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying s3 sync bcl") else: raise RuntimeError("couldn't sync {}".format( os.path.join(args.s3_input_dir, args.exp_id))) command = ( "while true;" ' do echo "memory usage" `cat /sys/fs/cgroup/memory/memory.usage_in_bytes`;' ' echo "disk usage" `df -h | grep "/mnt"`;' " sleep 300;" " done") p = subprocess.Popen([command], shell=True) # Run bcl2 fastq command = [ BCL2FASTQ, " ".join(args.bcl2fastq_options), "--sample-sheet", os.path.join(result_path, args.sample_sheet_name), "-R", bcl_path, "-o", output_path, ] failed = log_command(logger, command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) if failed: p.kill() raise RuntimeError("bcl2fastq failed, see above for error") # fix directory structure of the files *before* sync! fastqgz_files = glob.glob(os.path.join(output_path, "*fastq.gz")) logger.debug("all fastq.gz files\n{}\n\n".format("\n".join(fastqgz_files))) for fastq_file in fastqgz_files: if args.skip_undetermined and os.path.basename(fastq_file).startswith( "Undetermined"): logger.info("removing {}".format(os.path.basename(fastq_file))) os.remove(fastq_file) elif args.star_structure: m = re.match("(.+)(_R[12]_001.fastq.gz)", os.path.basename(fastq_file)) if m: sample = m.group(1) if not os.path.exists(os.path.join(output_path, sample)): logger.debug("creating {}".format( os.path.join(output_path, sample))) os.mkdir(os.path.join(output_path, sample)) logger.debug("moving {}".format(fastq_file)) os.rename( fastq_file, os.path.join(output_path, sample, os.path.basename(fastq_file)), ) else: logger.warning( "Warning: regex didn't match {}".format(fastq_file)) sys.stdout.flush() # upload fastq files to destination folder command = [ "aws", "s3", "sync", "--quiet", output_path, os.path.join(args.s3_output_dir, args.exp_id), "--exclude", '"*"', "--include", '"*fastq.gz"', ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying sync fastq") else: raise RuntimeError("couldn't sync fastqs") # Move reports data back to S3 reports_path = subprocess.check_output( "ls -d {}".format( os.path.join(output_path, "Reports", "html", "*", "all", "all", "all")), shell=True, ).rstrip() command = [ "aws", "s3", "cp", "--quiet", reports_path.decode(), os.path.join(args.s3_report_dir, args.exp_id), "--recursive", ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info("retrying cp reports") else: raise RuntimeError("couldn't cp reports") p.kill()
def main(logger): parser = get_parser() args = parser.parse_args() if os.environ.get('AWS_BATCH_JOB_ID'): root_dir = os.path.join(ROOT_DIR_PATH, os.environ['AWS_BATCH_JOB_ID']) else: root_dir = ROOT_DIR_PATH if args.sample_sheet_name is None: args.sample_sheet_name = '{}.csv'.format(args.exp_id) # local directories result_path = os.path.join(root_dir, 'data', 'hca', args.exp_id) bcl_path = os.path.join(result_path, 'bcl') output_path = os.path.join(result_path, 'fastqs') if not args.no_s3_download: # only make dirs if they don't exist yet if not os.path.isdir(result_path): os.makedirs(result_path) if not os.path.isdir(bcl_path): os.mkdir(bcl_path) # download sample sheet command = ['aws', 's3', 'cp', '--quiet', os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name), result_path] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying s3 copy") else: raise RuntimeError("couldn't download sample sheet {}".format( os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name)) ) # do a check on the sample inputs to make sure we can get run IDs from all of them # change this if the Illumina sample sheet output ever changes; otherwise this line has the headers _SAMPLE_SHEET_STARTING_LINE = 21 df_csv = pd.read_csv(os.path.join(result_path, args.sample_sheet_name), header=_SAMPLE_SHEET_STARTING_LINE) samples_not_matching_run_ids = [sample_name for sample_name in df_csv['Sample_ID'] if not _check_for_run_information(sample_name)] if len(samples_not_matching_run_ids) > 0: raise ValueError('Found sample names that I could not extract run ID values (of the form RunXX_YY) from: ' '{}'.format(samples_not_matching_run_ids)) # download the bcl files command = ['aws', 's3', 'sync', '--quiet', '--force-glacier-transfer' if args.force_glacier else '', os.path.join(args.s3_input_dir, args.exp_id), bcl_path] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying s3 sync bcl") else: raise RuntimeError("couldn't sync {}".format( os.path.join(args.s3_input_dir, args.exp_id)) ) # this is actually awful because the process forks and you have to go kill it yourself command = ('while true;' ' do memusage=`cat /sys/fs/cgroup/memory/memory.usage_in_bytes`;' ' memgb=`echo "${memusage}/(1000000000)" | bc -l | xargs -I {} printf "%.2f\n" {}`;' ' echo "memory usage: ${memgb}GB";' ' echo "disk usage: " `df -h | grep -e "/$" | awk \'{print $(NF-4)" "$(NF-3)" "$(NF-2)" "$(NF-1)" "$NF}\'' ' sleep 90;' ' done') p = subprocess.Popen([command], shell=True) # Run bcl2 fastq command = [BCL2FASTQ, ' '.join(args.bcl2fastq_options), '--sample-sheet', os.path.join(result_path, args.sample_sheet_name), '-R', bcl_path, '-o', output_path] log_command(logger, command, shell=True) # fix directory structure of the files *before* sync! fastqgz_files = glob.glob(os.path.join(output_path, '*fastq.gz')) logger.debug('all fastq.gz files\n{}\n\n'.format('\n'.join(fastqgz_files))) # TODO(dstone): organize the run based on the TraceGenomics/RunXX/RunXX_YY/*.fastq.gz and do our usual rearrangement for fastq_file in fastqgz_files: if (args.skip_undetermined and os.path.basename(fastq_file).startswith('Undetermined')): logger.info("removing {}".format(os.path.basename(fastq_file))) os.remove(fastq_file) elif args.group_by_sample: # exclude the sample number (_S[numbers]) m = re.match("(.+)(_S\d+_R[12]_001.fastq.gz)", os.path.basename(fastq_file)) if m: sample = m.group(1) # should be of the form RunX_Y if not re.match('^Run\d+_\d+$', sample): # shouldn't actually be able to get here, because there is a check above at the sample sheet level, # but just in case raise ValueError('Was expecting to find a sample name of the form RunXX_YY, could not find in {} sample name!'.format(sample)) run = sample.split('_')[0] grouped_sample_path = os.path.join(output_path, run, sample) # organizes as RunX/RunX_Y/[sample stuff] if not os.path.exists(grouped_sample_path): logger.debug("creating {}".format(grouped_sample_path)) os.makedirs(grouped_sample_path) logger.debug("moving {}".format(fastq_file)) os.rename(fastq_file, os.path.join(grouped_sample_path, os.path.basename(fastq_file))) else: logger.warning("Warning: regex didn't match {}".format(fastq_file)) sys.stdout.flush() if not args.no_s3_upload: # upload fastq files to destination folder command = ['aws', 's3', 'sync', '--quiet', output_path, args.s3_output_dir, # this doesn't fit our output structure #os.path.join(args.s3_output_dir, args.exp_id, 'rawdata'), '--exclude', '"*"', '--include', '"*fastq.gz"'] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying sync fastq") else: raise RuntimeError("couldn't sync fastqs") # check fastq upload command = ['aws', 's3', 'ls', '--recursive', args.s3_output_dir] #os.path.join(args.s3_output_dir, args.exp_id, 'rawdata')] log_command(logger, command, shell=True) # Move reports data back to S3 reports_path = subprocess.check_output( "ls -d {}".format(os.path.join(output_path, 'Reports', 'html', '*', 'all', 'all', 'all')), shell=True).rstrip() command = ['aws', 's3', 'cp', '--quiet', reports_path, os.path.join(args.s3_report_dir, args.exp_id), '--recursive'] for i in range(S3_RETRY): try: log_command(logger, command, shell=True) break except subprocess.CalledProcessError: logger.info("retrying cp reports") else: raise RuntimeError("couldn't cp reports") p.kill()
def main(logger): """ Download reference genome, run alignment jobs, and upload results to S3. logger - Logger object that exposes the interface the code directly uses """ parser = get_parser() args = parser.parse_args() args.root_dir = pathlib.Path(args.root_dir) if os.environ.get("AWS_BATCH_JOB_ID"): args.root_dir = args.root_dir / os.environ["AWS_BATCH_JOB_ID"] # local directories if args.s3_input_path.endswith("/"): args.s3_input_path = args.s3_input_path[:-1] sample_id = os.path.basename(args.s3_input_path) result_path = args.root_dir / "data" / sample_id if args.dobby: fastq_path = result_path else: fastq_path = result_path / "fastqs" fastq_path.mkdir(parents=True) genome_base_dir = args.root_dir / "genome" / "cellranger" genome_base_dir.mkdir(parents=True) # check if the input genome and region are valid if args.taxon in reference_genomes: if args.taxon in deprecated: logger.warn( f"The name '{args.taxon}' will be removed in the future," f" start using '{deprecated[args.taxon]}'") genome_name = reference_genomes[args.taxon] else: raise ValueError(f"unknown taxon {args.taxon}") genome_dir = genome_base_dir / genome_name ref_genome_10x_file = f"cellranger/{genome_name}.tgz" if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"): raise ValueError(f"you must use --region west for {genome_name}") if args.region == "east": ref_genome_10x_file = f"ref-genome/{ref_genome_10x_file}" logger.info( f"""Run Info: partition {args.partition_id} out of {args.num_partitions} genome_dir:\t{genome_dir} ref_genome_10x_file:\t{ref_genome_10x_file} taxon:\t{args.taxon} s3_input_path:\t{args.s3_input_path}""") s3 = boto3.resource("s3") # download the reference genome data logger.info(f"Downloading and extracting genome data {genome_name}") s3_object = s3.Object(S3_REFERENCE[args.region], ref_genome_10x_file) with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf: tf.extractall(path=genome_base_dir) sys.stdout.flush() # download the fastq files command = [ "aws", "s3", "cp", "--no-progress", "--recursive", "--force-glacier-transfer" if args.glacier else "", args.s3_input_path, f"{fastq_path}", ] log_command(logger, command, shell=True) logger.info( f"Running partition {args.partition_id} of {args.num_partitions}") # check the input folder for existing runs sample_name = { os.path.basename(fn).rsplit("_", 4)[0] for fn in fastq_path.glob("*fastq.gz") } assert len(sample_name) == 1, "Should only have one sample name to process" sample_name = sample_name.pop() # Run cellranger os.chdir(result_path) command = [ CELLRANGER, "count", "--localmem=240", "--nosecondary", "--disable-ui", f"--expect-cells={args.cell_count}", f"--id={sample_id}", f"--fastqs={fastq_path}", f"--transcriptome={genome_dir}", ] if args.dobby: command.append(f"--sample={sample_name}") failed = log_command( logger, command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, ) if failed: raise RuntimeError("cellranger count failed") # Move outs folder to S3 command = [ "aws", "s3", "sync", "--no-progress", os.path.join(result_path, sample_id, "outs"), args.s3_output_path, ] for i in range(S3_RETRY): if not log_command(logger, command, shell=True): break logger.info(f"retrying sync") else: raise RuntimeError(f"couldn't sync output")