def download_fastq_files(fastq1_s3_path, fastq2_s3_path, working_dir): """Download the fastq files. :param fastq1_s3_path: S3 path containing FASTQ with read1 :param fastq2_s3_path: S3 path containing FASTQ with read2 :param working_dir: working directory :return: local path to the folder containing the fastq """ fastq_folder = os.path.join(working_dir, 'fastq') try: os.mkdir(fastq_folder) except Exception as e: pass local_fastq1_path = download_file(fastq1_s3_path, fastq_folder) local_fastq2_path = download_file(fastq2_s3_path, fastq_folder) # Isaac requires the fastqs to be symlinked as lane1_read1.fastq.gz and lane1_read2.fastq.gz os.symlink(local_fastq1_path, os.path.join(fastq_folder, 'lane1_read1.fastq.gz')) os.symlink(local_fastq2_path, os.path.join(fastq_folder, 'lane1_read2.fastq.gz')) return fastq_folder
def main(): argparser = ArgumentParser() file_path_group = argparser.add_argument_group(title='File paths') file_path_group.add_argument('--bam_s3_path', type=str, help='BAM s3 path', required=True) file_path_group.add_argument('--bai_s3_path', type=str, help='BAM Index s3 path', required=True) file_path_group.add_argument('--vcf_s3_path', type=str, help='VCF s3 path', required=True) file_path_group.add_argument('--reference_s3_path', type=str, help='Reference file s3 path', required=True) file_path_group.add_argument('--reference_index_s3_path', type=str, help='Reference file index s3 path', required=True) run_group = argparser.add_argument_group(title='Run command args') run_group.add_argument('--memory', type=str, help='Memory (in GB) for strelka to use', default=28) run_group.add_argument('--cmd_args', type=str, help='Additional arguments for platypus', default='') argparser.add_argument('--working_dir', type=str, default='/scratch') args = argparser.parse_args() working_dir = generate_working_dir(args.working_dir) print("Downloading bam") local_bam_path = download_file(args.bam_s3_path, working_dir) local_bai_path = download_file(args.bai_s3_path, working_dir) print("BAM and index donwloaded to %s and %s" % (local_bam_path, local_bai_path)) print("Downloading reference") local_reference_path = download_file(args.reference_s3_path, working_dir) local_reference_index_path = download_file(args.reference_index_s3_path, working_dir) print("Reference downloaded to %s. Index to %s" % (local_reference_path, local_reference_index_path)) print("Running Strelka") local_vcf_path = run_strelka(local_bam_path, local_reference_path, args.memory, args.cmd_args, working_dir) print("Uploading %s to %s" % (local_vcf_path, args.vcf_s3_path)) upload_folder(args.vcf_s3_path, local_vcf_path) print('Cleaning up working dir') delete_working_dir(working_dir) print("Completed")
def main(): argparser = ArgumentParser() argparser.add_argument('--vcf_s3_path', type=str, help='VCF s3 path', required=True) argparser.add_argument('--annotated_vcf_s3_path', type=str, help='Annotated vcf s3 path', required=True) argparser.add_argument('--working_dir', type=str, default='/scratch') argparser.add_argument('--cmd_args', type=str, help='arguments/options for snpeff', default='-t') args = argparser.parse_args() working_dir = generate_working_dir(args.working_dir) print('Downloading vcf') local_vcf_path = download_file(args.vcf_s3_path, working_dir) print('Running snpeff') annotated_vcf_path = run_snpeff(local_vcf_path, args.cmd_args, working_dir) print('Uploading %s to %s' % (annotated_vcf_path, args.annotated_vcf_s3_path)) upload_file(args.annotated_vcf_s3_path, annotated_vcf_path) print('Cleaning up working dir') delete_working_dir(working_dir) print('Completed')
def download_required_files(*args): fList = [] for f in args: print("Downloading {}".format(f)) downloaded_path = download_file(f, '/') print("file downloaded to {}".format(downloaded_path)) fList.append(downloaded_path) return fList
def main(): argparser = ArgumentParser() file_path_group = argparser.add_argument_group(title='File paths') file_path_group.add_argument('--bam_s3_path', type=str, help='BAM s3 path', required=True) file_path_group.add_argument('--reference_s3_path', type=str, help='reference file', required=True) file_path_group.add_argument('--bam_stats_s3_path', type=str, help='S3 Path to upload stats', required=True) run_group = argparser.add_argument_group(title='Run command args') run_group.add_argument('--cmd_args', type=str, help='Arguments for platypus', default='') argparser.add_argument('--working_dir', type=str, default='/scratch') args = argparser.parse_args() working_dir = generate_working_dir(args.working_dir) print("Downloading bam") local_bam_path = download_file(args.bam_s3_path, working_dir) print("BAM downloaded to %s" % local_bam_path) print("Downloading reference") local_reference_path = download_file(args.reference_s3_path, working_dir) print("Reference downloaded to %s." % local_reference_path) print("Running samtools stats") local_stats_path = run_samtools_stats(local_bam_path, local_reference_path, args.cmd_args, working_dir) print("Uploading %s to %s" % (local_stats_path, args.bam_stats_s3_path)) upload_file(args.bam_stats_s3_path, local_stats_path) print('Cleaning up working dir') delete_working_dir(working_dir) print("Completed")
def download_fastq_file(fastq1_s3_path, working_dir): """ Downlodas the fastq files :param fastq1_s3_path: S3 path containing FASTQ with read1 :param fastq2_s3_path: S3 path containing FASTQ with read2 :param working_dir: working directory :return: local path to the folder containing the fastq """ fastq_folder = os.path.join(working_dir, 'fastq') try: os.mkdir(fastq_folder) except Exception as e: pass local_fastq_path = download_file(fastq1_s3_path, fastq_folder) return local_fastq_path
def download_fastq_file(fastq1_s3_path, working_dir): """ Downloads the input file :param fastq1_s3_path: S3 path containing our FASTQ file :param working_dir: working directory :return: local path to the folder containing the fastq """ fastq_folder = os.path.join(working_dir, 'fastq') try: os.mkdir(fastq_folder) except Exception as e: print('Error occured while creating the fastq download folder') pass local_fastq_path = download_file(fastq1_s3_path, fastq_folder) return local_fastq_path
def main(): args = parseArguments() logging.basicConfig(level=args.log_level) logger.info("Run cohort-matcher Docker CLI v%s", __version__) logger.info(args) working_dir = generate_working_dir(args.working_dir) # Download fastq files and reference files logger.info('Downloading bam sheets') set1_bamsheet = download_file(args.set1_s3_path, working_dir) set2_bamsheet = download_file(args.set2_s3_path, working_dir) # Download reference bundles if args.set1_reference == 'hg19' or args.set2_reference == 'hg19': logger.info("Downloading hg19 reference bundle") download_file('s3://bmsrd-ngs-repo/reference/hg19-cohort-matcher.tar.bz2', working_dir) logger.info("Uncompressing hg19 reference bundle") uncompress(os.path.join(working_dir, 'hg19-cohort-matcher.tar.bz2'), working_dir) if args.set2_reference == 'GRCh37' or args.set2_reference == 'GRCh37': logger.info("Downloading GRCh37 reference bundle") download_file('s3://bmsrd-ngs-repo/reference/GRCh37-cohort-matcher.tar.bz2', working_dir) logger.info("Uncompressing GRCh37 reference bundle") uncompress(os.path.join(working_dir, 'GRCh37-cohort-matcher.tar.bz2', working_dir)) # Run cohort-matcher logger.info('Running cohort-matcher') if args.max_jobs is None: max_jobs = multiprocessing.cpu_count() else: max_jobs = args.max_jobs output_folder_path = run_cohort_matcher(args.log_level, set1_bamsheet, set2_bamsheet, args.set1_reference, args.set2_reference, working_dir, args.output_prefix, max_jobs) logger.info('Uploading results to %s', args.s3_output_folder_path) #upload_bam(args.bam_s3_folder_path, bam_folder_path) logger.info('Cleaning up working dir') delete_working_dir(working_dir) logger.info('Completed')
def run_vcf2tiledb_no_s3(workdir,idx, loader_path, callset_path, vid_path, contig): """ fallback to downloading each file. This method uses tabix to read the intervals from each file and saves sliced vcf as a local file. """ print("Performing no-s3-callset vcf2tiledb") exportSession() with open(vid_path) as vid_file: hg = json.load(vid_file) offset = hg['contigs'][contig]['tiledb_column_offset'] # edit loader file to point to correct callset filename srchStr = subprocess.check_output('grep -Po "/callset.+.json" %s' % (loader_path), shell=True) subprocess.check_call('sed -i "s|%s|%s|" %s' % (srchStr.rstrip(), callset_path, loader_path), shell=True) # extract start/end for this partition with open(loader_path) as loader_file: ldr = json.load(loader_file) start = ldr['column_partitions'][idx]['begin'] - offset - 1 end = ldr['column_partitions'][idx]['end'] - offset + 1 del ldr if start < 0: start = 0 pos = "%s:%s-%s" % (contig, start, end) # tabix region # Download VCF slices to workdir # assumes callset has S3 paths for filenames with open(callset_path) as callset_fp: fList = json.load(callset_fp) fListNew = fList print("Downloading slices") for SM in fList['callsets']: #download_file(fList['callsets'][SM]['filename'], workdir) s3path = fList['callsets'][SM]['filename'] fName = os.path.basename(s3path) gzFile = '%s/%s' % (workdir, fName) retries = 0 while True: if retries > 0: print("Retrying download for {}".format(fName)) if retries > 3: print('Downloading entire file') download_file(s3path, workdir) break cmd = "/bin/bash -o pipefail -c 'timeout 30 tabix -h %s %s | bgzip > %s'" % (s3path, pos, gzFile) try: subprocess.check_call(cmd, shell=True) if not os.path.exists(gzFile)) or os.stat(gzFile).st_size < 29: print("Download file size error") retries += 1 else: break except subprocess.CalledProcessError as e: print("Caught Exception: CalledProcessError") status = e.returncode print("status:={}".format(status)) retries += 1
def main(): argparser = ArgumentParser() file_path_group = argparser.add_argument_group(title='File paths') file_path_group.add_argument('--vcf_s3_path', type=str, help='VCF s3 path', required=True) file_path_group.add_argument('--bam_s3_path', type=str, help='BAM s3 path', required=True) file_path_group.add_argument('--bai_s3_path', type=str, help='BAI s3 path', required=True) file_path_group.add_argument('--results_s3_path', type=str, help='S3 Path to upload stats', required=True) run_group = argparser.add_argument_group(title='Run command args') run_group.add_argument('--cmd_args', type=str, help='Additional Arguments', default=None, nargs='*', action='store', dest='opt_list') #argparser.add_argument('--working_dir', type=str, default='/scratch') args = argparser.parse_args() total_size = 0 for obj in [args.vcf_s3_path, args.bam_s3_path, args.bai_s3_path]: total_size += get_size(obj) # add more for formatting loss total_size += 2e9 total_size = int(total_size) print("Total Size := {0}".format(total_size)) # Declare expected disk usage, triggers host's EBS script (ecs-ebs-manager) with open("/TOTAL_SIZE", "w") as text_file: text_file.write("{0}".format(total_size)) print("Waiting EBS") # Wait for EBS to appear while not os.path.isdir('/scratch'): time.sleep(5) # Wait for mount verification while not os.path.ismount('/scratch'): time.sleep(1) working_dir = generate_working_dir('/scratch') print("Downloading vcf") local_vcf_path = download_file(args.vcf_s3_path, working_dir) print("VCF downloaded to %s" % local_vcf_path) print("Downloading bam") local_bam_path = download_file(args.bam_s3_path, working_dir) print("BAM downloaded to %s" % local_bam_path) print("Downloading bam index") local_bam_index_path = download_file(args.bai_s3_path, working_dir) print("BAM index downloaded to %s" % local_bam_index_path) print("Running verifybamid") local_stats_path = run_verifybamid_basic(local_vcf_path, local_bam_path, local_bam_index_path, args.opt_list, working_dir) for ext in ['.selfSM', '.bestSM', '.depthSM', '.log']: if os.path.exists(local_stats_path + ext): print("Uploading %s to %s" % (local_stats_path + ext, args.results_s3_path + ext)) upload_file(args.results_s3_path + ext, local_stats_path + ext) print('Cleaning up working dir') delete_working_dir(working_dir) print("Completed")