def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None): """ Executes the germline variant calling pipeline :type execution: Execution :param str target_bed: The target bed to call variants in :param str input_path: The path to the input_file tsv of fastq files """ #: chrom -> target_bed_path target_bed = os.path.abspath(os.path.expanduser(target_bed)) input_path = os.path.abspath(os.path.expanduser(input_path)) # Copy the target.bed to the output_dir assert os.path.exists(target_bed), '%s does not exist' % target_bed cp_target_bed_task = execution.add_task(lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' % (target_bed, out_bed), out_dir='', stage_name='Copy_Target_Bed') target_bed_tasks = [execution.add_task(bed.filter_bed_by_contig, dict(contig=contig), [cp_target_bed_task], 'work/contigs/{contig}') for contig in util.get_bed_contigs(target_bed)] fastq_tasks = list(util.gen_fastq_tasks(execution, input_path)) # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs') # for fastq_path, tags in parse_inputs(input_path)] fastqc_tasks = many2one(fastqc.fastqc, fastq_tasks, ['sample_name', 'library'], out_dir='SM_{sample_name}/qc/LB_{library}') # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet aligned_tasks = align(execution, fastq_tasks, target_bed_tasks) call_task = variant_call(execution, aligned_tasks, target_bed_tasks) execution.run(max_cores=max_cores, max_attempts=max_attempts, cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs) if s3fs else shared_fs_cmd_fxn_wrapper) if execution.successful: execution.log.info('Final vcf: %s' % opj(s3fs if s3fs else execution.output_dir, call_task.output_files[0])) # Copy the sqlite db to s3 dburl = env.config['gk']['database_url'] if s3fs and dburl.startswith('sqlite'): # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution. Currently this is pushing way too much information, # TODO but will soon be replaced. Alternative: use amazon RDS! Or perhaps both? Could do a sqlalchemy merge and save to sqlite, or implement # TODO cosmos multiverse s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs, 'sqlite.db.backup'))
def download_from_s3(in_file, out_file=out_dir('{in_file}')): assert in_file.startswith('s3://') return s3cmd.cp(in_file, out_file)