def ingest(app, csv_file, output_dir, force=False): Session = open_db(app.database) default_job_options = { 'memory_in_gb': 5, 'queue': app.queue, 'docker': app.docker, } if app.job_group is not None: default_job_options['group'] = app.job_group preprocessor = B38Preprocessor(output_dir, job_runner=LsfJob(default_job_options), force=force) columns = { 'Compute Workflow Execution': 'compute_workflow_execution', 'Work Order': 'work_order', 'DNA': 'ingest_sample_name', 'WOI': 'woi', 'Working Directory': 'source_directory' } seen = set() with open(csv_file) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: output_json = dict() for key in columns: output_json[columns[key]] = row[key] seen_key = (output_json['source_directory'], output_json['ingest_sample_name'], output_json['work_order']) if seen_key in seen: logger.info( 'Duplicate row with identical source directory, sample name and workorder. Skipping...' ) continue else: seen.add(seen_key) outdir = preprocessor(output_json['source_directory']) is_valid = False analysis_cram_path = None analysis_gvcf_path = None if outdir is not None: is_valid = True analysis_cram_path = outdir analysis_gvcf_path = outdir session = Session() session.add( ComputeWorkflowSample( source_work_order=output_json['work_order'], ingest_sample_name=output_json['ingest_sample_name'], source_directory=output_json['source_directory'], woi=output_json['woi'], valid_source_directory=is_valid, analysis_cram_path=analysis_cram_path, analysis_gvcf_path=analysis_gvcf_path)) session.commit()
def generate(app, workorders): Session = open_db(app.database) table = QcTable() for wo in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == wo): if (sample.analysis_cram_verifyed): qc_dir = QcDirectory( os.path.join(sample.analysis_gvcf_path, 'qc')) if qc_dir.is_complete: logger.info('Adding qc for {0}'.format( sample.analysis_gvcf_path)) table.add(qc_dir.sample_name(), qc_dir, sample.ingest_sample_name) table.write(sys.stdout)
def check_analysis_dir(app): Session = open_db(app.database) session = Session() for sample in session.query(ComputeWorkflowSample): if (sample.analysis_gvcf_path is None or sample.analysis_gvcf_path is None): logger.warn( 'No analysis directory in database for {0}. Is the source directory invalid?' .format(sample.source_directory)) continue if (sample.analysis_cram_verifyed is None or not sample.analysis_cram_verifyed or sample.analysis_gvcfs_verified is None or not sample.analysis_gvcfs_verified): directory = AnalysisDirectory(sample.analysis_gvcf_path) is_complete = directory.is_complete sample.analysis_gvcfs_verified = is_complete sample.analysis_cram_verifyed = is_complete if not is_complete: # Print source_directory so we can attempt to process again logger.warn( '{0} should be examined and a new attempt made to pre-process the gvcfs etc.' .format(sample.source_directory)) else: qc_directory = QcDirectory( os.path.join(sample.analysis_gvcf_path, 'qc')) qc_synced = qc_directory.is_complete if not qc_synced: logger.warn( '{0} has a missing or incomplete qc directory. Attempt to resync.' .format(sample.source_directory)) else: logger.info('{0} complete.'.format( sample.source_directory)) session.commit()
def call_svs(app, workorders): Session = open_db(app.database) for workorder in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == workorder): if (sample.analysis_cram_verifyed and sample.analysis_sv_verified != True): if sample.analysis_sv_path is None: sample.analysis_sv_path = os.path.join( sample.analysis_cram_path, 'sv') directory = AnalysisDirectory(sample.analysis_gvcf_path) cram_file = directory.output_file_dict['*.cram'][0] filename = os.path.basename(cram_file) sample_name = filename.split('.cram')[0] sv_directory = AnalysisSvDirectory(sample.analysis_sv_path) complete = True if not sv_directory.staging_complete(): # stage directory force_make_dirs(sample.analysis_sv_path) force_symlink( cram_file, os.path.join(sample.analysis_sv_path, filename)) force_symlink( cram_file + '.crai', os.path.join(sample.analysis_sv_path, filename + '.crai')) os.chdir(sample.analysis_sv_path) if not sv_directory.cnvnator_complete(): # launch cnvnator complete = False print( subprocess.check_output([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/cnvnator_histogram.sh', filename ])) if not sv_directory.extract_complete(): # launch complete = False print( subprocess.check_output([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/extract_sv_reads.sh', filename ])) elif not sv_directory.lumpy_complete(): # launch complete = False subprocess.call([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/lumpy.sh', filename ]) elif not sv_directory.svtyper_complete(): complete = False subprocess.call([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/genotype.sh', filename ]) sample.analysis_sv_verified = complete session.commit() if complete: logger.info("{0} complete".format(sample_name)) session.close()
def oldband(app, output_dir, workorders): os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true' default_job_options = { 'memory_in_gb': 10, 'queue': app.queue, 'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1', } if app.job_group is not None: default_job_options['group'] = app.job_group job_runner = LsfJob(default_job_options) logdir = os.path.join(output_dir, 'log') Session = open_db(app.database) cmd = OldbandandRewriteGvcfCmd( java='/usr/bin/java', max_mem='8G', max_stack='8G', gatk_jar='/opt/GenomeAnalysisTK.jar', reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', break_multiple=1000000) for wo in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == wo): if (sample.analysis_cram_verifyed): cram_path = sample.analysis_cram_path sample_name = os.path.basename(cram_path) cram_file = os.path.join(sample.analysis_cram_path, '{}.cram'.format(sample_name)) oldband_path = os.path.join(sample.analysis_gvcf_path, 'oldbanded_gvcfs') force_make_dirs(oldband_path) stdout_dir = os.path.join(logdir, sample_name) for chrom in chromosomes: new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') cmdline = cmd(cram_file, output_gzvcf, chrom) lsf_options = { 'stdout': stdout, } job_runner.launch(cmdline, lsf_options) # do ext chrom_string = ' -L '.join(ext_chromosomes) new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): script = os.path.join(oldband_path, 'oldband_extChr.sh') cmdline = cmd(cram_file, output_gzvcf, chrom_string) cmdline += ' && rm -f {0}'.format(script) with open(script, 'w') as f: f.write('#!/bin/bash\n') f.write(cmdline) f.write('\n') stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') lsf_options = { 'stdout': stdout, } job_runner.launch('/bin/bash {0}'.format(script), lsf_options)