def test_fetch_db_data_and_prepare_gviz_json(self): pp = Project_pooling_info(dbconfig_file=self.dbconfig) temp_dir = get_temp_dir() temp_file = os.path.join(temp_dir, 'test.json') pp.fetch_db_data_and_prepare_gviz_json(output_file_path=temp_file) self.assertTrue(os.path.exists(temp_file)) remove_dir(temp_dir)
def tearDown(self): remove_dir(dir_path=self.results_dir) if os.path.exists(self.output_tar_file): os.remove(self.output_tar_file) if os.path.exists(self.output_targz_file): os.remove(self.output_targz_file)
def extract_cellranger_count_metrics_summary( cellranger_tar, collection_name=None, collection_type=None, attribute_name='attribute_name', attribute_value='attribute_value', attribute_prefix='None', target_filename='metrics_summary.csv'): ''' A function for extracting metrics summary file for cellranger ourput tar and parse the file. Optionally it can add the collection name and type info to the output dictionary. :param cellranger_tar: A cellranger output tar file :param target_filename: A filename for metrics summary file lookup, default metrics_summary.csv :param collection_name: Optional collection name, default None :param collection_type: Optional collection type, default None :param attribute_tag: An optional string to add as prefix of the attribute names, default None :returns: A dictionary containing the metrics values ''' try: check_file_path(cellranger_tar) temp_work_dir = get_temp_dir(use_ephemeral_space=False) metrics_file = None with tarfile.open(cellranger_tar, mode='r') as tar: for file_name in tar.getnames(): if os.path.basename(file_name) == target_filename: tar.extract(file_name, path=temp_work_dir) metrics_file = os.path.join(temp_work_dir, file_name) if metrics_file is None: raise IOError('Required file {0} not found in tar {1}'.\ format(target_filename,cellranger_tar)) attribute_data = pd.read_csv(metrics_file).T.\ reset_index() attribute_data.columns = [attribute_name, attribute_value] if attribute_prefix is None: attribute_data[attribute_name] = \ attribute_data[attribute_name].\ map(lambda x: x.replace(' ','_')) else: attribute_data[attribute_name] = \ attribute_data[attribute_name].\ map(lambda x: \ '{0}_{1}'.format(\ attribute_prefix, x.replace(' ','_'))) if collection_name is not None: attribute_data['name'] = collection_name if collection_type is not None: attribute_data['type'] = collection_type attribute_data = attribute_data.\ to_dict(orient='records') remove_dir(temp_work_dir) return attribute_data except: raise
def generate_ipynb_from_template(template_ipynb_path, output_dir, param_dictionary, date_tag='date_tag', use_ephemeral_space=False): ''' A class for generating notebook IPYNB file from a template files with param substitution :param template_ipynb_path: A template IPYNB file path :param output_dir: Output path :param param_dictionary: A dictionary containing the params for final notebook :param date_tag: A text for date tag name, default date_tag :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False :returns: None ''' try: check_file_path(template_ipynb_path) check_file_path(output_dir) if not isinstance(param_dictionary, dict): raise TypeError( "Expecting a dictionary, got {0}".\ format(type(param_dictionary))) date_tag_value = \ datetime.\ strftime( datetime.now(), '%Y-%b-%d %H:%M') # date tag values param_dictionary.\ update(dict(date_tag=date_tag_value)) # adding date tag values to params temp_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) temp_output = \ os.path.join( temp_dir, os.path.basename(template_ipynb_path)) final_output = \ os.path.join( output_dir, os.path.basename(template_ipynb_path)) template_env = \ Environment( loader=\ FileSystemLoader( searchpath=os.path.dirname(template_ipynb_path)), autoescape=select_autoescape(['html', 'xml'])) notebook = \ template_env.\ get_template( os.path.basename(template_ipynb_path)) notebook.\ stream(**param_dictionary).\ dump(temp_output) # write temp ipynb file with param substitution copy_local_file(temp_output, final_output) remove_dir(temp_dir) except Exception as e: raise ValueError( "Failed to generate ipynb file from template {1}, error: {0}".\ format(e,template_ipynb_path))
def run_HaplotypeCaller(self, input_bam, output_vcf_path, dbsnp_vcf, emit_gvcf=True, force=False, dry_run=False, gatk_param_list=None): ''' A method for running GATK HaplotypeCaller :param input_bam: A input bam file :param output_vcf_path: A output vcf filepath :param dbsnp_vcf: A dbsnp vcf file :param emit_gvcf: A toggle for GVCF generation, default True :param force: Overwrite output file, if force is True :param dry_run: Return GATK command, if its true, default False :param gatk_param_list: List of additional params for BQSR, default None :returns: GATK commandline ''' try: self._run_gatk_checks() # run initial checks check_file_path(input_bam) check_file_path(dbsnp_vcf) temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # get temp dir temp_output = \ os.path.join( temp_dir, os.path.basename(output_vcf_path)) gatk_cmd = [ quote(self.gatk_exe), "HaplotypeCaller", "-I", quote(input_bam), "-O", quote(temp_output), "--reference", quote(self.ref_fasta), "--dbsnp", quote(dbsnp_vcf), "--java-options", quote(self.java_param) ] if emit_gvcf: gatk_cmd.extend(["--emit-ref-confidence", "GVCF"]) if gatk_param_list is not None and \ isinstance(gatk_param_list,list) and \ len(gatk_param_list) > 0: gatk_cmd.extend(gatk_param_list) # additional params gatk_cmd = ' '.join(gatk_cmd) if dry_run: return gatk_cmd subprocess.check_call(gatk_cmd, shell=True) copy_local_file(source_path=temp_output, destinationa_path=output_vcf_path, force=force) remove_dir(temp_dir) return gatk_cmd except Exception as e: raise ValueError( "Failed to run GATK HaplotypeCaller, error: {0}".\ format(e))
def singularity_run(image_path, path_bind, args_list, container_dir='/tmp', return_results=True, use_ephemeral_space=False, dry_run=False): ''' A wrapper module for running singularity based containers :param image_path: Singularrity image path :param path_bind: Path to bind to singularity /tmp dir :param args_list: List of args for singulatiy run :param return_results: Return singulatiy run results, default True :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False :param dry_run: Return the singularity command without run, default False :returns: A response from container run and a string containing singularity command line ''' try: check_file_path(image_path) check_file_path(path_bind) temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) res = None temp_image_path = \ os.path.join( temp_dir, os.path.basename(image_path)) copy_local_file(image_path, temp_image_path) # copy image to tmp dir if not isinstance(args_list,list) and \ len(args_list) > 0: raise ValueError( 'No args provided for singularity run') # safemode args = ' '.join(args_list) # flatten args singularity_run_cmd = \ 'singularity run {0} --bind {1}:{2} {3}'.\ format( temp_image_path, path_bind, container_dir, args) if dry_run: return res, singularity_run_cmd else: res = \ Client.run( image=temp_image_path, bind='{0}:{1}'.format(path_bind,container_dir), args=args, return_result=return_results) remove_dir(temp_dir) # remove copied image after run return res, singularity_run_cmd except Exception as e: raise ValueError( 'Failed to run image {0}, error: {1}'.\ format(image_path,e))
def run_AnalyzeCovariates(self, before_report_file, after_report_file, output_pdf_path, force=False, dry_run=False, gatk_param_list=None): ''' A method for running GATK AnalyzeCovariates tool :param before_report_file: A file containing bqsr output before recalibration :param after_report_file: A file containing bqsr output after recalibration :param output_pdf_path: An output pdf filepath :param force: Overwrite output file, if force is True :param dry_run: Return GATK command, if its true, default False :param gatk_param_list: List of additional params for BQSR, default None :returns: GATK commandline ''' try: self._run_gatk_checks() # run initial checks check_file_path(before_report_file) check_file_path(after_report_file) temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # get temp dir temp_output = \ os.path.join( temp_dir, os.path.basename(output_pdf_path)) gatk_cmd = [ quote(self.gatk_exe), "AnalyzeCovariates", "--before-report-file", quote(before_report_file), "--after-report-file", quote(after_report_file), "--plots-report-file", quote(temp_output), "--java-options", quote(self.java_param) ] if gatk_param_list is not None and \ isinstance(gatk_param_list,list) and \ len(gatk_param_list) > 0: gatk_cmd.extend(gatk_param_list) # additional params gatk_cmd = ' '.join(gatk_cmd) if dry_run: return gatk_cmd subprocess.check_call(gatk_cmd, shell=True) copy_local_file(source_path=temp_output, destinationa_path=output_pdf_path, force=force) remove_dir(temp_dir) return gatk_cmd except Exception as e: raise ValueError( "Failed to run GATK AnalyzeCovariates, error: {0}".\ format(e))
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') remote_project_path = self.param_required('remote_project_path') igf_session_class = self.param_required('igf_session_class') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') status_data_json = self.param('status_data_json') demultiplexing_pipeline_name = self.param_required('demultiplexing_pipeline_name') analysis_pipeline_name = self.param_required('analysis_pipeline_name') use_ephemeral_space = self.param('use_ephemeral_space') temp_work_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir ps = \ Project_status(\ igf_session_class=igf_session_class, project_igf_id=project_igf_id) temp_status_output = \ os.path.join(\ temp_work_dir, status_data_json) # get path for temp status file remote_project_dir = \ os.path.join(\ remote_project_path, project_igf_id) # get remote project directory path ps.generate_gviz_json_file(\ output_file=temp_status_output, demultiplexing_pipeline=demultiplexing_pipeline_name, analysis_pipeline=analysis_pipeline_name) # write data to output json file remote_file_path = \ os.path.join(\ remote_project_dir, status_data_json) self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=temp_status_output, remote_file=remote_file_path) # copy file to remote self.param('dataflow_params', {'remote_project_info':'done'}) remove_dir(temp_work_dir) # remove temp dir except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def _notify_about_new_user_account(self,data,user_col='username',\ password_col='password',hpc_user_col='hpc_username',\ name_col='name',email_id_col='email_id'): ''' An internal method for sending mail to new user with their password :param data: A pandas series containing user data :param user_col: Column name for username, default username :param password_col: Column name for password, default password :param hpc_user_col: Column name for hpc_username, default hpc_username :param name_col: Column name for name, default name :param email_id_col: Column name for email id, default email_id ''' try: if not isinstance(data, pd.Series): raise ValueError('Expecting a pandas series and got {0}'.\ format(type(data))) username = data[user_col] fullname = data[name_col] password = data[password_col] email_id = data[email_id_col] if hpc_user_col not in data or pd.isnull( data[hpc_user_col]): # send email only to non-hpc users template_dir = os.path.dirname(self.user_account_template) template_env=Environment(loader=FileSystemLoader(searchpath=template_dir), \ autoescape=select_autoescape(['html','xml'])) # set template env template_file=template_env.\ get_template(os.path.basename(self.user_account_template)) temp_work_dir = get_temp_dir() # get a temp dir report_output_file = os.path.join(temp_work_dir, 'email_template.txt') template_file.\ stream(userEmail=email_id, \ fullName=fullname,\ userName=username,\ userPass=password,\ ).\ dump(report_output_file) read_cmd = ['cat', quote(report_output_file)] proc = subprocess.Popen(read_cmd, stdout=subprocess.PIPE) sendmail_cmd = [self.sendmail_exe, '-t'] subprocess.check_call(sendmail_cmd, stdin=proc.stdout) proc.stdout.close() if proc.returncode != None: raise ValueError('Failed running command {0}:{1}'.format(read_cmd,\ proc.returncode)) remove_dir(temp_work_dir) except: raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') path = self.param_required('path') cleanup_status = self.param_required('cleanup_status') message = None if cleanup_status: if not os.path.exists(path): raise IOError('path {0} is not accessible'.format(path)) if os.path.isdir(path): remove_dir(path) message = 'removed dir {0}'.format(path) elif os.path.isfile(path): os.remove(path) message = 'removed file {0}'.format(path) else: message = 'path {0} is not file or directory, skipped removing'.\ format(path) else: message = 'Not removing path {0} as cleanup_status is not True'.\ format(path) self.param('dataflow_params', { 'path': path, 'cleanup_status': cleanup_status }) # set dataflow params if message: self.post_message_to_slack( message, reaction='pass') # send msg to slack self.comment_asana_task(task_name=seqrun_igf_id, comment=message) # send msg to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def validation_home(): form = ValidationForm() if form.validate_on_submit(): temp_dir = get_temp_dir() new_metadata_list = list() counter = 0 for file in form.metadata_file.data: counter += 1 filename = secure_filename(file.filename) file.save(\ os.path.join(\ temp_dir, '{0}_{1}'.format(counter,filename))) new_metadata_list.\ append(\ os.path.join(\ temp_dir, '{0}_{1}'.format(counter,filename))) samplesheet_filename = \ secure_filename(form.samplesheet_file.data.filename) form.samplesheet_file.\ data.save(\ os.path.join(\ temp_dir, samplesheet_filename)) new_samplesheet = \ os.path.join(\ temp_dir, samplesheet_filename) logging.warning(form.recaptcha.errors) vp = \ Validate_project_and_samplesheet_metadata(\ samplesheet_file=new_samplesheet, metadata_files=new_metadata_list, samplesheet_schema=app.config.get('SAMPLESHEET_SCHEMA'), metadata_schema=app.config.get('METADATA_SCHEMA')) json_data = vp.convert_errors_to_gviz() remove_dir(temp_dir) return render_template('validation/results.html', jsonData=json_data) else: if request.method == 'POST': flash('Failed input validation check') return render_template('validation/validate_metadata.html', form=form)
def metadata_home(): try: csv_data = '' form = MetadataForm() if form.validate_on_submit(): temp_dir = get_temp_dir() metadata_filename = \ secure_filename(form.metadata_file.data.filename) form.metadata_file.\ data.save(\ os.path.join(\ temp_dir, metadata_filename)) new_metadata_file = \ os.path.join(\ temp_dir, metadata_filename) try: csv_data = \ run_metadata_reformatting(\ metadata_file=new_metadata_file,\ output_dir=temp_dir) except Exception as e: flash('Failed metadata file reformatting') logging.warning(e) remove_dir(temp_dir) if csv_data != '': return \ Response(\ csv_data, mimetype="text/csv", headers={"Content-disposition": "attachment; filename=reformatted_metadata.csv"}) else: if request.method == 'POST': flash('Failed file type validation check') except Exception as e: logging.warning('Failed metadata reformatting, error: {0}'.format(e)) return render_template('metadata/metadata_reformat.html', form=form)
def run_ppqt(self, input_bam, output_dir, output_spp_name, output_pdf_name): ''' A method for running PPQT on input bam :param input_bam: Input bam file :param output_spp_name: Output spp out file :param output_pdf_name: Output pdf plot :param output_dir: Destination output dir :returns: PPQT run command as list,spp and pdf output path and a list or dictionary for spp.out matrics ''' try: temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) run_cmd = \ self._pre_process(\ input_bam=input_bam, output_spp_name=output_spp_name, output_pdf_name=output_pdf_name, output_dir=temp_dir, temp_dir=temp_dir) # preprocess and fetch run cmd subprocess.check_call(\ ' '.join(run_cmd), shell=True) # run ppqt and capture stdout spp_output, pdf_output = \ self._post_process(\ output_spp_name=output_spp_name, output_pdf_name=output_pdf_name, output_dir=output_dir, temp_dir=temp_dir) # copy files from temp dir remove_dir(temp_dir) # clean up temp dir spp_data = self._parse_spp_output(spp_file=spp_output) return run_cmd, spp_output, pdf_output, spp_data except: raise
def merge_multiple_bam(samtools_exe, input_bam_list, output_bam_path, sorted_by_name=False, use_ephemeral_space=0, threads=1, force=False, dry_run=False, index_output=True): ''' A function for merging multiple input bams to a single output bam :param samtools_exe: samtools executable path :param input_bam_list: A file containing list of bam filepath :param output_bam_path: A bam output filepath :param sorted_by_name: Sort bam file by read_name, default False (for coordinate sorted bams) :param threads: Number of threads to use for merging, default 1 :param force: Output bam file will be overwritten if force is True, default False :param index_output: Index output bam, default True :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param dry_run: A toggle for returning the samtools command without actually running it, default False :return: samtools command ''' try: check_file_path(samtools_exe) check_file_path(input_bam_list) with open(input_bam_list, 'r') as fp: for bam in fp: check_file_path(bam.strip()) temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_bam = \ os.path.join(\ temp_dir, os.path.basename(output_bam_path)) merge_cmd = \ [quote(samtools_exe), 'merge', '--output-fmt','BAM', '--threads',quote(str(threads)), '-b',quote(input_bam_list) ] if sorted_by_name: merge_cmd.append('-n') # Input files are sorted by read name merge_cmd.append(temp_bam) if dry_run: return merge_cmd subprocess.check_call(merge_cmd) # run samtools merge copy_local_file(\ source_path=temp_bam, destinationa_path=output_bam_path, force=force) # copy bamfile remove_dir(temp_dir) # remove temp dir _check_bam_file(output_bam_path) if index_output and \ not sorted_by_name: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_bam_path, threads=threads) return merge_cmd except: raise
def run_sort_bam(samtools_exe, input_bam_path, output_bam_path, sort_by_name=False, use_ephemeral_space=0, threads=1, force=False, dry_run=False, cram_out=False, index_output=True): ''' A function for sorting input bam file and generate a output bam :param samtools_exe: samtools executable path :param input_bam_path: A bam filepath :param output_bam_path: A bam output filepath :param sort_by_name: Sort bam file by read_name, default False (for coordinate sorting) :param threads: Number of threads to use for sorting, default 1 :param force: Output bam file will be overwritten if force is True, default False :param cram_out: Output cram file, default False :param index_output: Index output bam, default True :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param dry_run: A toggle for returning the samtools command without actually running it, default False :return: None ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=input_bam_path) sort_cmd = \ [quote(samtools_exe), 'sort', '-@{0}'.format(quote(str(threads))) ] if sort_by_name: sort_cmd.append('-n') # sorting by read name if cram_out: sort_cmd.append('--output-fmt CRAM') else: sort_cmd.append('--output-fmt BAM') temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_bam = \ os.path.join(\ temp_dir, os.path.basename(output_bam_path)) sort_cmd.extend(['-o', quote(temp_bam)]) sort_cmd.append(quote(input_bam_path)) if dry_run: return sort_cmd copy_local_file(\ source_path=temp_bam, destinationa_path=output_bam_path, force=force) # copy output bam remove_dir(temp_dir) # remove temp dir if cram_out: _check_cram_file(output_bam_path) else: _check_bam_file(output_bam_path) if index_output: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_bam_path, threads=threads) except: raise
time_tuple.tm_mday, time_tuple.tm_hour, time_tuple.tm_min, time_tuple.tm_sec) file_name = \ 'samplesheet_metadata_check_failed_{0}.txt'.\ format(time_stamp) file_name = os.path.join(msg_tmp_dir, file_name) with open(file_name, 'w') as fp: fp.write(message) # write message file for slack message = 'samplesheet metadata check message : {0}'.format( time_stamp) slack_obj.post_file_to_channel( filepath=file_name, message=message ) # post samplesheet metadata check results to slack remove_dir(msg_tmp_dir) # remove temp dir if len(new_seqruns.keys()) > 0: temp_dir = get_temp_dir() # create temp dir new_seqruns,error_files = \ validate_samplesheet_for_seqrun( seqrun_info=new_seqruns, schema_json=samplesheet_json_schema, output_dir=temp_dir)# validate samplesheet for seqruns if len(error_files.keys()) > 0: for seqrun_name, error_file_path in error_files.items(): message = \ 'Samplesheet validation failed for run {0}'.\ format(seqrun_name) slack_obj.post_file_to_channel( filepath=error_file_path,
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') demultiplexing_stats_file = self.param_required('demultiplexing_stats_file') qc_files = self.param_required('qc_files') fastq_dir = self.param_required('fastq_dir') multiqc_exe = self.param('multiqc_exe') multiqc_options = self.param('multiqc_options') multiqc_dir_label = self.param('multiqc_dir_label') force_overwrite = self.param('force_overwrite') base_results_dir = self.param_required('base_results_dir') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') tag = self.param_required('tag') multiqc_template_file = self.param_required('multiqc_template_file') tool_order_list = self.param('tool_order_list') model_name = self.param('model_name') use_ephemeral_space = self.param('use_ephemeral_space') if tag not in ['known','undetermined']: raise ValueError('unknown status tag {0}'.format(tag)) # check valid status tags lane_index_info = os.path.basename(fastq_dir) # get lane and index info fastqc_files = list() fastqscreen_files = list() fastqc_files.\ extend([fqc_dir for fqc_dir in qc_files['fastqc']]) fastqscreen_files.\ extend([fsr_dir for fsr_dir in qc_files['fastqscreen']]) multiqc_result_dir = \ os.path.join(\ base_results_dir, project_name, seqrun_date, flowcell_id, lane_index_info, tag, multiqc_dir_label) # get multiqc final output path if os.path.exists(multiqc_result_dir) and \ force_overwrite: remove_dir(multiqc_result_dir) # remove existing output dir if force_overwrite is true if not os.path.exists(multiqc_result_dir): os.makedirs(multiqc_result_dir,mode=0o775) # create output dir if its not present temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir multiqc_input_list = \ os.path.join(\ temp_work_dir, 'multiqc_input_file.txt') # get name of multiqc input file demultiplexing_stats_file = \ os.path.join(\ fastq_dir, demultiplexing_stats_file) with open(multiqc_input_list,'w') as multiqc_input_file: # writing multiqc input if not os.path.exists(demultiplexing_stats_file): raise IOError('demultiplexing stats file {0} not found'.\ format(demultiplexing_stats_file)) # check demultiplexing stats file multiqc_input_file.write('{}\n'.format(demultiplexing_stats_file)) # add demultiplexing stat to list for fastqc_file in fastqc_files: fastqc_zip = fastqc_file['fastqc_zip'] if not os.path.exists(fastqc_zip): raise IOError('fasqc file {0} not found'.\ format(fastqc_zip)) # check fastqc file multiqc_input_file.write('{}\n'.format(fastqc_zip)) # add fastqc file to list for fastqscreen_file in fastqscreen_files: fastqscreen_stat = fastqscreen_file['fastqscreen_stat'] if not os.path.exists(fastqscreen_stat): raise IOError('fastqscreen file {0} not found'.\ format(fastqscreen_stat)) # check fastqscreen file multiqc_input_file.write('{}\n'.format(fastqscreen_stat)) # add fastqscreen file to list multiqc_report_title = \ 'Project:{0},Sequencing_date:{1},Flowcell_lane:{2},status:{3}'.\ format( project_name, seqrun_date, lane_index_info, tag) # get multiqc report title and filename multiqc_param = self.format_tool_options(multiqc_options) # format multiqc params date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S') check_file_path(multiqc_template_file) multiqc_conf_file = \ os.path.join( temp_work_dir, os.path.basename(multiqc_template_file)) template_env = \ Environment(\ loader=\ FileSystemLoader( searchpath=os.path.dirname(multiqc_template_file)), autoescape=select_autoescape(['html', 'xml'])) multiqc_conf = \ template_env.\ get_template(os.path.basename(multiqc_template_file)) multiqc_conf.\ stream(\ project_igf_id=project_name, flowcell_id=flowcell_id, platform_name=model_name, tag_name='{0} {1}'.format(lane_index_info,tag), date_stamp=date_stamp, tool_order_list=tool_order_list).\ dump(multiqc_conf_file) multiqc_cmd = \ [multiqc_exe, '--file-list',quote(multiqc_input_list), '--outdir',temp_work_dir, '--title',quote(multiqc_report_title), '--config',quote(multiqc_conf_file) ] # multiqc base parameters multiqc_cmd.extend(multiqc_param) # add additional parameters subprocess.check_call(' '.join(multiqc_cmd),shell=True) # run multiqc multiqc_html = None multiqc_data = None for root, _,files in os.walk(top=temp_work_dir): for file in files: if fnmatch.fnmatch(file, '*.html'): copy2(os.path.join(root,file),multiqc_result_dir) multiqc_html = os.path.join(multiqc_result_dir,file) # get multiqc html path elif fnmatch.fnmatch(file, '*.zip'): copy2(os.path.join(root,file),multiqc_result_dir) multiqc_data = os.path.join(multiqc_result_dir,file) # get multiqc data path self.param('dataflow_params', {'multiqc_html':multiqc_html, 'multiqc_data':multiqc_data, 'lane_index_info':lane_index_info}) except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running picard commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL :param species_name: species_name :param java_exe: Java path :param java_java_paramexe: Java run parameters :param picard_jar: Picard jar path :param picard_command: Picard command :param base_work_dir: Base workd directory :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False :param use_ephemeral_space: A toggle for temp dir setting, default 0 :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ'] ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') java_exe = self.param_required('java_exe') java_param = self.param_required('java_param') picard_jar = self.param_required('picard_jar') input_files = self.param_required('input_files') picard_command = self.param_required('picard_command') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') reference_refFlat = self.param('reference_refFlat') ribosomal_interval_type = self.param('ribosomal_interval_type') base_work_dir = self.param_required('base_work_dir') analysis_files = self.param_required('analysis_files') picard_option = self.param('picard_option') patterned_flowcell_list = self.param('patterned_flowcell_list') platform_name = self.param_required('platform_name') output_prefix = self.param('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding seed datestamp to output prefix work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir temp_output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type, gene_reflat_type=reference_refFlat, ribosomal_interval_type=ribosomal_interval_type) # setup ref genome utils genome_fasta = ref_genome.get_genome_fasta() # get genome fasta ref_flat_file = ref_genome.get_gene_reflat() # get refFlat file ribosomal_interval_file = ref_genome.get_ribosomal_interval( ) # get ribosomal interval file patterned_flowcell = False if platform_name in patterned_flowcell_list: # check for patterned flowcell patterned_flowcell = True if load_metrics_to_cram and \ not cram_collection_type: raise ValueError( 'Cram file collection type is required for loading picard metrics to db' ) picard=\ Picard_tools(\ java_exe=java_exe, java_param=java_param, picard_jar=picard_jar, input_files=input_files, output_dir=temp_output_dir, ref_fasta=genome_fasta, patterned_flowcell=patterned_flowcell, ref_flat_file=ref_flat_file, picard_option=picard_option, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, ribisomal_interval=ribosomal_interval_file) # setup picard tool temp_output_files,picard_command_line,picard_metrics = \ picard.run_picard_command(command_name=picard_command) # run picard command output_file_list = list() for source_path in temp_output_files: dest_path=\ os.path.join( work_dir, os.path.basename(source_path)) # get destination filepath move_file(source_path=source_path, destinationa_path=dest_path, force=True) # move files to work dir output_file_list.append(dest_path) remove_dir(temp_output_dir) analysis_files.extend(output_file_list) bam_files = list() for file in output_file_list: if file.endswith('.bam'): bam_files.append(file) if load_metrics_to_cram and \ len(picard_metrics)>0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute( collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=picard_metrics) # fromat data for collection attribute table ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False ) # load data to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'bam_files': bam_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished picard {0} for {1} {2}'.\ format( picard_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Picard {0} command: {1}'.\ format( picard_command, picard_command_line) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: if temp_output_dir and \ os.path.exists(temp_output_dir): remove_dir(temp_output_dir) message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') flowcell_lane = self.param_required('flowcell_lane') project_name = self.param_required('project_name') index_length = self.param_required('index_length') seqrun_local_dir = self.param_required('seqrun_local_dir') bases_mask = self.param_required('basesmask') base_work_dir = self.param_required('base_work_dir') base_fastq_dir = self.param_required('base_fastq_dir') samplesheet_file = self.param_required('samplesheet') bcl2fastq_exe = self.param_required('bcl2fastq_exe') runinfo_filename = self.param('runinfo_filename') bcl2fastq_options = self.param('bcl2fastq_options') singlecell_options = self.param_required('singlecell_options') singlecell_tag = self.param('singlecell_tag') force_overwrite = self.param('force_overwrite') fastq_dir_label = self.param('fastq_dir_label') samplesheet_filename = self.param('samplesheet_filename') use_ephemeral_space = self.param('use_ephemeral_space') model_name = self.param('model_name') reset_mask_short_adapter_reads = self.param( 'reset_mask_short_adapter_reads') project_type = '' # default single cell status is empty seqrun_dir = os.path.join(seqrun_local_dir, seqrun_igf_id) # local seqrun dir runinfo_file = os.path.join( seqrun_dir, runinfo_filename) # seqrun runinfo file if not os.path.exists(samplesheet_file): raise IOError('samplesheet file {0} not found'.\ format(samplesheet_file)) samplesheet_sc = SampleSheet( infile=samplesheet_file ) # read samplesheet for single cell check samplesheet_sc.\ filter_sample_data(\ condition_key='Description', condition_value=singlecell_tag, method='include') if len(samplesheet_sc._data) > 0: project_type = singlecell_tag # set single cell status as true if its present in samplesheet if not os.path.exists(runinfo_file): raise IOError('Runinfo file {0} not found'.\ format(runinfo_file)) lane_index = '{0}_{1}'.format( flowcell_lane, index_length) # get label for lane and index length output_dir_label = \ os.path.join( project_name, fastq_dir_label, seqrun_date, flowcell_id, lane_index) # output dir label output_fastq_dir = \ os.path.join(base_fastq_dir,output_dir_label) # output fastq dir if os.path.exists(output_fastq_dir) and force_overwrite: remove_dir(output_fastq_dir ) # remove fastq directory if its already present message = \ 'started fastq conversion for {0}, {1} : {2}_{3}'.\ format( seqrun_igf_id, project_name, flowcell_lane, index_length) self.post_message_to_slack(message, reaction='pass') # send log to slack seqrun_temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # create a new input directory in TMPDIR move_file = \ moveBclFilesForDemultiplexing(\ input_dir=seqrun_dir, output_dir=seqrun_temp_dir, samplesheet=samplesheet_file, run_info_xml=runinfo_file, platform_model=model_name) # get lists of files to move to TMPDIR move_file.copy_bcl_files() # move files to TMPDIR job_name = self.job_name() output_temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # create tmp directory in TMPDIR for cluster report_dir = \ os.path.join(\ base_work_dir, seqrun_igf_id, job_name, 'Reports') # creating report directory in main storage if not os.path.exists(report_dir): os.makedirs(report_dir, mode=0o770) stats_dir = \ os.path.join(\ base_work_dir, seqrun_igf_id, job_name, 'Stats') # create stats directory in main storage if not os.path.exists(stats_dir): os.makedirs(stats_dir, mode=0o770) bcl2fastq_cmd = \ [quote(bcl2fastq_exe), '--runfolder-dir',quote(seqrun_temp_dir), '--sample-sheet',quote(samplesheet_file), '--output-dir',quote(output_temp_dir), '--reports-dir',quote(report_dir), '--use-bases-mask',quote(bases_mask), '--stats-dir',quote(stats_dir)] # bcl2fastq base parameters bcl2fastq_param = \ self.format_tool_options(bcl2fastq_options) # format bcl2fastq params bcl2fastq_cmd.extend(bcl2fastq_param) # add additional parameters if reset_mask_short_adapter_reads and \ '--mask-short-adapter-reads' not in bcl2fastq_options: read_pattern = re.compile(r'^y(\d+)n?\d?') read_values = [ int(re.match(read_pattern, i).group(1)) for i in bases_mask.split(',') if i.startswith('y') and re.match(read_pattern, i) if int(re.match(read_pattern, i).group(1)) < 22 ] # hack for checking if reads are lower than the Illumina threasholds if len(read_values) > 0 and \ min(read_values) > 5: bcl2fastq_cmd.\ append("--mask-short-adapter-reads={0}".\ format(quote(str(min(read_values))))) message = \ 'Setting masked bases length for {0},{1}:{2}_{3}, value: {4}'.\ format( seqrun_igf_id, project_name, flowcell_lane, index_length, min(read_values)) self.post_message_to_slack( message, reaction='pass') # send log to slack self.comment_asana_task(\ task_name=seqrun_igf_id, comment=message) # send log to asana if project_type == singlecell_tag: sc_bcl2fastq_param = self.format_tool_options( singlecell_options) # format singlecell bcl2fastq params bcl2fastq_cmd.extend( sc_bcl2fastq_param) # add additional parameters message = ' '.join(bcl2fastq_cmd) self.post_message_to_slack( message, reaction='pass') # send bcl2fastq command to Slack self.comment_asana_task( task_name=seqrun_igf_id, comment=message) # send bcl2fastq command to Asana subprocess.check_call(' '.join(bcl2fastq_cmd), shell=True) # run bcl2fastq copytree(output_temp_dir, output_fastq_dir) # copy output from TMPDIR copy2(\ samplesheet_file, os.path.join(\ output_fastq_dir, samplesheet_filename)) # add samplesheet to output dir move(report_dir, output_fastq_dir) # move report directory to project dir move(stats_dir, output_fastq_dir) # move stats directory to project dir self.param('dataflow_params', { 'fastq_dir': output_fastq_dir, 'bcl2fq_project_type': project_type }) # set dataflow params message = \ 'Fastq conversion done for {0},{1}:{2}_{3}, fastq: {4}'.\ format( seqrun_igf_id, project_name, flowcell_lane, index_length, output_fastq_dir) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(\ task_name=seqrun_igf_id, comment=message) # send log to asana remove_dir(seqrun_temp_dir) remove_dir(output_temp_dir) # remove temp dirs except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run_samtools_view(samtools_exe, input_file, output_file, reference_file=None, force=True, cram_out=False, threads=1, samtools_params=None, index_output=True, dry_run=False, use_ephemeral_space=0): ''' A function for running samtools view command :param samtools_exe: samtools executable path :param input_file: An input bam filepath with / without index. Index file will be created if its missing :param output_file: An output file path :param reference_file: Reference genome fasta filepath, default None :param force: Output file will be overwritten if force is True, default True :param threads: Number of threads to use for conversion, default 1 :param samtools_params: List of samtools param, default None :param index_output: Index output file, default True :param dry_run: A toggle for returning the samtools command without actually running it, default False :param use_ephemeral_space: A toggle for temp dir settings, default 0 :returns: Samtools command as list ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=input_file) # check bam file if not dry_run: _check_bam_index(\ samtools_exe=samtools_exe, bam_file=input_file) # check bam index temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_file = \ os.path.join(\ temp_dir, os.path.basename(output_file)) # get temp output file path view_cmd = \ [quote(samtools_exe), 'view', '-o',quote(temp_file) ] # convert bam to cram using samtools if reference_file is not None: check_file_path(reference_file) view_cmd.extend(['-T', quote(reference_file)]) if threads is not None: view_cmd.append('-@{0}'.format(quote(str(threads)))) if cram_out: view_cmd.append('-C') if reference_file is None: raise ValueError('Reference file is required for cram output') else: view_cmd.append('-b') if samtools_params is not None and \ isinstance(samtools_params, list) and \ len(samtools_params) > 0: view_cmd.extend(\ [quote(i) for i in samtools_params]) # add additional params view_cmd.append(quote(input_file)) if dry_run: return view_cmd subprocess.check_call(\ ' '.join(view_cmd), shell=True) if cram_out: _check_cram_file(cram_path=temp_file) # check cram output copy_local_file(\ source_path=temp_file, destinationa_path=output_file, force=force) # move cram file to original path remove_dir(temp_dir) # remove temp directory if index_output: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_file, threads=threads) return view_cmd except: raise
def tearDown(self): Base.metadata.drop_all(self.engine) os.remove(self.dbname) if os.path.exists(self.temp_dir): remove_dir(dir_path=self.temp_dir)
def nbconvert_execute_in_singularity(image_path, ipynb_path, input_list, output_dir, output_format='html', output_file_map=None, timeout=600, kernel='python3', use_ephemeral_space=False, allow_errors=False, dry_run=False): ''' A function for running jupyter nbconvert within singularity containers :param image_path: A singularity image path :param ipynb_path: A notebook file path to run in the singularity container :param input_list: A list of input file for notebook run :param output_dir: Path to copy output files :param output_format: Notebook output format, default html :param output_file_map: A a dictionary of output file tag abd name as key and value, to copy to output_path from tmp dir, default None :param timeout: Timeout setting for notebook execution, default 600s :param kernel: Kernel name for notebook execution, default python3 :param allow_errors: A toggle for running notebook with errors, default False :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False :param dry_run: Return the notebook command without run, default False :returns: notebook cmd ''' try: check_file_path(image_path) check_file_path(ipynb_path) if output_file_map is None: output_file_map = dict( ) # default output map is an empty dictionary if not isinstance(input_list,list) and \ len(input_list)==0: raise ValueError("Missing input files for notebook run") tmp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space ) # this will be mounted on container on /tmp tmp_input_list = list() for f in input_list: check_file_path(f) temp_path = \ os.path.join( tmp_dir, os.path.basename(f)) copy_local_file(f, temp_path) # copy input files to temp dir tmp_input_list.append(temp_path) temp_ipynb_path = \ os.path.join( tmp_dir, os.path.basename(ipynb_path)) copy_local_file(ipynb_path, temp_ipynb_path) # copy ipynb file to tmp dir args_list = [ 'jupyter', 'nbconvert', '{0}'.format(quote(temp_ipynb_path)), '--to={0}'.format(quote(output_format)), '--execute', '--ExecutePreprocessor.enabled=True', '--ExecutePreprocessor.timeout={0}'.format(quote(str(timeout))), '--ExecutePreprocessor.kernel_name={0}'.format(quote(kernel)) ] # prepare notebook cmd for run if allow_errors: args_list.append('--allow-errors') # run notebooks with errors try: res = None res, run_cmd = \ singularity_run( image_path=image_path, path_bind=tmp_dir, use_ephemeral_space=use_ephemeral_space, args_list=args_list, dry_run=dry_run) # run notebook in singularity container except Exception as e: raise ValueError("Failed to run jupyter command in singularity, error {0}, response: {1}".\ format(e,res)) if output_file_map is not None and \ isinstance(output_file_map,dict): for tag, output in output_file_map.items(): output_path = output_dir temp_output = \ os.path.join( tmp_dir, os.path.basename(output)) # just get base name if not dry_run: check_file_path( temp_output) # skip output file check for dry run if os.path.isfile(temp_output): output_path = \ os.path.join( output_path, os.path.basename(output)) # need file name when copying files if not dry_run: copy_local_file( temp_output, output_path) # copy file or dir to output path if os.path.isdir(temp_output): output_path = \ os.path.join( output_path, os.path.basename(output)) # adding dir name to output path, once copy is over output_file_map.\ update({tag:output_path}) if output_format == 'html': temp_ipynb_path = \ temp_ipynb_path.replace('.ipynb','.html') elif output_format == 'markdown': temp_ipynb_path = \ temp_ipynb_path.replace('.ipynb','.md') elif output_format == 'notebook': temp_ipynb_path = temp_ipynb_path elif output_format == 'pdf': temp_ipynb_path = \ temp_ipynb_path.replace('.ipynb','.pdf') elif output_format == 'python': temp_ipynb_path = \ temp_ipynb_path.replace('.ipynb','.py') elif output_format == 'slide': temp_ipynb_path = \ temp_ipynb_path.replace('.ipynb','.html') if not dry_run: check_file_path(temp_ipynb_path) # check output file path output_ipynb_path = \ os.path.join( output_dir, os.path.basename(temp_ipynb_path)) if not dry_run: copy_local_file(temp_ipynb_path, output_ipynb_path) # copy output notebook output_file_map.\ update({'notebook':output_ipynb_path}) # add notebook output to dataflow remove_dir(tmp_dir) return output_file_map, run_cmd except Exception as e: raise ValueError( "Failed to run nbconvert in singularity, error: {0}".\ format(e))
def run(self): try: fastq_dir = self.param_required('fastq_dir') seqrun_igf_id = self.param_required('seqrun_igf_id') project_name = self.param_required('project_name') igf_session_class = self.param_required('igf_session_class') irods_exe_dir = self.param_required('irods_exe_dir') flowcell_id = self.param_required('flowcell_id') samplesheet_filename = self.param('samplesheet_filename') manifest_name = self.param_required('manifest_name') report_html = self.param('report_html') use_ephemeral_space = self.param('use_ephemeral_space') pa = ProjectAdaptor(**{'session_class':igf_session_class}) pa.start_session() user = \ pa.fetch_data_authority_for_project(\ project_igf_id=project_name) # fetch user info from db pa.close_session() if user is None: raise ValueError('No user found for project {0}'.\ format(project_name)) username = user.username # get username for irods report_htmlname = os.path.basename(report_html) seqrun_date = seqrun_igf_id.split('_')[0] # collect seqrun date from igf id seqrun_date = datetime.datetime.strptime(seqrun_date,'%y%m%d').date() # identify actual date seqrun_date = str(seqrun_date) # convert object to string irods_upload = IGF_irods_uploader(irods_exe_dir) # create instance for irods upload base_seq_dir = os.path.basename(fastq_dir) # get base name for the source dir tarfile_name = \ '{0}_{1}_{2}.tar'.\ format(\ project_name, base_seq_dir, seqrun_date) # construct name of the tarfile temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir tarfile_name = \ os.path.join( temp_work_dir, tarfile_name) # create tarfile in the temp dir with tarfile.open(tarfile_name, "w") as tar: for root,_, files in os.walk(top=fastq_dir): if samplesheet_filename in files: samplesheet_file = \ os.path.join(os.path.abspath(root), samplesheet_filename) # get samplesheet filepath tmp_samplesheet_file = \ os.path.join( temp_work_dir, '{0}_{1}_{2}_{3}'.\ format( project_name, base_seq_dir, seqrun_date, samplesheet_filename)) copy2( samplesheet_file, tmp_samplesheet_file) # change samplesheet filename tar.add( tmp_samplesheet_file, arcname=\ os.path.relpath( tmp_samplesheet_file, start=temp_work_dir)) # add samplesheet file to tar if report_htmlname in files: for file in files: if fnmatch.fnmatch(os.path.join(root,file),report_html): report_file = os.path.join(os.path.abspath(root),file) # get filepath for the report tmp_report_file = \ os.path.join(\ temp_work_dir, '{0}_{1}_{2}_{3}'.\ format(\ project_name, base_seq_dir, seqrun_date, os.path.basename(report_file))) # change report name copy2(report_file, tmp_report_file) # copy report file to temp tar.add(tmp_report_file, arcname=os.path.relpath(tmp_report_file, start=temp_work_dir)) # add demultiplexing report to tar if manifest_name in files: manifest_file = \ os.path.join(os.path.abspath(root), manifest_name) # get samplesheet filepath tmp_manifest_file = \ os.path.join(\ temp_work_dir, '{0}_{1}_{2}_{3}'.\ format(\ project_name, base_seq_dir, seqrun_date, manifest_name)) # change manifest name copy2(manifest_file,tmp_manifest_file) # copy manifest to temp tar.add(tmp_manifest_file, arcname=os.path.relpath(tmp_manifest_file, start=temp_work_dir)) # add samplesheet file to tar for file in files: if fnmatch.fnmatch(file, '*.fastq.gz') and \ not fnmatch.fnmatch(file, 'Undetermined_*'): fastq_file_path = os.path.join(os.path.abspath(root),file) # get filepath for the fastq files tar.add(fastq_file_path, arcname=os.path.relpath(fastq_file_path, start=fastq_dir)) # add fastq file to tar irods_upload.\ upload_fastqfile_and_create_collection(\ filepath=tarfile_name, irods_user=username, project_name=project_name, run_igf_id=seqrun_igf_id, flowcell_id=flowcell_id, run_date=seqrun_date) # upload fastq data to irods remove_dir(temp_work_dir) # remove temp dir once data uoload is done except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running the cellranger count for a given sample using ehive pipeline :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param biomaterial_type: Biomaterial type for samples, required for nuclei samples :param nuclei_biomaterial_type: Required keywords for nuclei samples, default 'SINGLE_NUCLEI' :param igf_session_class: A database session class :param cellranger_exe: Cellranger executable path :param cellranger_options: Cellranger parameters List of default parameters --jobmode=pbspro --localcores=1 --localmem=4 --mempercore=4 --maxjobs=20 :param base_work_dir: Base work directory path :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq :param species_name: Reference genome collection name :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX :param nuclei_reference_type: Reference genome collection type for pre-mRNA samples, default TRANSCRIPTOME_TENX_NUCLEI :param job_timeout: Timeout for cellranger job, default 24hrs :returns: Adding cellranger_output to the dataflow_params ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') cellranger_exe = self.param_required('cellranger_exe') cellranger_options = self.param_required('cellranger_options') base_work_dir = self.param_required('base_work_dir') fastq_collection_type = self.param_required( 'fastq_collection_type') biomaterial_type = self.param_required('biomaterial_type') job_timeout = self.param_required('job_timeout') nuclei_biomaterial_type = self.param('nuclei_biomaterial_type') species_name = self.param('species_name') reference_type = self.param('reference_type') nuclei_reference_type = self.param('nuclei_reference_type') # setup work dir for run work_dir = False work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix ) # replace this with temp dir while running in queue # setup env for run os.chdir(work_dir) # move to work dir os.environ['PATH'] += '{0}{1}'.format( os.pathsep, os.path.dirname( cellranger_exe)) # add cellranger location to env PATH # collect reference genome for run if biomaterial_type == nuclei_biomaterial_type: ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, tenx_ref_type=nuclei_reference_type) # fetch ref genome for pre-mRNA samples else: ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, tenx_ref_type=reference_type) # collect fastq input for run cellranger_ref_transcriptome = ref_genome.get_transcriptome_tenx( ) # fetch tenx ref transcriptome from db input_fastq_dirs = \ get_cellranger_count_input_list(\ db_session_class=igf_session_class, experiment_igf_id=experiment_igf_id, fastq_collection_type=fastq_collection_type) # fetch fastq dir paths as list for run # configure cellranger count command for run cellranger_options = \ self.format_tool_options(\ cellranger_options, separator='=') cellranger_cmd = \ [cellranger_exe, 'count', '{0}={1}'.format('--fastqs', quote(','.join(input_fastq_dirs))), '{0}={1}'.format('--id', quote(experiment_igf_id)), '{0}={1}'.format('--transcriptome', quote(cellranger_ref_transcriptome)), ] # set initial parameters cellranger_cmd.extend( cellranger_options) # add optional parameters # log before job submission message = \ 'started cellranger count for {0}, {1} {2}'.\ format(\ project_igf_id, sample_igf_id, experiment_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana message = ' '.join(cellranger_cmd) self.comment_asana_task( task_name=project_igf_id, comment=message) # send cellranger command to Asana # start job execution cellranger_cmd = ' '.join( cellranger_cmd) # create shell command string subprocess.\ check_call(\ cellranger_cmd, shell=True, timeout=job_timeout) # run cellranger count using shell # prepare output after cellranger run cellranger_output = \ os.path.join(\ work_dir, experiment_igf_id, 'outs') # get cellranger output path message = \ 'finished cellranger count for {0}, {1} {2} : {3}'.\ format(\ project_igf_id, sample_igf_id, experiment_igf_id, cellranger_output) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana # validate output files after cellranger run check_cellranger_count_output( output_path=cellranger_output) # check output file cellranger_report = \ os.path.join(\ cellranger_output, 'web_summary.html') check_file_path(cellranger_report) self.param('dataflow_params',\ {'cellranger_output':cellranger_output, 'cellranger_report':cellranger_report}) # pass on cellranger output path except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs if work_dir: remove_dir(work_dir) raise
def merge_fastq_per_lane_per_sample(self): ''' A method for merging single cell fastq files present in input fastq_dir per lane per sample basis ''' try: sample_data = \ self._fetch_lane_and_sample_info_from_samplesheet() # get sample and lane information from samplesheet sample_files, samples_info = \ self._group_singlecell_fastq( sample_data, self.fastq_dir) # get file groups all_intermediate_files = list( ) # empty list for intermediate files s_count = 0 # initial count for fastq S value for lane_id in sorted(sample_files.keys()): if self.platform_name == 'NEXTSEQ': s_count = 0 # nextseq is weird, reset counter for each lane for sample_id in sorted(sample_files[lane_id].keys()): s_count += 1 # assign new S value for fastq files sample_name = samples_info.get(sample_id)['sample_name'] project_id = samples_info.get(sample_id)[ 'project_id'] # get sample and project info output_path = \ os.path.join( self.fastq_dir, project_id, sample_id) # output location is under input fastq_dir if not os.path.exists(output_path): os.makedirs(output_path, mode=0o770) # create outout directory for read_type in sample_files[lane_id][sample_id].keys( ): # merge per read type output_filename = \ '{0}_S{1}_L00{2}_{3}_001.fastq.gz'.\ format( sample_name, s_count, lane_id, read_type) # assign new output filename final_path = os.path.join( output_path, output_filename) # assign final output path if not self.force_overwrite and os.path.exists( final_path): raise ValueError('Failed to overwrite existing file {0}'.\ format(final_path)) input_list = list() for sc_fragment, file_path in \ sorted(sample_files[lane_id][sample_id][read_type].items()): input_list.extend( file_path ) # create list of input fastqs for merge if len(input_list) != 4: raise ValueError(\ 'expecting 4 files, got {0} for sample {1}, lane {2}, read type {3}'.\ format( len(input_list), sample_id, lane_id, read_type)) # checking input files list temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # get a temp dir temp_file = os.path.join( temp_dir, output_filename) # assign temp filename cmd = ["cat"] + input_list + [ ">", temp_file ] # shell command for merging fastq.gz files subprocess.check_call( " ".join(cmd), shell=True ) # exact same command for fastq merge as 10x pipeline shutil.copy(temp_file, final_path) # copy file to final location remove_dir(temp_dir) # remove temp dir for file_path in input_list: all_intermediate_files.append( file_path) # add fastq to intermediate list for file_path in all_intermediate_files: os.remove( file_path ) # remove intermediate files once merging is complete except: raise
def generate_report(self): ''' A method for generating html report from scanpy analysis :param generate_cb_data: A toggle for generating cellbrowser data, default False :param cb_data_path: A output path for cellbrowser data, default None ''' try: os.chdir(self.work_dir) if os.path.exists(os.path.join(self.work_dir, 'cache')): remove_dir(os.path.join(self.work_dir, 'cache')) date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S') # step 1: read input files temp_input_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # fix for hpc local_matrix_file = \ os.path.join(\ temp_input_dir, os.path.basename(self.matrix_file)) local_barcode_tsv = \ os.path.join(\ temp_input_dir, os.path.basename(self.barcode_tsv)) local_features_tsv = \ os.path.join(\ temp_input_dir, os.path.basename(self.features_tsv)) copy_local_file(\ source_path=self.matrix_file, destinationa_path=local_matrix_file) copy_local_file(\ source_path=self.barcode_tsv, destinationa_path=local_barcode_tsv) copy_local_file(\ source_path=self.features_tsv, destinationa_path=local_features_tsv) adata = sc.read_10x_mtx(\ temp_input_dir, var_names='gene_symbols', cache=True) # read input files adata.var_names_make_unique() sc.pl.highest_expr_genes(\ adata, n_top=30, save='.png') # list of genes that yield the highest fraction of counts in each single cells, across all cells highest_gene_expr = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/highest_expr_genes.png')) # encode highest gene expr data # step 2: filter data based on cell and genes sc.pp.filter_cells(\ adata, min_genes=self.min_gene_count) sc.pp.filter_genes(\ adata, min_cells=self.min_cell_count) # step 3: fetch mitochondrial genes mt_genes = self._fetch_mitochondrial_genes(species_name='hsapiens') mt_genes = [name for name in adata.var_names if name in mt_genes ] # filter mito genes which are not present in data # step 4: calculate mitochondrial read percentage adata.obs['percent_mito'] = \ np.sum(adata[:, mt_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 adata.obs['n_counts'] = adata.X.sum( axis=1 ).A1 # add the total counts per cell as observations-annotation to adata sc.pl.violin(\ adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, show=True, save='.png') # violin plot of the computed quality measures /figures/violin.png mito_plot_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir,\ 'figures/violin.png')) sc.pl.scatter(\ adata, x='n_counts', y='percent_mito', show=True, save='.png') # scatter plots for data quality 1 mito_plot_scatter1 = \ self._encode_png_image(\ png_file=os.path.join(\ self.work_dir, 'figures/scatter.png')) sc.pl.scatter(\ adata, x='n_counts', y='n_genes', save='.png') # scatter plots for data quality 2 mito_plot_scatter2 = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/scatter.png')) # step 5: Filtering data bases on percent mito adata = adata[adata.obs['n_genes'] < 2500, :] adata = adata[adata.obs['percent_mito'] < 0.05, :] # step 6: Normalise and filter data sc.pp.normalize_per_cell( adata ) # Total-count normalize (library-size correct) the data matrix to 10,000 reads per cell, so that counts become comparable among cells. sc.pp.log1p(adata) adata.raw = adata sc.pp.highly_variable_genes(\ adata, min_mean=0.0125, max_mean=3, min_disp=0.5) # Identify highly-variable genes sc.pl.highly_variable_genes(adata, save='.png') genes_dispersion_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/filter_genes_dispersion.png')) # plot highly-variable genes adata = adata[:, adata.var[ 'highly_variable']] # filter highly-variable genes # step 7: Analyze data sc.pp.regress_out(\ adata, ['n_counts', 'percent_mito']) # regress out effects of total counts per cell and the percentage of mitochondrial genes expressed sc.pp.scale(\ adata, max_value=10) # scale the data to unit variance sc.tl.pca(\ adata, svd_solver='arpack') # run pca sc.pl.pca_loadings(\ adata, show=True, save='.png') # plot pca loading graph pca_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/pca_loadings.png')) # load pca loading graph sc.pl.pca_variance_ratio(\ adata, log=True,save='.png') # save pca variation ratio pca_var_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/pca_variance_ratio.png')) # load pca variation graph sc.tl.tsne(\ adata, random_state=2, n_pcs=10) # legacy tsne sc.pp.neighbors(\ adata, n_neighbors=10, n_pcs=40) # neighborhood graph # step 7.5 Plot 3D UMAP sc.tl.umap(\ adata, n_components=3) # generate UMAP with 3PCs sc.tl.louvain(adata) # louvain graph clustering dict_map = { \ '0':'#4682B4', '1':'#A233A2', '2':'#FF7F50', '3':'#6787E7', '4':'#B75555', '5':'#2E8B57', '6':'#191970', '7':'#DB7093', '8':'#90EE90', '9':'#00FFFF', '10':'#FFD700', '11':'#DC143C', '12':'#B0C4DE', '13':'#00FA9A', '14':'#FA8072', '15':'#FFF0F5', '16':'#DB7093' } louvain_series = deepcopy(adata.obs['louvain']) color_map = louvain_series.map(dict_map).values labels = list(adata.obs.index) hovertext = \ ['cluster: {0}, barcode: {1}'.\ format(grp,labels[index]) for index,grp in enumerate(louvain_series.values)] threeDUmapDiv = \ plot([go.Scatter3d( \ x=adata.obsm['X_umap'][:, 0], y=adata.obsm['X_umap'][:, 1], z=adata.obsm['X_umap'][:, 2], mode = 'markers', marker = dict(color = color_map, size = 5), opacity=0.6, text=labels, hovertext=hovertext, )], output_type='div', include_plotlyjs='cdn') # capture 3d div for umap plot sc.tl.umap(adata, n_components=2) # umap with 2PCs or original adata sc.pl.tsne(\ adata, color='louvain', show=True, save='.png') # plot tSNE data tsne_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/tsne.png')) # load t-SNE # step 8: Finding marker genes sc.pl.umap(\ adata, color=['louvain'], save='.png') # plot umap umap_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/umap.png')) # load umap sc.tl.rank_genes_groups(\ adata, 'louvain', method='t-test') # compute a ranking for the highly differential genes in each cluster sc.pl.rank_genes_groups(\ adata, n_genes=20, show=True, sharey=False, save='.png') # plot diff genes in each clusters rank_genes_groups_data = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/rank_genes_groups_louvain.png')) # load ranking plot sc.pl.rank_genes_groups_stacked_violin(\ adata, n_genes=10, save='.png') # ranked genes group stacked violin plot rank_genes_groups_stacked_violin = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/stacked_violin.png')) # load stacked violin plot data sc.pl.rank_genes_groups_dotplot(\ adata, n_genes=10, color_map='bwr', dendrogram='dendrogram_louvain', save='.png') # ranked genes group dot plot rank_genes_groups_dotplot = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/dotplot.png')) # load dotplot sc.pl.rank_genes_groups_matrixplot(\ adata, n_genes=10, save='.png') # ranked genes group matrix plot rank_genes_groups_matrixplot = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/matrixplot.png')) # load matrix plot sc.pl.rank_genes_groups_heatmap(\ adata, n_genes=10, show_gene_labels=True, save='.png') # ranked gene heatmap plot rank_genes_groups_heatmap = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/heatmap.png')) # load heatmap plot sc.pl.rank_genes_groups_tracksplot(\ adata, n_genes=10, cmap='bwr', save='.png') # ranked gene tracks plot rank_genes_groups_tracksplot = \ self._encode_png_image(\ png_file=\ os.path.join(\ self.work_dir, 'figures/tracksplot.png')) # load tracks plot project_name = self.project_name project_name = \ project_name[0] \ if isinstance(project_name, tuple) \ else project_name # check for project_name object template_env = \ Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(self.html_template_file)), autoescape=select_autoescape(['xml'])) template_file = \ template_env.\ get_template(\ os.path.basename(self.html_template_file)) template_file.\ stream(\ ProjectName=project_name, SampleName=self.sample_name, Date_stamp=date_stamp, Highest_gene_expr=highest_gene_expr, MitoPlot=mito_plot_data, MitoScatter1=mito_plot_scatter1, MitoScatter2=mito_plot_scatter2, GenesDispersion=genes_dispersion_data, Pca=pca_data, Pca_var_data=pca_var_data, Tsne=tsne_data, Umap3DDiv=threeDUmapDiv, Umap_data=umap_data, RankGenesGroups=rank_genes_groups_data, Rank_genes_groups_stacked_violin=rank_genes_groups_stacked_violin, Rank_genes_groups_dotplot=rank_genes_groups_dotplot, Rank_genes_groups_matrixplot=rank_genes_groups_matrixplot, Rank_genes_groups_heatmap=rank_genes_groups_heatmap, Rank_genes_groups_tracksplot=rank_genes_groups_tracksplot).\ dump(os.path.join(self.work_dir,'test.html')) copy_local_file(\ os.path.join(\ self.work_dir,'test.html'), self.output_file, force=self.force_overwrite) if self.cellbrowser_h5ad is not None: try: if not os.path.exists( os.path.dirname(self.cellbrowser_h5ad)): os.makedirs(os.path.dirname(self.cellbrowser_h5ad)) temp_h5ad = \ os.path.join(\ self.work_dir, os.path.basename(self.cellbrowser_h5ad)) adata.write_h5ad(filename=temp_h5ad) copy_local_file(\ source_path=temp_h5ad, destinationa_path=self.cellbrowser_h5ad, force=True) except Exception as e: raise ValueError('Failed to export Scanpy h5ad, error: {0}'.\ format(e)) remove_dir(temp_input_dir) remove_dir(self.work_dir) except: raise
def tearDown(self): remove_dir(self.tmp_dir)
def tearDown(self): if os.path.exists(self.fastq_dir): remove_dir(self.fastq_dir)
disk_path = args.disk_path copy_to_remoter = args.copy_to_remoter remote_server = args.remote_server output_path = args.output_path try: if copy_to_remoter and not remote_server: parser.print_help() raise ValueError( 'Remote server address is required for copying files.') storage_stats = get_storage_stats_in_gb( disk_path) # calculate disk usage stats temp_dir = get_temp_dir() temp_file = os.path.join(temp_dir, 'disk_usage.json') # get temp file path with open(temp_file, 'w') as j_data: json.dump(storage_stats, j_data, indent=4) # writing disk usage to temp jeon file if copy_to_remoter: copy_remote_file(source_path=temp_file, destinationa_path=output_path, destination_address=remote_server ) # copy json file to remote server else: shutil.copy2(temp_file, output_path) # copy json file to local server remove_dir(temp_dir) # remove temp dir except Exception as e: print('Error: {0}'.format(e))
def nbconvert_singularity(self, singularity_image_path, dry_run=False): ''' A method for generating notebook from template and executing in singularity container :param singularity_image_path: A singularity image path :param dry_run: A toggle for dry run, default False :returns: A response str from singularity, run command and a dictionary of output params for dataflow ''' try: output_params = dict() new_input_map = \ self._substitute_input_path_and_copy_files_to_tempdir() # get modified input map and copy files to ount dir if not isinstance(new_input_map, dict): raise TypeError("Expecting a dictionary and got {0}".\ format(type(new_input_map))) date_stamp = self._get_date_stamp() # get date stamp new_input_map.\ update({self.date_tag:date_stamp}) # update input map with datestamp temp_notebook = \ self._generate_ipynb_from_template(param_map=new_input_map) # generate new notebook after param substitution container_notebook_path = \ os.path.join( self.container_dir_prefix, os.path.basename(temp_notebook)) args_list = [ 'jupyter', 'nbconvert', '{0}'.format(quote(container_notebook_path)), '--to={0}'.format(quote(self.output_format)), '--execute', '--ExecutePreprocessor.enabled=True', '--ExecutePreprocessor.timeout={0}'.format( quote(str(self.timeout))), '--ExecutePreprocessor.kernel_name={0}'.format( quote(self.kernel)) ] # prepare notebook cmd for run if self.allow_errors: args_list.append('--allow-errors') # run notebooks with errors try: res = None res, run_cmd = \ singularity_run( image_path=singularity_image_path, path_bind=self.temp_dir, use_ephemeral_space=self.use_ephemeral_space, args_list=args_list, dry_run=dry_run) # run notebook in singularity container except Exception as e: raise ValueError( "Failed to run jupyter command in singularity, error {0}, response: {1}".\ format(e,res)) if dry_run: return res, run_cmd, output_params # test singularity cmd else: output_params = \ self._copy_container_output_and_update_map( temp_notebook_path=temp_notebook) # move files to output dir remove_dir(self.temp_dir) # clean up temp dir return res, run_cmd, output_params except Exception as e: raise ValueError("Failed to execute notebook in singularity container, error: {0}".\ format(e))