except: raise if __name__ == '__main__': from igf_data.igfdb.igfTables import Base from igf_data.utils.dbutils import read_dbconf_json from igf_data.utils.fileutils import get_temp_dir from igf_data.utils.fileutils import remove_dir dbparams = read_dbconf_json('data/dbconfig.json') dbname = dbparams['dbname'] if os.path.exists(dbname): os.remove(dbname) temp_dir = get_temp_dir() base = BaseAdaptor(**dbparams) Base.metadata.create_all(base.engine) base.start_session() collection_data = [{ 'name': 'IGF001_MISEQ', 'type': 'ALIGNMENT_CRAM', 'table': 'experiment' }, { 'name': 'IGF002_MISEQ', 'type': 'ALIGNMENT_CRAM', 'table': 'experiment' }] ca = CollectionAdaptor(**{'session': base.session}) ca.store_collection_and_attribute_data(data=collection_data, autosave=True)
def check_lane_effect_and_log_report(self, project_name, sample_name, output_file): ''' A function for generating batch effect report for a sample and project :param project_name: A project name for the report file :param sample_name: A sample name for the report file :param output_file: Path of the output report file ''' try: if self.strand_info not in self.allowed_strands: raise ValueError('{0} is not a valid strand'.format( self.strand_info)) temp_dir = get_temp_dir(use_ephemeral_space=False) temp_merged_output = os.path.join(temp_dir, 'merged.csv') temp_cpm_output = os.path.join(temp_dir, 'merged_cpm.csv') temp_png_output = os.path.join(temp_dir, 'plot.png') temp_clustermap = os.path.join(temp_dir, 'clustermap.png') temp_corr = os.path.join(temp_dir, 'corr.png') temp_pca_flowcell = os.path.join(temp_dir, 'pca_flowcell.png') temp_pca_flowcell_lane = os.path.join(temp_dir, 'pca_flowcell_lane.png') temp_html_report = os.path.join( temp_dir, os.path.basename(self.template_file)) check_file_path(self.input_json_file) check_file_path(self.rscript_path) check_file_path(self.batch_effect_rscript_path) with open(self.input_json_file, 'r') as json_data: input_list = json.load(json_data) if len(input_list) < 2: raise ValueError( 'Minimum two input files are required for lane level batch effect checking' ) gene_name_label = 'gene_name' final_df = pd.DataFrame() for entry in input_list: file = entry.get('file') flowcell = entry.get('flowcell') lane = entry.get('lane') if file is None or \ flowcell is None or \ lane is None: raise ValueError('Missing required info for batch effect check: {0}'.\ format(entry)) unstranded_label = 'unstranded_{0}_{1}'.format(flowcell, lane) reverse_strand_label = 'reverse_strand_{0}_{1}'.format( flowcell, lane) forward_strand_label = 'forward_strand_{0}_{1}'.format( flowcell, lane) data=pd.read_csv(\ file, sep='\t', header=None, skiprows=4, index_col=False, names=[gene_name_label, unstranded_label, forward_strand_label, reverse_strand_label]) if self.strand_info == 'reverse_strand': data = data[[gene_name_label, reverse_strand_label]] data = data[ data[reverse_strand_label] > self. read_threshold] # filter series and remove any low value gene elif self.strand_info == 'forward_strand': data = data[[gene_name_label, forward_strand_label]] data = data[ data[forward_strand_label] > self. read_threshold] # filter series and remove any low value gene elif self.strand_info == 'unstranded': data = data[[gene_name_label, unstranded_label]] data = data[ data[unstranded_label] > self. read_threshold] # filter series and remove any low value gene if len(final_df.index) == 0: final_df = copy(data) else: final_df=final_df.\ merge(data, how='outer', on=gene_name_label) final_df = final_df.dropna().set_index( gene_name_label) # remove any row with NA values from df final_df.\ applymap(lambda x: float(x)).\ to_csv(temp_merged_output,index=True) # dump raw counts as csv file rscript_cmd = [ quote(self.rscript_path), quote(self.batch_effect_rscript_path), quote(temp_merged_output), quote(temp_cpm_output), quote(temp_png_output) ] subprocess.check_call(' '.join(rscript_cmd), shell=True) # run r script for cpm counts check_file_path(temp_cpm_output) # check output file mod_data=pd.read_csv(temp_cpm_output).\ rename(columns={'Unnamed: 0':gene_name_label}).\ set_index(gene_name_label) # read output file sns_fig = sns.clustermap(mod_data, figsize=(10, 10)) sns_fig.fig.savefig(temp_clustermap) check_file_path(temp_clustermap) # plot clustermap corr_df = mod_data.corr() cmap = sns.diverging_palette(220, 10, as_cmap=True) fig, ax = plt.subplots(figsize=(7, 7)) sns.heatmap(corr_df, cmap=cmap, square=True, linewidths=.5, cbar_kws={"shrink": .4}, ax=ax) plt.savefig(temp_corr) check_file_path(temp_corr) # plot correlation values pca = PCA(n_components=2) X_r = pca.fit(mod_data.T).transform(mod_data.T) pattern1 = re.compile( r'(rev_strand|forward_strand|unstranded)_(\S+)_([1-8])') pattern2 = re.compile( r'(rev_strand|forward_strand|unstranded)_(\S+_[1-8])') results_df=pd.DataFrame(\ {'PCA1':X_r[:,0], 'PCA2':X_r[:,1], 'flowcell':[re.match(pattern1,label).group(2) if re.match(pattern1,label) else label for label in mod_data.T.index], 'flowcell_lane':[re.match(pattern2,label).group(2) if re.match(pattern2,label) else label for label in mod_data.T.index] }) pca_plot = sns.lmplot('PCA1', 'PCA2', hue='flowcell', data=results_df, fit_reg=False) pca_plot.fig.savefig(temp_pca_flowcell) # plot flowcell level pca pca_plot = sns.lmplot('PCA1', 'PCA2', hue='flowcell_lane', data=results_df, fit_reg=False) pca_plot.fig.savefig( temp_pca_flowcell_lane) # plot flowcell-lane level pca template_env=Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(self.template_file)), autoescape=select_autoescape(['xml'])) template_file=template_env.\ get_template(os.path.basename(self.template_file)) template_file.\ stream(ProjectName=project_name, SampleName=sample_name, mdsPlot=self._encode_png_image(png_file=temp_png_output), clustermapPlot=self._encode_png_image(png_file=temp_clustermap), corrPlot=self._encode_png_image(png_file=temp_corr), pca1Plot=self._encode_png_image(png_file=temp_pca_flowcell), pca2Plot=self._encode_png_image(png_file=temp_pca_flowcell_lane), ).\ dump(temp_html_report) copy_local_file(temp_html_report, output_file, force=True) except: raise
def run(self): try: samplesheet_file = self.param_required('original_samplesheet') seqrun_igf_id = self.param_required('seqrun_igf_id') fastq_dir = self.param_required('fastq_dir') model_name = self.param_required('model_name') project_name = self.param_required('project_name') stats_filename = self.param('stats_filename') strict_check = self.param('strict_check') use_ephemeral_space = self.param('use_ephemeral_space') work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get work directory name stats_json_file = \ os.path.join( fastq_dir, stats_filename) # get stats file path barcode_stat = \ CheckSequenceIndexBarcodes( stats_json_file=stats_json_file, samplesheet_file=samplesheet_file, platform_name=model_name) # create check instance barcode_stat.\ validate_barcode_stats( work_dir=work_dir, \ strict_check=strict_check) # validate seqrun stats self.param('dataflow_params', {'barcode_qc_stats': 'PASS' }) # seed dataflow parame for the qc passed lanes except IndexBarcodeValidationError as e: self.param( 'dataflow_params', {'barcode_qc_stats': 'FAIL'}) # seed dataflow for failed lanes message = \ 'project: {0}, message:{1}'.\ format( project_name, e.message) if len(e.plots) == 0: self.post_message_to_slack(\ message=e.message, reaction='fail') # only post msg to slack if no plots self.comment_asana_task(\ task_name=seqrun_igf_id, comment=e.message) # log to asana task else: for plot_file in e.plots: self.post_file_to_slack( message=message, filepath=plot_file) # posting plot files to slack self.upload_file_to_asana_task(\ task_name=seqrun_igf_id, filepath=plot_file, \ comment=message) # upload plots to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run_mem(self, mem_cmd='mem', parameter_options=("-M", ""), samtools_cmd='view', dry_run=False): ''' A method for running Bwa mem and generate output alignment :param mem_cmd: Bwa mem command, default mem :param option_list: List of bwa mem option, default -M :param samtools_cmd: Samtools view command, default view :param dry_run: A toggle for returning the bwa cmd without running it, default False :returns: A alignment file path and bwa run cmd ''' try: self._run_checks() # check input params read1_list,read2_list = \ identify_fastq_pair(\ input_list=self.input_fastq_list) # fetch input files temp_dir = \ get_temp_dir( use_ephemeral_space=self.use_ephemeral_space) bwa_cmd = [ quote(self.bwa_exe), quote(mem_cmd), '-t', quote(str(self.thread)) ] if isinstance(parameter_options,tuple) and \ len(parameter_options)>0 : parameter_options = \ {item:parameter_options[index+1] for index, item in enumerate(parameter_options) if index %2==0} # convert default param tuple to a dict if isinstance(parameter_options,dict) and \ len(parameter_options)>0: parameter_options = [ quote(str(field)) for key, val in parameter_options.items() for field in [key, val] if field != '' ] # flatten param list bwa_cmd.extend(parameter_options) # add mem specific options bwa_cmd.append(quote(self.ref_genome)) bwa_cmd.append(quote(read1_list[0])) # add read 1 if len(read2_list) > 0: bwa_cmd.append(quote(read2_list[0])) # add read 2 if self.bam_output: temp_output_path = \ os.path.join( temp_dir, '{0}.bam'.format(self.output_prefix)) # bam output samtools_cmd = [ quote(self.samtools_exe), quote(samtools_cmd), quote('--threads'), quote(str(self.thread)), quote('-bo'), temp_output_path ] if dry_run: return bwa_cmd, samtools_cmd # return bwa and samtools cmd with subprocess.Popen(bwa_cmd, stdout=subprocess.PIPE) as proc: _ = \ subprocess.\ Popen( ' '.join(samtools_cmd), shell=True, stdin=proc.stdout) else: temp_output_path = \ os.path.join( temp_dir, '{0}.sam'.format(self.output_prefix)) # sam output if dry_run: return bwa_cmd with open(temp_output_path, 'w') as sam: with subprocess.Popen(bwa_cmd, stdout=subprocess.PIPE) as proc: sam.write(proc.stdout.read().decode( 'utf-8')) # writing sam output if os.path.exists(temp_output_path): final_output_file = \ os.path.join( self.output_dir, os.path.basename(temp_output_path)) copy_local_file(source_path=temp_output_path, destinationa_path=final_output_file) else: raise IOError('Alignment temp output missing') return final_output_file, bwa_cmd except: raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') remote_project_path = self.param_required('remote_project_path') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') template_dir = self.param_required('template_dir') page_type = self.param_required('page_type') fastq_dir = self.param('fastq_dir') multiqc_remote_file = self.param('multiqc_remote_file') lane_index_info = self.param('lane_index_info') qc_template_path = self.param('qc_template_path') project_template = self.param('project_template') undetermined_template = self.param('undetermined_template') sample_template = self.param('sample_template') project_filename = self.param('project_filename') sample_filename = self.param('sample_filename') undetermined_filename = self.param('undetermined_filename') report_html = self.param('report_html') remote_ftp_base = self.param('remote_ftp_base') use_ephemeral_space = self.param('use_ephemeral_space') if page_type not in ['project', 'sample', 'undetermined']: raise ValueError( 'Project type {0} is not defined yet'.format(page_type)) qc_template_path = \ os.path.join(template_dir,qc_template_path) remote_file_path = \ os.path.join(\ remote_project_path, project_name, seqrun_date, flowcell_id) if lane_index_info is not None: remote_file_path = \ os.path.join(\ remote_file_path, lane_index_info) # generic remote path, lane info is none for project template_env = \ Environment( loader=FileSystemLoader(searchpath=qc_template_path), autoescape=select_autoescape(['xml'])) # set template env #remote_chk_cmd=['ssh',\ # '{0}@{1}'.\ # format(remote_user,\ # remote_host),\ # 'ls'] #remote_rm_cmd=['ssh',\ # '{0}@{1}'.\ # format(remote_user,\ # remote_host),\ # 'rm', \ # '-f'] temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir report_output_file = None qc_file_info = dict() qc_file_info.\ update({ 'project_name':project_name, 'flowcell': flowcell_id, }) if page_type == 'project': # prepare project page (headerdata, qcmain) = self._process_projects_data( ) # get required data for project qc page template_file = \ template_env.get_template(project_template) report_output_file = \ os.path.join(\ temp_work_dir, project_filename) template_file.\ stream(\ ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,project_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,project_filename)) elif page_type == 'undetermined': # prepare undetermined fastq page (headerdata, qcmain) = \ self._process_undetermined_data(remote_file_path) # get required data for undetermined qc page template_file = \ template_env.get_template(undetermined_template) report_output_file = \ os.path.join(\ temp_work_dir, undetermined_filename) template_file.\ stream( ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,undetermined_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,undetermined_filename)) elif page_type == 'sample': # prepare sample page if lane_index_info is None: raise ValueError('Missing lane and index information') if fastq_dir is None: raise ValueError('Missing required fastq_dir') (headerdata, qcmain) = \ self._process_samples_data() # get required data for sample qc page (lane_id,index_length) = \ lane_index_info.split('_',1) # get lane and index info template_file = \ template_env.get_template(sample_template) # get template file report_output_file = \ os.path.join(\ temp_work_dir, sample_filename) template_file.\ stream( ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, Lane=lane_id, IndexBarcodeLength=index_length, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) # dump data to template file os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,sample_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,sample_filename)) remote_sample_qc_path = \ os.path.join(\ remote_file_path, os.path.basename(report_output_file)) if multiqc_remote_file is None: raise ValueError( 'required a valid path for remote multiqc') remote_path = \ os.path.join(\ remote_project_path, project_name, seqrun_date, flowcell_id) # get remote base path remote_sample_qc_path = \ os.path.relpath(\ remote_sample_qc_path, start=remote_path) # elative path for sample qc multiqc_remote_file = \ os.path.relpath(\ multiqc_remote_file, start=remote_path) # relative path for multiqc report_htmlname = os.path.basename(report_html) reports = list() for root, _, files in os.walk(top=fastq_dir): if report_htmlname in files: reports.\ extend([os.path.join(os.path.abspath(root),file) \ for file in files \ if fnmatch.fnmatch(os.path.join(root,file),report_html)]) # get all html reports if len(reports) == 0: raise ValueError('No demultiplexing report found for fastq dir {0}'.\ format(fastq_dir)) os.chmod(reports[0], mode=0o774) # added read permission for report html copy_remote_file(source_path=reports[0], destinationa_path=remote_file_path, destination_address='{0}@{1}'.format( remote_user, remote_host)) # copy file to remote remote_report_file = \ os.path.join(\ remote_file_path, os.path.basename(reports[0])) # get remote path for report file remote_report_file = \ os.path.relpath(\ remote_report_file, start=remote_path) # get relative path for demultiplexing report qc_file_info = \ {'lane_id':lane_id, 'index_length':index_length, 'sample_qc_page':remote_sample_qc_path, 'multiqc_page':multiqc_remote_file, 'demultiplexing_report':remote_report_file, 'fastq_dir':fastq_dir, 'project_name':project_name, } #response=subprocess.call(remote_chk_cmd) #if response!=0: # subprocess.check_call(remote_rm_cmd) # remove existing remote file if not os.path.exists(report_output_file): raise IOError('file {0} not found'.format(report_output_file)) copy_remote_file(\ source_path=report_output_file, destinationa_path=remote_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host)) # copy file to remote remote_qc_page = \ os.path.join(\ remote_file_path, os.path.basename(report_output_file)) qc_file_info.\ update({'remote_qc_page':remote_qc_page}) self.param('dataflow_params', {'qc_file_info': qc_file_info}) remote_url_path = \ 'http://{0}/{1}'.\ format(remote_host, os.path.relpath(\ remote_qc_page, start=remote_ftp_base)) message = \ 'QC page {0}, {1},{2}: {3}'.\ format( seqrun_igf_id, project_name, page_type, remote_url_path) self.post_message_to_slack(message, reaction='pass') # send msg to slack self.comment_asana_task(\ task_name=seqrun_igf_id, comment=message) # send msg to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: fastq_file = self.param_required('fastq_file') fastq_dir = self.param_required('fastq_dir') igf_session_class = self.param_required('igf_session_class') seqrun_igf_id = self.param_required('seqrun_igf_id') base_results_dir = self.param_required('base_results_dir') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') fastqscreen_exe = self.param_required('fastqscreen_exe') fastqscreen_conf = self.param_required('fastqscreen_conf') tag = self.param_required('tag') lane_index_info = self.param_required('lane_index_info') sample_name = self.param('sample_name') fastqscreen_options = self.param('fastqscreen_options') force_overwrite = self.param('force_overwrite') fastqscreen_dir_label = self.param('fastqscreen_dir_label') fastqs_collection_type = self.param('fastqs_collection_type') hpc_location = self.param('hpc_location') store_file = self.param('store_file') required_collection_table = self.param('required_collection_table') use_ephemeral_space = self.param('use_ephemeral_space') if lane_index_info is None: lane_index_info = os.path.basename( fastq_dir) # get the lane and index length info fastq_file_label = \ os.path.basename(fastq_file).replace('.fastq.gz','') if tag == 'known' and store_file: # fetch sample name for known fastq, if its not defined base = BaseAdaptor(**{'session_class': igf_session_class}) base.start_session() # connect to db ca = CollectionAdaptor(**{'session': base.session}) (collection_name,collection_table) = \ ca.fetch_collection_name_and_table_from_file_path(\ file_path=fastq_file) # fetch collection name and table info if collection_table != required_collection_table: raise ValueError( 'Expected collection table {0} and got {1}, {2}'.\ format( required_collection_table, collection_table, fastq_file)) ra = RunAdaptor(**{'session': base.session}) sample = ra.fetch_sample_info_for_run( run_igf_id=collection_name) sample_name = sample['sample_igf_id'] base.close_session() fastqscreen_result_dir = \ os.path.join(\ base_results_dir, project_name, seqrun_date, flowcell_id, lane_index_info, tag) # result dir path is generic if sample_name is not None: fastqscreen_result_dir = \ os.path.join(\ fastqscreen_result_dir, sample_name) # add sample name to dir path only if its available fastqscreen_result_dir = \ os.path.join(\ fastqscreen_result_dir, fastq_file_label, fastqscreen_dir_label) # keep multiple files under same dir if os.path.exists(fastqscreen_result_dir) and force_overwrite: remove_dir( fastqscreen_result_dir ) # remove existing output dir if force_overwrite is true if not os.path.exists(fastqscreen_result_dir): os.makedirs(fastqscreen_result_dir, mode=0o775) # create output dir if its not present temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir if not os.path.exists(fastq_file): raise IOError('fastq file {0} not readable'.format( fastq_file)) # raise if fastq file path is not readable fastqscreen_output = os.path.join(temp_work_dir, fastq_file_label) os.mkdir(fastqscreen_output) # create fastqc output dir fastqscreen_param = self.format_tool_options( fastqscreen_options) # format fastqc params fastqscreen_cmd = \ [fastqscreen_exe, '-conf',fastqscreen_conf, '--outdir',fastqscreen_output, ] # fastqscreen base parameters fastqscreen_cmd.extend( fastqscreen_param) # add additional parameters fastqscreen_cmd.append(fastq_file) # fastqscreen input file subprocess.check_call(fastqscreen_cmd) # run fastqscreen fastqscreen_stat = None fastqscreen_html = None fastqscreen_png = None for root, _, files in os.walk(top=fastqscreen_output): for file in files: if fnmatch.fnmatch(file, '*.txt'): input_fastqs_txt = os.path.join(root, file) copy2(input_fastqs_txt, fastqscreen_result_dir) fastqscreen_stat = os.path.join( fastqscreen_result_dir, file) if fnmatch.fnmatch(file, '*.html'): input_fastqs_html = os.path.join(root, file) copy2(input_fastqs_html, fastqscreen_result_dir) fastqscreen_html = os.path.join( fastqscreen_result_dir, file) if fnmatch.fnmatch(file, '*.png'): input_fastqs_png = os.path.join(root, file) copy2(input_fastqs_png, fastqscreen_result_dir) fastqscreen_png = os.path.join(fastqscreen_result_dir, file) if fastqscreen_stat is None or fastqscreen_html is None or \ fastqscreen_png is None: raise ValueError('Missing required file, stat: {0}, html: {1}, png: {2}'.\ format(fastqscreen_stat, fastqscreen_html, fastqscreen_png)) if tag == 'known' and store_file: fastqs_files = \ [{'name':collection_name, 'type':fastqs_collection_type, 'table':required_collection_table, 'file_path':fastqscreen_stat, 'location':hpc_location}, {'name':collection_name, 'type':fastqs_collection_type, 'table':required_collection_table, 'file_path':fastqscreen_html, 'location':hpc_location}, {'name':collection_name, 'type':fastqs_collection_type, 'table':required_collection_table, 'file_path':fastqscreen_png, 'location':hpc_location}, ] ca = CollectionAdaptor(**{'session_class': igf_session_class}) ca.start_session() ca.load_file_and_create_collection( data=fastqs_files) # store fastqs files to db ca.close_session() self.param('dataflow_params', {'fastqscreen_html':fastqscreen_html, 'lane_index_info':lane_index_info, 'sample_name':sample_name, 'fastqscreen': \ {'fastq_dir':fastq_dir, 'fastqscreen_stat':fastqscreen_stat, 'fastqscreen_html':fastqscreen_html, }}) # set dataflow params except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id=self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') input_files = self.param_required('input_files') igf_session_class = self.param_required('igf_session_class') template_report_file = self.param_required('template_report_file') rscript_path = self.param_required('rscript_path') batch_effect_rscript_path = self.param_required('batch_effect_rscript_path') base_result_dir = self.param_required('base_result_dir') strand_info = self.param('strand_info') read_threshold = self.param('read_threshold') collection_type = self.param('collection_type') collection_table = self.param('collection_table') analysis_name = self.param('analysis_name') tag_name = self.param('tag_name') use_ephemeral_space = self.param('use_ephemeral_space') output_file_list = None if len(input_files)==0: raise ValueError('No input files found for bactch effect checking') elif len(input_files) < 3: output_file_list = '' # can't run batch effect checking on less than 3 lanes else: for file in input_files: check_file_path(file) # check input filepath file_data = list() ra = RunAdaptor(**{'session_class':igf_session_class}) ra.start_session() for file in input_files: run_igf_id = os.path.basename(file).\ replace('ReadsPerGene.out.tab','') # using simple string match to fetch run igf ids flowcell_id, lane_id = \ ra.fetch_flowcell_and_lane_for_run(run_igf_id=run_igf_id) # fetch flowcell id and lane info file_data.append({'file':file, 'flowcell':flowcell_id, 'lane':lane_id }) ra.close_session() temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_json_file = \ os.path.join(temp_dir,'star_gene_counts.json') # temp json file path temp_output_file = \ os.path.join(\ temp_dir, os.path.basename(template_report_file)) # temp report file path with open(temp_json_file,'w') as jp: json.dump(file_data,jp,indent=2) # dumping json output br = Batch_effect_report(\ input_json_file=temp_json_file, template_file=template_report_file, rscript_path=rscript_path, batch_effect_rscript_path=batch_effect_rscript_path, strand_info=strand_info, read_threshold=read_threshold ) # set up batch effect run br.check_lane_effect_and_log_report(\ project_name=project_igf_id, sample_name=sample_igf_id, output_file=temp_output_file ) # generate report file au = Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_result_dir, tag_name=tag_name, collection_name=experiment_igf_id, collection_type=collection_type, collection_table=collection_table ) # prepare to load file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[temp_output_file]) # load file to db and disk self.param('dataflow_params', {'batch_effect_reports':output_file_list}) # populating data flow only if report is present except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param threads: Number of threads to use for Bam to Cram conversion, default 4 :param base_work_dir: Base workd directory :param samtools_command: Samtools command :param samFlagInclude: Sam flags to include in filtered bam, default None :param samFlagExclude: Sam flags to exclude from the filtered bam, default None :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804 :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796 :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') samtools_exe = self.param_required('samtools_exe') reference_type = self.param('reference_type') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') samtools_command = self.param_required('samtools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') base_result_dir = self.param('base_result_dir') analysis_name = self.param('analysis_name') force_overwrite = self.param('force_overwrite') samFlagInclude = self.param('samFlagInclude') samFlagExclude = self.param('samFlagExclude') mapq_threshold = self.param('mapq_threshold') library_layout = self.param_required('library_layout') use_encode_filter = self.param('use_encode_filter') species_name = self.param_required('species_name') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if use_encode_filter: samFlagInclude = None if library_layout == 'PAIRED': samFlagExclude = 1804 else: samFlagExclude = 1796 if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) output_bam_cram_list = list() input_file = input_files[0] temp_output_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) # get temp work dir work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir samtools_cmdline = '' temp_output = None if samtools_command == 'idxstats': temp_output,samtools_cmdline = \ run_bam_idxstat( samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, force=True) # run samtools idxstats elif samtools_command == 'flagstat': temp_output,samtools_cmdline = \ run_bam_flagstat(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools flagstat elif samtools_command == 'stats': temp_output,samtools_cmdline,stats_metrics = \ run_bam_stats(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools stats if load_metrics_to_cram and \ len(stats_metrics) > 0: ca = CollectionAdaptor( **{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=stats_metrics) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) elif samtools_command == 'merge': if output_prefix is None: raise ValueError( 'Missing output filename prefix for merged bam') sorted_by_name = self.param('sorted_by_name') temp_output = \ os.path.join(\ work_dir, '{0}_merged.bam'.format(output_prefix)) samtools_cmdline = \ merge_multiple_bam(\ samtools_exe=samtools_exe, input_bam_list=input_file, output_bam_path=temp_output, sorted_by_name=sorted_by_name, threads=threads, use_ephemeral_space=use_ephemeral_space, force=True) elif samtools_command == 'view_bamToCram': if base_result_dir is None: raise ValueError( 'base_result_dir is required for CRAM file loading') if analysis_name is None: raise ValueError( 'analysis_name is required for CRAM file loading') ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type) genome_fasta = ref_genome.get_genome_fasta( ) # get genome fasta cram_file = \ os.path.basename(input_file).\ replace('.bam','.cram') # get base cram file name cram_file = os.path.join( temp_output_dir, cram_file) # get cram file path in work dir samtools_cmdline = \ convert_bam_to_cram(\ samtools_exe=samtools_exe, bam_file=input_file, reference_file=genome_fasta, cram_path=cram_file, use_ephemeral_space=use_ephemeral_space, threads=threads, force=True, dry_run=False) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=cram_collection_type, collection_table=collection_table, base_path=base_result_dir) temp_output_bam_cram_list = \ au.load_file_to_disk_and_db(\ input_file_list=[cram_file], file_suffix='cram', withdraw_exisitng_collection=force_overwrite) # load file to db and disk for cram in temp_output_bam_cram_list: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=cram, threads=threads, dry_run=False) index_path = '{0}.crai'.format(cram) output_bam_cram_list.append(cram) output_bam_cram_list.append(index_path) if len(output_bam_cram_list) == 0: raise ValueError('No output cram file found') elif samtools_command == 'view_filterBam': temp_output_bam = \ os.path.join(\ temp_output_dir, os.path.basename(input_file).replace('.bam','.filtered.bam')) samtools_cmdline = \ filter_bam_file( samtools_exe=samtools_exe, input_bam=input_file, output_bam=temp_output_bam, samFlagInclude=samFlagInclude, samFlagExclude=samFlagExclude, threads=threads, mapq_threshold=mapq_threshold, index_output=False, dry_run=False) dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output_bam)) move_file(\ source_path=temp_output_bam, destinationa_path=dest_path, force=True) index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=dest_path, threads=threads, dry_run=False) index_path = '{0}.bai'.format(dest_path) output_bam_cram_list.append(dest_path) output_bam_cram_list.append(index_path) else: raise ValueError('Samtools command {0} not supported'.\ format(samtools_command)) if temp_output is not None: dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output)) if dest_path != temp_output: move_file(\ source_path=temp_output, destinationa_path=dest_path, force=True) analysis_files.append(dest_path) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_bam_cram_list': output_bam_cram_list }) # pass on samtools output list message = \ 'finished samtools {0} for {1} {2}'.\ format( samtools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'finished samtools {0} for {1} {2}: {3}'.\ format( samtools_command, project_igf_id, sample_igf_id, samtools_cmdline) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') project_name = self.param_required('project_name') remote_project_path = self.param_required('remote_project_path') igf_session_class = self.param_required('igf_session_class') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') seqruninfofile = self.param('seqruninfofile') samplereadcountfile = self.param('samplereadcountfile') samplereadcountcsvfile = self.param('samplereadcountcsvfile') status_data_json = self.param('status_data_json') pipeline_name = self.param_required('pipeline_name') analysis_pipeline_name = self.param_required( 'analysis_pipeline_name') sample_column = self.param('sample_column') use_ephemeral_space = self.param('use_ephemeral_space') temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir temp_read_count_output = \ os.path.join(\ temp_work_dir, samplereadcountfile) # get path for temp read count file temp_read_count_csv_output = \ os.path.join(\ temp_work_dir, samplereadcountcsvfile) # get path for temp read count csv file temp_seqrun_info = \ os.path.join(\ temp_work_dir, seqruninfofile) # get path for temp seqrun info file raw_read_count = \ get_project_read_count(\ session_class=igf_session_class, project_igf_id=project_name) # get raw read count for project (description,read_count_data,column_order) = \ convert_project_data_gviz_data(input_data=raw_read_count) # convert read count to gviz requirements convert_to_gviz_json_for_display(\ description=description, data=read_count_data, columns_order=column_order, output_file=temp_read_count_output) # write data to output json file read_count_data = pd.DataFrame(read_count_data) if not isinstance(read_count_data, pd.DataFrame): raise ValueError('Expecting a pandas dataframe, and got {0}'.\ format(type(read_count_data))) read_count_data.\ set_index(sample_column).\ to_csv(\ temp_read_count_csv_output, index=True) # create csv output for project data seqrun_data = \ get_seqrun_info_for_project(\ session_class=igf_session_class, project_igf_id=project_name) # fetch seqrun info for each projects add_seqrun_path_info(\ input_data=seqrun_data, output_file=temp_seqrun_info) # write seqrun info json remote_project_dir = \ os.path.join(\ remote_project_path, project_name) # get remote project directory path self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=temp_seqrun_info, remote_file=os.path.join(remote_project_dir, seqruninfofile)) # copy seqrun info file to remote self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=temp_read_count_output, remote_file=os.path.join(remote_project_dir, samplereadcountfile)) # copy file sample read count json file to remote os.chmod(temp_read_count_csv_output, mode=0o754) # changed file permission before copy self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=temp_read_count_csv_output, remote_file=os.path.join(remote_project_dir, samplereadcountcsvfile)) # copy file sample read count csv file to remote ps = Project_status(\ igf_session_class=igf_session_class, project_igf_id=project_name) temp_status_output = \ os.path.join(\ temp_work_dir, status_data_json) # get path for temp status file ps.generate_gviz_json_file(\ output_file=temp_status_output, demultiplexing_pipeline=pipeline_name, analysis_pipeline=analysis_pipeline_name, active_seqrun_igf_id=seqrun_igf_id) # write data to output json file self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=temp_status_output, remote_file=os.path.join(remote_project_dir, status_data_json)) # copy file project status file to remote self.param('dataflow_params', {'remote_project_info': 'done'}) remove_dir(temp_work_dir) # remove temp dir except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') file_list = self.param_required('file_list') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') dir_labels = self.param_required('dir_labels') igf_session_class = self.param_required('igf_session_class') force_overwrite = self.param('force_overwrite') collect_remote_file = self.param('collect_remote_file') collection_name = self.param('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') file_location = self.param('file_location') use_ephemeral_space = self.param('use_ephemeral_space') destination_output_path = \ os.path.join( remote_project_path, project_igf_id) # get base destination path if isinstance(dir_labels, list) and \ len(dir_labels) > 0: destination_output_path=\ os.path.join(destination_output_path, *dir_labels) if collect_remote_file: if collection_name is None or \ collection_type is None: raise ValueError('Name and type are required for db collection') output_file_list = list() temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp dir for file in file_list: if not os.path.exists(file): raise IOError('file {0} not found'.\ format(file)) if os.path.isfile(file): copy2( file, os.path.join( temp_work_dir, os.path.basename(file))) # copy file to a temp dir dest_file_path = \ os.path.join( destination_output_path, os.path.basename(file)) # get destination file path os.chmod( os.path.join( temp_work_dir, os.path.basename(file)), mode=0o764) # set file permission elif os.path.isdir(file): copytree(\ file, os.path.join( temp_work_dir, os.path.basename(file))) # copy dir to a temp dir dest_file_path=destination_output_path for root,dirs,files in os.walk(temp_work_dir): for dir_name in dirs: os.chmod( os.path.join(root,dir_name), mode=0o775) for file_name in files: os.chmod( os.path.join(root,file_name), mode=0o764) # changing file and dir permissions for remote files else: raise ValueError('Unknown source file type: {0}'.\ format(file)) #os.chmod( # os.path.join( # temp_work_dir, # os.path.basename(file)), # mode=0o754) # set file permission copy_remote_file(\ source_path=os.path.join(temp_work_dir, os.path.basename(file)), destinationa_path=dest_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host), force_update=force_overwrite ) # copy file to remote if os.path.isdir(file): dest_file_path=\ os.path.join(\ dest_file_path, os.path.basename(file)) # fix for dir input output_file_list.append(dest_file_path) remove_dir(dir_path=temp_work_dir) # remove temp dir self.param('dataflow_params', {'status': 'done', 'output_list':output_file_list}) # add dataflow params if collect_remote_file: data=list() remove_data_list=[{'name':collection_name, 'type':collection_type}] for file in output_file_list: data.append( {'name':collection_name, 'type':collection_type, 'table':collection_table, 'file_path':file, 'location':file_location } ) ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.remove_collection_group_info( data=remove_data_list, autosave=False) # remove existing data before loading new collection ca.load_file_and_create_collection( data=data, autosave=False, calculate_file_size_and_md5=False) # load remote files to db ca.commit_session() # commit changes ca.close_session() except: ca.rollback_session() # rollback changes ca.close_session() raise except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') collection_type_list = self.param_required('collection_type_list') analysis_data_json = self.param_required('analysis_data_json') igf_session_class = self.param_required('igf_session_class') remote_project_path = self.param_required('remote_project_path') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_analysis_dir = self.param('remote_analysis_dir') pipeline_name = self.param_required('pipeline_name') attribute_collection_file_type = self.param( 'attribute_collection_file_type') pipeline_seed_table = self.param('pipeline_seed_table') pipeline_finished_status = self.param('pipeline_finished_status') chart_data_json = self.param('chart_data_json') chart_data_csv = self.param('chart_data_csv') sample_id_label = self.param('sample_id_label') use_ephemeral_space = self.param('use_ephemeral_space') temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) output_file = os.path.join(temp_dir, analysis_data_json) chart_json_output_file = os.path.join(temp_dir, chart_data_json) csv_output_file = os.path.join(temp_dir, chart_data_csv) prj_data = \ Project_analysis(\ igf_session_class=igf_session_class, collection_type_list=collection_type_list, remote_analysis_dir=remote_analysis_dir, attribute_collection_file_type=attribute_collection_file_type, pipeline_name=pipeline_name, pipeline_seed_table=pipeline_seed_table, pipeline_finished_status=pipeline_finished_status, use_ephemeral_space=use_ephemeral_space, sample_id_label=sample_id_label) prj_data.\ get_analysis_data_for_project(\ project_igf_id=project_igf_id, output_file=output_file, chart_json_output_file=chart_json_output_file, csv_output_file=csv_output_file) remote_file_path = \ os.path.join(\ remote_project_path, project_igf_id, analysis_data_json) self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=output_file, remote_file=remote_file_path) remote_chart_file_path = \ os.path.join(\ remote_project_path, project_igf_id, chart_data_json) self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=chart_json_output_file, remote_file=remote_chart_file_path) remote_csv_file_path = \ os.path.join(\ remote_project_path, project_igf_id, chart_data_csv) self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=csv_output_file, remote_file=remote_csv_file_path) self.param('dataflow_params', {'remote_file_path': remote_file_path}) except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run_plotCoverage(bam_files, output_raw_counts, plotcov_stdout, output_plot=None, blacklist_file=None, thread=1, params_list=None, dry_run=False, use_ephemeral_space=0): ''' A function for running Deeptools plotCoverage :param bam_files: A list of indexed bam files :param output_raw_counts: Output raw count filepath :param plotcov_stdout: Output path of plotCoverage stdout logs :param output_plot: Output plots filepath, default None :param blacklist_file: Input blacklist region filepath, default None :param thread: Number of threads to use, default 1 :param params_list: Additional deeptools plotCoverage params as list, default None :param dry_run: Return Deeptools command list without running it :param use_ephemeral_space: A toggle for tmp dir settings, default 0 :returns: Deeptools command list ''' try: if len(bam_files) == 0: raise ValueError( 'No bamfiles found to generate coverage plot data') plotcov_args = ['--bamfiles'] # prepare to add input bams to args for path in bam_files: check_file_path(path) # check input bams plotcov_args.append(quote(path)) # adding input bams temp_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) temp_output_raw_counts = \ os.path.join( temp_dir, os.path.basename(output_raw_counts)) # path for temp raw counts temp_plotcov_stdout = \ os.path.join( temp_dir, os.path.basename(plotcov_stdout)) # path for temp raw counts plotcov_args.\ extend([ "--numberOfProcessors",quote(str(thread)), "--outRawCounts",temp_output_raw_counts ]) if output_plot is not None: temp_output_plot = \ os.path.join( temp_dir, os.path.basename(output_plot)) # path for temp raw counts plotcov_args.extend(["--plotFile", temp_output_plot]) if blacklist_file is not None: check_file_path(blacklist_file) plotcov_args.extend(["--blackListFileName", quote(blacklist_file)]) if (params_list is not None or \ params_list !='') and \ isinstance(params_list,list) and \ len(params_list) > 0: params_list = [quote(param) for param in params_list] plotcov_args.extend( params_list) # add additional params to the list if dry_run: return plotcov_args from deeptools.plotCoverage import main as plotCoverage_main f = io.StringIO() with redirect_stdout(f): plotCoverage_main(plotcov_args) stdout_logs = f.getvalue() with open(temp_plotcov_stdout, 'w') as fp: fp.write(stdout_logs) copy_local_file(source_path=temp_plotcov_stdout, destinationa_path=plotcov_stdout) copy_local_file(source_path=temp_output_raw_counts, destinationa_path=output_raw_counts) if output_plot is not None: copy_local_file(source_path=temp_output_plot, destinationa_path=output_plot) remove_dir(temp_dir) # clean up temp dir plotcov_args.insert(0, 'plotCoverage') # fix for deeptools commandline return plotcov_args except: raise
def run_bamCoverage(bam_files, output_file, blacklist_file=None, thread=1, dry_run=False, params_list=("--outFileFormat", "bigwig"), use_ephemeral_space=0): ''' A function for running Deeptools bamCoverage :param bam_files: A list of bam files to run tool,expecting only one file :param output_file: Ouput filepath for the coverage plot :param blacklist_file: Input blacklist region filepath, default None :param thread: Number of threads to use, default 1 :param dry_run: Return Deeptools command list without running it :param params_list: Additional deeptools plotCoverage params as list, default ("--outFileFormat","bigwig") :param use_ephemeral_space: A toggle for temp dir settings, default 0 :returns: Deeptools command as list ''' try: if len(bam_files) == 0: raise ValueError('No bamfiles found to generate coverage data') if len(bam_files) > 1: raise ValueError( 'Expecting only one bam for bamCoverage tools, found : {0}'.\ format(len(bam_files))) bamcov_args = ['--bam'] # prepare to add input bams to args for path in bam_files: check_file_path(path) # check input bams bamcov_args.append(quote(path)) # adding input bams temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_output = \ os.path.join( temp_dir, os.path.basename(output_file)) bamcov_args.\ extend([ "--numberOfProcessors",quote(str(thread)), "--outFileName",temp_output]) if blacklist_file is not None: check_file_path(blacklist_file) bamcov_args.extend(["--blackListFileName", quote(blacklist_file)]) if (params_list is not None or \ params_list != '') and \ (isinstance(params_list,list) or \ isinstance(params_list,tuple)) and \ len(params_list)>0: params_list = list(params_list) if len(params_list) > 0: params_list = [quote(param) for param in params_list] bamcov_args.extend( params_list) # add additional params to the list if dry_run: return bamcov_args from deeptools.bamCoverage import main as bamCoverage_main bamCoverage_main(bamcov_args) # generate bam coverage file copy_local_file(source_path=temp_output, destinationa_path=output_file) # copy output file remove_dir(temp_dir) # clean up temp dir bamcov_args.insert(0, 'bamCoverage') # fix for deeptools commandline return bamcov_args except: raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') project_name = self.param_required('project_name') remote_project_path = self.param_required('remote_project_path') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') template_dir = self.param_required('template_dir') igf_session_class = self.param_required('igf_session_class') htaccess_template_path = self.param('htaccess_template_path') htaccess_template = self.param('htaccess_template') htpasswd_template = self.param('htpasswd_template') htaccess_filename = self.param('htaccess_filename') htpasswd_filename = self.param('htpasswd_filename') project_template = self.param('project_template') status_template = self.param('status_template') analysis_template = self.param('analysis_template') analysis_viewer_template = self.param('analysis_viewer_template') seqruninfofile = self.param('seqruninfofile') samplereadcountfile = self.param('samplereadcountfile') samplereadcountcsvfile = self.param('samplereadcountcsvfile') status_data_json = self.param('status_data_json') analysis_data_json = self.param('analysis_data_json') analysis_data_csv = self.param('analysis_data_csv') analysis_chart_data_csv = self.param('analysis_chart_data_csv') analysis_chart_data_json = self.param('analysis_chart_data_json') analysis_view_js = self.param('analysis_view_js') image_height = self.param('image_height') sample_count_threshold = self.param('sample_count_threshold') use_ephemeral_space = self.param('use_ephemeral_space') htaccess_template_path = \ os.path.join(\ template_dir, htaccess_template_path) # set path for template dir project_template_path = \ os.path.join(\ template_dir, project_template) # set path for project template status_template_path = \ os.path.join(\ template_dir, status_template) # set path for project status template analysis_template_path = \ os.path.join(\ template_dir, analysis_template) # set path for project analysis template analysis_viewer_template = \ os.path.join(\ template_dir, analysis_viewer_template) # set path for analysis viewer template pa = ProjectAdaptor(**{'session_class': igf_session_class}) pa.start_session() user_info = \ pa.get_project_user_info(project_igf_id=project_name) # fetch user info from db sample_counts = \ pa.count_project_samples(\ project_igf_id=project_name, only_active=True) # get sample counts for the project pa.close_session() image_height = \ self._calculate_image_height(\ sample_count=sample_counts, height=image_height, threshold=sample_count_threshold) # change image height based on sample count user_info = user_info.to_dict( orient='records') # convert dataframe to list of dictionaries if len(user_info) == 0: raise ValueError('No user found for project {0}'.\ format(project_name)) user_list = list() user_passwd_dict = dict() hpc_user = True # by default, load hpc user settings for user in user_info: username = user['username'] # get username for irods user_list.append(username) if 'ht_password' in user.keys(): ht_passwd = user['ht_password'] # get htaccess passwd user_passwd_dict.update({username: ht_passwd}) if 'category' in user.keys() and \ 'data_authority' in user.keys() and \ user['category'] == 'NON_HPC_USER' and \ user['data_authority']=='T': hpc_user = False # switch to non-hpc settings if primary user is non-hpc temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir template_env = \ Environment(\ loader=FileSystemLoader(\ searchpath=htaccess_template_path), autoescape=select_autoescape(['html', 'xml'])) # set template env htaccess = template_env.get_template( htaccess_template) # read htaccess template htpasswd = template_env.get_template( htpasswd_template) # read htpass template htaccess_output = \ os.path.join(\ temp_work_dir, htaccess_filename) htpasswd_output = \ os.path.join(\ temp_work_dir, htpasswd_filename) htaccess.\ stream(\ remote_project_dir=remote_project_path, project_tag=project_name, hpcUser=hpc_user, htpasswd_filename=htpasswd_filename, customerUsernameList=' '.join(user_list)).\ dump(htaccess_output) # write new htacces file htpasswd.\ stream(userDict=user_passwd_dict).\ dump(htpasswd_output) # write new htpass file template_prj = \ Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(project_template_path)), autoescape=select_autoescape(['txt', 'xml'])) # set template env for project project_index = \ template_prj.\ get_template(os.path.basename(project_template_path)) # read htaccess template project_output = \ os.path.join(\ temp_work_dir, os.path.basename(project_template_path)) project_index.\ stream(\ ProjectName=project_name, seqrunInfoFile=seqruninfofile, sampleReadCountFile=samplereadcountfile, sampleReadCountCsvFile=samplereadcountcsvfile, ImageHeight=image_height).\ dump(project_output) # write new project file template_status = \ Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(status_template_path)), autoescape=select_autoescape(['txt', 'xml'])) # set template env for project project_status = \ template_status.\ get_template(os.path.basename(status_template_path)) # read status page template status_output = \ os.path.join(\ temp_work_dir, os.path.basename(status_template_path)) project_status.\ stream(\ ProjectName=project_name, status_data_json=status_data_json).\ dump(status_output) # write new project status file template_analysis = \ Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(analysis_template_path)), autoescape=select_autoescape(['txt', 'xml'])) # set template env for analysis project_analysis = \ template_analysis.\ get_template(os.path.basename(analysis_template_path)) # read analysis page template analysis_output = \ os.path.join(\ temp_work_dir, os.path.basename(analysis_template_path)) project_analysis.\ stream(\ ProjectName=project_name, analysisInfoFile=analysis_data_json, analysisInfoCsvFile=analysis_data_csv, analysisCsvDataFile=analysis_chart_data_csv, analysisPlotFile=analysis_chart_data_json).\ dump(analysis_output) # write new project analysis file template_analysis_viewer = \ Environment(\ loader=FileSystemLoader(\ searchpath=os.path.dirname(analysis_viewer_template)), autoescape=select_autoescape(['txt', 'xml'])) # set template env for analysis viewer project_analysis_viewer = \ template_analysis_viewer.\ get_template(os.path.basename(analysis_viewer_template)) # read analysis viewer page template analysis_viewer_output = \ os.path.join(\ temp_work_dir, os.path.basename(analysis_viewer_template)) project_analysis_viewer.\ stream(\ ProjectName=project_name, analysisJsFile=analysis_view_js).\ dump(analysis_viewer_output) # write new project analysis viewer file remote_project_dir = \ os.path.join(\ remote_project_path, project_name) # ger remote project dir path remote_htaccess_file = \ os.path.join(\ remote_project_dir, htaccess_filename) # remote htaccess filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=htaccess_output, remote_file=remote_htaccess_file) # copy htaccess file to remote dir remote_htpasswd_file = \ os.path.join(\ remote_project_dir, htpasswd_filename) # remote htpasswd filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=htpasswd_output, remote_file=remote_htpasswd_file) # copy htpasswd file to remote dir remote_project_output_file = \ os.path.join(\ remote_project_dir, os.path.basename(project_output)) # remote project output filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=project_output, remote_file=remote_project_output_file) # copy project output file to remote dir remote_status_output_file = \ os.path.join(\ remote_project_dir, os.path.basename(status_output)) # remote project status output filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=status_output, remote_file=remote_status_output_file) # copy project status output file to remote dir remote_analysis_output_file = \ os.path.join(\ remote_project_dir, os.path.basename(analysis_output)) # remote project analysis output filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=analysis_output, remote_file=remote_analysis_output_file) # copy project analysis output file to remote dir remote_analysis_viewer_output_file = \ os.path.join(\ remote_project_dir, os.path.basename(analysis_viewer_output)) # remote project analysis viewer output filepath self._check_and_copy_remote_file(\ remote_user=remote_user, remote_host=remote_host, source_file=analysis_viewer_output, remote_file=remote_analysis_viewer_output_file) # copy project analysis viewer output file to remote dir self.param('dataflow_params', {'remote_dir_status': 'done'}) remove_dir(temp_work_dir) except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise