def test_load_file_to_disk_and_db7(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='RunA', collection_type='AnalysisA_Files', collection_table='run', base_path=self.temp_base_dir) input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='RunA', collection_type='AnalysisA_Files', output_mode='dataframe') file_list = list(ca_files['file_path'].to_dict().values()) datestamp = get_datestamp_label() test_file = os.path.join( self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA', 'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA', 'TagA', datestamp, 'cram')) test_file = preprocess_path_name(input_path=test_file) self.assertTrue(test_file in file_list) self.assertTrue(test_file in output_list) base.close_session()
def get_datestamp(self): ''' A method for fetching datestamp :returns: A padded string of format YYYYMMDD ''' try: datestamp = get_datestamp_label() return datestamp except: raise
def run(self): ''' ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param_required('run_igf_id') accu_data = self.param_required('accu_data') output_mode = self.param_required('output_mode') base_work_dir = self.param_required('base_work_dir') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) run_analysis_files = accu_data.get(run_igf_id).get(seed_date_stamp) if run_analysis_files is None: raise ValueError('No data found in accu table for run {0} and date_stamp {1}'.\ format(run_igf_id,seed_date_stamp)) # incorrect data structure if isinstance(run_analysis_files,list) and \ len(run_analysis_files)==0: raise ValueError('No run level file found in accu data for run {0} and date_stamp {1}'.\ format(run_igf_id,seed_date_stamp)) # zero input file if output_mode == 'list': self.param('dataflow_params', {'run_chunk_list': run_analysis_files}) elif output_mode == 'file': work_dir_prefix = os.path.join(base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id, run_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir output_file = os.path.join(work_dir, 'run_level_chunk.txt') with open(output_file, 'w') as fp: fp.write('\n'.join(run_analysis_files)) self.param('dataflow_params', {'run_chunk_list_file': output_file}) else: raise ValueError( 'Output mode {0} not supported'.format(output_mode)) except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.format(self.__class__.__name__, \ e, \ project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def test_load_file_to_disk_and_db8(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='RunA', collection_type='AnalysisA_Files', collection_table='run') input_file = os.path.join(self.temp_work_dir, 'a.cram') input_file = preprocess_path_name(input_path=input_file) new_file_name = au.get_new_file_name(input_file=input_file) datestamp = get_datestamp_label() test_file_name = '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA', 'TagA', datestamp, 'cram') self.assertEqual(new_file_name, test_file_name)
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') base_work_dir = self.param_required('base_work_dir') cbImportScanpy_path = self.param_required('cbImportScanpy_path') scanpy_h5ad_path = self.param_required('scanpy_h5ad_path') cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix') use_ephemeral_space = self.param('use_ephemeral_space') work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(\ work_dir=work_dir_prefix) datestamp = get_datestamp_label() cellbrowser_dir = \ os.path.join( \ work_dir, '{0}_{1}'.\ format( \ cellbrowser_dir_prefix, datestamp)) convert_scanpy_h5ad_to_cellbrowser_dir(\ cbImportScanpy_path=cbImportScanpy_path, h5ad_path=scanpy_h5ad_path, project_name=experiment_igf_id, use_ephemeral_space=use_ephemeral_space, cellbrowser_htmldir=cellbrowser_dir) self.param('dataflow_params', {'cellbrowser_dir': cellbrowser_dir}) except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def get_new_file_name(self, input_file, file_suffix=None): ''' A method for fetching new file name :param input_file: An input filepath :param file_suffix: A file suffix ''' try: new_filename = self.collection_name # use collection name to rename file if new_filename == '': raise ValueError('New filename not found for input file {0}'.\ format(input_file)) new_filename = \ '{0}_{1}'.format( new_filename, self.analysis_name) if self.tag_name is not None: new_filename = \ '{0}_{1}'.format( new_filename, self.tag_name) # add tagname to filepath if self.add_datestamp: datestamp = get_datestamp_label() # collect datestamp new_filename = \ '{0}_{1}'.format( new_filename, datestamp) # add datestamp to filepath if file_suffix is None: file_suffix = get_file_extension( input_file=input_file) # collect file suffix if file_suffix == '': raise ValueError('Missing file extension for new file name of {0}'.\ format(input_file)) # raise error if not file suffix found new_filename = \ '{0}.{1}'.format(\ new_filename, file_suffix) # add file suffix to the new name return new_filename except: raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param species_name: species_name :param base_result_dir: Base results directory :param report_template_file: A template file for writing scanpy report :param analysis_name: Analysis name, default scanpy :param species_name_lookup: A dictionary for ensembl species name lookup :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS :param collection_table: Collection table name for loading scanpy report, default experiment ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') species_name = self.param_required('species_name') report_template_file = self.param_required('report_template_file') analysis_name = self.param_required('analysis_name') base_result_dir = self.param_required('base_result_dir') base_work_dir = self.param_required('base_work_dir') species_name_lookup = self.param('species_name_lookup') cellranger_collection_type = self.param( 'cellranger_collection_type') scanpy_collection_type = self.param('scanpy_collection_type') collection_table = self.param('collection_table') cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix') use_ephemeral_space = self.param('use_ephemeral_space') cellranger_tarfile = '' output_report = '' work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir if species_name in species_name_lookup.keys( ): # check for human or mice ensembl_species_name = species_name_lookup[ species_name] # get ensembl species name # fetch cellranger tar path from db if cellranger_tarfile == '': ca = CollectionAdaptor( **{'session_class': igf_session_class}) ca.start_session() # connect to database cellranger_tarfiles = \ ca.get_collection_files(\ collection_name=experiment_igf_id, collection_type=cellranger_collection_type, output_mode='dataframe') # fetch collection files ca.close_session() if len(cellranger_tarfiles.index) == 0: raise ValueError('No cellranger analysis output found for exp {0}'.\ format(experiment_igf_id)) cellranger_tarfile = cellranger_tarfiles[ 'file_path'].values[ 0] # select first file as analysis file # extract filtered metrics files from tar output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir datestamp = get_datestamp_label() cellbrowser_dir = \ os.path.join( \ work_dir, '{0}_{1}'.\ format( \ cellbrowser_dir_prefix, datestamp)) cellbrowser_h5ad = \ os.path.join(\ cellbrowser_dir, 'scanpy.h5ad') output_report = \ os.path.join(\ output_dir, 'report.html') # get temp report path matrix_file,gene_file,barcode_file = \ self._extract_cellranger_filtered_metrics(\ tar_file=cellranger_tarfile, output_dir=output_dir) # get cellranger output files sp = \ Scanpy_tool(\ project_name=project_igf_id, sample_name=sample_igf_id, matrix_file=matrix_file, features_tsv=gene_file, barcode_tsv=barcode_file, html_template_file=report_template_file, species_name=ensembl_species_name, output_file=output_report, use_ephemeral_space=use_ephemeral_space, cellbrowser_h5ad=cellbrowser_h5ad) sp.generate_report() # generate scanpy report # load files to db and disk au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=scanpy_collection_type, collection_table=collection_table, base_path=base_result_dir) # initiate loading of report file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_report], withdraw_exisitng_collection=True) # load file to db and disk output_report = output_file_list[0] self.param( 'dataflow_params', { 'output_report': output_report, 'scanpy_h5ad_path': cellbrowser_h5ad }) # pass on output report filepath except Exception as e: message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running picard commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL :param species_name: species_name :param java_exe: Java path :param java_java_paramexe: Java run parameters :param picard_jar: Picard jar path :param picard_command: Picard command :param base_work_dir: Base workd directory :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False :param use_ephemeral_space: A toggle for temp dir setting, default 0 :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ'] ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') java_exe = self.param_required('java_exe') java_param = self.param_required('java_param') picard_jar = self.param_required('picard_jar') input_files = self.param_required('input_files') picard_command = self.param_required('picard_command') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') reference_refFlat = self.param('reference_refFlat') ribosomal_interval_type = self.param('ribosomal_interval_type') base_work_dir = self.param_required('base_work_dir') analysis_files = self.param_required('analysis_files') picard_option = self.param('picard_option') patterned_flowcell_list = self.param('patterned_flowcell_list') platform_name = self.param_required('platform_name') output_prefix = self.param('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding seed datestamp to output prefix work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir temp_output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type, gene_reflat_type=reference_refFlat, ribosomal_interval_type=ribosomal_interval_type) # setup ref genome utils genome_fasta = ref_genome.get_genome_fasta() # get genome fasta ref_flat_file = ref_genome.get_gene_reflat() # get refFlat file ribosomal_interval_file = ref_genome.get_ribosomal_interval( ) # get ribosomal interval file patterned_flowcell = False if platform_name in patterned_flowcell_list: # check for patterned flowcell patterned_flowcell = True if load_metrics_to_cram and \ not cram_collection_type: raise ValueError( 'Cram file collection type is required for loading picard metrics to db' ) picard=\ Picard_tools(\ java_exe=java_exe, java_param=java_param, picard_jar=picard_jar, input_files=input_files, output_dir=temp_output_dir, ref_fasta=genome_fasta, patterned_flowcell=patterned_flowcell, ref_flat_file=ref_flat_file, picard_option=picard_option, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, ribisomal_interval=ribosomal_interval_file) # setup picard tool temp_output_files,picard_command_line,picard_metrics = \ picard.run_picard_command(command_name=picard_command) # run picard command output_file_list = list() for source_path in temp_output_files: dest_path=\ os.path.join( work_dir, os.path.basename(source_path)) # get destination filepath move_file(source_path=source_path, destinationa_path=dest_path, force=True) # move files to work dir output_file_list.append(dest_path) remove_dir(temp_output_dir) analysis_files.extend(output_file_list) bam_files = list() for file in output_file_list: if file.endswith('.bam'): bam_files.append(file) if load_metrics_to_cram and \ len(picard_metrics)>0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute( collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=picard_metrics) # fromat data for collection attribute table ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False ) # load data to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'bam_files': bam_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished picard {0} for {1} {2}'.\ format( picard_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Picard {0} command: {1}'.\ format( picard_command, picard_command_line) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: if temp_output_dir and \ os.path.exists(temp_output_dir): remove_dir(temp_output_dir) message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') analysis_files = self.param_required('analysis_files') multiqc_exe = self.param('multiqc_exe') multiqc_options = self.param('multiqc_options') multiqc_dir_label = self.param('multiqc_dir_label') force_overwrite = self.param('force_overwrite') base_results_dir = self.param_required('base_results_dir') tag = self.param_required('tag_name') analysis_name = self.param_required('analysis_name') collection_name = self.param_required('collection_name') collection_type = self.param_required('collection_type') collection_table = self.param_required('collection_table') igf_session_class = self.param_required('igf_session_class') multiqc_template_file = self.param_required( 'multiqc_template_file') platform_name = self.param('platform_name') tool_order_list = self.param('tool_order_list') use_ephemeral_space = self.param('use_ephemeral_space') if not isinstance(analysis_files,list) and \ len(analysis_files) ==0: raise ValueError('Failed to run MultiQC for zero analysis list' ) # check analysis files temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir multiqc_input_file = \ os.path.join( temp_work_dir, 'multiqc.txt') # get temp multiqc list with open(multiqc_input_file, 'w') as fp: for file in analysis_files: if not os.path.exists(file): raise IOError('File {0} not found for multiQC run'.\ format(file)) # check filepath fp.write('{}\n'.format(file)) # write file to temp file date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S') check_file_path(multiqc_template_file) multiqc_conf_file = \ os.path.join( temp_work_dir, os.path.basename(multiqc_template_file)) template_env = \ Environment( loader=\ FileSystemLoader( searchpath=os.path.dirname(multiqc_template_file)), autoescape=select_autoescape(['html', 'xml'])) multiqc_conf = \ template_env.\ get_template( os.path.basename(multiqc_template_file)) multiqc_conf.\ stream( project_igf_id=project_igf_id, sample_igf_id=sample_igf_id, platform_name=platform_name, tag_name=tag, date_stamp=date_stamp, tool_order_list=tool_order_list).\ dump(multiqc_conf_file) multiqc_report_title = \ 'Project:{0}'.format(project_igf_id) # base multiqc label if sample_igf_id is not None: multiqc_report_title = \ '{0},Sample:{1}'.\ format( multiqc_report_title, sample_igf_id) # add sample, if its present multiqc_report_title = \ '{0};tag:{1};date:{2}'.\ format( multiqc_report_title, tag, get_datestamp_label()) # add tag and date stamp multiqc_param = self.format_tool_options( multiqc_options) # format multiqc params multiqc_cmd = [ multiqc_exe, '--file-list', quote(multiqc_input_file), '--outdir', quote(temp_work_dir), '--title', quote(multiqc_report_title), '-c', quote(multiqc_conf_file) ] # multiqc base parameters multiqc_param = \ [quote(param) for param in multiqc_param] # wrap params in quotes multiqc_cmd.\ extend(multiqc_param) # add additional parameters subprocess.\ check_call(' '.join(multiqc_cmd),shell=True) # run multiqc multiqc_html = None output_list = list() for root, _, files in os.walk(top=temp_work_dir): for file in files: if fnmatch.fnmatch(file, '*.html'): multiqc_html = os.path.join( root, file) # get multiqc html path au = \ Analysis_collection_utils( dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=tag, collection_name=collection_name, collection_type=collection_type, collection_table=collection_table, base_path=base_results_dir) output_list = \ au.load_file_to_disk_and_db( input_file_list=[multiqc_html], withdraw_exisitng_collection=force_overwrite, force=True,remove_file=True) # load file to db and disk self.param('dataflow_params', {'multiqc_html': output_list[0] }) # add output files to dataflow except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def upload_analysis_results_and_create_collection(self, file_list, irods_user, project_name, analysis_name='default', dir_path_list=None, file_tag=None): ''' A method for uploading analysis files to irods server :param file_list: A list of file paths to upload to irods :param irods_user: Irods user name :param project_name: Name of the project_name :param analysis_name: A string for analysis name, default is 'default' :param dir_path_list: A list of directory structure for irod server, default None for using datestamp :param file_tag: A text string for adding tag to collection, default None for only project_name ''' try: irods_exe_dir = self.irods_exe_dir irods_base_dir=os.path.join(self.zone, \ 'home', \ irods_user, \ project_name) if dir_path_list is not None and \ isinstance(dir_path_list, list) and \ len(dir_path_list) >0 : irods_base_dir = os.path.join( irods_base_dir, os.path.sep.join(dir_path_list)) # use path from dir list else: datestamp = get_datestamp_label() irods_base_dir = os.path.join(irods_base_dir, datestamp) # use datestamp if not isinstance(dir_path_list, list) or \ analysis_name not in dir_path_list: irods_base_dir = os.path.join( irods_base_dir, analysis_name) # add analysis name to the irods dir chk_cmd = [os.path.join(irods_exe_dir, 'ils'), irods_base_dir] response = subprocess.call( chk_cmd) # check for existing dir in irods if response != 0: # create dir if response is not 0 make_dir_cmd = [ os.path.join(irods_exe_dir, 'imkdir'), '-p', quote(irods_base_dir) ] subprocess.check_call(make_dir_cmd) # create destination dir chmod_cmd=[os.path.join(irods_exe_dir,'ichmod'), '-M', 'own', quote(self.igf_user), \ quote(irods_base_dir)] subprocess.check_call(chmod_cmd) # change directory ownership inherit_cmd = [ os.path.join(irods_exe_dir, 'ichmod'), '-r', 'inherit', quote(irods_base_dir) ] subprocess.check_call(inherit_cmd) # inherit new directory for filepath in file_list: if not os.path.exists(filepath) or os.path.isdir(filepath): raise IOError('filepath {0} not found or its not a file'.\ format(filepath)) # checking filepath before upload irods_filepath = os.path.join(irods_base_dir, os.path.basename(filepath)) file_chk_cmd = [ os.path.join(irods_exe_dir, 'ils'), irods_filepath ] file_response = subprocess.call( file_chk_cmd) # check for existing file in irods if file_response == 0: file_rm_cmd = [ os.path.join(irods_exe_dir, 'irm'), '-rf', quote(irods_filepath) ] subprocess.check_call( file_rm_cmd ) # remove existing file to prevent any clash iput_cmd = [ os.path.join(irods_exe_dir, 'iput'), '-k', '-f', '-N', '1', '-R', quote(self.irods_resource), quote(filepath), quote(irods_base_dir) ] subprocess.check_call( iput_cmd ) # upload file to irods dir, calculate md5sub and overwrite if file_tag is None: file_meta_info = project_name else: file_meta_info = '{0} - {1}'.format(project_name, file_tag) meta_project_user=[os.path.join(irods_exe_dir,'imeta'), 'add', '-d', quote(irods_filepath),\ quote(file_meta_info), \ quote(irods_user),\ quote('iRODSUserTagging:Star')] subprocess.check_call( meta_project_user) # add more metadata to file meta_30d = [ os.path.join(irods_exe_dir, 'isysmeta'), 'mod', quote(irods_filepath), quote('+30d') ] subprocess.call(meta_30d) # add metadata for file meta_file_retentaion = [ os.path.join(irods_exe_dir, 'imeta'), 'add', '-d', quote(irods_filepath), quote('retention'), quote('30'), quote('days') ] subprocess.call( meta_file_retentaion) # adding file retaintion info except: raise
def run(self): ''' A method for running STAR alignment ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param('run_igf_id') star_exe = self.param_required('star_exe') run_mode = self.param_required('run_mode') output_prefix = self.param_required('output_prefix') run_thread = self.param('run_thread') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') reference_gtf_type = self.param('reference_gtf_type') fasta_fai_reference_type = self.param('fasta_fai_reference_type') star_patameters = self.param('star_patameters') two_pass_mode = self.param('two_pass_mode') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') base_work_dir = self.param_required('base_work_dir') seed_date_stamp = get_datestamp_label(seed_date_stamp) work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) if run_igf_id is not None: work_dir_prefix = \ os.path.join( work_dir_prefix, run_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, gene_gtf_type=reference_gtf_type, fasta_fai_type=fasta_fai_reference_type, star_ref_type=reference_type) # setup ref genome utils star_ref = ref_genome.get_transcriptome_star() # get star ref gene_gtf = ref_genome.get_gene_gtf() # get gtf file genome_fai = ref_genome.get_genome_fasta_fai( ) # fetch genomic fasta fai index if run_mode == 'generate_aligned_bams': if run_igf_id is None: raise ValueError('No Run igf id found') r1_read_file = self.param_required('r1_read_file') r2_read_file = self.param('r2_read_file') input_fastq_list = list() input_fastq_list.append(r1_read_file[0]) # get the first input if r2_read_file is not None and \ len(r2_read_file)>0: input_fastq_list.append( r2_read_file[0]) # get the first input star_obj = \ Star_utils( star_exe=star_exe, input_files=input_fastq_list, genome_dir=star_ref, reference_gtf=gene_gtf, output_dir=work_dir, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, threads=run_thread) # set up star for run if two_pass_mode is None: two_pass_mode = True elif two_pass_mode == 0: two_pass_mode = False # reset srat twopass mode if isinstance(star_patameters, str): star_patameters = json.loads( star_patameters) # convert string param to dict genomic_bam,transcriptomic_bam,star_log_file,\ star_gene_count_file,star_cmd = \ star_obj.\ generate_aligned_bams( two_pass_mode=two_pass_mode, star_patameters=star_patameters) # run star cmd self.param( 'dataflow_params', { 'star_genomic_bam': genomic_bam, 'star_transcriptomic_bam': transcriptomic_bam, 'star_log_file': star_log_file, 'star_gene_count_file': star_gene_count_file, 'seed_date_stamp': seed_date_stamp }) elif run_mode == 'generate_rna_bigwig': input_bam = self.param_required('input_bam') bedGraphToBigWig_path = self.param_required( 'bedGraphToBigWig_path') chrom_length_file = genome_fai stranded = self.param('stranded') star_obj = \ Star_utils( star_exe=star_exe, input_files=[input_bam], genome_dir=star_ref, reference_gtf=gene_gtf, output_dir=work_dir, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, threads=run_thread) # set up star for run output_paths,star_cmd = \ star_obj.\ generate_rna_bigwig( bedGraphToBigWig_path=bedGraphToBigWig_path, chrom_length_file=chrom_length_file, stranded=stranded,) # generate bigwig signal tracks self.param('dataflow_params', {'star_bigwigs': output_paths }) # passing bigwig paths to dataflow message = \ 'finished star for {0} {1}'.format( project_igf_id, run_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'STAR {0} {1} command: {2}'.format( run_igf_id, output_prefix, star_cmd) self.comment_asana_task( task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param threads: Number of threads to use for Bam to Cram conversion, default 4 :param base_work_dir: Base workd directory :param samtools_command: Samtools command :param samFlagInclude: Sam flags to include in filtered bam, default None :param samFlagExclude: Sam flags to exclude from the filtered bam, default None :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804 :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796 :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') samtools_exe = self.param_required('samtools_exe') reference_type = self.param('reference_type') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') samtools_command = self.param_required('samtools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') base_result_dir = self.param('base_result_dir') analysis_name = self.param('analysis_name') force_overwrite = self.param('force_overwrite') samFlagInclude = self.param('samFlagInclude') samFlagExclude = self.param('samFlagExclude') mapq_threshold = self.param('mapq_threshold') library_layout = self.param_required('library_layout') use_encode_filter = self.param('use_encode_filter') species_name = self.param_required('species_name') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if use_encode_filter: samFlagInclude = None if library_layout == 'PAIRED': samFlagExclude = 1804 else: samFlagExclude = 1796 if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) output_bam_cram_list = list() input_file = input_files[0] temp_output_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) # get temp work dir work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir samtools_cmdline = '' temp_output = None if samtools_command == 'idxstats': temp_output,samtools_cmdline = \ run_bam_idxstat( samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, force=True) # run samtools idxstats elif samtools_command == 'flagstat': temp_output,samtools_cmdline = \ run_bam_flagstat(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools flagstat elif samtools_command == 'stats': temp_output,samtools_cmdline,stats_metrics = \ run_bam_stats(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools stats if load_metrics_to_cram and \ len(stats_metrics) > 0: ca = CollectionAdaptor( **{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=stats_metrics) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) elif samtools_command == 'merge': if output_prefix is None: raise ValueError( 'Missing output filename prefix for merged bam') sorted_by_name = self.param('sorted_by_name') temp_output = \ os.path.join(\ work_dir, '{0}_merged.bam'.format(output_prefix)) samtools_cmdline = \ merge_multiple_bam(\ samtools_exe=samtools_exe, input_bam_list=input_file, output_bam_path=temp_output, sorted_by_name=sorted_by_name, threads=threads, use_ephemeral_space=use_ephemeral_space, force=True) elif samtools_command == 'view_bamToCram': if base_result_dir is None: raise ValueError( 'base_result_dir is required for CRAM file loading') if analysis_name is None: raise ValueError( 'analysis_name is required for CRAM file loading') ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type) genome_fasta = ref_genome.get_genome_fasta( ) # get genome fasta cram_file = \ os.path.basename(input_file).\ replace('.bam','.cram') # get base cram file name cram_file = os.path.join( temp_output_dir, cram_file) # get cram file path in work dir samtools_cmdline = \ convert_bam_to_cram(\ samtools_exe=samtools_exe, bam_file=input_file, reference_file=genome_fasta, cram_path=cram_file, use_ephemeral_space=use_ephemeral_space, threads=threads, force=True, dry_run=False) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=cram_collection_type, collection_table=collection_table, base_path=base_result_dir) temp_output_bam_cram_list = \ au.load_file_to_disk_and_db(\ input_file_list=[cram_file], file_suffix='cram', withdraw_exisitng_collection=force_overwrite) # load file to db and disk for cram in temp_output_bam_cram_list: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=cram, threads=threads, dry_run=False) index_path = '{0}.crai'.format(cram) output_bam_cram_list.append(cram) output_bam_cram_list.append(index_path) if len(output_bam_cram_list) == 0: raise ValueError('No output cram file found') elif samtools_command == 'view_filterBam': temp_output_bam = \ os.path.join(\ temp_output_dir, os.path.basename(input_file).replace('.bam','.filtered.bam')) samtools_cmdline = \ filter_bam_file( samtools_exe=samtools_exe, input_bam=input_file, output_bam=temp_output_bam, samFlagInclude=samFlagInclude, samFlagExclude=samFlagExclude, threads=threads, mapq_threshold=mapq_threshold, index_output=False, dry_run=False) dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output_bam)) move_file(\ source_path=temp_output_bam, destinationa_path=dest_path, force=True) index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=dest_path, threads=threads, dry_run=False) index_path = '{0}.bai'.format(dest_path) output_bam_cram_list.append(dest_path) output_bam_cram_list.append(index_path) else: raise ValueError('Samtools command {0} not supported'.\ format(samtools_command)) if temp_output is not None: dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output)) if dest_path != temp_output: move_file(\ source_path=temp_output, destinationa_path=dest_path, force=True) analysis_files.append(dest_path) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_bam_cram_list': output_bam_cram_list }) # pass on samtools output list message = \ 'finished samtools {0} for {1} {2}'.\ format( samtools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'finished samtools {0} for {1} {2}: {3}'.\ format( samtools_command, project_igf_id, sample_igf_id, samtools_cmdline) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') base_results_dir = self.param_required('base_results_dir') deeptools_command = self.param_required('deeptools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_signal_bigwig = self.param('load_signal_bigwig') signal_collection_type = self.param('signal_collection_type') blacklist_reference_type = self.param('blacklist_reference_type') species_name = self.param('species_name') deeptools_params = self.param('deeptools_params') deeptools_bamCov_params = self.param('deeptools_bamCov_params') collection_table = self.param('collection_table') remove_existing_file = self.param('remove_existing_file') withdraw_exisitng_collection = self.param( 'withdraw_exisitng_collection') analysis_name = self.param('analysis_name') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') signal_files = list() work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, blacklist_interval_type=blacklist_reference_type) # setup ref genome utils blacklist_bed = ref_genome.get_blacklist_region_bed( ) # get genome fasta if deeptools_command == 'plotCoverage': output_raw_counts = \ '{0}_{1}.raw.txt'.format(output_prefix,'plotCoverage') output_raw_counts = \ os.path.join(\ work_dir, output_raw_counts) plotcov_stdout = \ '{0}_{1}.stdout.txt'.format(output_prefix,'plotCoverage') plotcov_stdout = \ os.path.join(\ work_dir, plotcov_stdout) output_plot = \ '{0}_{1}.pdf'.format(output_prefix,'plotCoverage') output_plot = \ os.path.join(\ work_dir, output_plot) deeptools_args = \ run_plotCoverage(\ bam_files=input_files, output_raw_counts=output_raw_counts, plotcov_stdout=plotcov_stdout, output_plot=output_plot, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) analysis_files.extend(\ [output_raw_counts,plotcov_stdout,output_plot]) elif deeptools_command == 'bamCoverage': output_file = \ '{0}_{1}.bw'.format(output_prefix,'bamCoverage') output_file = \ os.path.join(\ work_dir, output_file) if deeptools_params is None: deeptools_params = deeptools_bamCov_params deeptools_args = \ run_bamCoverage(\ bam_files=input_files, output_file=output_file, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) if load_signal_bigwig: au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_results_dir, tag_name=species_name, collection_name=experiment_igf_id, collection_type=signal_collection_type, collection_table=collection_table) # initiate analysis file loading output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_file], remove_file=remove_existing_file, file_suffix='bw', withdraw_exisitng_collection=withdraw_exisitng_collection) # load file to db and disk analysis_files.extend(output_file_list) signal_files.extend(output_file_list) else: analysis_files.append(output_file) elif deeptools_command == 'plotFingerprint': output_raw_counts = \ '{0}_{1}.raw.txt'.format(output_prefix,'plotFingerprint') output_raw_counts = \ os.path.join(\ work_dir, output_raw_counts) output_matrics = \ '{0}_{1}.metrics.txt'.format(output_prefix,'plotFingerprint') output_matrics = \ os.path.join(\ work_dir, output_matrics) output_plot = \ '{0}_{1}.pdf'.format(output_prefix,'plotFingerprint') output_plot = \ os.path.join(\ work_dir, output_plot) deeptools_args = \ run_plotFingerprint(\ bam_files=input_files, output_raw_counts=output_raw_counts, output_matrics=output_matrics, output_plot=output_plot, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) analysis_files.extend(\ [output_raw_counts,output_matrics,output_plot]) else: raise ValueError('Deeptool command {0} is not implemented yet'.\ format(deeptools_command)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'signal_files': signal_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished deeptools {0} for {1} {2}'.format( deeptools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Deeptools {0} command: {1}'.format( deeptools_command, deeptools_args) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') rscript_path = self.param_required('rscript_path') ppqt_exe = self.param_required('ppqt_exe') base_work_dir = self.param_required('base_work_dir') base_result_dir = self.param_required('base_result_dir') library_strategy = self.param_required('library_strategy') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') species_name = self.param_required('species_name') analysis_name = self.param('analysis_name') seed_date_stamp = self.param_required('date_stamp') load_metrics_to_cram = self.param('load_metrics_to_cram') ppqt_collection_type = self.param('ppqt_collection_type') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') force_overwrite = self.param('force_overwrite') use_ephemeral_space = self.param('use_ephemeral_space') threads = self.param('threads') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = '{0}_{1}'.format( output_prefix, seed_date_stamp ) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) if analysis_name is None: analysis_name = library_strategy # use library_strategy as default analysis_name input_file = input_files[0] work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ppqt_obj = \ Ppqt_tools(\ rscript_path=rscript_path, ppqt_exe=ppqt_exe, use_ephemeral_space=use_ephemeral_space, threads=threads) ppqt_cmd,spp_output, pdf_output,spp_data = \ ppqt_obj.run_ppqt(\ input_bam=input_file, output_dir=work_dir, output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'), output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT')) analysis_files.append(spp_output) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=ppqt_collection_type, collection_table=collection_table, base_path=base_result_dir) output_ppqt_list = \ au.load_file_to_disk_and_db(\ input_file_list=[pdf_output], file_suffix='pdf', withdraw_exisitng_collection=force_overwrite) # load file to db and disk if load_metrics_to_cram and \ len(spp_data) > 0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=spp_data) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_ppqt_list': output_ppqt_list }) # pass on samtools output list message='finished PPQT for {0} {1}'.\ format(project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message='finished PPQT for {0} {1}: {2}'.\ format(project_igf_id, sample_igf_id, ppqt_cmd) self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running BWA alignment ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param_required('run_igf_id') bwa_exe = self.param_required('bwa_exe') samtools_exe = self.param_required('samtools_exe') r1_read_file = self.param_required('r1_read_file') r2_read_file = self.param('r2_read_file') run_thread = self.param('run_thread') output_prefix = self.param_required('output_prefix') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') base_work_dir = self.param_required('base_work_dir') parameter_options = self.param('parameter_options') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) input_fastq_list = list() input_fastq_list.append(r1_read_file[0]) if r2_read_file is not None and \ len(r2_read_file)>0: input_fastq_list.append(r2_read_file[0]) work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id, run_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, bwa_ref_type=reference_type) # setup ref genome utils bwa_ref = ref_genome.get_genome_bwa() # get bwa ref bwa_obj = \ BWA_util( bwa_exe=bwa_exe, samtools_exe=samtools_exe, ref_genome=bwa_ref, input_fastq_list=input_fastq_list, output_dir=work_dir, output_prefix=output_prefix, bam_output=True, use_ephemeral_space=use_ephemeral_space, thread=run_thread) # set up bwa for run if isinstance(parameter_options, str): parameter_options = json.loads( parameter_options) # convert string param to dict final_output_file,bwa_cmd = \ bwa_obj.\ run_mem(parameter_options=parameter_options) # run bwa mem self.param('dataflow_params', { 'bwa_bam': final_output_file, 'seed_date_stamp': seed_date_stamp }) # pass on bwa output list message = \ 'finished bwa {0} {1}'.\ format( project_igf_id, run_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana message = \ 'Bwa {0} {1}'.\ format( run_igf_id, bwa_cmd) self.comment_asana_task( task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') rsem_exe_dir = self.param_required('rsem_exe_dir') library_layout = self.param_required('library_layout') reference_type = self.param_required('reference_type') igf_session_class = self.param_required('igf_session_class') output_prefix = self.param_required('output_prefix') base_work_dir = self.param_required('base_work_dir') input_bams = self.param_required('input_bams') strandedness = self.param('strandedness') threads = self.param('threads') use_ephemeral_space = self.param('use_ephemeral_space') memory_limit = self.param('memory_limit') rsem_options = self.param('rsem_options') force_overwrite = self.param('force_overwrite') species_name = self.param('species_name') base_work_dir = self.param_required('base_work_dir') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) if not isinstance(input_bams,list) or \ len(input_bams) != 1: raise ValueError('Expecting one input bam for rsem and got : {0}'.\ format(len(input_bams))) work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, gene_rsem_type=reference_type) rsem_ref = ref_genome.get_transcriptome_rsem( ) # fetch rsem refrence if library_layout == 'PAIRED': paired_end = True else: paired_end = False rsem_obj = \ RSEM_utils( rsem_exe_dir=rsem_exe_dir, reference_rsem=rsem_ref, input_bam=input_bams[0], threads=threads, use_ephemeral_space=use_ephemeral_space, memory_limit=memory_limit) # prepare rsem for run rsem_cmd,rsem_output_list,rsem_log_file = \ rsem_obj.\ run_rsem_calculate_expression( output_dir=work_dir, output_prefix=output_prefix, paired_end=paired_end, strandedness=strandedness, options=rsem_options, force=force_overwrite) if not isinstance(rsem_output_list,list) or \ len(rsem_output_list)==0: raise ValueError( 'No RSEM output files found') # check output files self.param( 'dataflow_params', { 'rsem_output': rsem_output_list, 'rsem_log_file': rsem_log_file, 'seed_date_stamp': seed_date_stamp }) # pass on rsem output list message = \ 'Finished RSEM {0} for {1}'.format( project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'RSEM {0} command: {1}'.format( experiment_igf_id, rsem_cmd) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def test_get_datestamp_label(self): date_str = '2018-08-23 15:15:01' self.assertEqual(get_datestamp_label(date_str), '20180823') self.assertEqual(get_datestamp_label(parse(date_str)), '20180823')
def run(self): ''' A method for running featureCounts tool ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') featurecounts_exe = self.param_required('featurecounts_exe') input_files = self.param_required('input_files') reference_gtf = self.param('reference_gtf') base_work_dir = self.param_required('base_work_dir') igf_session_class = self.param_required('igf_session_class') species_name = self.param_required('species_name') parameter_options = self.param('parameter_options') run_thread = self.param('run_thread') use_ephemeral_space = self.param('use_ephemeral_space') output_prefix = self.param_required('output_prefix') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir output_prefix = \ '{0}_{1}'.format( output_prefix, seed_date_stamp) output_file = \ os.path.join( work_dir, output_prefix) ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, gene_gtf_type=reference_gtf) # setup ref genome utils gene_gtf = ref_genome.get_gene_gtf() # get gtf file summary_file,featureCount_cmd = \ run_featureCounts( featurecounts_exe=featurecounts_exe, input_gtf=gene_gtf, input_bams=input_files, output_file=output_file, thread=run_thread, use_ephemeral_space=use_ephemeral_space, options=parameter_options) self.param( 'dataflow_params', { 'featureCounts_output': output_file, 'featureCounts_summary': summary_file, 'seed_date_stamp': seed_date_stamp }) message = \ 'finished featureCounts for {0} {1}'.format( project_igf_id, experiment_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'featureCounts {0} command: {1}'.format( experiment_igf_id, featureCount_cmd) self.comment_asana_task( task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running Fastp commands :param project_igf_id: A project_igf_id from dataflow :param experiment_igf_id: A experiment_igf_id from dataflow :param sample_igf_id: A sample_igf_id from dataflow :param fastp_exe: Fastp exe path from analysis config :param input_fastq_list: Input fastq list from dataflow :param base_work_dir: Base work dir path from analysis config :param run_thread: Number of threads for fastp run, default 1 :param split_fastq: Enable splitting fastq files, default None :param split_by_lines_count: Number of fastq lines to be used if split_fastq is True, default 5000000 :param fastp_options_list: A list of fasrp tool options, default ['-a=auto','--qualified_quality_phred=15','--length_required=15'] :param platform_name: Sequencing platform name from dataflow :param use_ephemeral_space: A toggle for temp dir setting, default 0 :param polyg_platform_list: A list of Illumin platforms which emit poly Gs for empty cycles, default ['NextSeq','NOVASEQ6000'] :param enable_polyg_trim: Enable Fastp poly G trim, default False ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param_required('run_igf_id') fastp_exe = self.param_required('fastp_exe') input_fastq_list = self.param_required('input_fastq_list') base_work_dir = self.param_required('base_work_dir') run_thread = self.param('run_thread') split_fastq = self.param('split_fastq') split_by_lines_count = self.param('split_by_lines_count') fastp_options_list = self.param('fastp_options_list') platform_name = self.param_required('platform_name') polyg_platform_list = self.param('polyg_platform_list') enable_polyg_trim = self.param('enable_polyg_trim') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir split_fastq = \ False if split_fastq is None else True # set default value for split fastq if platform_name in polyg_platform_list: enable_polyg_trim = True # enable poly G trim for new Illumin platforms fastp_obj = \ Fastp_utils(\ fastp_exe=fastp_exe, input_fastq_list=input_fastq_list, log_output_prefix=run_igf_id, output_dir=work_dir, run_thread=run_thread, use_ephemeral_space=use_ephemeral_space, enable_polyg_trim=enable_polyg_trim, split_by_lines_count=split_by_lines_count, fastp_options_list=fastp_options_list) # setup fastp tool for run output_read1, output_read2, output_html_file, output_json_file, _ = \ fastp_obj.\ run_adapter_trimming(split_fastq=split_fastq) # run fastp trimming self.param( 'dataflow_params', { 'output_read1': output_read1, 'output_read2': output_read2, 'output_html_file': output_html_file, 'output_json_file': output_json_file, 'seed_date_stamp': seed_date_stamp }) # pass on fastp output list message = 'finished fastp for {0} {1}'.\ format(project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise