def run(self): ''' A method for running the cellranger count metrics extraction :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param igf_session_class: A database session class :param analysis_output_list: Cellranger analysis tar output path :param collection_type: Cellranger results collection type :param metrics_filename: Name of the metrics file, default metrics_summary.csv :returns: None ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') analysis_output_list = self.param_required('analysis_output_list') collection_type = self.param('collection_type') metrics_filename = self.param('metrics_filename') attribute_prefix = self.param('attribute_prefix') for infile in analysis_output_list: check_file_path(infile) # check input file path cellranger_tar = analysis_output_list[0] cellranger_metrics = extract_cellranger_count_metrics_summary(\ cellranger_tar=cellranger_tar, target_filename=metrics_filename, collection_name=experiment_igf_id, collection_type=collection_type, attribute_prefix=attribute_prefix ) # extract cellranger metrics stats as dictionary ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=cellranger_metrics, autosave=False) # load cellranger metrics to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param('dataflow_params',{'cellranger_attribute':'done'}) except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def test_find_fastq_and_build_db_collection(self): ci = Collect_seqrun_fastq_to_db( fastq_dir=self.fastq_dir, session_class=self.session_class, seqrun_igf_id=self.seqrun_igf_id, flowcell_id=self.flowcell_id, model_name=self.model_name, file_location=self.file_location, samplesheet_file=self.samplesheet_file, manifest_name=self.manifest_name, ) ci.find_fastq_and_build_db_collection() ca = CollectionAdaptor(**{'session_class': self.session_class}) ca.start_session() file_path = 'data/collect_fastq_dir/sc_1_8/IGFP0001_test_22-8-2017_rna_sc/IGF00001/IGF00001-1_S1_L003_R1_001.fastq.gz' (name, type) = ca.fetch_collection_name_and_table_from_file_path(file_path) ca.close_session() self.assertEqual(name, 'IGF00001_NEXTSEQ_TESTABC_3')
def run(self): try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param_required('run_igf_id') igf_session_class = self.param_required('igf_session_class') fastq_collection_type = self.param('fastq_collection_type') fastq_collection_table = self.param('fastq_collection_table') ca = CollectionAdaptor(**{'session_class': igf_session_class}) ca.start_session() fastq_files = ca.get_collection_files( collection_name=run_igf_id, collection_type=fastq_collection_type, collection_table=fastq_collection_table, output_mode='dataframe') ca.close_session() fastq_counts = len(fastq_files.index) fastq_list = list(fastq_files['file_path'].values ) # converting fastq filepaths to a list if not isinstance(fastq_list, list) or \ len(fastq_list)==0: raise ValueError( 'No fastq file found for run {0}'.format(run_igf_id)) for file in fastq_list: if not os.path.exists(file): raise IOError('Fastq file path {0} not found for run {1}'.\ format(file,run_igf_id)) self.param('dataflow_params', {'fastq_files_list': fastq_list }) # add fastq filepaths to dataflow except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def _fetch_collection_files(self, collection_type, check_missing=False, unique_file=True, file_path_label='file_path'): ''' An internal method for fetching collection group files from database :param collection_type: Collection type information for database lookup :param check_missing: A toggle for checking errors for missing files, default False :param unique_file: A toggle for keeping only a single collection file, default True :param file_path_label: Name of the file_path column in the File table, default file_path :returns: A single file if unique_file is true, else a list of files ''' try: ref_file = None ca = \ CollectionAdaptor(**{'session_class':self.dbsession_class}) ca.start_session() collection_files = \ ca.\ get_collection_files( collection_name=self.genome_tag, collection_type=collection_type, output_mode='dataframe') # fetch collection files from db ca.close_session() if len(collection_files.index) > 0: files = list(collection_files[file_path_label].values) if unique_file: ref_file = files[ 0] # select the first file from db results else: ref_file = files if ref_file is None and check_missing: raise ValueError( 'No file collection found for reference genome {0}:{1}'.\ format(self.genome_tag,collection_type)) return ref_file except: raise
def test_find_fastq_and_build_db_collection(self): ci = Collect_seqrun_fastq_to_db( fastq_dir=self.fastq_dir, session_class=self.session_class, seqrun_igf_id=self.seqrun_igf_id, flowcell_id=self.flowcell_id, model_name=self.model_name, file_location=self.file_location, samplesheet_file=self.samplesheet_file, manifest_name=self.manifest_name, ) ci.find_fastq_and_build_db_collection() ca = CollectionAdaptor(**{'session_class': self.session_class}) ca.start_session() query = ca.session.query(Collection).filter( Collection.name == 'IGF00001_MISEQ_000000000-D0YLK_1') file_path = 'data/collect_fastq_dir/1_16/IGFP0001_test_22-8-2017_rna/IGF00002/IGF00002-2_S1_L001_R1_001.fastq.gz' (name, type) = ca.fetch_collection_name_and_table_from_file_path(file_path) ca.close_session() self.assertEqual(name, 'IGF00002_MISEQ_000000000-D0YLK_1')
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() self.json_file_path = 'data/reset_samplesheet_md5/seqrun1_file_md5.json' json_data = pd.DataFrame([{ 'file_md5': '1e7531158974b5a5b7cbb7dde09ac779', 'seqrun_file_name': 'SampleSheet.csv' }, { 'file_md5': '2b22f945bc9e7e390af5432425783a03', 'seqrun_file_name': 'RTAConfiguration.xml' }]) with open(self.json_file_path, 'w') as jp: json.dump(json_data.to_dict(orient='record'), jp, indent=4) self.initial_json_md5 = calculate_file_checksum( filepath=self.json_file_path) self.correct_samplesheet_md5 = '259ed03f2e8c45980de121f7c3a70565' self.json_collection_name = 'seqrun1' self.json_collection_type = 'ILLUMINA_BCL_MD5' self.seqrun_path = 'data/reset_samplesheet_md5' self.seqrun_input_list = 'data/reset_samplesheet_md5/seqrun_input_list.txt' ca = CollectionAdaptor(**{'session_class': self.session_class}) ca.start_session() data = pd.DataFrame([{ 'name': self.json_collection_name, 'type': self.json_collection_type, 'table': 'seqrun', 'file_path': self.json_file_path, }]) ca.load_file_and_create_collection(data, autosave=True, hasher='md5') ca.close_session() with open(self.seqrun_input_list, 'w') as fp: fp.write(self.json_collection_name)
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param species_name: species_name :param base_result_dir: Base results directory :param report_template_file: A template file for writing scanpy report :param analysis_name: Analysis name, default scanpy :param species_name_lookup: A dictionary for ensembl species name lookup :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS :param collection_table: Collection table name for loading scanpy report, default experiment ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') species_name = self.param_required('species_name') report_template_file = self.param_required('report_template_file') analysis_name = self.param_required('analysis_name') base_result_dir = self.param_required('base_result_dir') base_work_dir = self.param_required('base_work_dir') species_name_lookup = self.param('species_name_lookup') cellranger_collection_type = self.param( 'cellranger_collection_type') scanpy_collection_type = self.param('scanpy_collection_type') collection_table = self.param('collection_table') cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix') use_ephemeral_space = self.param('use_ephemeral_space') cellranger_tarfile = '' output_report = '' work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir if species_name in species_name_lookup.keys( ): # check for human or mice ensembl_species_name = species_name_lookup[ species_name] # get ensembl species name # fetch cellranger tar path from db if cellranger_tarfile == '': ca = CollectionAdaptor( **{'session_class': igf_session_class}) ca.start_session() # connect to database cellranger_tarfiles = \ ca.get_collection_files(\ collection_name=experiment_igf_id, collection_type=cellranger_collection_type, output_mode='dataframe') # fetch collection files ca.close_session() if len(cellranger_tarfiles.index) == 0: raise ValueError('No cellranger analysis output found for exp {0}'.\ format(experiment_igf_id)) cellranger_tarfile = cellranger_tarfiles[ 'file_path'].values[ 0] # select first file as analysis file # extract filtered metrics files from tar output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir datestamp = get_datestamp_label() cellbrowser_dir = \ os.path.join( \ work_dir, '{0}_{1}'.\ format( \ cellbrowser_dir_prefix, datestamp)) cellbrowser_h5ad = \ os.path.join(\ cellbrowser_dir, 'scanpy.h5ad') output_report = \ os.path.join(\ output_dir, 'report.html') # get temp report path matrix_file,gene_file,barcode_file = \ self._extract_cellranger_filtered_metrics(\ tar_file=cellranger_tarfile, output_dir=output_dir) # get cellranger output files sp = \ Scanpy_tool(\ project_name=project_igf_id, sample_name=sample_igf_id, matrix_file=matrix_file, features_tsv=gene_file, barcode_tsv=barcode_file, html_template_file=report_template_file, species_name=ensembl_species_name, output_file=output_report, use_ephemeral_space=use_ephemeral_space, cellbrowser_h5ad=cellbrowser_h5ad) sp.generate_report() # generate scanpy report # load files to db and disk au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=scanpy_collection_type, collection_table=collection_table, base_path=base_result_dir) # initiate loading of report file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_report], withdraw_exisitng_collection=True) # load file to db and disk output_report = output_file_list[0] self.param( 'dataflow_params', { 'output_report': output_report, 'scanpy_h5ad_path': cellbrowser_h5ad }) # pass on output report filepath except Exception as e: message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running picard commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL :param species_name: species_name :param java_exe: Java path :param java_java_paramexe: Java run parameters :param picard_jar: Picard jar path :param picard_command: Picard command :param base_work_dir: Base workd directory :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False :param use_ephemeral_space: A toggle for temp dir setting, default 0 :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ'] ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') java_exe = self.param_required('java_exe') java_param = self.param_required('java_param') picard_jar = self.param_required('picard_jar') input_files = self.param_required('input_files') picard_command = self.param_required('picard_command') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') reference_refFlat = self.param('reference_refFlat') ribosomal_interval_type = self.param('ribosomal_interval_type') base_work_dir = self.param_required('base_work_dir') analysis_files = self.param_required('analysis_files') picard_option = self.param('picard_option') patterned_flowcell_list = self.param('patterned_flowcell_list') platform_name = self.param_required('platform_name') output_prefix = self.param('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding seed datestamp to output prefix work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir temp_output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type, gene_reflat_type=reference_refFlat, ribosomal_interval_type=ribosomal_interval_type) # setup ref genome utils genome_fasta = ref_genome.get_genome_fasta() # get genome fasta ref_flat_file = ref_genome.get_gene_reflat() # get refFlat file ribosomal_interval_file = ref_genome.get_ribosomal_interval( ) # get ribosomal interval file patterned_flowcell = False if platform_name in patterned_flowcell_list: # check for patterned flowcell patterned_flowcell = True if load_metrics_to_cram and \ not cram_collection_type: raise ValueError( 'Cram file collection type is required for loading picard metrics to db' ) picard=\ Picard_tools(\ java_exe=java_exe, java_param=java_param, picard_jar=picard_jar, input_files=input_files, output_dir=temp_output_dir, ref_fasta=genome_fasta, patterned_flowcell=patterned_flowcell, ref_flat_file=ref_flat_file, picard_option=picard_option, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, ribisomal_interval=ribosomal_interval_file) # setup picard tool temp_output_files,picard_command_line,picard_metrics = \ picard.run_picard_command(command_name=picard_command) # run picard command output_file_list = list() for source_path in temp_output_files: dest_path=\ os.path.join( work_dir, os.path.basename(source_path)) # get destination filepath move_file(source_path=source_path, destinationa_path=dest_path, force=True) # move files to work dir output_file_list.append(dest_path) remove_dir(temp_output_dir) analysis_files.extend(output_file_list) bam_files = list() for file in output_file_list: if file.endswith('.bam'): bam_files.append(file) if load_metrics_to_cram and \ len(picard_metrics)>0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute( collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=picard_metrics) # fromat data for collection attribute table ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False ) # load data to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'bam_files': bam_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished picard {0} for {1} {2}'.\ format( picard_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Picard {0} command: {1}'.\ format( picard_command, picard_command_line) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: if temp_output_dir and \ os.path.exists(temp_output_dir): remove_dir(temp_output_dir) message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') rscript_path = self.param_required('rscript_path') ppqt_exe = self.param_required('ppqt_exe') base_work_dir = self.param_required('base_work_dir') base_result_dir = self.param_required('base_result_dir') library_strategy = self.param_required('library_strategy') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') species_name = self.param_required('species_name') analysis_name = self.param('analysis_name') seed_date_stamp = self.param_required('date_stamp') load_metrics_to_cram = self.param('load_metrics_to_cram') ppqt_collection_type = self.param('ppqt_collection_type') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') force_overwrite = self.param('force_overwrite') use_ephemeral_space = self.param('use_ephemeral_space') threads = self.param('threads') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = '{0}_{1}'.format( output_prefix, seed_date_stamp ) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) if analysis_name is None: analysis_name = library_strategy # use library_strategy as default analysis_name input_file = input_files[0] work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ppqt_obj = \ Ppqt_tools(\ rscript_path=rscript_path, ppqt_exe=ppqt_exe, use_ephemeral_space=use_ephemeral_space, threads=threads) ppqt_cmd,spp_output, pdf_output,spp_data = \ ppqt_obj.run_ppqt(\ input_bam=input_file, output_dir=work_dir, output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'), output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT')) analysis_files.append(spp_output) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=ppqt_collection_type, collection_table=collection_table, base_path=base_result_dir) output_ppqt_list = \ au.load_file_to_disk_and_db(\ input_file_list=[pdf_output], file_suffix='pdf', withdraw_exisitng_collection=force_overwrite) # load file to db and disk if load_metrics_to_cram and \ len(spp_data) > 0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=spp_data) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_ppqt_list': output_ppqt_list }) # pass on samtools output list message='finished PPQT for {0} {1}'.\ format(project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message='finished PPQT for {0} {1}: {2}'.\ format(project_igf_id, sample_igf_id, ppqt_cmd) self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: fastq_file = self.param_required('fastq_file') fastq_dir = self.param_required('fastq_dir') igf_session_class = self.param_required('igf_session_class') fastqc_exe = self.param_required('fastqc_exe') tag = self.param_required('tag') seqrun_igf_id = self.param_required('seqrun_igf_id') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') fastqc_options = self.param('fastqc_options') base_results_dir = self.param_required('base_results_dir') project_name = self.param_required('project_name') force_overwrite = self.param('force_overwrite') fastqc_dir_label = self.param('fastqc_dir_label') required_collection_table = self.param('required_collection_table') sample_name = self.param('sample_name') hpc_location = self.param('hpc_location') fastqc_collection_type = self.param('fastqc_collection_type') use_ephemeral_space = self.param('use_ephemeral_space') store_file = self.param('store_file') lane_index_info = os.path.basename(fastq_dir) # get the lane and index length info fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','') collection_name = None collection_table = None if tag=='known' and store_file: # fetch sample name for known fastq, if its not defined base = BaseAdaptor(**{'session_class':igf_session_class}) base.start_session() # connect to db ca = CollectionAdaptor(**{'session':base.session}) (collection_name,collection_table) = \ ca.fetch_collection_name_and_table_from_file_path(\ file_path=fastq_file) # fetch collection name and table info if collection_table != required_collection_table: raise ValueError( 'Expected collection table {0} and got {1}, {2}'.\ format( required_collection_table, collection_table, fastq_file)) ra = RunAdaptor(**{'session':base.session}) sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name) sample_name = sample['sample_igf_id'] base.close_session() fastqc_result_dir = \ os.path.join(\ base_results_dir, project_name, seqrun_date, flowcell_id, lane_index_info, tag) # result dir path is generic if sample_name is not None: fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, sample_name) # add sample name to dir path if its available fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, fastq_file_label, fastqc_dir_label) # keep multiple files under same dir if os.path.exists(fastqc_result_dir) and force_overwrite: remove_dir(fastqc_result_dir) # remove existing output dir if force_overwrite is true if not os.path.exists(fastqc_result_dir): os.makedirs(fastqc_result_dir,mode=0o775) # create output dir if its not present temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir if not os.path.exists(fastq_file): raise IOError('fastq file {0} not readable'.format(fastq_file)) # raise if fastq file path is not readable fastqc_output = \ os.path.join(\ temp_work_dir, fastq_file_label) os.mkdir(fastqc_output) # create fastqc output dir fastqc_param = \ self.format_tool_options(fastqc_options) # format fastqc params fastqc_cmd = \ [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ] # fastqc base parameters fastqc_cmd.extend(fastqc_param) # add additional parameters fastqc_cmd.append(fastq_file) # fastqc input file subprocess.check_call(' '.join(fastqc_cmd),shell=True) # run fastqc fastqc_zip = None fastqc_html = None for root, _, files in os.walk(top=fastqc_output): for file in files: if fnmatch.fnmatch(file, '*.zip'): input_fastqc_zip = os.path.join(root,file) copy2(input_fastqc_zip,fastqc_result_dir) fastqc_zip = os.path.join(fastqc_result_dir,file) if fnmatch.fnmatch(file, '*.html'): input_fastqc_html = os.path.join(root,file) copy2(input_fastqc_html,fastqc_result_dir) fastqc_html = os.path.join(fastqc_result_dir,file) if fastqc_html is None or fastqc_zip is None: raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\ format(fastqc_zip,fastqc_html)) if tag=='known' and store_file: if collection_name is None: raise ValueError('couldn\'t retrieve collection name for {0}'.\ format(fastq_file)) fastqc_files = \ [{'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_zip, 'location':hpc_location}, {'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_html, 'location':hpc_location}, ] ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() ca.load_file_and_create_collection(data=fastqc_files) # store fastqc files to db ca.close_session() self.param('dataflow_params', {'fastqc_html':fastqc_html, 'lane_index_info':lane_index_info, 'sample_name':sample_name, 'fastqc':{'fastq_dir':fastq_dir, 'fastqc_zip':fastqc_zip, 'fastqc_html':fastqc_html}}) # set dataflow params except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: seqrun_igf_id=self.param_required('seqrun_igf_id') seqrun_source=self.param_required('seqrun_source') seqrun_server=self.param_required('seqrun_server') seqrun_user=self.param_required('seqrun_user') igf_session_class=self.param_required('igf_session_class') seqrun_md5_type=self.param_required('seqrun_md5_type') hpc_location=self.param_required('hpc_location') db_file_location_label=self.param_required('db_file_location_label') db_file_path_label=self.param_required('db_file_path_label') seqrun_path=os.path.join(seqrun_source,seqrun_igf_id) # get new seqrun path seqrun_server_login='******'.format(seqrun_user, seqrun_server) # get destination path subprocess.check_call(['ssh', seqrun_server_login, 'ls', seqrun_path]) # check remote file ca=CollectionAdaptor(**{'session_class':igf_session_class}) # get the md5 list from db ca.start_session() files=ca.get_collection_files(collection_name=seqrun_igf_id, collection_type=seqrun_md5_type) # fetch file collection files=files.to_dict(orient='records') ca.close_session() if len(files)>1: raise ValueError('sequencing run {0} has more than one md5 json file'.\ format(seqrun_igf_id)) if len(files)==0: raise ValueError('sequencing run {0} does not have any md5 json file'.\ format(seqrun_igf_id)) md5_json_location=files[0][db_file_location_label] md5_json_path=files[0][db_file_path_label] if md5_json_location !=hpc_location: temp_dir=get_temp_dir(work_dir=os.getcwd()) # create a temp directory destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file copy_remote_file(source_path=md5_json_path, destinationa_path=destination_path, source_address=seqrun_server_login) # copy remote file to local disk md5_json_path=destination_path # set md5 json filepath with open(md5_json_path) as json_data: md5_json=json.load(json_data) # read json data, get all file and md5 from json file self.param('sub_tasks',md5_json) # seed dataflow remove_dir(temp_dir) # remove temp dir when its not required message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \ len(md5_json)) self.warning(message) self.post_message_to_slack(message,reaction='pass') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) except Exception as e: message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\ e,\ seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) raise
dbconfig_path = args.dbconfig_path collection_file_data = args.collection_file_data calculate_checksum = args.calculate_checksum if __name__ == '__main__': try: dbconnected = False if not os.path.exists(dbconfig_path): raise IOError('Dbconfig file {0} not found'.format(dbconfig_path)) if not os.path.exists(collection_file_data): raise IOError('Collection data json file {0} not found'.format( collection_file_data)) dbparam = read_dbconf_json(dbconfig_path) # read db config collection_data = read_json_data( collection_file_data) # read collection data json ca = CollectionAdaptor(**dbparam) ca.start_session() # connect to database dbconnected = True ca.load_file_and_create_collection( data=collection_data, calculate_file_size_and_md5=calculate_checksum, autosave=True) # load data and commit changes ca.close_session() dbconnected = False except Exception as e: if dbconnected: ca.rollback_session() ca.close_session() raise ValueError('Error: {0}'.format(e))
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param threads: Number of threads to use for Bam to Cram conversion, default 4 :param base_work_dir: Base workd directory :param samtools_command: Samtools command :param samFlagInclude: Sam flags to include in filtered bam, default None :param samFlagExclude: Sam flags to exclude from the filtered bam, default None :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804 :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796 :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') samtools_exe = self.param_required('samtools_exe') reference_type = self.param('reference_type') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') samtools_command = self.param_required('samtools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') base_result_dir = self.param('base_result_dir') analysis_name = self.param('analysis_name') force_overwrite = self.param('force_overwrite') samFlagInclude = self.param('samFlagInclude') samFlagExclude = self.param('samFlagExclude') mapq_threshold = self.param('mapq_threshold') library_layout = self.param_required('library_layout') use_encode_filter = self.param('use_encode_filter') species_name = self.param_required('species_name') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if use_encode_filter: samFlagInclude = None if library_layout == 'PAIRED': samFlagExclude = 1804 else: samFlagExclude = 1796 if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) output_bam_cram_list = list() input_file = input_files[0] temp_output_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) # get temp work dir work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir samtools_cmdline = '' temp_output = None if samtools_command == 'idxstats': temp_output,samtools_cmdline = \ run_bam_idxstat( samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, force=True) # run samtools idxstats elif samtools_command == 'flagstat': temp_output,samtools_cmdline = \ run_bam_flagstat(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools flagstat elif samtools_command == 'stats': temp_output,samtools_cmdline,stats_metrics = \ run_bam_stats(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools stats if load_metrics_to_cram and \ len(stats_metrics) > 0: ca = CollectionAdaptor( **{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=stats_metrics) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) elif samtools_command == 'merge': if output_prefix is None: raise ValueError( 'Missing output filename prefix for merged bam') sorted_by_name = self.param('sorted_by_name') temp_output = \ os.path.join(\ work_dir, '{0}_merged.bam'.format(output_prefix)) samtools_cmdline = \ merge_multiple_bam(\ samtools_exe=samtools_exe, input_bam_list=input_file, output_bam_path=temp_output, sorted_by_name=sorted_by_name, threads=threads, use_ephemeral_space=use_ephemeral_space, force=True) elif samtools_command == 'view_bamToCram': if base_result_dir is None: raise ValueError( 'base_result_dir is required for CRAM file loading') if analysis_name is None: raise ValueError( 'analysis_name is required for CRAM file loading') ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type) genome_fasta = ref_genome.get_genome_fasta( ) # get genome fasta cram_file = \ os.path.basename(input_file).\ replace('.bam','.cram') # get base cram file name cram_file = os.path.join( temp_output_dir, cram_file) # get cram file path in work dir samtools_cmdline = \ convert_bam_to_cram(\ samtools_exe=samtools_exe, bam_file=input_file, reference_file=genome_fasta, cram_path=cram_file, use_ephemeral_space=use_ephemeral_space, threads=threads, force=True, dry_run=False) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=cram_collection_type, collection_table=collection_table, base_path=base_result_dir) temp_output_bam_cram_list = \ au.load_file_to_disk_and_db(\ input_file_list=[cram_file], file_suffix='cram', withdraw_exisitng_collection=force_overwrite) # load file to db and disk for cram in temp_output_bam_cram_list: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=cram, threads=threads, dry_run=False) index_path = '{0}.crai'.format(cram) output_bam_cram_list.append(cram) output_bam_cram_list.append(index_path) if len(output_bam_cram_list) == 0: raise ValueError('No output cram file found') elif samtools_command == 'view_filterBam': temp_output_bam = \ os.path.join(\ temp_output_dir, os.path.basename(input_file).replace('.bam','.filtered.bam')) samtools_cmdline = \ filter_bam_file( samtools_exe=samtools_exe, input_bam=input_file, output_bam=temp_output_bam, samFlagInclude=samFlagInclude, samFlagExclude=samFlagExclude, threads=threads, mapq_threshold=mapq_threshold, index_output=False, dry_run=False) dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output_bam)) move_file(\ source_path=temp_output_bam, destinationa_path=dest_path, force=True) index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=dest_path, threads=threads, dry_run=False) index_path = '{0}.bai'.format(dest_path) output_bam_cram_list.append(dest_path) output_bam_cram_list.append(index_path) else: raise ValueError('Samtools command {0} not supported'.\ format(samtools_command)) if temp_output is not None: dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output)) if dest_path != temp_output: move_file(\ source_path=temp_output, destinationa_path=dest_path, force=True) analysis_files.append(dest_path) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_bam_cram_list': output_bam_cram_list }) # pass on samtools output list message = \ 'finished samtools {0} for {1} {2}'.\ format( samtools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'finished samtools {0} for {1} {2}: {3}'.\ format( samtools_command, project_igf_id, sample_igf_id, samtools_cmdline) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') file_list = self.param_required('file_list') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') dir_labels = self.param_required('dir_labels') igf_session_class = self.param_required('igf_session_class') force_overwrite = self.param('force_overwrite') collect_remote_file = self.param('collect_remote_file') collection_name = self.param('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') file_location = self.param('file_location') use_ephemeral_space = self.param('use_ephemeral_space') destination_output_path = \ os.path.join( remote_project_path, project_igf_id) # get base destination path if isinstance(dir_labels, list) and \ len(dir_labels) > 0: destination_output_path=\ os.path.join(destination_output_path, *dir_labels) if collect_remote_file: if collection_name is None or \ collection_type is None: raise ValueError('Name and type are required for db collection') output_file_list = list() temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp dir for file in file_list: if not os.path.exists(file): raise IOError('file {0} not found'.\ format(file)) if os.path.isfile(file): copy2( file, os.path.join( temp_work_dir, os.path.basename(file))) # copy file to a temp dir dest_file_path = \ os.path.join( destination_output_path, os.path.basename(file)) # get destination file path os.chmod( os.path.join( temp_work_dir, os.path.basename(file)), mode=0o764) # set file permission elif os.path.isdir(file): copytree(\ file, os.path.join( temp_work_dir, os.path.basename(file))) # copy dir to a temp dir dest_file_path=destination_output_path for root,dirs,files in os.walk(temp_work_dir): for dir_name in dirs: os.chmod( os.path.join(root,dir_name), mode=0o775) for file_name in files: os.chmod( os.path.join(root,file_name), mode=0o764) # changing file and dir permissions for remote files else: raise ValueError('Unknown source file type: {0}'.\ format(file)) #os.chmod( # os.path.join( # temp_work_dir, # os.path.basename(file)), # mode=0o754) # set file permission copy_remote_file(\ source_path=os.path.join(temp_work_dir, os.path.basename(file)), destinationa_path=dest_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host), force_update=force_overwrite ) # copy file to remote if os.path.isdir(file): dest_file_path=\ os.path.join(\ dest_file_path, os.path.basename(file)) # fix for dir input output_file_list.append(dest_file_path) remove_dir(dir_path=temp_work_dir) # remove temp dir self.param('dataflow_params', {'status': 'done', 'output_list':output_file_list}) # add dataflow params if collect_remote_file: data=list() remove_data_list=[{'name':collection_name, 'type':collection_type}] for file in output_file_list: data.append( {'name':collection_name, 'type':collection_type, 'table':collection_table, 'file_path':file, 'location':file_location } ) ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.remove_collection_group_info( data=remove_data_list, autosave=False) # remove existing data before loading new collection ca.load_file_and_create_collection( data=data, autosave=False, calculate_file_size_and_md5=False) # load remote files to db ca.commit_session() # commit changes ca.close_session() except: ca.rollback_session() # rollback changes ca.close_session() raise except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise