def test_find_new_analysis_seeds2(self): base = BaseAdaptor(**{'session_class': self.session_class}) project_name_file = os.path.join(self.temp_dir, 'project_name_list.txt') with open(project_name_file, 'w') as fp: fp.write('projectA') available_exps,seeded_exps = \ find_new_analysis_seeds(\ dbconfig_path=self.dbconfig, pipeline_name='PrimaryAnalysis', project_name_file=project_name_file, species_name_list=['HG38'], fastq_type='demultiplexed_fastq', library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertTrue(available_exps is None) self.assertTrue('projectA' in seeded_exps) pla = PipelineAdaptor(**{'session_class': self.session_class}) pla.start_session() seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') pla.close_session() exp_data = exp_data.to_dict(orient='records') self.assertTrue(len(exp_data), 1) self.assertEqual(exp_data[0]['experiment_igf_id'], 'sampleA_MISEQ')
def test_find_new_analysis_seeds1(self): project_name_file = os.path.join(self.temp_dir, 'project_name_list.txt') with open(project_name_file, 'w') as fp: fp.write('') available_exps,seeded_exps = \ find_new_analysis_seeds(\ dbconfig_path=self.dbconfig, pipeline_name='PrimaryAnalysis', project_name_file=project_name_file, species_name_list=['HG38'], fastq_type='demultiplexed_fastq', library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertTrue('projectA' in available_exps) self.assertTrue(seeded_exps is None) pla = PipelineAdaptor(**{'session_class': self.session_class}) pla.start_session() seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') pla.close_session() self.assertEqual(len(seeded_data.index), 0)
def test_load_new_pipeline_data(self): load_new_pipeline_data(data_file=self.data_file, dbconfig=self.dbconfig) pp = PipelineAdaptor(**{'session_class': self.session_class}) pp.start_session() data = pp.fetch_pipeline_records_pipeline_name( pipeline_name='demultiplexing_fastq') pp.close_session() self.assertEqual(data.pipeline_name, 'demultiplexing_fastq')
def test_fetch_pipeline_seed_with_table_data(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq') pl.close_session() self.assertIsInstance(table_data.to_dict(orient='records'), list) self.assertEqual(len(table_data.to_dict(orient='records')), len(pipe_seed.to_dict(orient='records'))) self.assertTrue('seqrun_igf_id' in list(table_data.columns))
def test_seed_new_experiments(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() new_exps,_=\ pl.seed_new_experiments(\ pipeline_name='PrimaryAnalysis', species_name_list=['HG38'], fastq_type='demultiplexed_fastq', ) self.assertEqual(len(new_exps), 1) self.assertEqual(new_exps[0], 'IGFQ000123_avik_10-4-2018_Miseq')
def test_create_pipeline_seed(self): pipeline_seed_data1 = [ { 'seed_id': '1', 'seed_table': 'seqrun' }, ] pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() with self.assertRaises(ValueError): pl.create_pipeline_seed(data=pipeline_seed_data1) pl.close_session()
def load_new_pipeline_data(data_file, dbconfig): ''' A method for loading new data for pipeline table ''' try: formatted_data = read_json_data(data_file) dbparam = read_dbconf_json(dbconfig) pp = PipelineAdaptor(**dbparam) pp.start_session() pp.store_pipeline_data(data=formatted_data) pp.close_session() except: raise
def test_fetch_pipeline_seed_with_table_data(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='alignment', table_name='experiment') pl.close_session() self.assertIsInstance(table_data.to_dict(orient='records'), list) self.assertEqual(len(table_data.to_dict(orient='records')), len(pipe_seed.to_dict(orient='records'))) exp_id = table_data.to_dict(orient='records')[0]['experiment_igf_id'] project_id = table_data.to_dict(orient='records')[0]['project_igf_id'] self.assertEqual(exp_id, 'IGF00001_HISEQ4000') self.assertEqual(project_id, 'IGFP0001_test_22-8-2017_rna_sc') self.assertTrue('experiment_igf_id' in list(table_data.columns))
def find_new_analysis_seeds(dbconfig_path, pipeline_name, project_name_file, species_name_list, fastq_type, library_source_list): ''' A utils method for finding and seeding new experiments for analysis :param dbconfig_path: A database configuration file :param slack_config: A slack configuration file :param pipeline_name:Pipeline name :param fastq_type: Fastq collection type :param project_name_file: A file containing the list of projects for seeding pipeline :param species_name_list: A list of species to consider for seeding analysis :param library_source_list: A list of library source info to consider for seeding analysis :returns: List of available experiments or None and a list of seeded experiments or None ''' try: available_exps = None seeded_exps = None if not os.path.exists(project_name_file): raise IOError('File {0} not found'.format(project_name_file)) with open(project_name_file, 'r') as fp: project_list = fp.readlines() # read list of projects from file, project_list = [i.strip() for i in project_list] if len(project_list) == 0: project_list = None dbparam = read_dbconf_json(dbconfig_path) pl = PipelineAdaptor(**dbparam) pl.start_session() available_exps,seeded_exps=\ pl.seed_new_experiments(\ pipeline_name=pipeline_name, species_name_list=species_name_list, fastq_type=fastq_type, project_list=project_list, library_source_list=library_source_list ) pl.close_session() return available_exps, seeded_exps except: raise
def test_update_pipeline_seed(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() pipeline_seed_data1 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '2', 'seed_table': 'seqrun', }, ] with self.assertRaises(ValueError): pl.update_pipeline_seed(data=pipeline_seed_data1) pipeline_seed_data2 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '2', 'seed_table': 'seqrun', 'status': 'RUNNING' }, ] pl.update_pipeline_seed(data=pipeline_seed_data2) (pipe_seed1, table_data1) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq') self.assertEqual(len(table_data1.to_dict(orient='records')), len(pipe_seed1.to_dict(orient='records'))) pipeline_seed_data3 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '1', 'seed_table': 'seqrun', 'status': 'RUNNING' }, ] pl.update_pipeline_seed(data=pipeline_seed_data3) (pipe_seed2, _) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq', status='RUNNING') pl.close_session() self.assertEqual( pipe_seed2.loc[pipe_seed2.seed_id == 1]['status'].values[0], 'RUNNING')
def test_seed_new_experiments1(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() new_exps,_=\ pl.seed_new_experiments(\ pipeline_name='PrimaryAnalysis', species_name_list=['HG38'], fastq_type='demultiplexed_fastq', project_list=['IGFQ000123_avik_10-4-2018_Miseq'], library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertFalse(new_exps) pl.close_session() pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (_,exp_data)=pl.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') self.assertEqual(len(list(exp_data['experiment_igf_id'].values)), 1) self.assertEqual(exp_data['experiment_igf_id'].values[0], 'IGF103923_MISEQ')
def run(self): try: igf_session_class = self.param_required('igf_session_class') # set by base class pipeline_name = self.param_required('pipeline_name') igf_id = self.param_required('igf_id') task_id = self.param_required('task_id') seed_id = self.param_required('seed_id') seed_table = self.param_required('seed_table') new_status = self.param_required('new_status') pa = PipelineAdaptor(**{'session_class':igf_session_class}) pa.start_session() # connect to db pa.update_pipeline_seed(\ data=[{'pipeline_name':pipeline_name, 'seed_id':int(seed_id), 'seed_table':seed_table, 'status':new_status.upper()}]) # update seed record in db pa.close_session() # close db connection message = \ 'changing status in {0} for seed {1} as {2}'.\ format(\ pipeline_name, seed_id, new_status.upper()) # format message self.post_message_to_slack(message, reaction='pass') # send message to slack self.comment_asana_task(task_name=task_id, comment=message) # send message to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def test_fetch_pipeline_records_pipeline_name(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() pl_data = pl.fetch_pipeline_records_pipeline_name( pipeline_name='demultiplexing_fastq') self.assertEqual(pl_data.pipeline_id, 1)
def run(self): ''' Run method for the seed job factory class of the all pipelines :param igf_session_class: A database session class :param pipeline_name: Name of the pipeline :param seed_id_label: A text label for the seed_id, default seed_id :param seqrun_id_label: A text for seqrun_id column name, default seqrun_id :param seqrun_date_label: A text label for the seqrun date, default seqrun_date :param seqrun_igf_id_label: A text label for sequencing run igf id, default seqrun_igf_id :param seeded_label: A text label for the status seeded in pipeline_seed table, default SEEDED :param running_label: A text label for the status running in the pipeline_seed table, default RUNNING :param seed_status_label: A text label for the pipeline_seed status column name, default status :param experiment_id_label: A text label for the experiment_id, default experiment_id :param pipeseed_mode: A text label for pipeline mode, default demultiplexing Allowed values are demultiplexing alignment :returns: A list of dictionary containing the seqrun ids or experiment_igf_ids seed for analysis ''' try: dbconnected=False igf_session_class = self.param_required('igf_session_class') # set by base class pipeline_name = self.param_required('pipeline_name') seed_id_label = self.param_required('seed_id_label') seqrun_id_label = self.param_required('seqrun_id_label') seeded_label=self.param_required('seeded_label') running_label=self.param_required('running_label') seqrun_date_label=self.param_required('seqrun_date_label') seqrun_igf_id_label=self.param_required('seqrun_igf_id_label') seed_status_label=self.param_required('seed_status_label') experiment_id_label = self.param_required('experiment_id_label') pipeseed_mode=self.param_required('pipeseed_mode') if pipeseed_mode not in ('demultiplexing','alignment'): raise ValueError('Pipeseed_mode {0} not supported'.format(pipeseed_mode)) pipeseeds_data,seed_data=get_pipeline_seeds(\ pipeseed_mode=pipeseed_mode, pipeline_name=pipeline_name, igf_session_class=igf_session_class) # fetch pipeseed data from db if len(seed_data.index)>0: seed_data=seed_data.\ to_dict(orient='records') # convert dataframe to list of dictionaries self.param('sub_tasks',seed_data) # set sub_tasks param for the data flow pipeseeds_data[seed_status_label]=pipeseeds_data[seed_status_label].\ map({seeded_label:running_label}) # update seed records in pipeseed table, changed status to RUNNING pa = PipelineAdaptor(**{'session_class':igf_session_class}) # get db adaptor pa.start_session() # connect to db dbconnected=True pa.update_pipeline_seed(data=pipeseeds_data.to_dict(orient='records'), autosave=False) # set pipeline seeds as running pa.commit_session() # save changes to db pa.close_session() # close db connection dbconnected=False message='Total {0} new job found for {1}, pipeline: {2}'.\ format(len(seed_data),self.__class__.__name__,pipeline_name) # format msg for slack self.post_message_to_slack(message,reaction='pass') # send update to slack else: message='{0}, {1}: no new job created'.format(self.__class__.__name__,\ pipeline_name) # format msg for failed jobs self.warning(message) self.post_message_to_slack(message,reaction='sleep') # post about failed job to slack except Exception as e: message='Error in {0},{1}: {2}'.format(self.__class__.__name__,\ pipeline_name, e) # format slack msg self.warning(message) self.post_message_to_slack(message,reaction='fail') # send msg to slack if dbconnected: pa.rollback_session() # remove changes from db pa.close_session() raise # mark worker as failed
def get_pipeline_seeds(pipeseed_mode, pipeline_name, igf_session_class, seed_id_label='seed_id', seqrun_date_label='seqrun_date', seqrun_id_label='seqrun_id', experiment_id_label='experiment_id', seqrun_igf_id_label='seqrun_igf_id'): ''' A utils function for fetching pipeline seed information :param pipeseed_mode: A string info about pipeseed mode, allowed values are demultiplexing alignment :param pipeline_name: A string infor about pipeline name :param igf_session_class: A database session class for pipeline seed lookup :returns: Two Pandas dataframes, first with pipeseed entries and second with seed info ''' try: if pipeseed_mode not in ('demultiplexing', 'alignment'): raise ValueError( 'Pipeseed_mode {0} not supported'.format(pipeseed_mode)) table_name = None if pipeseed_mode == 'demultiplexing': table_name = 'seqrun' elif pipeseed_mode == 'alignment': table_name = 'experiment' pa = PipelineAdaptor(**{'session_class': igf_session_class}) # get db adaptor pa.start_session() # connect to db dbconnected = True pipeseeds_data, table_data = \ pa.fetch_pipeline_seed_with_table_data(pipeline_name, table_name=table_name) # fetch requires entries as list of dictionaries from table for the seeded entries seed_data = pd.DataFrame() if not isinstance(pipeseeds_data,pd.DataFrame) or \ not isinstance(table_data,pd.DataFrame): raise AttributeError('Expecting a pandas dataframe of pipeseed data and received {0}, {1}').\ format(type(pipeseeds_data),type(table_data)) if len(pipeseeds_data.index) > 0 and \ len(table_data.index) > 0: pipeseeds_data[seed_id_label]=pipeseeds_data[seed_id_label].\ map(lambda x: int(x)) # convert pipeseed column type if pipeseed_mode == 'demultiplexing': table_data[seqrun_id_label]=table_data[seqrun_id_label].\ map(lambda x: int(x)) # convert seqrun data column type merged_data = pd.merge(pipeseeds_data, table_data, how='inner', on=None, left_on=[seed_id_label], right_on=[seqrun_id_label], left_index=False, right_index=False) # join dataframes merged_data[seqrun_date_label]=\ merged_data[seqrun_igf_id_label].\ map(lambda x: _get_date_from_seqrun(seqrun_igf_id=x)) # get seqrun date from seqrun id elif pipeseed_mode == 'alignment': table_data[experiment_id_label]=table_data[experiment_id_label].\ map(lambda x: int(x)) # convert experiment data column type merged_data = pd.merge(pipeseeds_data, table_data, how='inner', on=None, left_on=[seed_id_label], right_on=[experiment_id_label], left_index=False, right_index=False) # join dataframes seed_data=merged_data.\ applymap(lambda x: str(x)) # convert dataframe to string and add as list of dictionaries return pipeseeds_data, seed_data except: raise