def test_seed_new_experiments(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() new_exps,_=\ pl.seed_new_experiments(\ pipeline_name='PrimaryAnalysis', species_name_list=['HG38'], fastq_type='demultiplexed_fastq', ) self.assertEqual(len(new_exps), 1) self.assertEqual(new_exps[0], 'IGFQ000123_avik_10-4-2018_Miseq')
def test_find_new_analysis_seeds2(self): base = BaseAdaptor(**{'session_class': self.session_class}) project_name_file = os.path.join(self.temp_dir, 'project_name_list.txt') with open(project_name_file, 'w') as fp: fp.write('projectA') available_exps,seeded_exps = \ find_new_analysis_seeds(\ dbconfig_path=self.dbconfig, pipeline_name='PrimaryAnalysis', project_name_file=project_name_file, species_name_list=['HG38'], fastq_type='demultiplexed_fastq', library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertTrue(available_exps is None) self.assertTrue('projectA' in seeded_exps) pla = PipelineAdaptor(**{'session_class': self.session_class}) pla.start_session() seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') pla.close_session() exp_data = exp_data.to_dict(orient='records') self.assertTrue(len(exp_data), 1) self.assertEqual(exp_data[0]['experiment_igf_id'], 'sampleA_MISEQ')
def test_find_new_analysis_seeds1(self): project_name_file = os.path.join(self.temp_dir, 'project_name_list.txt') with open(project_name_file, 'w') as fp: fp.write('') available_exps,seeded_exps = \ find_new_analysis_seeds(\ dbconfig_path=self.dbconfig, pipeline_name='PrimaryAnalysis', project_name_file=project_name_file, species_name_list=['HG38'], fastq_type='demultiplexed_fastq', library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertTrue('projectA' in available_exps) self.assertTrue(seeded_exps is None) pla = PipelineAdaptor(**{'session_class': self.session_class}) pla.start_session() seeded_data, exp_data = pla.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') pla.close_session() self.assertEqual(len(seeded_data.index), 0)
def test_create_pipeline_seed(self): pipeline_seed_data1 = [ { 'seed_id': '1', 'seed_table': 'seqrun' }, ] pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() with self.assertRaises(ValueError): pl.create_pipeline_seed(data=pipeline_seed_data1) pl.close_session()
def test_load_new_pipeline_data(self): load_new_pipeline_data(data_file=self.data_file, dbconfig=self.dbconfig) pp = PipelineAdaptor(**{'session_class': self.session_class}) pp.start_session() data = pp.fetch_pipeline_records_pipeline_name( pipeline_name='demultiplexing_fastq') pp.close_session() self.assertEqual(data.pipeline_name, 'demultiplexing_fastq')
def load_new_pipeline_data(data_file, dbconfig): ''' A method for loading new data for pipeline table ''' try: formatted_data = read_json_data(data_file) dbparam = read_dbconf_json(dbconfig) pp = PipelineAdaptor(**dbparam) pp.start_session() pp.store_pipeline_data(data=formatted_data) pp.close_session() except: raise
def test_fetch_pipeline_seed_with_table_data(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq') pl.close_session() self.assertIsInstance(table_data.to_dict(orient='records'), list) self.assertEqual(len(table_data.to_dict(orient='records')), len(pipe_seed.to_dict(orient='records'))) self.assertTrue('seqrun_igf_id' in list(table_data.columns))
def setUp(self): self.dbconfig = 'data/dbconfig.json' self.platform_json = 'data/platform_db_data.json' self.seqrun_json = 'data/seqrun_db_data.json' self.pipeline_json = 'data/pipeline_data.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() # load platform data pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=read_json_data(self.platform_json)) # load seqrun data sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data( data=read_json_data(self.seqrun_json)) # load platform data pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=read_json_data(self.pipeline_json)) pipeline_seed_data = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '1', 'seed_table': 'seqrun' }, ] pla.create_pipeline_seed(data=pipeline_seed_data) base.close_session()
def test_fetch_pipeline_seed_with_table_data(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (pipe_seed, table_data) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='alignment', table_name='experiment') pl.close_session() self.assertIsInstance(table_data.to_dict(orient='records'), list) self.assertEqual(len(table_data.to_dict(orient='records')), len(pipe_seed.to_dict(orient='records'))) exp_id = table_data.to_dict(orient='records')[0]['experiment_igf_id'] project_id = table_data.to_dict(orient='records')[0]['project_igf_id'] self.assertEqual(exp_id, 'IGF00001_HISEQ4000') self.assertEqual(project_id, 'IGFP0001_test_22-8-2017_rna_sc') self.assertTrue('experiment_igf_id' in list(table_data.columns))
def find_new_analysis_seeds(dbconfig_path, pipeline_name, project_name_file, species_name_list, fastq_type, library_source_list): ''' A utils method for finding and seeding new experiments for analysis :param dbconfig_path: A database configuration file :param slack_config: A slack configuration file :param pipeline_name:Pipeline name :param fastq_type: Fastq collection type :param project_name_file: A file containing the list of projects for seeding pipeline :param species_name_list: A list of species to consider for seeding analysis :param library_source_list: A list of library source info to consider for seeding analysis :returns: List of available experiments or None and a list of seeded experiments or None ''' try: available_exps = None seeded_exps = None if not os.path.exists(project_name_file): raise IOError('File {0} not found'.format(project_name_file)) with open(project_name_file, 'r') as fp: project_list = fp.readlines() # read list of projects from file, project_list = [i.strip() for i in project_list] if len(project_list) == 0: project_list = None dbparam = read_dbconf_json(dbconfig_path) pl = PipelineAdaptor(**dbparam) pl.start_session() available_exps,seeded_exps=\ pl.seed_new_experiments(\ pipeline_name=pipeline_name, species_name_list=species_name_list, fastq_type=fastq_type, project_list=project_list, library_source_list=library_source_list ) pl.close_session() return available_exps, seeded_exps except: raise
def run(self): try: igf_session_class = self.param_required('igf_session_class') # set by base class pipeline_name = self.param_required('pipeline_name') igf_id = self.param_required('igf_id') task_id = self.param_required('task_id') seed_id = self.param_required('seed_id') seed_table = self.param_required('seed_table') new_status = self.param_required('new_status') pa = PipelineAdaptor(**{'session_class':igf_session_class}) pa.start_session() # connect to db pa.update_pipeline_seed(\ data=[{'pipeline_name':pipeline_name, 'seed_id':int(seed_id), 'seed_table':seed_table, 'status':new_status.upper()}]) # update seed record in db pa.close_session() # close db connection message = \ 'changing status in {0} for seed {1} as {2}'.\ format(\ pipeline_name, seed_id, new_status.upper()) # format message self.post_message_to_slack(message, reaction='pass') # send message to slack self.comment_asana_task(task_name=task_id, comment=message) # send message to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() platform_data = [ { "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }, ] flowcell_rule_data = [{ "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) project_data = [{'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) sample_data = [{ 'sample_igf_id': 'IGF103923', 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'species_name': 'HG38' }] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) seqrun_data = [ { 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'flowcell_id': '000000000-BRN47', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ' }, ] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) pipeline_data = [ { "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////bcl2fastq.db" }, { "pipeline_name": "DemultiplexIlluminaFastq", "pipeline_db": "sqlite:////bcl2fastq.db" }, ] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) file_data = [ { 'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404' }, { 'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1467047580' }, { 'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1467047580' }, ] fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data) collection_data = [ { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'IGF103923_MISEQ1_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, ] collection_files_data = [ { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz' }, { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz' }, { 'name': 'IGF103923_MISEQ1_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz' }, ] ca = CollectionAdaptor(**{'session': base.session}) ca.store_collection_and_attribute_data(data=collection_data) ca.create_collection_group(data=collection_files_data) experiment_data = [{ 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'sample_igf_id': 'IGF103923', 'experiment_igf_id': 'IGF103923_MISEQ', 'library_name': 'IGF103923', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ' }, { 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'sample_igf_id': 'IGF103923', 'experiment_igf_id': 'IGF103923_MISEQ1', 'library_name': 'IGF103923_1', 'library_source': 'GENOMIC_SINGLE_CELL', 'library_strategy': 'WGS', 'experiment_type': 'UNKNOWN', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ' }] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) run_data = [{ 'experiment_igf_id': 'IGF103923_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'IGF103923_MISEQ_000000000-BRN47_1', 'lane_number': '1' }, { 'experiment_igf_id': 'IGF103923_MISEQ1', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'IGF103923_MISEQ1_000000000-BRN47_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) base.close_session()
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() project_data = [{ 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'project_name': 'test_22-8-2017_rna', 'description': 'Its project 1', 'project_deadline': 'Before August 2017', 'comments': 'Some samples are treated with drug X', }] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) sample_data = [ { 'sample_igf_id': 'IGF00001', 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'POLYA-RNA' }, { 'sample_igf_id': 'IGF00003', 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'experiment_type': 'POLYA-RNA' }, { 'sample_igf_id': 'IGF00002', 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', }, ] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) experiment_data = [ { 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'sample_igf_id': 'IGF00001', 'experiment_igf_id': 'IGF00001_HISEQ4000', 'library_name': 'IGF00001' }, { 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'sample_igf_id': 'IGF00003', 'experiment_igf_id': 'IGF00003_HISEQ4000', 'library_name': 'IGF00001' }, { 'project_igf_id': 'IGFP0001_test_22-8-2017_rna_sc', 'sample_igf_id': 'IGF00002', 'experiment_igf_id': 'IGF00002_HISEQ4000', 'library_name': 'IGF00002' }, ] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) pipeline_data = [{ "pipeline_name": "alignment", "pipeline_db": "sqlite:////data/aln.db", "pipeline_init_conf": { "input_dir": "data/fastq_dir/", "output_dir": "data" }, "pipeline_run_conf": { "output_dir": "data" } }] pl = PipelineAdaptor(**{'session': base.session}) pl.store_pipeline_data(data=pipeline_data) pipeline_seed_data = [ { 'pipeline_name': 'alignment', 'seed_id': '1', 'seed_table': 'experiment' }, ] pl.create_pipeline_seed(data=pipeline_seed_data) base.close_session()
def test_update_pipeline_seed(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() pipeline_seed_data1 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '2', 'seed_table': 'seqrun', }, ] with self.assertRaises(ValueError): pl.update_pipeline_seed(data=pipeline_seed_data1) pipeline_seed_data2 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '2', 'seed_table': 'seqrun', 'status': 'RUNNING' }, ] pl.update_pipeline_seed(data=pipeline_seed_data2) (pipe_seed1, table_data1) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq') self.assertEqual(len(table_data1.to_dict(orient='records')), len(pipe_seed1.to_dict(orient='records'))) pipeline_seed_data3 = [ { 'pipeline_name': 'demultiplexing_fastq', 'seed_id': '1', 'seed_table': 'seqrun', 'status': 'RUNNING' }, ] pl.update_pipeline_seed(data=pipeline_seed_data3) (pipe_seed2, _) = pl.fetch_pipeline_seed_with_table_data( pipeline_name='demultiplexing_fastq', status='RUNNING') pl.close_session() self.assertEqual( pipe_seed2.loc[pipe_seed2.seed_id == 1]['status'].values[0], 'RUNNING')
def test_seed_new_experiments1(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() new_exps,_=\ pl.seed_new_experiments(\ pipeline_name='PrimaryAnalysis', species_name_list=['HG38'], fastq_type='demultiplexed_fastq', project_list=['IGFQ000123_avik_10-4-2018_Miseq'], library_source_list=['TRANSCRIPTOMIC_SINGLE_CELL'] ) self.assertFalse(new_exps) pl.close_session() pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() (_,exp_data)=pl.fetch_pipeline_seed_with_table_data(\ pipeline_name='PrimaryAnalysis', table_name='experiment', status='SEEDED') self.assertEqual(len(list(exp_data['experiment_igf_id'].values)), 1) self.assertEqual(exp_data['experiment_igf_id'].values[0], 'IGF103923_MISEQ')
'pipeline_name': 'DemultiplexIlluminaFastq', 'seed_id': 1, 'seed_table': 'seqrun' }, { 'pipeline_name': 'DemultiplexIlluminaFastq', 'seed_id': 2, 'seed_table': 'seqrun' }, { 'pipeline_name': 'DemultiplexIlluminaFastq', 'seed_id': 3, 'seed_table': 'seqrun' }, ] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) pla.create_pipeline_seed(data=pipeline_seed_data) pipeline_data = [{ "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////analysis.db", }] pipeline_seed_data = [{ 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 1, 'seed_table': 'experiment' }, { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment'
def test_reset_pipeline_seed_for_rerun(self): base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() sra = SeqrunAdaptor(**{'session': base.session}) seqrun = sra.fetch_seqrun_records_igf_id( seqrun_igf_id='171003_M00001_0089_000000000-TEST') pp = PipelineAdaptor(**{'session': base.session}) pipeline = pp.fetch_pipeline_records_pipeline_name( 'demultiplexing_fastq') pipe_seed = pp.fetch_pipeline_seed(pipeline_id=pipeline.pipeline_id, seed_id=seqrun.seqrun_id, seed_table='seqrun') self.assertEqual(pipe_seed.status, 'SEEDED') pp.update_pipeline_seed(data=[{ 'pipeline_id': pipeline.pipeline_id, 'seed_id': seqrun.seqrun_id, 'seed_table': 'seqrun', 'status': 'FINISHED', }]) pipe_seed2 = pp.fetch_pipeline_seed(pipeline_id=pipeline.pipeline_id, seed_id=seqrun.seqrun_id, seed_table='seqrun') self.assertEqual(pipe_seed2.status, 'FINISHED') base.close_session() with open(self.seqrun_input_list, 'w') as fp: fp.write('171003_M00001_0089_000000000-TEST') mps = Modify_pipeline_seed(igf_id_list=self.seqrun_input_list, table_name='seqrun', pipeline_name='demultiplexing_fastq', dbconfig_file=self.dbconfig, log_slack=False, log_asana=False, clean_up=True) mps.reset_pipeline_seed_for_rerun(seeded_label='SEEDED') base.start_session() sra = SeqrunAdaptor(**{'session': base.session}) seqrun = sra.fetch_seqrun_records_igf_id( seqrun_igf_id='171003_M00001_0089_000000000-TEST') pp = PipelineAdaptor(**{'session': base.session}) pipeline = pp.fetch_pipeline_records_pipeline_name( 'demultiplexing_fastq') pipe_seed = pp.fetch_pipeline_seed(pipeline_id=pipeline.pipeline_id, seed_id=seqrun.seqrun_id, seed_table='seqrun') self.assertEqual(pipe_seed.status, 'SEEDED') base.close_session()
def get_pipeline_seeds(pipeseed_mode, pipeline_name, igf_session_class, seed_id_label='seed_id', seqrun_date_label='seqrun_date', seqrun_id_label='seqrun_id', experiment_id_label='experiment_id', seqrun_igf_id_label='seqrun_igf_id'): ''' A utils function for fetching pipeline seed information :param pipeseed_mode: A string info about pipeseed mode, allowed values are demultiplexing alignment :param pipeline_name: A string infor about pipeline name :param igf_session_class: A database session class for pipeline seed lookup :returns: Two Pandas dataframes, first with pipeseed entries and second with seed info ''' try: if pipeseed_mode not in ('demultiplexing', 'alignment'): raise ValueError( 'Pipeseed_mode {0} not supported'.format(pipeseed_mode)) table_name = None if pipeseed_mode == 'demultiplexing': table_name = 'seqrun' elif pipeseed_mode == 'alignment': table_name = 'experiment' pa = PipelineAdaptor(**{'session_class': igf_session_class}) # get db adaptor pa.start_session() # connect to db dbconnected = True pipeseeds_data, table_data = \ pa.fetch_pipeline_seed_with_table_data(pipeline_name, table_name=table_name) # fetch requires entries as list of dictionaries from table for the seeded entries seed_data = pd.DataFrame() if not isinstance(pipeseeds_data,pd.DataFrame) or \ not isinstance(table_data,pd.DataFrame): raise AttributeError('Expecting a pandas dataframe of pipeseed data and received {0}, {1}').\ format(type(pipeseeds_data),type(table_data)) if len(pipeseeds_data.index) > 0 and \ len(table_data.index) > 0: pipeseeds_data[seed_id_label]=pipeseeds_data[seed_id_label].\ map(lambda x: int(x)) # convert pipeseed column type if pipeseed_mode == 'demultiplexing': table_data[seqrun_id_label]=table_data[seqrun_id_label].\ map(lambda x: int(x)) # convert seqrun data column type merged_data = pd.merge(pipeseeds_data, table_data, how='inner', on=None, left_on=[seed_id_label], right_on=[seqrun_id_label], left_index=False, right_index=False) # join dataframes merged_data[seqrun_date_label]=\ merged_data[seqrun_igf_id_label].\ map(lambda x: _get_date_from_seqrun(seqrun_igf_id=x)) # get seqrun date from seqrun id elif pipeseed_mode == 'alignment': table_data[experiment_id_label]=table_data[experiment_id_label].\ map(lambda x: int(x)) # convert experiment data column type merged_data = pd.merge(pipeseeds_data, table_data, how='inner', on=None, left_on=[seed_id_label], right_on=[experiment_id_label], left_index=False, right_index=False) # join dataframes seed_data=merged_data.\ applymap(lambda x: str(x)) # convert dataframe to string and add as list of dictionaries return pipeseeds_data, seed_data except: raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() platform_data = [{ "platform_igf_id": "M00001", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }, { "platform_igf_id": "NB500000", "model_name": "NEXTSEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }, { "platform_igf_id": "K00000", "model_name": "HISEQ4000", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }] flowcell_rule_data = [{ "platform_igf_id": "K00000", "flowcell_type": "HiSeq 3000/4000 SR", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }, { "platform_igf_id": "K00000", "flowcell_type": "HiSeq 3000/4000 PE", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "NB500000", "flowcell_type": "NEXTSEQ", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "M00001", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) seqrun_data = [{ 'seqrun_igf_id': '171003_M00001_0089_000000000-TEST', 'flowcell_id': '000000000-D0YLK', 'platform_igf_id': 'M00001', 'flowcell': 'MISEQ', }] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) seqrun = sra.fetch_seqrun_records_igf_id( seqrun_igf_id='171003_M00001_0089_000000000-TEST') pipeline_data = [{ "pipeline_name": "demultiplexing_fastq", "pipeline_db": "sqlite:////data/bcl2fastq.db", "pipeline_init_conf": { "input_dir": "data/seqrun_dir/", "output_dir": "data" }, "pipeline_run_conf": { "output_dir": "data" } }] pipeseed_data = [{ "pipeline_name": "demultiplexing_fastq", "seed_table": "seqrun", "seed_id": seqrun.seqrun_id }] pp = PipelineAdaptor(**{'session': base.session}) pp.store_pipeline_data(data=pipeline_data) pp.create_pipeline_seed( data=pipeseed_data, required_columns=['pipeline_id', 'seed_id', 'seed_table']) base.close_session() self.seqrun_input_list = 'data/reset_samplesheet_md5/seqrun_pipeline_reset_list.txt' with open(self.seqrun_input_list, 'w') as fp: fp.write('')
def test_fetch_pipeline_records_pipeline_name(self): pl = PipelineAdaptor(**{'session_class': self.session_class}) pl.start_session() pl_data = pl.fetch_pipeline_records_pipeline_name( pipeline_name='demultiplexing_fastq') self.assertEqual(pl_data.pipeline_id, 1)
def setUp(self): self.data_file = 'data/pipeline_data.json' self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] self.temp_dir = get_temp_dir() Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() platform_data = [{ "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }, { "platform_igf_id": "NB501820", "model_name": "NEXTSEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }, { "platform_igf_id": "K00345", "model_name": "HISEQ4000", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }] flowcell_rule_data = [{ "platform_igf_id": "K00345", "flowcell_type": "HiSeq 3000/4000 SR", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }, { "platform_igf_id": "K00345", "flowcell_type": "HiSeq 3000/4000 PE", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "NB501820", "flowcell_type": "NEXTSEQ", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) seqrun_data = [{ 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'flowcell_id': '000000000-BRN47', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }, { 'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47', 'flowcell_id': '000000001-BRN47', 'platform_igf_id': 'NB501820', 'flowcell': 'NEXTSEQ', }] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) project_data = [{'project_igf_id': 'projectA'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) sample_data = [ { 'sample_igf_id': 'sampleA', 'project_igf_id': 'projectA', 'species_name': 'HG38' }, { 'sample_igf_id': 'sampleB', 'project_igf_id': 'projectA', 'species_name': 'UNKNOWN' }, ] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) experiment_data = [ { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleA', 'experiment_igf_id': 'sampleA_MISEQ', 'library_name': 'sampleA', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', }, { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleA', 'experiment_igf_id': 'sampleA_NEXTSEQ', 'library_name': 'sampleA', 'library_source': 'UNKNOWN', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'NEXTSEQ', }, { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleB', 'experiment_igf_id': 'sampleB_MISEQ', 'library_name': 'sampleB', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', }, ] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) run_data = [{ 'experiment_igf_id': 'sampleA_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'sampleA_MISEQ_000000000-BRN47_1', 'lane_number': '1' }, { 'experiment_igf_id': 'sampleA_NEXTSEQ', 'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47', 'run_igf_id': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'lane_number': '2' }, { 'experiment_igf_id': 'sampleB_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'sampleB_MISEQ_HVWN7BBXX_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) file_data = [ { 'file_path': '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, { 'file_path': '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, { 'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, ] fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data) collection_data = [{ 'name': 'sampleA_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'sampleB_MISEQ_HVWN7BBXX_1', 'type': 'demultiplexed_fastq', 'table': 'run' }] collection_files_data = [{ 'name': 'sampleA_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz' }, { 'name': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz' }, { 'name': 'sampleB_MISEQ_HVWN7BBXX_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz' }] ca = CollectionAdaptor(**{'session': base.session}) ca.store_collection_and_attribute_data(data=collection_data) ca.create_collection_group(data=collection_files_data) pipeline_data = [ { "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////bcl2fastq.db", }, { "pipeline_name": "DemultiplexIlluminaFastq", "pipeline_db": "sqlite:////bcl2fastq.db", }, ] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) base.close_session()
def run(self): ''' Run method for the seed job factory class of the all pipelines :param igf_session_class: A database session class :param pipeline_name: Name of the pipeline :param seed_id_label: A text label for the seed_id, default seed_id :param seqrun_id_label: A text for seqrun_id column name, default seqrun_id :param seqrun_date_label: A text label for the seqrun date, default seqrun_date :param seqrun_igf_id_label: A text label for sequencing run igf id, default seqrun_igf_id :param seeded_label: A text label for the status seeded in pipeline_seed table, default SEEDED :param running_label: A text label for the status running in the pipeline_seed table, default RUNNING :param seed_status_label: A text label for the pipeline_seed status column name, default status :param experiment_id_label: A text label for the experiment_id, default experiment_id :param pipeseed_mode: A text label for pipeline mode, default demultiplexing Allowed values are demultiplexing alignment :returns: A list of dictionary containing the seqrun ids or experiment_igf_ids seed for analysis ''' try: dbconnected=False igf_session_class = self.param_required('igf_session_class') # set by base class pipeline_name = self.param_required('pipeline_name') seed_id_label = self.param_required('seed_id_label') seqrun_id_label = self.param_required('seqrun_id_label') seeded_label=self.param_required('seeded_label') running_label=self.param_required('running_label') seqrun_date_label=self.param_required('seqrun_date_label') seqrun_igf_id_label=self.param_required('seqrun_igf_id_label') seed_status_label=self.param_required('seed_status_label') experiment_id_label = self.param_required('experiment_id_label') pipeseed_mode=self.param_required('pipeseed_mode') if pipeseed_mode not in ('demultiplexing','alignment'): raise ValueError('Pipeseed_mode {0} not supported'.format(pipeseed_mode)) pipeseeds_data,seed_data=get_pipeline_seeds(\ pipeseed_mode=pipeseed_mode, pipeline_name=pipeline_name, igf_session_class=igf_session_class) # fetch pipeseed data from db if len(seed_data.index)>0: seed_data=seed_data.\ to_dict(orient='records') # convert dataframe to list of dictionaries self.param('sub_tasks',seed_data) # set sub_tasks param for the data flow pipeseeds_data[seed_status_label]=pipeseeds_data[seed_status_label].\ map({seeded_label:running_label}) # update seed records in pipeseed table, changed status to RUNNING pa = PipelineAdaptor(**{'session_class':igf_session_class}) # get db adaptor pa.start_session() # connect to db dbconnected=True pa.update_pipeline_seed(data=pipeseeds_data.to_dict(orient='records'), autosave=False) # set pipeline seeds as running pa.commit_session() # save changes to db pa.close_session() # close db connection dbconnected=False message='Total {0} new job found for {1}, pipeline: {2}'.\ format(len(seed_data),self.__class__.__name__,pipeline_name) # format msg for slack self.post_message_to_slack(message,reaction='pass') # send update to slack else: message='{0}, {1}: no new job created'.format(self.__class__.__name__,\ pipeline_name) # format msg for failed jobs self.warning(message) self.post_message_to_slack(message,reaction='sleep') # post about failed job to slack except Exception as e: message='Error in {0},{1}: {2}'.format(self.__class__.__name__,\ pipeline_name, e) # format slack msg self.warning(message) self.post_message_to_slack(message,reaction='fail') # send msg to slack if dbconnected: pa.rollback_session() # remove changes from db pa.close_session() raise # mark worker as failed
def setUp(self): self.path = 'data/seqrun_dir' self.dbconfig = 'data/dbconfig.json' self.md5_out_path = 'data/md5_dir' self.pipeline_name = 'demultiplexing_fastq' seqrun_json = 'data/seqrun_db_data.json' platform_json = 'data/platform_db_data.json' pipeline_json = 'data/pipeline_data.json' os.mkdir(self.md5_out_path) dbparam = None with open(self.dbconfig, 'r') as json_data: dbparam = json.load(json_data) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] self.pipeline_name = '' Base.metadata.create_all(self.engine) base.start_session() user_data = [ { 'name': 'user1', 'email_id': '*****@*****.**', 'username': '******' }, ] ua = UserAdaptor(**{'session': base.session}) ua.store_user_data(data=user_data) project_data = [{ 'project_igf_id': 'project_1', 'project_name': 'test_22-8-2017_rna', 'description': 'Its project 1', 'project_deadline': 'Before August 2017', 'comments': 'Some samples are treated with drug X', }] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) project_user_data = [{ 'project_igf_id': 'project_1', 'email_id': '*****@*****.**', 'data_authority': True }] pa.assign_user_to_project(data=project_user_data) sample_data = [ { 'sample_igf_id': 'IGF0001', 'project_igf_id': 'project_1', }, { 'sample_igf_id': 'IGF0002', 'project_igf_id': 'project_1', }, { 'sample_igf_id': 'IGF0003', 'project_igf_id': 'project_1', }, ] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) base.commit_session() with open(pipeline_json, 'r') as json_data: # store pipeline data to db pipeline_data = json.load(json_data) pa = PipelineAdaptor(**{'session': base.session}) pa.store_pipeline_data(data=pipeline_data) with open(platform_json, 'r') as json_data: # store platform data to db platform_data = json.load(json_data) pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) with open(seqrun_json, 'r') as json_data: # store seqrun data to db seqrun_data = json.load(json_data) sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) base.close_session()
def reset_pipeline_seed_for_rerun(self, seeded_label='SEEDED', restricted_status_list=('SEEDED', 'RUNNING')): ''' A method for setting the pipeline for re-run if the first run has failed or aborted This method will set the pipeline_seed.status as 'SEEDED' only if its not already 'SEEDED' or 'RUNNING' :param seeded_label: A text label for seeded status, default SEEDED :param restricted_status_list: A list of pipeline status to exclude from the search, default ['SEEDED','RUNNING'] ''' try: db_connected = False restricted_status_list = list(restricted_status_list) input_id_list = self._read_input_list( igf_id_list=self.igf_id_list) # get input ids from file failed_ids = list() # define empty list of failed ids pass_list = list() # required for logging in asana base = self.base_adaptor base.start_session() # connect to database db_connected = True for igf_id in input_id_list: pipe_seed_data = self._fetch_pipeline_seed_entry( igf_id=igf_id, restrict_seed_status=restricted_status_list ) # get pipe seed data for igf id if pipe_seed_data is None: failed_ids.append(igf_id) # add igf id to failed list else: pl = PipelineAdaptor(**{'session': base.session }) # connect to pipeline adaptor updated_seed_data = [{ 'pipeline_id': pipe_seed_data.pipeline_id, 'seed_id': pipe_seed_data.seed_id, 'seed_table': pipe_seed_data.seed_table, 'status': seeded_label }] # set data for seed update pl.update_pipeline_seed( data=updated_seed_data, autosave=False) # update data to pipeline seed table pass_list.append(igf_id) base.commit_session() # save data to database after all changes base.close_session() # close database connection db_connected = False if self.clean_up: self._clear_input_list( file_path=self.igf_id_list, igf_list=failed_ids ) # over write input list and add failed ids for next try message = 'Overwriting pipeseed input list {0}'.format( self.igf_id_list) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='pass' ) # comment to slack for file over writing if len(pass_list) > 0: for id_line in pass_list: message='Changed pipeline seed for id {0}, pipeline {1}, to {2}'.\ format(id_line,self.pipeline_name,seeded_label) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='pass') # comment to slack channel if self.log_asana: self.igf_asana.comment_asana_task( task_name=id_line, comment=message) # comment on asana task except Exception as e: if db_connected: base.rollback_session() base.close_session() message = 'Failed to update pipeline seed, Error: {0}'.format( e) warnings.warn(message) if self.log_slack: self.igf_slack.post_message_to_channel(message, reaction='fail') raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() # PLATFORM platform_data = [{ "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }] flowcell_rule_data = [{ "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) # SEQRUN seqrun_data = [{ 'seqrun_igf_id': '180416_M03291_0139_000000000-TEST', 'flowcell_id': '000000000-TEST', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }, { 'seqrun_igf_id': '180416_M03291_0140_000000000-TEST', 'flowcell_id': '000000000-TEST', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) # PROJECT project_data = [{'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) # SAMPLE sample_data = [{ 'sample_igf_id': 'IGF00123', 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq' }, { 'sample_igf_id': 'IGF00124', 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq' }] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) # EXPERIMENT experiment_data = [{ 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq', 'sample_igf_id': 'IGF00123', 'experiment_igf_id': 'IGF00123_MISEQ', 'library_name': 'IGF00123', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'POLYA-RNA', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', 'singlecell_chemistry': 'TENX' }, { 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq', 'sample_igf_id': 'IGF00124', 'experiment_igf_id': 'IGF00124_MISEQ', 'library_name': 'IGF00124', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'POLYA-RNA', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', 'singlecell_chemistry': 'TENX' }] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) # RUN run_data = [{ 'experiment_igf_id': 'IGF00123_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-TEST', 'run_igf_id': 'IGF00123_MISEQ_000000000-TEST_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) # PIPELINE pipeline_data = [{ "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////aln.db", }, { "pipeline_name": "DemultiplexingFastq", "pipeline_db": "sqlite:////fastq.db", }] pipeline_seed_data = [ { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 1, 'seed_table': 'experiment' }, { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 1, 'seed_table': 'seqrun' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 2, 'seed_table': 'seqrun' }, ] update_data = [{ 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment', 'status': 'FINISHED' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 2, 'seed_table': 'seqrun', 'status': 'FINISHED' }] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) pla.create_pipeline_seed(data=pipeline_seed_data) pla.update_pipeline_seed(update_data) base.close_session()