def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     base.start_session()
     platform_data = [{
         "platform_igf_id": "M03291",
         "model_name": "MISEQ",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA1.18.54"
     }, {
         "platform_igf_id": "NB501820",
         "model_name": "NEXTSEQ",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA2"
     }, {
         "platform_igf_id": "K00345",
         "model_name": "HISEQ4000",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA2"
     }]
     flowcell_rule_data = [{
         "platform_igf_id": "K00345",
         "flowcell_type": "HiSeq 3000/4000 SR",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }, {
         "platform_igf_id": "K00345",
         "flowcell_type": "HiSeq 3000/4000 PE",
         "index_1": "NO_CHANGE",
         "index_2": "REVCOMP"
     }, {
         "platform_igf_id": "NB501820",
         "flowcell_type": "NEXTSEQ",
         "index_1": "NO_CHANGE",
         "index_2": "REVCOMP"
     }, {
         "platform_igf_id": "M03291",
         "flowcell_type": "MISEQ",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }]
     pl = PlatformAdaptor(**{'session': base.session})
     pl.store_platform_data(data=platform_data)
     pl.store_flowcell_barcode_rule(data=flowcell_rule_data)
     seqrun_data = [{
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'flowcell_id': '000000000-BRN47',
         'platform_igf_id': 'M03291',
         'flowcell': 'MISEQ',
     }, {
         'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47',
         'flowcell_id': '000000001-BRN47',
         'platform_igf_id': 'NB501820',
         'flowcell': 'NEXTSEQ',
     }]
     sra = SeqrunAdaptor(**{'session': base.session})
     sra.store_seqrun_and_attribute_data(data=seqrun_data)
     project_data = [{'project_igf_id': 'projectA'}]
     pa = ProjectAdaptor(**{'session': base.session})
     pa.store_project_and_attribute_data(data=project_data)
     sample_data = [
         {
             'sample_igf_id': 'sampleA',
             'project_igf_id': 'projectA',
             'species_name': 'HG38'
         },
         {
             'sample_igf_id': 'sampleB',
             'project_igf_id': 'projectA',
             'species_name': 'UNKNOWN'
         },
     ]
     sa = SampleAdaptor(**{'session': base.session})
     sa.store_sample_and_attribute_data(data=sample_data)
     experiment_data = [
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleA',
             'experiment_igf_id': 'sampleA_MISEQ',
             'library_name': 'sampleA',
             'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'MISEQ',
         },
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleA',
             'experiment_igf_id': 'sampleA_NEXTSEQ',
             'library_name': 'sampleA',
             'library_source': 'UNKNOWN',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'NEXTSEQ',
         },
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleB',
             'experiment_igf_id': 'sampleB_MISEQ',
             'library_name': 'sampleB',
             'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'MISEQ',
         },
     ]
     ea = ExperimentAdaptor(**{'session': base.session})
     ea.store_project_and_attribute_data(data=experiment_data)
     run_data = [{
         'experiment_igf_id': 'sampleA_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'sampleA_MISEQ_000000000-BRN47_1',
         'lane_number': '1'
     }, {
         'experiment_igf_id': 'sampleA_NEXTSEQ',
         'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47',
         'run_igf_id': 'sampleA_NEXTSEQ_000000001-BRN47_2',
         'lane_number': '2'
     }, {
         'experiment_igf_id': 'sampleB_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'sampleB_MISEQ_HVWN7BBXX_1',
         'lane_number': '1'
     }]
     ra = RunAdaptor(**{'session': base.session})
     ra.store_run_and_attribute_data(data=run_data)
     file_data = [
         {
             'file_path':
             '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
         {
             'file_path':
             '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
         {
             'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
     ]
     fa = FileAdaptor(**{'session': base.session})
     fa.store_file_and_attribute_data(data=file_data)
     collection_data = [{
         'name': 'sampleA_MISEQ_000000000-BRN47_1',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }, {
         'name': 'sampleA_NEXTSEQ_000000001-BRN47_2',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }, {
         'name': 'sampleB_MISEQ_HVWN7BBXX_1',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }]
     collection_files_data = [{
         'name':
         'sampleA_MISEQ_000000000-BRN47_1',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz'
     }, {
         'name':
         'sampleA_NEXTSEQ_000000001-BRN47_2',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz'
     }, {
         'name':
         'sampleB_MISEQ_HVWN7BBXX_1',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz'
     }]
     ca = CollectionAdaptor(**{'session': base.session})
     ca.store_collection_and_attribute_data(data=collection_data)
     ca.create_collection_group(data=collection_files_data)
     base.close_session()
Beispiel #2
0
 def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     base.start_session()
     platform_data = [
         {
             "platform_igf_id": "M03291",
             "model_name": "MISEQ",
             "vendor_name": "ILLUMINA",
             "software_name": "RTA",
             "software_version": "RTA1.18.54"
         },
     ]
     flowcell_rule_data = [{
         "platform_igf_id": "M03291",
         "flowcell_type": "MISEQ",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }]
     pl = PlatformAdaptor(**{'session': base.session})
     pl.store_platform_data(data=platform_data)
     pl.store_flowcell_barcode_rule(data=flowcell_rule_data)
     project_data = [{'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq'}]
     pa = ProjectAdaptor(**{'session': base.session})
     pa.store_project_and_attribute_data(data=project_data)
     sample_data = [{
         'sample_igf_id': 'IGF103923',
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'species_name': 'HG38'
     }]
     sa = SampleAdaptor(**{'session': base.session})
     sa.store_sample_and_attribute_data(data=sample_data)
     seqrun_data = [
         {
             'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
             'flowcell_id': '000000000-BRN47',
             'platform_igf_id': 'M03291',
             'flowcell': 'MISEQ'
         },
     ]
     sra = SeqrunAdaptor(**{'session': base.session})
     sra.store_seqrun_and_attribute_data(data=seqrun_data)
     pipeline_data = [
         {
             "pipeline_name": "PrimaryAnalysis",
             "pipeline_db": "sqlite:////bcl2fastq.db"
         },
         {
             "pipeline_name": "DemultiplexIlluminaFastq",
             "pipeline_db": "sqlite:////bcl2fastq.db"
         },
     ]
     pla = PipelineAdaptor(**{'session': base.session})
     pla.store_pipeline_data(data=pipeline_data)
     file_data = [
         {
             'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404'
         },
         {
             'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1467047580'
         },
         {
             'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1467047580'
         },
     ]
     fa = FileAdaptor(**{'session': base.session})
     fa.store_file_and_attribute_data(data=file_data)
     collection_data = [
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'table': 'run'
         },
         {
             'name': 'IGF103923_MISEQ1_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'table': 'run'
         },
     ]
     collection_files_data = [
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz'
         },
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz'
         },
         {
             'name': 'IGF103923_MISEQ1_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz'
         },
     ]
     ca = CollectionAdaptor(**{'session': base.session})
     ca.store_collection_and_attribute_data(data=collection_data)
     ca.create_collection_group(data=collection_files_data)
     experiment_data = [{
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'sample_igf_id': 'IGF103923',
         'experiment_igf_id': 'IGF103923_MISEQ',
         'library_name': 'IGF103923',
         'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
         'library_strategy': 'RNA-SEQ',
         'experiment_type': 'TENX-TRANSCRIPTOME-3P',
         'library_layout': 'PAIRED',
         'platform_name': 'MISEQ'
     }, {
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'sample_igf_id': 'IGF103923',
         'experiment_igf_id': 'IGF103923_MISEQ1',
         'library_name': 'IGF103923_1',
         'library_source': 'GENOMIC_SINGLE_CELL',
         'library_strategy': 'WGS',
         'experiment_type': 'UNKNOWN',
         'library_layout': 'PAIRED',
         'platform_name': 'MISEQ'
     }]
     ea = ExperimentAdaptor(**{'session': base.session})
     ea.store_project_and_attribute_data(data=experiment_data)
     run_data = [{
         'experiment_igf_id': 'IGF103923_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'IGF103923_MISEQ_000000000-BRN47_1',
         'lane_number': '1'
     }, {
         'experiment_igf_id': 'IGF103923_MISEQ1',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'IGF103923_MISEQ1_000000000-BRN47_1',
         'lane_number': '1'
     }]
     ra = RunAdaptor(**{'session': base.session})
     ra.store_run_and_attribute_data(data=run_data)
     base.close_session()
    def _build_and_store_exp_run_and_collection_in_db(self,fastq_files_list, \
                                                      restricted_list=('10X')):
        '''
    An internal method for building db collections for the raw fastq files
    '''
        session_class = self.session_class
        db_connected = False
        try:
            restricted_list = list(restricted_list)
            dataframe = pd.DataFrame(fastq_files_list)
            # calculate additional detail
            dataframe=dataframe.apply(lambda data: \
                                      self._calculate_experiment_run_and_file_info(data,
                                                                     restricted_list),\
                                      axis=1)
            # get file data
            file_group_columns = [
                'name', 'type', 'location', 'R1', 'R1_md5', 'R1_size', 'R2',
                'R2_md5', 'R2_size'
            ]
            file_group_data = dataframe.loc[:, file_group_columns]
            file_group_data = file_group_data.drop_duplicates()
            (file_data, file_group_data) = self._reformat_file_group_data(
                data=file_group_data)
            # get base session
            base = BaseAdaptor(**{'session_class': session_class})
            base.start_session()
            db_connected = True
            # get experiment data
            experiment_columns=base.get_table_columns(table_name=Experiment, \
                                                      excluded_columns=['experiment_id',
                                                                        'project_id',
                                                                        'sample_id' ])
            experiment_columns.extend(['project_igf_id', 'sample_igf_id'])
            exp_data = dataframe.loc[:, experiment_columns]
            exp_data = exp_data.drop_duplicates()
            if exp_data.index.size > 0:
                exp_data=exp_data.apply(lambda x: \
                                        self._check_existing_data(\
                                              data=x,\
                                              dbsession=base.session,\
                                              table_name='experiment',\
                                              check_column='EXISTS'),\
                                        axis=1)
                exp_data = exp_data[exp_data['EXISTS'] ==
                                    False]  # filter existing experiments
                exp_data.drop('EXISTS', axis=1,
                              inplace=True)  # remove extra columns
                exp_data = exp_data[pd.isnull(exp_data['experiment_igf_id']) ==
                                    False]  # filter exp with null values
            # get run data
            run_columns=base.get_table_columns(table_name=Run, \
                                               excluded_columns=['run_id',
                                                                 'seqrun_id',
                                                                 'experiment_id',
                                                                 'date_created',
                                                                 'status'
                                                                ])
            run_columns.extend([
                'seqrun_igf_id', 'experiment_igf_id', 'R1_READ_COUNT',
                'R2_READ_COUNT'
            ])
            run_data = dataframe.loc[:, run_columns]
            run_data = run_data.drop_duplicates()
            if run_data.index.size > 0:
                run_data=run_data.apply(lambda x: \
                                        self._check_existing_data(\
                                              data=x,\
                                              dbsession=base.session,\
                                              table_name='run',\
                                              check_column='EXISTS'),\
                                        axis=1)
                run_data = run_data[run_data['EXISTS'] ==
                                    False]  # filter existing runs
                run_data.drop('EXISTS', axis=1,
                              inplace=True)  # remove extra columns
                run_data = run_data[pd.isnull(run_data['run_igf_id']) ==
                                    False]  # filter run with null values
            # get collection data
            collection_columns = ['name', 'type', 'table']
            collection_data = dataframe.loc[:, collection_columns]
            collection_data = collection_data.drop_duplicates()
            if collection_data.index.size > 0:
                collection_data=collection_data.apply(lambda x: \
                                        self._check_existing_data( \
                                              data=x, \
                                              dbsession=base.session, \
                                              table_name='collection', \
                                              check_column='EXISTS'), \
                                        axis=1)
                collection_data = collection_data[collection_data[
                    'EXISTS'] == False]  # filter existing collection
                collection_data.drop('EXISTS', axis=1,
                                     inplace=True)  # remove extra columns
                collection_data = collection_data[pd.isnull(
                    collection_data['name']
                ) == False]  # filter collection with null values
            # store experiment to db
            if exp_data.index.size > 0:
                ea = ExperimentAdaptor(**{'session': base.session})
                ea.store_project_and_attribute_data(data=exp_data,
                                                    autosave=False)
                base.session.flush()
            # store run to db
            if run_data.index.size > 0:
                ra = RunAdaptor(**{'session': base.session})
                ra.store_run_and_attribute_data(data=run_data, autosave=False)
                base.session.flush()
            # store file to db

            fa = FileAdaptor(**{'session': base.session})
            fa.store_file_and_attribute_data(data=file_data, autosave=False)
            base.session.flush()
            # store collection to db
            ca = CollectionAdaptor(**{'session': base.session})
            if collection_data.index.size > 0:
                ca.store_collection_and_attribute_data(data=collection_data,\
                                                       autosave=False)
                base.session.flush()
            ca.create_collection_group(data=file_group_data, autosave=False)
            base.commit_session()
            self._write_manifest_file(file_data)
        except:
            if db_connected:
                base.rollback_session()
            raise
        finally:
            if db_connected:
                base.close_session()