Python SampleSheetParserの例、flowcell_parser.classes.SampleSheetParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: hiseqx.py プロジェクト: kate-v-stepanova/hugin

 def sample_sheet(self):
     if self._sample_sheet is None:
         sample_sheet_path = os.path.join(self.path, 'SampleSheet.csv')
         if os.path.exists(sample_sheet_path):
             self._sample_sheet = SampleSheetParser(sample_sheet_path).data
         else:
             logging.warning("SampleSheet.csv does not exist: {}".format(
                 os.path.abspath(sample_sheet_path)))
             path = config.get('sample_sheet_path', {}).get('hiseqx')
             if path is None:
                 logging.error(
                     "'sample_sheet_path' missing in the config file")
                 raise RuntimeError(
                     "'sample_sheet_path' missing in the config file: {}".
                     format(config.get('config_path')))
             else:
                 sample_sheet_path = os.path.join(path, self.name,
                                                  'SampleSheet.csv')
                 if os.path.exists(sample_sheet_path):
                     self._sample_sheet = SampleSheetParser(
                         sample_sheet_path).data
                 else:
                     logging.error(
                         "SampleSheet.csv does not exist at {}".format(
                             sample_sheet_path))
                     raise RuntimeError(
                         "SampleSheet.csv does not exist at {}".format(
                             sample_sheet_path))
     return self._sample_sheet

コード例 #2

0

ファイルを表示

    def _copy_samplesheet(self):
        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        indexfile = dict()
        # Loading index files
        try:
            indexfile['tenX'] = self.CONFIG['bcl2fastq']['tenX_index_path']
        except KeyError:
            logger.error(
                'Path to index file (10X) not found in the config file')
            raise RuntimeError
        try:
            indexfile['smartseq'] = self.CONFIG['bcl2fastq'][
                'smartseq_index_path']
        except KeyError:
            logger.error(
                'Path to index file (Smart-seq) not found in the config file')
            raise RuntimeError
        # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        # If this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet.csv')
        # Function that goes through the original sample sheet and check for sample types
        self.sample_table = _classify_samples(indexfile, ssparser)
        # Check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'w') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            indexfile,
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(
                    'Encountered the following exception {}'.format(e))
                return False
            logger.info(
                ('Created SampleSheet.csv for Flowcell {} in {} '.format(
                    self.id, samplesheet_dest)))
        # SampleSheet.csv generated

        # When demultiplexing SampleSheet.csv is the one I need to use
        # Need to rewrite so that SampleSheet_0.csv is always used.
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, 'SampleSheet.csv'))
        if not self.runParserObj.obj.get('samplesheet_csv'):
            self.runParserObj.obj[
                'samplesheet_csv'] = self.runParserObj.samplesheet.data

コード例 #3

0

ファイルを表示

ファイル: NextSeq_Runs.py プロジェクト: zhanglingfei/TACA

    def demultiplex_run(self):
        """ Demultiplex a NextSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        if not os.path.exists(self.ssname):
            # We should not get here really and this run should be defined as NON NGI-RUN
            return False
        # TODO SampleSheetParser may throw an exception
        ssparser = SampleSheetParser(self.ssname)
        # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        # if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # Check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(self._generate_clean_samplesheet(ssparser))
            except Exception as e:
                if os.path.exists(samplesheet_dest):
                    os.remove(samplesheet_dest)
                logger.error(e)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        # SampleSheet.csv generated to be used in bcl2fastq
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        # Make the demux call
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)

        return True

コード例 #4

0

ファイルを表示

 def _aggregate_demux_results(self):
     """
     Take the Stats.json files from the different demultiplexing folders and merges them into one
     """
     ssname = self._get_samplesheet()
     ssparser = SampleSheetParser(ssname)
     try:
         indexfile = self.CONFIG['bcl2fastq']['index_path']
     except KeyError:
         logger.error(
             "Path to index file (10X) not found in the config file")
         raise RuntimeError
     #Function that returns a list of which lanes contains 10X samples.
     (lanes_10X, lanes_not_10X) = look_for_lanes_with_10X_indicies(
         indexfile, ssparser)
     lanes_10X_dict = {}
     for lane in lanes_10X:
         lanes_10X_dict[lane] = 0
     lanes_not_10X_dict = {}
     for lane in lanes_not_10X:
         lanes_not_10X_dict[lane] = 0
     if len(lanes_not_10X_dict) == 0:
         #in this case I have only 10X lanes, so I can treat it 10X lanes as the easy ones
         self._aggregate_demux_results_simple_complex(lanes_10X_dict, {})
     else:
         self._aggregate_demux_results_simple_complex(
             lanes_not_10X_dict, lanes_10X_dict)

コード例 #5

0

ファイルを表示

    def _copy_samplesheet(self):
        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        try:
            indexfile = self.CONFIG['bcl2fastq']['index_path']
        except KeyError:
            logger.error(
                "Path to index file (10X) not found in the config file")
            raise RuntimeError
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #Function that returns a list of which lanes contains 10X samples.
        (self.lanes_10X,
         self.lanes_not_10X) = look_for_lanes_with_10X_indicies(
             indexfile, ssparser)
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            indexfile,
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(
                    "encountered the following exception '{}'".format(e))
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated

        ##when demultiplexing SampleSheet.csv is the one I need to use
        ## Need to rewrite so that SampleSheet_0.csv is always used.
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        if not self.runParserObj.obj.get("samplesheet_csv"):
            self.runParserObj.obj[
                "samplesheet_csv"] = self.runParserObj.samplesheet.data

コード例 #6

0

ファイルを表示

ファイル: test_hiseqx.py プロジェクト: kate-v-stepanova/hugin

    def test_sample_sheet_path(self):
        fc = BaseFlowcell.init_flowcell(self.original_flowcell)
        sample_sheet = os.path.join(fc.path, 'SampleSheet.csv')
        sample_sheet_renamed = os.path.join(fc.path, 'SamapleSheet.csv.bckp')
        os.rename(sample_sheet, sample_sheet_renamed)
        sample_sheet_path = os.path.join(
            "tests/test_data/sample_sheets/hiseqx", fc.name, 'SampleSheet.csv')
        sample_sheet_parser = SampleSheetParser(sample_sheet_path)
        self.assertEqual(fc.sample_sheet, sample_sheet_parser.data)

        os.rename(sample_sheet_renamed, sample_sheet)

コード例 #7

0

ファイルを表示

    def _copy_samplesheet(self):
        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        #Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(
                ssname,
                os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError(
                "unable to copy file {} to destination {}".format(
                    ssname, self.run_dir))

        #this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        #to be used it needs some editing
        #this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, 'wb') as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(
            self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        if not self.runParserObj.obj.get("samplesheet_csv"):
            self.runParserObj.obj[
                "samplesheet_csv"] = self.runParserObj.samplesheet.data

コード例 #8

0

ファイルを表示

def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False):
    data = []
    try:
        ss_reader=SampleSheetParser(FCID_samplesheet_origin)
        data=ss_reader.data
    except:
        logger.warn("Cannot initialize SampleSheetParser for {}. Most likely due to poor comma separation".format(run_dir))
        return []

    if is_miseq:
        if not 'Description' in ss_reader.header or not \
        ('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']):
            logger.warn("Run {} not labelled as production or application. Disregarding it.".format(run_dir))
            #skip this run
            return []
    return data

コード例 #9

0

ファイルを表示

 def _set_run_type(self):
     ssname = os.path.join(self.run_dir, 'Data', 'Intensities', 'BaseCalls',
                           'SampleSheet.csv')
     if not os.path.exists(ssname):
         #case in which no samplesheet is found, assume it is a non NGI run
         self.run_type = "NON-NGI-RUN"
     else:
         #it SampleSheet exists try to see if it is a NGI-run
         ssparser = SampleSheetParser(ssname)
         if ssparser.header[
                 'Description'] == "Production" or ssparser.header[
                     'Description'] == "Applications":
             self.run_type = "NGI-RUN"
         else:
             #otherwise this is a non NGI run
             self.run_type = "NON-NGI-RUN"

コード例 #10

0

ファイルを表示

ファイル: NextSeq_Runs.py プロジェクト: zhanglingfei/TACA

 def _set_run_type(self):
     if not os.path.exists(self.ssname):
         # Case in which no samplesheet is found, assume it is a non NGI run
         self.run_type = "NON-NGI-RUN"
     else:
         # it SampleSheet exists try to see if it is a NGI-run
         # TODO SampleSheetParser may throw an exception
         ssparser = SampleSheetParser(self.ssname)
         # Jose : a key error can perfectly occur here
         if ssparser.header['Description'] == "Production" \
         or ssparser.header['Description'] == "Application" \
         or ssparser.header['Description'] == "Private":
             self.run_type = "NGI-RUN"
         else:
             # otherwise this is a non NGI run
             self.run_type = "NON-NGI-RUN"
         # Jose : This is a hack so to not break the naming convention in the NGI
         # The idea is that private costumers might sequence reads and in that
         # case the demultiplexed reads should not be transfered to Uppmax
         if ssparser.header['Description'] == "Private":
             self.transfer_to_analysis_server = False

コード例 #11

0

ファイルを表示

    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [
                    per_lane_base_masks[lane][bm]['base_mask']
                    for bm in per_lane_base_masks[lane]
                ][0]  # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True

コード例 #12

0

ファイルを表示

ファイル: index_fixer.py プロジェクト: SciLifeLab/standalone_scripts

def main(path, swap, rc1, rc2, platform, ss):
    ss_reader=SampleSheetParser(path)
    ss_data=ss_reader.data
    single = True

    if platform == "hiseq":
        index1 = 'Index'
        if re.search('[-+]', (ss_data[0][index1])):
            single = False

    elif platform == "miseq":
        index1 = 'index'
        index2 = 'index2'
        if index2 in ss_data[0]:
            single = False

    elif platform == "hiseqx":
        index1 = 'index1'
        index2 = 'index2'
        single = False

    if single:
        #Sanity check
        if rc2 or swap:
            sys.exit("Single index. Cannot change index 2, nor swap indexes")

        #Reverse compliment
        if rc1:
            for row in ss_data:
                index_in = re.match('([ATCG]{4,12})', row[index1])
                if index_in:
                    if rc1:
                        rc = ""
                        for nuc in index_in.group(1)[::-1]:
                            rc = rc + nuc_compliment(nuc)
                        row[index1] = '{}'.format(rc)

    if not single:
        #Reverse Compliment
        if rc1 or rc2:
            for row in ss_data:
                if platform == "hiseq":
                    index_in = re.match('([ATCG]{4,12})[-+]([ATCG]{4,12})', row[index1])
                    if rc1:
                        rc = ""
                        for nuc in index_in.group(1)[::-1]:
                            rc = rc + nuc_compliment(nuc)
                        row[index1] = '{}-{}'.format(rc, index_in.group(2))
                    if rc2:
                        rc = ""
                        for nuc in index_in.group(2)[::-1]:
                            rc = rc + nuc_compliment(nuc)
                        row[index1] = '{}-{}'.format(index_in.group(1), rc)

                elif platform == "miseq" or platform == "hiseqx":
                    if rc1:
                        rc = ""
                        for nuc in row['index1'][::-1]:
                            rc = rc + nuc_compliment(nuc)
                        row['index1'] = rc
                    if rc2:
                        rc = ""
                        for nuc in row['index2'][::-1]:
                            rc = rc + nuc_compliment(nuc)
                        row['index2'] = rc
        #Swap indexes
        if swap:
            for row in ss_data:
                if platform == "hiseq":
                    index_in = re.match('([ATCG]{4,12})[-+]([ATCG]{4,12})', row[index1])
                    row[index1] = '{}-{}'.format(index_in.group(2), index_in.group(1))

                elif platform == "miseq" or platform == "hiseqx":
                    storage = row['index1']
                    row['index1'] = row['index2']
                    row['index2'] = storage

    #Rearrange samples
    if ss:
        #Need to catch all samples in a list prior to writing, then dump them in corrected order
        sys.exit("Sample Swap isn't implemented yet.")

    #redemux_ss = ss_reader.generate_clean_samplesheet()
    redemux_ss = generate_samplesheet(ss_reader)
    if platform == "hiseq" or platform == "hiseqx":
        filename = re.search('\/(\w+).csv$', path).group(1)
    else:
        filename = "SampleSheet"

    with open('{}_redemux.csv'.format(filename), 'w') as fh_out:
        fh_out.write(redemux_ss)

コード例 #13

0

ファイルを表示

    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        try:
            indexfile = self.CONFIG['bcl2fastq']['index_path']
        except KeyError:
            logger.error(
                "Path to index file (10X) not found in the config file")
            raise RuntimeError
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #Function that returns a list of which lanes contains 10X samples.
        (lanes_10X, lanes_not_10X) = look_for_lanes_with_10X_indicies(
            indexfile, ssparser)
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            indexfile,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(
                    "encountered the following exception '{}'".format(e))
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated

        ##when demultiplexing SampleSheet.csv is the one I need to use
        ## Need to rewrite so that SampleSheet_0.csv is always used.
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(lanes_10X) and len(lanes_not_10X):
            Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X = "SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_not_10X))
                samplesheet_dest_10X = "SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_10X))
        else:
            with chdir(self.run_dir):
                shutil.copy("SampleSheet.csv", "SampleSheet_0.csv")

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if lanes_not_10X:
                cmd_normal = self.generate_bcl_command(lanes_not_10X,
                                                       bcl2fastq_cmd_counter)
                misc.call_external_command_detached(
                    cmd_normal,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "normal run {} on {}".format(os.path.basename(self.id),
                                                  datetime.now())))
                bcl2fastq_cmd_counter += 1
            if lanes_10X:
                cmd_10X = self.generate_bcl_command(lanes_10X,
                                                    bcl2fastq_cmd_counter,
                                                    is_10X=True)
                misc.call_external_command_detached(
                    cmd_10X,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "10X run {} on {}".format(os.path.basename(self.id),
                                               datetime.now())))
                bcl2fastq_cmd_counter += 1
        return True

コード例 #14

0

ファイルを表示

    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        #Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(
                ssname,
                os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError(
                "unable to copy file {} to destination {}".format(
                    ssname, self.run_dir))

        #this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        #to be used it needs some editing
        #this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, 'wb') as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(
            self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        #extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        #simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(simple_lanes, True,
                                                 bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [
            complex_lanes[base_masks].keys() for base_masks in complex_lanes
        ]
        different_masks = list(
            set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[
                        lane][mask]
            #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux,
                                                 True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command,
                    with_log_files=True,
                    prefix="demux_{}".format(execution))
                execution += 1