Esempio n. 1
0
    def demultiplex(self):
        """Perform demultiplexing of the flowcell.

        Takes software (bcl2fastq version to use) and parameters from the configuration
        file.
        """
        logger.info('Building bcl2fastq command')
        config = CONFIG['analysis']
        with chdir(self.run_dir):
            cl = [config.get('bcl2fastq').get(self.run_type)]
            if config['bcl2fastq'].has_key('options'):
                cl_options = config['bcl2fastq']['options']

                # Append all options that appear in the configuration file to the main command.
                # Options that require a value, i.e --use-bases-mask Y8,I8,Y8, will be returned
                # as a dictionary, while options that doesn't require a value, i.e --no-lane-splitting
                # will be returned as a simple string
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.popitem()
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                         " run {} on {}".format(os.path.basename(self.id), datetime.now())))

            misc.call_external_command_detached(cl, with_log_files=True)
Esempio n. 2
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname   = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_clean_samplesheet(ssparser, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix = True, fields_qPCR=['SampleName']))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet  = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id), datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True
Esempio n. 3
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(self.lanes_10X) and len(self.lanes_not_10X):
             Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X="SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_not_10X))
                samplesheet_dest_10X="SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_10X))
        else:
            with chdir(self.run_dir):
                samplesheet_dest="SampleSheet_0.csv"
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, (self.lanes_10X or self.lanes_not_10X)))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if self.lanes_not_10X:
               cmd_normal = self.generate_bcl_command(self.lanes_not_10X, bcl2fastq_cmd_counter)
               misc.call_external_command_detached(cmd_normal, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "normal run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
            if self.lanes_10X:
               cmd_10X = self.generate_bcl_command(self.lanes_10X, bcl2fastq_cmd_counter, is_10X = True)
               misc.call_external_command_detached(cmd_10X, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "10X run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
        return True
Esempio n. 4
0
 def test_call_external_command_detached(self):
     """Call external command detached."""
     new_file = os.path.join(self.rootdir, 'test_call_external_det')
     command = 'touch ' + new_file
     misc.call_external_command_detached(command,
                                         with_log_files=True,
                                         prefix='test_det')
     time.sleep(0.1)
     self.assertTrue(os.path.isfile(new_file))
     self.assertTrue(os.path.isfile('test_det_touch.out'))
     os.remove('test_det_touch.out')
     os.remove('test_det_touch.err')
Esempio n. 5
0
    def demultiplex_run(self):
        """ Demultiplex a NextSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        if not os.path.exists(self.ssname):
            # We should not get here really and this run should be defined as NON NGI-RUN
            return False
        # TODO SampleSheetParser may throw an exception
        ssparser = SampleSheetParser(self.ssname)
        # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        # if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # Check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(self._generate_clean_samplesheet(ssparser))
            except Exception as e:
                if os.path.exists(samplesheet_dest):
                    os.remove(samplesheet_dest)
                logger.error(e)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        # SampleSheet.csv generated to be used in bcl2fastq
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        # Make the demux call
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)

        return True
Esempio n. 6
0
 def demultiplex_run(self): 
     """ Demultiplex a NextSeq run:
         - find the samplesheet
         - make a local copy of the samplesheet and name it SampleSheet.csv
         - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
         - run bcl2fastq conversion
     """
     if not os.path.exists(self.ssname):
         # We should not get here really and this run should be defined as NON NGI-RUN
         return False
     # TODO SampleSheetParser may throw an exception
     ssparser = SampleSheetParser(self.ssname)
     # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
     # if this is not the case then create it and take special care of modification to be done on the SampleSheet
     samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
     # Check that the samplesheet is not already present. In this case go the next step
     if not os.path.exists(samplesheet_dest):
         try:
             with open(samplesheet_dest, 'wb') as fcd:
                 fcd.write(self._generate_clean_samplesheet(ssparser))
         except Exception as e:
             if os.path.exists(samplesheet_dest):
                 os.remove(samplesheet_dest)
             logger.error(e)
             return False
         logger.info(("Created SampleSheet.csv for Flowcell {} in {} "
                      .format(self.id, samplesheet_dest)))
     # SampleSheet.csv generated to be used in bcl2fastq
     self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
     # Make the demux call
     with chdir(self.run_dir):
         cl = [self.CONFIG.get('bcl2fastq')['bin']]
         if self.CONFIG.get('bcl2fastq').has_key('options'):
             cl_options = self.CONFIG['bcl2fastq']['options']
             # Append all options that appear in the configuration file to the main command.
             for option in cl_options:
                 if isinstance(option, dict):
                     opt, val = option.items()[0]
                     cl.extend(['--{}'.format(opt), str(val)])
                 else:
                     cl.append('--{}'.format(option))
         logger.info(("BCL to FASTQ conversion and demultiplexing started for "
              " run {} on {}".format(os.path.basename(self.id), datetime.now())))
         misc.call_external_command_detached(cl, with_log_files=True)
         
     return True
Esempio n. 7
0
    def demultiplex_run(self):
        """
           Demultiplex a run:
            - Make sub-samplesheet based on sample classes
            - Decide correct bcl2fastq command parameters based on sample classes
            - run bcl2fastq conversion
        """
        # Check sample types
        sample_type_list = []
        for lane, lane_contents in self.sample_table.items():
            for sample in lane_contents:
                sample_detail = sample[1]
                sample_type = sample_detail['sample_type']
                if sample_type not in sample_type_list:
                    sample_type_list.append(sample_type)

        # Go through sample_table for demultiplexing
        bcl2fastq_cmd_counter = 0
        for sample_type in sorted(sample_type_list):
            # Looking for lanes with multiple masks under the same sample type
            lane_table = dict()
            for lane, lane_contents in self.sample_table.items():
                for sample in lane_contents:
                    sample_detail = sample[1]
                    sample_type_t = sample_detail['sample_type']
                    sample_index_length = sample_detail['index_length']
                    if sample_type_t == sample_type:
                        if lane_table.get(lane):
                            if sample_index_length not in lane_table[lane]:
                                lane_table[lane].append(sample_index_length)
                        else:
                            lane_table.update({lane: [sample_index_length]})
            # Determine the number of demux needed for the same sample type
            demux_number_with_the_same_sample_type = len(
                max([v for k, v in lane_table.items()], key=len))
            # Prepare sub-samplesheets, masks and commands
            for i in range(0, demux_number_with_the_same_sample_type):
                # Prepare sub-samplesheet
                # A dictionary with lane and sample IDs to include
                samples_to_include = dict()
                # A dictionary with lane and index length for generating masks
                mask_table = dict()
                for lane, lane_contents in self.sample_table.items():
                    try:
                        index_length = lane_table[lane][i]
                        mask_table.update({lane: index_length})
                        for sample in lane_contents:
                            sample_name = sample[0]
                            sample_detail = sample[1]
                            sample_type_t = sample_detail['sample_type']
                            sample_index_length = sample_detail['index_length']
                            if sample_type_t == sample_type and sample_index_length == index_length:
                                if samples_to_include.get(lane):
                                    samples_to_include[lane].append(
                                        sample_name)
                                else:
                                    samples_to_include.update(
                                        {lane: [sample_name]})
                    except (KeyError, IndexError) as err:
                        logger.info(
                            ('No corresponding mask in lane {}. Skip it.'.
                             format(lane)))
                        continue

                # Make sub-samplesheet
                with chdir(self.run_dir):
                    samplesheet_dest = 'SampleSheet_{}.csv'.format(
                        bcl2fastq_cmd_counter)
                    with open(samplesheet_dest, 'w') as fcd:
                        fcd.write(
                            _generate_samplesheet_subset(
                                self.runParserObj.samplesheet,
                                samples_to_include))

                # Prepare demultiplexing dir
                with chdir(self.run_dir):
                    # Create Demultiplexing dir, this changes the status to IN_PROGRESS
                    if not os.path.exists('Demultiplexing'):
                        os.makedirs('Demultiplexing')

                # Prepare demultiplexing command
                with chdir(self.run_dir):
                    cmd = self.generate_bcl_command(sample_type, mask_table,
                                                    bcl2fastq_cmd_counter)
                    misc.call_external_command_detached(
                        cmd,
                        with_log_files=True,
                        prefix='demux_{}'.format(bcl2fastq_cmd_counter))
                    logger.info(('BCL to FASTQ conversion and demultiplexing ' \
                    'started for run {} on {}'.format(os.path.basename(self.id),
                                                      datetime.now())))

                # Demutiplexing done for one mask type and scripts will continue
                # Working with the next type. Command counter should increase by 1
                bcl2fastq_cmd_counter += 1
        return True
Esempio n. 8
0
    def compute_undetermined(self):
        """
        This function returns true if all demux steps are done and we can proceed to QC
        For simple lanes with index: no check is done everything needs to be in place
        for complex lanes: no check is done everything needs to be in place
        for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion
        """
        NoIndexLanes = [lane["Lane"] for lane in self.runParserObj.samplesheet.data if "NoIndex" in lane["index"]]
        if len(NoIndexLanes) == 0:
            return True  # everything is fine I can proceed to QC
        # otherwise proceed

        NoIndex_Undetermiend = os.path.join(self.run_dir, "Demultiplexing_NoIndex")
        if not os.path.exists(NoIndex_Undetermiend):
            # for these lanes I have no undetermiend as I demux them without index.
            # now geenrate the base masks per lane
            per_lane_base_masks = self._generate_per_lane_base_mask()
            # store here only the NoIndex lanes
            per_lane_base_masks_NoIndex = {}
            run_with_no_index = False  # use this flag to check that we are not in the C.Daub case
            for NoIndexLane in NoIndexLanes:
                per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[NoIndexLane]
                base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0]
                new_base_mask = []
                if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]):
                    # C.Daub_15_01 case, only one sample per lane and no index at all
                    run_with_no_index = True
                else:
                    for baseMask_element in per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]:
                        if baseMask_element.startswith("Y"):
                            new_base_mask.append(baseMask_element.replace("Y", "N"))
                        elif baseMask_element.startswith("N"):
                            new_base_mask.append(baseMask_element.replace("N", "Y"))
                    per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"] = new_base_mask
            if not run_with_no_index:
                os.makedirs(NoIndex_Undetermiend)
                command = self._generate_bcl2fastq_command(
                    per_lane_base_masks_NoIndex, True, "NoIndex", mask_short_adapter_reads=True
                )
                with chdir(self.run_dir):
                    misc.call_external_command_detached(command, with_log_files=True, prefix="demux_NoIndex")
                # return false, as I need to wait to finish the demux for the NoIndex case
                return False
            else:
                # in this case I do not want to start a demux for th eindex, beceause I do not have the index at all
                # I need to softlink everythin else that is in Stats as I do not want to recompute it
                missingStatsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_0", "Stats", "*F*L*.txt"))
                destination = os.path.join(self.run_dir, self.demux_dir, "Stats")
                for source in missingStatsFiles:
                    source_file_name = os.path.basename(source)
                    if not os.path.exists(os.path.join(destination, source_file_name)):
                        os.symlink(source, os.path.join(destination, source_file_name))
                return True
        else:
            # in this case it means that I have already started to demux the NoIndex
            if not os.path.exists(
                os.path.join(self.run_dir, "Demultiplexing_NoIndex", "Stats", "DemultiplexingStats.xml")
            ):
                # demultiplexing of undetermined is still ongoing
                logger.info("Demux of NoIndex lanes ongoing")
                return False
            else:
                logger.info("Demux of NoIndex lanes done.")
                # now I need to produce the files needed in the QC
                flag_file = os.path.join(NoIndex_Undetermiend, "ongoing")
                if os.path.exists(flag_file):
                    # it means that a previous instance of TACA is running and still processing this FC
                    logger.info("Counting of undetermined indexes for NoIndex lanes ongoing")
                    return False
                # now check if the stats have been already computed
                computed = True
                for lane_id in NoIndexLanes:
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    if not os.path.exists(demuxSummary_file):
                        # if does not exists and the ongoing falg is not present, then I need to set computed to False
                        computed = False
                if computed:
                    # in this case I already computed all the demux stats that I need
                    return True
                # otherwise I need to comput them
                open(flag_file, "a").close()  # create the flag file indicating I am working on this
                for lane_id in NoIndexLanes:
                    # count the index occurences, each lane corresponds to one project, a project might have multiple lanes
                    current_lane = [lane for lane in self.runParserObj.samplesheet.data if lane_id == lane["Lane"]][0]
                    if current_lane["index"] != "NoIndex":
                        logger.error(
                            "while processing run {} NoIndex lane {}, index {} found in SampleSheet".format(
                                self.id, lane_id, current_lane["index"]
                            )
                        )
                        return False
                    index_counter = {}
                    indexes_fastq1 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R2_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    indexes_fastq2 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R3_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    # I assume these two files are always present, maybe it is posisble to have no index with a single index...
                    logger.info("Computing Undetermiend indexes for NoIndex lane {}".format(lane_id))
                    zcat = subprocess.Popen(["zcat", indexes_fastq1], stdout=subprocess.PIPE)
                    # this command allows to steam two files, print them line after line separated by a plus
                    awk = subprocess.Popen(
                        [
                            "awk",
                            'BEGIN {{OFS="+"}}{{  ("zcat " "{0} " ) | getline line ; print $0,line }}'.format(
                                indexes_fastq2
                            ),
                        ],
                        stdout=subprocess.PIPE,
                        stdin=zcat.stdout,
                    )
                    # now select only the 2nd line every 4 (i.e., only the index1+index2 line)
                    sed = subprocess.Popen(["sed", "-n", "2~4p"], stdout=subprocess.PIPE, stdin=awk.stdout)
                    zcat.stdout.close()
                    awk.stdout.close()
                    output = sed.communicate()[0]
                    zcat.wait()
                    awk.wait()
                    for barcode in output.split("\n")[:-1]:
                        try:
                            index_counter[barcode] += 1
                        except KeyError:
                            index_counter[barcode] = 1
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    with open(demuxSummary_file, "w") as demuxSummary_file_fh:
                        demuxSummary_file_fh.write("### Most Popular Unknown Index Sequences\n")
                        demuxSummary_file_fh.write("### Columns: Index_Sequence Hit_Count\n")
                        for (index, occ) in sorted(index_counter.items(), key=operator.itemgetter(1), reverse=True):
                            demuxSummary_file_fh.write("{}\t{}\n".format(index, occ))

                # I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values
                undeterminedStats = DemuxSummaryParser(os.path.join(self.run_dir, self.demux_dir, "Stats"))
                sample_data_old = self.runParserObj.lanes.sample_data
                sample_data_new = []
                for lane in sample_data_old:
                    if lane["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        lane["% One mismatchbarcode"] = "0"
                        lane["% Perfectbarcode"] = "100"
                        lane["% of thelane"] = "100"
                        lane["PF Clusters"] = str(PF_clusters)
                    sample_data_new.append(lane)
                self.runParserObj.lanes.sample_data = sample_data_new

                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_lane_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html")
                _generate_lane_html(new_html_report_lane, self.runParserObj.lanes)
                # now do the same for laneBarcode
                sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data
                sampleBarcode_data_new = []
                for sample in sampleBarcode_data_old:
                    if sample["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        sample["% One mismatchbarcode"] = "0"
                        sample["% Perfectbarcode"] = "100"
                        sample["% of thelane"] = "100"
                        sample["PF Clusters"] = str(PF_clusters)
                    sampleBarcode_data_new.append(sample)
                self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new
                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_sampleBarcode_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_sampleBarcode = os.path.join(new_html_report_sampleBarcode_dir, "laneBarcode.html")
                _generate_lane_html(new_html_report_sampleBarcode, self.runParserObj.lanebarcodes)

                os.remove(flag_file)  # remove flag file to allow future iteration on this FC
                return True  # return true, I have done everything I was supposed to do
Esempio n. 9
0
    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        # Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(ssname, os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir))

        # this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        # to be used it needs some editing
        # this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, "wb") as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
        # now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        # if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        # extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        # simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        # compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [complex_lanes[base_masks].keys() for base_masks in complex_lanes]
        different_masks = list(set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[lane][mask]
            # at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num)
            )
            bcl2fastq_command_num += 1
        # now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution)
                )
                execution += 1
Esempio n. 10
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [
                    per_lane_base_masks[lane][bm]['base_mask']
                    for bm in per_lane_base_masks[lane]
                ][0]  # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True
Esempio n. 11
0
 def demultiplex_run(self):
     """
     Demultiplex a HiSeq run:
         - find the samplesheet
         - make a local copy of the samplesheet and name it SampleSheet.csv
         - create multiple SampleSheets in case at least one lane have multiple indexes lengths
         - run bcl2fastq conversion
     """
     #now geenrate the base masks per lane and decide how to demultiplex
     per_lane_base_masks = self._generate_per_lane_base_mask()
     max_different_base_masks = max([
         len(per_lane_base_masks[base_masks])
         for base_masks in per_lane_base_masks
     ])
     #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
     #extract lanes with a single base masks
     simple_lanes = {}
     complex_lanes = {}
     for lane in per_lane_base_masks:
         if len(per_lane_base_masks[lane]) == 1:
             simple_lanes[lane] = per_lane_base_masks[lane]
         else:
             complex_lanes[lane] = per_lane_base_masks[lane]
     #simple lanes contains the lanes such that there is more than one base mask
     bcl2fastq_commands = []
     bcl2fastq_command_num = 0
     if len(simple_lanes) > 0:
         bcl2fastq_commands.append(
             self._generate_bcl2fastq_command(simple_lanes, True,
                                              bcl2fastq_command_num))
         bcl2fastq_command_num += 1
     #compute the different masks, there will be one bcl2fastq command per mask
     base_masks_complex = [
         complex_lanes[base_masks].keys() for base_masks in complex_lanes
     ]
     different_masks = list(
         set([item for sublist in base_masks_complex for item in sublist]))
     for mask in different_masks:
         base_masks_complex_to_demux = {}
         for lane in complex_lanes:
             if complex_lanes[lane].has_key(mask):
                 base_masks_complex_to_demux[lane] = {}
                 base_masks_complex_to_demux[lane][mask] = complex_lanes[
                     lane][mask]
         #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
         bcl2fastq_commands.append(
             self._generate_bcl2fastq_command(base_masks_complex_to_demux,
                                              True, bcl2fastq_command_num))
         bcl2fastq_command_num += 1
     #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
     with chdir(self.run_dir):
         # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
         if not os.path.exists("Demultiplexing"):
             os.makedirs("Demultiplexing")
         execution = 0
         for bcl2fastq_command in bcl2fastq_commands:
             misc.call_external_command_detached(
                 bcl2fastq_command,
                 with_log_files=True,
                 prefix="demux_{}".format(execution))
             execution += 1
Esempio n. 12
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        try:
            indexfile = self.CONFIG['bcl2fastq']['index_path']
        except KeyError:
            logger.error(
                "Path to index file (10X) not found in the config file")
            raise RuntimeError
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #Function that returns a list of which lanes contains 10X samples.
        (lanes_10X, lanes_not_10X) = look_for_lanes_with_10X_indicies(
            indexfile, ssparser)
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            indexfile,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(
                    "encountered the following exception '{}'".format(e))
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated

        ##when demultiplexing SampleSheet.csv is the one I need to use
        ## Need to rewrite so that SampleSheet_0.csv is always used.
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(lanes_10X) and len(lanes_not_10X):
            Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X = "SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_not_10X))
                samplesheet_dest_10X = "SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_10X))
        else:
            with chdir(self.run_dir):
                shutil.copy("SampleSheet.csv", "SampleSheet_0.csv")

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if lanes_not_10X:
                cmd_normal = self.generate_bcl_command(lanes_not_10X,
                                                       bcl2fastq_cmd_counter)
                misc.call_external_command_detached(
                    cmd_normal,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "normal run {} on {}".format(os.path.basename(self.id),
                                                  datetime.now())))
                bcl2fastq_cmd_counter += 1
            if lanes_10X:
                cmd_10X = self.generate_bcl_command(lanes_10X,
                                                    bcl2fastq_cmd_counter,
                                                    is_10X=True)
                misc.call_external_command_detached(
                    cmd_10X,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "10X run {} on {}".format(os.path.basename(self.id),
                                               datetime.now())))
                bcl2fastq_cmd_counter += 1
        return True
Esempio n. 13
0
    def compute_undetermined(self):
        """
        This function returns true if all demux steps are done and we can proceed to QC
        For simple lanes with index: no check is done everything needs to be in place
        for complex lanes: no check is done everything needs to be in place
        for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion
        """
        NoIndexLanes = [
            lane["Lane"] for lane in self.runParserObj.samplesheet.data
            if "NoIndex" in lane["index"]
        ]
        if len(NoIndexLanes) == 0:
            return True  # everything is fine I can proceed to QC
        #otherwise proceed

        NoIndex_Undetermiend = os.path.join(self.run_dir,
                                            "Demultiplexing_NoIndex")
        if not os.path.exists(NoIndex_Undetermiend):
            #for these lanes I have no undetermiend as I demux them without index.
            #now geenrate the base masks per lane
            per_lane_base_masks = self._generate_per_lane_base_mask()
            #store here only the NoIndex lanes
            per_lane_base_masks_NoIndex = {}
            run_with_no_index = False  # use this flag to check that we are not in the C.Daub case
            for NoIndexLane in NoIndexLanes:
                per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[
                    NoIndexLane]
                base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0]
                new_base_mask = []
                if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]
                       ['base_mask']):
                    #C.Daub_15_01 case, only one sample per lane and no index at all
                    run_with_no_index = True
                else:
                    for baseMask_element in per_lane_base_masks_NoIndex[
                            NoIndexLane][base_mask_key]['base_mask']:
                        if baseMask_element.startswith("Y"):
                            new_base_mask.append(
                                baseMask_element.replace("Y", "N"))
                        elif baseMask_element.startswith("N"):
                            new_base_mask.append(
                                baseMask_element.replace("N", "Y"))
                    per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key][
                        'base_mask'] = new_base_mask
            if not run_with_no_index:
                os.makedirs(NoIndex_Undetermiend)
                command = self._generate_bcl2fastq_command(
                    per_lane_base_masks_NoIndex,
                    True,
                    "NoIndex",
                    mask_short_adapter_reads=True)
                with chdir(self.run_dir):
                    misc.call_external_command_detached(command,
                                                        with_log_files=True,
                                                        prefix="demux_NoIndex")
                #return false, as I need to wait to finish the demux for the NoIndex case
                return False
            else:
                #in this case I do not want to start a demux for th eindex, beceause I do not have the index at all
                #I need to softlink everythin else that is in Stats as I do not want to recompute it
                missingStatsFiles = glob.glob(
                    os.path.join(self.run_dir, "Demultiplexing_0", "Stats",
                                 "*F*L*.txt"))
                destination = os.path.join(self.run_dir, self.demux_dir,
                                           "Stats")
                for source in missingStatsFiles:
                    source_file_name = os.path.basename(source)
                    if not os.path.exists(
                            os.path.join(destination, source_file_name)):
                        os.symlink(source,
                                   os.path.join(destination, source_file_name))
                return True
        else:
            #in this case it means that I have already started to demux the NoIndex
            if not os.path.exists(
                    os.path.join(self.run_dir, "Demultiplexing_NoIndex",
                                 'Stats', 'DemultiplexingStats.xml')):
                #demultiplexing of undetermined is still ongoing
                logger.info("Demux of NoIndex lanes ongoing")
                return False
            else:
                logger.info("Demux of NoIndex lanes done.")
                #now I need to produce the files needed in the QC
                flag_file = os.path.join(NoIndex_Undetermiend, "ongoing")
                if os.path.exists(flag_file):
                    #it means that a previous instance of TACA is running and still processing this FC
                    logger.info(
                        "Counting of undetermined indexes for NoIndex lanes ongoing"
                    )
                    return False
                #now check if the stats have been already computed
                computed = True
                for lane_id in NoIndexLanes:
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats",
                        "DemuxSummaryF1L{}.txt".format(lane_id))
                    if not os.path.exists(demuxSummary_file):
                        #if does not exists and the ongoing falg is not present, then I need to set computed to False
                        computed = False
                if computed:
                    #in this case I already computed all the demux stats that I need
                    return True
                #otherwise I need to comput them
                open(flag_file, 'a').close(
                )  # create the flag file indicating I am working on this
                for lane_id in NoIndexLanes:
                    #count the index occurences, each lane corresponds to one project, a project might have multiple lanes
                    current_lane = [
                        lane for lane in self.runParserObj.samplesheet.data
                        if lane_id == lane["Lane"]
                    ][0]
                    if current_lane["index"] != "NoIndex":
                        logger.error(
                            "while processing run {} NoIndex lane {}, index {} found in SampleSheet"
                            .format(self.id, lane_id, current_lane["index"]))
                        return False
                    index_counter = {}
                    indexes_fastq1 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend, current_lane[
                                self.runParserObj.samplesheet.dfield_proj],
                            current_lane[
                                self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R2_001.fastq.gz".format(
                                current_lane[
                                    self.runParserObj.samplesheet.dfield_snm],
                                lane_id)))[0]
                    indexes_fastq2 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend, current_lane[
                                self.runParserObj.samplesheet.dfield_proj],
                            current_lane[
                                self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R3_001.fastq.gz".format(
                                current_lane[
                                    self.runParserObj.samplesheet.dfield_snm],
                                lane_id)))[0]
                    # I assume these two files are always present, maybe it is posisble to have no index with a single index...
                    logger.info(
                        "Computing Undetermiend indexes for NoIndex lane {}".
                        format(lane_id))
                    zcat = subprocess.Popen(['zcat', indexes_fastq1],
                                            stdout=subprocess.PIPE)
                    #this command allows to steam two files, print them line after line separated by a plus
                    awk = subprocess.Popen([
                        'awk',
                        'BEGIN {{OFS="+"}}{{  ("zcat " "{0} " ) | getline line ; print $0,line }}'
                        .format(indexes_fastq2)
                    ],
                                           stdout=subprocess.PIPE,
                                           stdin=zcat.stdout)
                    #now select only the 2nd line every 4 (i.e., only the index1+index2 line)
                    sed = subprocess.Popen(['sed', '-n', "2~4p"],
                                           stdout=subprocess.PIPE,
                                           stdin=awk.stdout)
                    zcat.stdout.close()
                    awk.stdout.close()
                    output = sed.communicate()[0]
                    zcat.wait()
                    awk.wait()
                    for barcode in output.split('\n')[:-1]:
                        try:
                            index_counter[barcode] += 1
                        except KeyError:
                            index_counter[barcode] = 1
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats",
                        "DemuxSummaryF1L{}.txt".format(lane_id))
                    with open(demuxSummary_file, 'w') as demuxSummary_file_fh:
                        demuxSummary_file_fh.write(
                            "### Most Popular Unknown Index Sequences\n")
                        demuxSummary_file_fh.write(
                            "### Columns: Index_Sequence Hit_Count\n")
                        for (index, occ) in sorted(index_counter.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True):
                            demuxSummary_file_fh.write("{}\t{}\n".format(
                                index, occ))

                #I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values
                undeterminedStats = DemuxSummaryParser(
                    os.path.join(self.run_dir, self.demux_dir, "Stats"))
                sample_data_old = self.runParserObj.lanes.sample_data
                sample_data_new = []
                for lane in sample_data_old:
                    if lane["Lane"] in NoIndexLanes:
                        #in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        lane["% One mismatchbarcode"] = '0'
                        lane["% Perfectbarcode"] = '100'
                        lane["% of thelane"] = '100'
                        lane["PF Clusters"] = str(PF_clusters)
                    sample_data_new.append(lane)
                self.runParserObj.lanes.sample_data = sample_data_new

                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_lane_dir = _create_folder_structure(
                    demux_folder,
                    ["Reports", "html", self.flowcell_id, "all", "all", "all"])
                new_html_report_lane = os.path.join(new_html_report_lane_dir,
                                                    "lane.html")
                _generate_lane_html(new_html_report_lane,
                                    self.runParserObj.lanes)
                #now do the same for laneBarcode
                sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data
                sampleBarcode_data_new = []
                for sample in sampleBarcode_data_old:
                    if sample["Lane"] in NoIndexLanes:
                        #in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        sample["% One mismatchbarcode"] = '0'
                        sample["% Perfectbarcode"] = '100'
                        sample["% of thelane"] = '100'
                        sample["PF Clusters"] = str(PF_clusters)
                    sampleBarcode_data_new.append(sample)
                self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new
                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_sampleBarcode_dir = _create_folder_structure(
                    demux_folder,
                    ["Reports", "html", self.flowcell_id, "all", "all", "all"])
                new_html_report_sampleBarcode = os.path.join(
                    new_html_report_sampleBarcode_dir, "laneBarcode.html")
                _generate_lane_html(new_html_report_sampleBarcode,
                                    self.runParserObj.lanebarcodes)

                os.remove(
                    flag_file
                )  # remove flag file to allow future iteration on this FC
                return True  #return true, I have done everything I was supposed to do
Esempio n. 14
0
    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        #Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(
                ssname,
                os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError(
                "unable to copy file {} to destination {}".format(
                    ssname, self.run_dir))

        #this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        #to be used it needs some editing
        #this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, 'wb') as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(
            self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        #extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        #simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(simple_lanes, True,
                                                 bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [
            complex_lanes[base_masks].keys() for base_masks in complex_lanes
        ]
        different_masks = list(
            set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[
                        lane][mask]
            #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux,
                                                 True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command,
                    with_log_files=True,
                    prefix="demux_{}".format(execution))
                execution += 1
Esempio n. 15
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(self.lanes_10X) and len(self.lanes_not_10X):
            Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X = "SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, self.lanes_not_10X))
                samplesheet_dest_10X = "SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, self.lanes_10X))
        else:
            with chdir(self.run_dir):
                samplesheet_dest = "SampleSheet_0.csv"
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet,
                            (self.lanes_10X or self.lanes_not_10X)))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if self.lanes_not_10X:
                cmd_normal = self.generate_bcl_command(self.lanes_not_10X,
                                                       bcl2fastq_cmd_counter)
                misc.call_external_command_detached(
                    cmd_normal,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "normal run {} on {}".format(os.path.basename(self.id),
                                                  datetime.now())))
                bcl2fastq_cmd_counter += 1
            if self.lanes_10X:
                cmd_10X = self.generate_bcl_command(self.lanes_10X,
                                                    bcl2fastq_cmd_counter,
                                                    is_10X=True)
                misc.call_external_command_detached(
                    cmd_10X,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "10X run {} on {}".format(os.path.basename(self.id),
                                               datetime.now())))
                bcl2fastq_cmd_counter += 1
        return True