コード例 #1
0
ファイル: __init__.py プロジェクト: Galithil/TACA
    def demultiplex(self):
        """Perform demultiplexing of the flowcell.

        Takes software (bcl2fastq version to use) and parameters from the configuration
        file.
        """
        logger.info('Building bcl2fastq command')
        config = CONFIG['analysis']
        with chdir(self.run_dir):
            cl = [config.get('bcl2fastq').get(self.run_type)]
            if config['bcl2fastq'].has_key('options'):
                cl_options = config['bcl2fastq']['options']

                # Append all options that appear in the configuration file to the main command.
                # Options that require a value, i.e --use-bases-mask Y8,I8,Y8, will be returned
                # as a dictionary, while options that doesn't require a value, i.e --no-lane-splitting
                # will be returned as a simple string
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.popitem()
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                         " run {} on {}".format(os.path.basename(self.id), datetime.now())))

            misc.call_external_command_detached(cl, with_log_files=True)
コード例 #2
0
ファイル: HiSeqX_Runs.py プロジェクト: eriksjolund/TACA
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname   = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_clean_samplesheet(ssparser, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix = True, fields_qPCR=['SampleName']))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet  = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id), datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True
コード例 #3
0
ファイル: HiSeqX_Runs.py プロジェクト: senthil10/TACA
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(self.lanes_10X) and len(self.lanes_not_10X):
             Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X="SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_not_10X))
                samplesheet_dest_10X="SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_10X))
        else:
            with chdir(self.run_dir):
                samplesheet_dest="SampleSheet_0.csv"
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, (self.lanes_10X or self.lanes_not_10X)))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if self.lanes_not_10X:
               cmd_normal = self.generate_bcl_command(self.lanes_not_10X, bcl2fastq_cmd_counter)
               misc.call_external_command_detached(cmd_normal, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "normal run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
            if self.lanes_10X:
               cmd_10X = self.generate_bcl_command(self.lanes_10X, bcl2fastq_cmd_counter, is_10X = True)
               misc.call_external_command_detached(cmd_10X, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "10X run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
        return True
コード例 #4
0
ファイル: NextSeq_Runs.py プロジェクト: SciLifeLab/TACA
 def demultiplex_run(self): 
     """ Demultiplex a NextSeq run:
         - find the samplesheet
         - make a local copy of the samplesheet and name it SampleSheet.csv
         - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
         - run bcl2fastq conversion
     """
     if not os.path.exists(self.ssname):
         # We should not get here really and this run should be defined as NON NGI-RUN
         return False
     # TODO SampleSheetParser may throw an exception
     ssparser = SampleSheetParser(self.ssname)
     # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
     # if this is not the case then create it and take special care of modification to be done on the SampleSheet
     samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
     # Check that the samplesheet is not already present. In this case go the next step
     if not os.path.exists(samplesheet_dest):
         try:
             with open(samplesheet_dest, 'wb') as fcd:
                 fcd.write(self._generate_clean_samplesheet(ssparser))
         except Exception as e:
             if os.path.exists(samplesheet_dest):
                 os.remove(samplesheet_dest)
             logger.error(e)
             return False
         logger.info(("Created SampleSheet.csv for Flowcell {} in {} "
                      .format(self.id, samplesheet_dest)))
     # SampleSheet.csv generated to be used in bcl2fastq
     self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
     # Make the demux call
     with chdir(self.run_dir):
         cl = [self.CONFIG.get('bcl2fastq')['bin']]
         if self.CONFIG.get('bcl2fastq').has_key('options'):
             cl_options = self.CONFIG['bcl2fastq']['options']
             # Append all options that appear in the configuration file to the main command.
             for option in cl_options:
                 if isinstance(option, dict):
                     opt, val = option.items()[0]
                     cl.extend(['--{}'.format(opt), str(val)])
                 else:
                     cl.append('--{}'.format(option))
         logger.info(("BCL to FASTQ conversion and demultiplexing started for "
              " run {} on {}".format(os.path.basename(self.id), datetime.now())))
         misc.call_external_command_detached(cl, with_log_files=True)
         
     return True
コード例 #5
0
ファイル: HiSeq_Runs.py プロジェクト: vezzi/TACA
    def compute_undetermined(self):
        """
        This function returns true if all demux steps are done and we can proceed to QC
        For simple lanes with index: no check is done everything needs to be in place
        for complex lanes: no check is done everything needs to be in place
        for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion
        """
        NoIndexLanes = [lane["Lane"] for lane in self.runParserObj.samplesheet.data if "NoIndex" in lane["index"]]
        if len(NoIndexLanes) == 0:
            return True  # everything is fine I can proceed to QC
        # otherwise proceed

        NoIndex_Undetermiend = os.path.join(self.run_dir, "Demultiplexing_NoIndex")
        if not os.path.exists(NoIndex_Undetermiend):
            # for these lanes I have no undetermiend as I demux them without index.
            # now geenrate the base masks per lane
            per_lane_base_masks = self._generate_per_lane_base_mask()
            # store here only the NoIndex lanes
            per_lane_base_masks_NoIndex = {}
            run_with_no_index = False  # use this flag to check that we are not in the C.Daub case
            for NoIndexLane in NoIndexLanes:
                per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[NoIndexLane]
                base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0]
                new_base_mask = []
                if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]):
                    # C.Daub_15_01 case, only one sample per lane and no index at all
                    run_with_no_index = True
                else:
                    for baseMask_element in per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]:
                        if baseMask_element.startswith("Y"):
                            new_base_mask.append(baseMask_element.replace("Y", "N"))
                        elif baseMask_element.startswith("N"):
                            new_base_mask.append(baseMask_element.replace("N", "Y"))
                    per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"] = new_base_mask
            if not run_with_no_index:
                os.makedirs(NoIndex_Undetermiend)
                command = self._generate_bcl2fastq_command(
                    per_lane_base_masks_NoIndex, True, "NoIndex", mask_short_adapter_reads=True
                )
                with chdir(self.run_dir):
                    misc.call_external_command_detached(command, with_log_files=True, prefix="demux_NoIndex")
                # return false, as I need to wait to finish the demux for the NoIndex case
                return False
            else:
                # in this case I do not want to start a demux for th eindex, beceause I do not have the index at all
                # I need to softlink everythin else that is in Stats as I do not want to recompute it
                missingStatsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_0", "Stats", "*F*L*.txt"))
                destination = os.path.join(self.run_dir, self.demux_dir, "Stats")
                for source in missingStatsFiles:
                    source_file_name = os.path.basename(source)
                    if not os.path.exists(os.path.join(destination, source_file_name)):
                        os.symlink(source, os.path.join(destination, source_file_name))
                return True
        else:
            # in this case it means that I have already started to demux the NoIndex
            if not os.path.exists(
                os.path.join(self.run_dir, "Demultiplexing_NoIndex", "Stats", "DemultiplexingStats.xml")
            ):
                # demultiplexing of undetermined is still ongoing
                logger.info("Demux of NoIndex lanes ongoing")
                return False
            else:
                logger.info("Demux of NoIndex lanes done.")
                # now I need to produce the files needed in the QC
                flag_file = os.path.join(NoIndex_Undetermiend, "ongoing")
                if os.path.exists(flag_file):
                    # it means that a previous instance of TACA is running and still processing this FC
                    logger.info("Counting of undetermined indexes for NoIndex lanes ongoing")
                    return False
                # now check if the stats have been already computed
                computed = True
                for lane_id in NoIndexLanes:
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    if not os.path.exists(demuxSummary_file):
                        # if does not exists and the ongoing falg is not present, then I need to set computed to False
                        computed = False
                if computed:
                    # in this case I already computed all the demux stats that I need
                    return True
                # otherwise I need to comput them
                open(flag_file, "a").close()  # create the flag file indicating I am working on this
                for lane_id in NoIndexLanes:
                    # count the index occurences, each lane corresponds to one project, a project might have multiple lanes
                    current_lane = [lane for lane in self.runParserObj.samplesheet.data if lane_id == lane["Lane"]][0]
                    if current_lane["index"] != "NoIndex":
                        logger.error(
                            "while processing run {} NoIndex lane {}, index {} found in SampleSheet".format(
                                self.id, lane_id, current_lane["index"]
                            )
                        )
                        return False
                    index_counter = {}
                    indexes_fastq1 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R2_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    indexes_fastq2 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R3_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    # I assume these two files are always present, maybe it is posisble to have no index with a single index...
                    logger.info("Computing Undetermiend indexes for NoIndex lane {}".format(lane_id))
                    zcat = subprocess.Popen(["zcat", indexes_fastq1], stdout=subprocess.PIPE)
                    # this command allows to steam two files, print them line after line separated by a plus
                    awk = subprocess.Popen(
                        [
                            "awk",
                            'BEGIN {{OFS="+"}}{{  ("zcat " "{0} " ) | getline line ; print $0,line }}'.format(
                                indexes_fastq2
                            ),
                        ],
                        stdout=subprocess.PIPE,
                        stdin=zcat.stdout,
                    )
                    # now select only the 2nd line every 4 (i.e., only the index1+index2 line)
                    sed = subprocess.Popen(["sed", "-n", "2~4p"], stdout=subprocess.PIPE, stdin=awk.stdout)
                    zcat.stdout.close()
                    awk.stdout.close()
                    output = sed.communicate()[0]
                    zcat.wait()
                    awk.wait()
                    for barcode in output.split("\n")[:-1]:
                        try:
                            index_counter[barcode] += 1
                        except KeyError:
                            index_counter[barcode] = 1
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    with open(demuxSummary_file, "w") as demuxSummary_file_fh:
                        demuxSummary_file_fh.write("### Most Popular Unknown Index Sequences\n")
                        demuxSummary_file_fh.write("### Columns: Index_Sequence Hit_Count\n")
                        for (index, occ) in sorted(index_counter.items(), key=operator.itemgetter(1), reverse=True):
                            demuxSummary_file_fh.write("{}\t{}\n".format(index, occ))

                # I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values
                undeterminedStats = DemuxSummaryParser(os.path.join(self.run_dir, self.demux_dir, "Stats"))
                sample_data_old = self.runParserObj.lanes.sample_data
                sample_data_new = []
                for lane in sample_data_old:
                    if lane["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        lane["% One mismatchbarcode"] = "0"
                        lane["% Perfectbarcode"] = "100"
                        lane["% of thelane"] = "100"
                        lane["PF Clusters"] = str(PF_clusters)
                    sample_data_new.append(lane)
                self.runParserObj.lanes.sample_data = sample_data_new

                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_lane_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html")
                _generate_lane_html(new_html_report_lane, self.runParserObj.lanes)
                # now do the same for laneBarcode
                sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data
                sampleBarcode_data_new = []
                for sample in sampleBarcode_data_old:
                    if sample["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        sample["% One mismatchbarcode"] = "0"
                        sample["% Perfectbarcode"] = "100"
                        sample["% of thelane"] = "100"
                        sample["PF Clusters"] = str(PF_clusters)
                    sampleBarcode_data_new.append(sample)
                self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new
                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_sampleBarcode_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_sampleBarcode = os.path.join(new_html_report_sampleBarcode_dir, "laneBarcode.html")
                _generate_lane_html(new_html_report_sampleBarcode, self.runParserObj.lanebarcodes)

                os.remove(flag_file)  # remove flag file to allow future iteration on this FC
                return True  # return true, I have done everything I was supposed to do
コード例 #6
0
ファイル: HiSeq_Runs.py プロジェクト: vezzi/TACA
    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        # Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(ssname, os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir))

        # this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        # to be used it needs some editing
        # this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, "wb") as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
        # now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        # if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        # extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        # simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        # compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [complex_lanes[base_masks].keys() for base_masks in complex_lanes]
        different_masks = list(set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[lane][mask]
            # at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num)
            )
            bcl2fastq_command_num += 1
        # now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution)
                )
                execution += 1