Beispiel #1
0
def archive_to_swestore(seconds,
                        run=None,
                        max_runs=None,
                        force=False,
                        compress_only=False):
    """Send runs (as archives) in NAS nosync to swestore for backup

    :param int seconds: Days/hours converted as seconds to check
    :param str run: specific run to send swestore
    :param int max_runs: number of runs to be processed simultaneously
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Compress the run without sending it to swestore
    """
    # If the run is specified in the command line, check that exists and archive
    if run:
        run = os.path.basename(run)
        base_dir = os.path.dirname(run)
        if re.match(filesystem.RUN_RE, run):
            # If the parameter is not an absolute path, find the run in the archive_dirs
            if not base_dir:
                for archive_dir in CONFIG.get('storage').get('archive_dirs'):
                    if os.path.exists(os.path.join(archive_dir, run)):
                        base_dir = archive_dir
            if not os.path.exists(os.path.join(base_dir, run)):
                logger.error(("Run {} not found. Please make sure to specify "
                              "the absolute path or relative path being in "
                              "the correct directory.".format(run)))
            else:
                with filesystem.chdir(base_dir):
                    _archive_run((run, seconds, force, compress_only))
        else:
            logger.error(
                "The name {} doesn't look like an Illumina run".format(
                    os.path.basename(run)))
    # Otherwise find all runs in every data dir on the nosync partition
    else:
        logger.info("Archiving old runs to SWESTORE")
        for to_send_dir in CONFIG.get('storage').get('archive_dirs'):
            logger.info('Checking {} directory'.format(to_send_dir))
            with filesystem.chdir(to_send_dir):
                to_be_archived = [
                    r for r in os.listdir(to_send_dir)
                    if re.match(filesystem.RUN_RE, r) and
                    not os.path.exists("{}.archiving".format(r.split('.')[0]))
                ]
                if to_be_archived:
                    pool = Pool(processes=len(to_be_archived
                                              ) if not max_runs else max_runs)
                    pool.map_async(_archive_run,
                                   ((run, seconds, force, compress_only)
                                    for run in to_be_archived))
                    pool.close()
                    pool.join()
                else:
                    logger.info('No old runs to be archived')
Beispiel #2
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(self.lanes_10X) and len(self.lanes_not_10X):
             Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X="SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_not_10X))
                samplesheet_dest_10X="SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_10X))
        else:
            with chdir(self.run_dir):
                samplesheet_dest="SampleSheet_0.csv"
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, (self.lanes_10X or self.lanes_not_10X)))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if self.lanes_not_10X:
               cmd_normal = self.generate_bcl_command(self.lanes_not_10X, bcl2fastq_cmd_counter)
               misc.call_external_command_detached(cmd_normal, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "normal run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
            if self.lanes_10X:
               cmd_10X = self.generate_bcl_command(self.lanes_10X, bcl2fastq_cmd_counter, is_10X = True)
               misc.call_external_command_detached(cmd_10X, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter))
               logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                   "10X run {} on {}".format(os.path.basename(self.id), datetime.now())))
               bcl2fastq_cmd_counter += 1
        return True
Beispiel #3
0
def archive_to_swestore(days, run=None, max_runs=None, force=False, compress_only=False):
    """Send runs (as archives) in NAS nosync to swestore for backup

    :param int days: number fo days to check threshold
    :param str run: specific run to send swestore
    :param int max_runs: number of runs to be processed simultaneously
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Compress the run without sending it to swestore
    """
    # If the run is specified in the command line, check that exists and archive
    if run:
        run = os.path.basename(run)
        base_dir = os.path.dirname(run)
        if re.match(filesystem.RUN_RE, run):
            # If the parameter is not an absolute path, find the run in the archive_dirs
            if not base_dir:
                for archive_dir in CONFIG.get("storage").get("archive_dirs"):
                    if os.path.exists(os.path.join(archive_dir, run)):
                        base_dir = archive_dir
            if not os.path.exists(os.path.join(base_dir, run)):
                logger.error(
                    (
                        "Run {} not found. Please make sure to specify "
                        "the absolute path or relative path being in "
                        "the correct directory.".format(run)
                    )
                )
            else:
                with filesystem.chdir(base_dir):
                    _archive_run((run, days, force, compress_only))
        else:
            logger.error("The name {} doesn't look like an Illumina run".format(os.path.basename(run)))
    # Otherwise find all runs in every data dir on the nosync partition
    else:
        logger.info("Archiving old runs to SWESTORE")
        for to_send_dir in CONFIG.get("storage").get("archive_dirs"):
            logger.info("Checking {} directory".format(to_send_dir))
            with filesystem.chdir(to_send_dir):
                to_be_archived = [
                    r
                    for r in os.listdir(to_send_dir)
                    if re.match(filesystem.RUN_RE, r) and not os.path.exists("{}.archiving".format(r.split(".")[0]))
                ]
                if to_be_archived:
                    pool = Pool(processes=len(to_be_archived) if not max_runs else max_runs)
                    pool.map_async(_archive_run, ((run, days, force, compress_only) for run in to_be_archived))
                    pool.close()
                    pool.join()
                else:
                    logger.info("No old runs to be archived")
Beispiel #4
0
def cleanup_processing(days):
    """Cleanup runs in processing server.

    :param int days: Number of days to consider a run to be old
    """
    transfer_file = os.path.join(CONFIG.get("preprocessing", {}).get("status_dir"), "transfer.tsv")
    if not days:
        days = CONFIG.get("cleanup", {}).get("processing-server", {}).get("days", 10)
    try:
        # Move finished runs to nosync
        for data_dir in CONFIG.get("storage").get("data_dirs"):
            logger.info("Moving old runs in {}".format(data_dir))
            with filesystem.chdir(data_dir):
                for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                    if filesystem.is_in_file(transfer_file, run):
                        logger.info("Moving run {} to nosync directory".format(os.path.basename(run)))
                        shutil.move(run, "nosync")
                    else:
                        logger.info(
                            ("Run {} has not been transferred to the analysis " "server yet, not archiving".format(run))
                        )
        # Remove old runs from archiving dirs
        for archive_dir in CONFIG.get("storage").get("archive_dirs").values():
            logger.info("Removing old runs in {}".format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, "RTAComplete.txt")
                    if os.path.exists(rta_file):
                        # 1 day == 60*60*24 seconds --> 86400
                        if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and filesystem.is_in_swestore(
                            "{}.tar.bz2".format(run)
                        ):
                            logger.info("Removing run {} to nosync directory".format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info(
                                "RTAComplete.txt file exists but is not older than {} day(s), skipping run {}".format(
                                    str(days), run
                                )
                            )

    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = "Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not."
        cnt = CONFIG.get("contact", None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Beispiel #5
0
    def demultiplex(self):
        """Perform demultiplexing of the flowcell.

        Takes software (bcl2fastq version to use) and parameters from the configuration
        file.
        """
        logger.info('Building bcl2fastq command')
        config = CONFIG['analysis']
        with chdir(self.run_dir):
            cl = [config.get('bcl2fastq').get(self.run_type)]
            if config['bcl2fastq'].has_key('options'):
                cl_options = config['bcl2fastq']['options']

                # Append all options that appear in the configuration file to the main command.
                # Options that require a value, i.e --use-bases-mask Y8,I8,Y8, will be returned
                # as a dictionary, while options that doesn't require a value, i.e --no-lane-splitting
                # will be returned as a simple string
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.popitem()
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                         " run {} on {}".format(os.path.basename(self.id), datetime.now())))

            misc.call_external_command_detached(cl, with_log_files=True)
Beispiel #6
0
def cleanup_processing(seconds):
    """Cleanup runs in processing server.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    try:
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, finished_run_indicator)
                    if os.path.exists(rta_file):
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info('Removing run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info('{} file exists but is not older than given time, skipping run {}'.format(
                                        finished_run_indicator, run))
    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = ("Could not find transfer.tsv file, so I cannot decide if I should "
               "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Beispiel #7
0
    def generate_bcl_command(self, lanes, bcl2fastq_cmd_counter, is_10X=False):
        #I have everything to run demultiplexing now.
        logger.info('Building a bcl2fastq command')
        per_lane_base_masks = self._generate_per_lane_base_mask()
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            output_dir = "Demultiplexing_{}".format(bcl2fastq_cmd_counter)
            cl.extend(["--output-dir", output_dir])
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            cl_options = []
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                for option in self.CONFIG['bcl2fastq']['options']:
                    cl_options.extend([option])
                # Add the extra 10X command options if we have a 10X run
                if is_10X:
                    cl_options.extend(self.CONFIG['bcl2fastq']['options_10X'])
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        if "output-dir" not in opt:
                            cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            cl.extend(["--sample-sheet",  os.path.join(os.path.join(self.run_dir, "SampleSheet_{}.csv".format(bcl2fastq_cmd_counter)))])
            #now add the base_mask for each lane
            for lane in sorted(lanes):
                #Iterate thorugh each lane and add the correct --use-bases-mask for that lane
                base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])
        return cl
Beispiel #8
0
 def create_report(self):
     """ Create a sample report and an aggregate report via a system call """
     logprefix = os.path.abspath(
         self.expand_path(os.path.join(self.logpath, "{}-{}".format(
             self.projectid, self.sampleid))))
     try:
         if not create_folder(os.path.dirname(logprefix)):
             logprefix = None
     except AttributeError:
         logprefix = None
     with chdir(self.expand_path(self.reportpath)):
         # create the ign_sample_report for this sample
         cl = self.report_sample.split(' ')
         cl.extend(["--samples",self.sampleid])
         call_external_command(
             cl,
             with_log_files=(logprefix is not None),
             prefix="{}_sample".format(logprefix))
         # estimate the delivery date for this sample to 0.5 days ahead
         cl = self.report_aggregate.split(' ')
         cl.extend([
             "--samples_extra",
             json.dumps({
                 self.sampleid: {
                     "delivered": "{}(expected)".format(
                         _timestamp(days=0.5))}})
         ])
         call_external_command(
             cl,
             with_log_files=(logprefix is not None),
             prefix="{}_aggregate".format(logprefix))
 def create_report(self):
     """ Create a sample report and an aggregate report via a system call """
     logprefix = os.path.abspath(
         self.expand_path(
             os.path.join(self.logpath,
                          "{}-{}".format(self.projectid, self.sampleid))))
     try:
         if not create_folder(os.path.dirname(logprefix)):
             logprefix = None
     except AttributeError:
         logprefix = None
     with chdir(self.expand_path(self.reportpath)):
         # create the ign_sample_report for this sample
         cl = self.report_sample.split(' ')
         cl.extend(["--samples", self.sampleid])
         call_external_command(cl,
                               with_log_files=(logprefix is not None),
                               prefix="{}_sample".format(logprefix))
         # estimate the delivery date for this sample to 0.5 days ahead
         cl = self.report_aggregate.split(' ')
         cl.extend([
             "--samples_extra",
             json.dumps({
                 self.sampleid: {
                     "delivered":
                     "{}(expected)".format(_timestamp(days=0.5))
                 }
             })
         ])
         call_external_command(cl,
                               with_log_files=(logprefix is not None),
                               prefix="{}_aggregate".format(logprefix))
Beispiel #10
0
def cleanup_processing(seconds):
    """Cleanup runs in processing server.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    try:
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [
                        r for r in os.listdir(archive_dir)
                        if re.match(filesystem.RUN_RE, r)
                ]:
                    rta_file = os.path.join(run, finished_run_indicator)
                    if os.path.exists(rta_file):
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info(
                                'Removing run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info(
                                '{} file exists but is not older than given time, skipping run {}'
                                .format(finished_run_indicator, run))
    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = (
            "Could not find transfer.tsv file, so I cannot decide if I should "
            "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Beispiel #11
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname   = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(_generate_clean_samplesheet(ssparser, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix = True, fields_qPCR=['SampleName']))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet  = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks =  max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id), datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True
Beispiel #12
0
def cleanup_nas(seconds):
    """Will move the finished runs in NASes to nosync directory.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    couch_info = CONFIG.get('statusdb')
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    check_demux = CONFIG.get('storage', {}).get('check_demux', False)
    host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    if check_demux:
                        if misc.run_is_demuxed(run, couch_info):
                            logger.info('Moving run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        elif os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.warn('Run {} is older than given time, but it is not demultiplexed yet'
                                        .format(run))
                            sbt = "Run not demultiplexed - {}".format(run)
                            msg = ("Run '{}' in '{}' is older then given threshold, but seems like it is not "
                                  "yet demultiplexed".format(os.path.join(data_dir, run), host_name))
                            misc.send_mail(sbt, msg, mail_recipients)
                    else:
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info('Moving run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        else:
                            logger.info('{} file exists but is not older than given time, skipping run {}'
                                        .format(finished_run_indicator, run))
Beispiel #13
0
def cleanup_processing(days):
    """Cleanup runs in processing server.

    :param int days: Number of days to consider a run to be old
    """
    transfer_file = os.path.join(CONFIG.get('preprocessing', {}).get('status_dir'), 'transfer.tsv')
    if not days:
        days = CONFIG.get('cleanup', {}).get('processing-server', {}).get('days', 10)
    try:
        #Move finished runs to nosync
        for data_dir in CONFIG.get('storage').get('data_dirs'):
            logger.info('Moving old runs in {}'.format(data_dir))
            with filesystem.chdir(data_dir):
                for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                    if filesystem.is_in_file(transfer_file, run):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info(("Run {} has not been transferred to the analysis "
                            "server yet, not archiving".format(run)))
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, 'RTAComplete.txt')
                    if os.path.exists(rta_file):
                        # 1 day == 60*60*24 seconds --> 86400
                        if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and \
                                filesystem.is_in_swestore("{}.tar.bz2".format(run)):
                            logger.info('Removing run {} to nosync directory'
                                        .format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run))

    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = ("Could not find transfer.tsv file, so I cannot decide if I should "
               "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Beispiel #14
0
    def generate_bcl_command(self, sample_type, mask_table,
                             bcl2fastq_cmd_counter):
        # I have everything to run demultiplexing now.
        logger.info('Building a bcl2fastq command')
        per_lane_base_masks = self._generate_per_lane_base_mask(
            sample_type, mask_table)
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            output_dir = 'Demultiplexing_{}'.format(bcl2fastq_cmd_counter)
            cl.extend(['--output-dir', output_dir])
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            cl_options = []
            if 'options' in self.CONFIG.get('bcl2fastq'):
                for option in self.CONFIG['bcl2fastq']['options']:
                    cl_options.extend([option])
                # Add the extra 10X command options if we have 10X Genomic or ATAC samples
                if sample_type == '10X_GENO' or sample_type == '10X_ATAC':
                    cl_options.extend(self.CONFIG['bcl2fastq']['options_10X'])
                # Add the extra 10X command options if we have 10X ST samples
                if sample_type == '10X_ST':
                    cl_options.extend(
                        self.CONFIG['bcl2fastq']['options_10X_ST'])
                # Add the extra command option if we have samples with IDT UMI
                if sample_type == 'IDT_UMI':
                    cl_options.extend(
                        self.CONFIG['bcl2fastq']['options_IDT_UMI'])
                # Add the extra Smart-seq command options if we have 10X ST samples
                if sample_type == 'SMARTSEQ':
                    cl_options.extend(
                        self.CONFIG['bcl2fastq']['options_SMARTSEQ'])
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = list(option.items())[0]
                        if 'output-dir' not in opt:
                            cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            cl.extend([
                '--sample-sheet',
                os.path.join(
                    os.path.join(
                        self.run_dir,
                        'SampleSheet_{}.csv'.format(bcl2fastq_cmd_counter)))
            ])
            # Add the base_mask for each lane
            lanes = list(mask_table.keys())
            for lane in sorted(lanes):
                # Iterate thorugh each lane and add the correct --use-bases-mask for that lane
                base_mask = [
                    per_lane_base_masks[lane][bm]['base_mask']
                    for bm in per_lane_base_masks[lane]
                ][0]  # Get the base_mask
                base_mask_expr = '{}:'.format(lane) + ','.join(base_mask)
                cl.extend(['--use-bases-mask', base_mask_expr])
        return cl
Beispiel #15
0
def cleanup_nas(seconds):
    """Will move the finished runs in NASes to nosync directory.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    couch_info = CONFIG.get('statusdb')
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    check_demux = CONFIG.get('storage', {}).get('check_demux', False)
    host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
            logger.warn(
                "Data directory '{}' does not exist or not a directory".format(
                    data_dir))
            continue
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [
                    r for r in os.listdir(data_dir)
                    if re.match(filesystem.RUN_RE, r)
            ]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    if check_demux:
                        if misc.run_is_demuxed(run, couch_info):
                            logger.info(
                                'Moving run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        elif 'miseq' in data_dir:
                            miseq_run = MiSeq_Run(run, CONFIG)
                            if miseq_run.get_run_type() == 'NON-NGI-RUN':
                                logger.info(
                                    'Run {} is a non-platform run, so moving it to nosync directory'
                                    .format(os.path.basename(run)))
                                shutil.move(run, 'nosync')
                        elif os.stat(
                                rta_file).st_mtime < time.time() - seconds:
                            logger.warn(
                                'Run {} is older than given time, but it is not demultiplexed yet'
                                .format(run))
                            sbt = "Run not demultiplexed - {}".format(run)
                            msg = (
                                "Run '{}' in '{}' is older then given threshold, but seems like it is not "
                                "yet demultiplexed".format(
                                    os.path.join(data_dir, run), host_name))
                            misc.send_mail(sbt, msg, mail_recipients)
                    else:
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info(
                                'Moving run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        else:
                            logger.info(
                                '{} file exists but is not older than given time, skipping run {}'
                                .format(finished_run_indicator, run))
Beispiel #16
0
def cleanup_uppmax(site, days, dry_run=False):
    """Remove project/run that have been closed more than 'days'
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int days: number of days to check for closed projects
    """
    days = check_days(site, days, config)
    if not days:
        return
    root_dir = CONFIG.get("cleanup").get(site).get("root")
    deleted_log = CONFIG.get("cleanup").get("deleted_log")
    assert os.path.exists(os.path.join(root_dir, deleted_log)), "Log directory {} doesn't exist in {}".format(
        deleted_log, root_dir
    )
    log_file = os.path.join(root_dir, "{fl}/{fl}.log".format(fl=deleted_log))

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site != "archive":
        ## work flow for cleaning up illumina/analysis ##
        projects = [p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE, p)]
        list_to_delete = get_closed_projects(projects, pcon, days)
    else:
        ##work flow for cleaning archive ##
        list_to_delete = []
        archived_in_swestore = filesystem.list_runs_in_swestore(
            path=CONFIG.get("cleanup").get("swestore").get("root"), no_ext=True
        )
        runs = [r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE, r)]
        with filesystem.chdir(root_dir):
            for run in runs:
                fc_date = run.split("_")[0]
                if misc.days_old(fc_date) > days:
                    if run in archived_in_swestore:
                        list_to_delete.append(run)
                    else:
                        logger.warn(
                            "Run {} is older than {} days but not in " "swestore, so SKIPPING".format(run, days)
                        )

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info("Will remove {} from {}".format(item, root_dir))
            continue
        try:
            shutil.rmtree(os.path.join(root_dir, item))
            logger.info("Removed project {} from {}".format(item, root_dir))
            with open(log_file, "a") as to_log:
                to_log.write("{}\t{}\n".format(item, datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M")))
        except OSError:
            logger.warn("Could not remove path {} from {}".format(item, root_dir))
            continue
Beispiel #17
0
def cleanup_uppmax(site, days, dry_run=False):
    """Remove project/run that have been closed more than 'days'
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int days: number of days to check for closed projects
    """
    days = check_days(site, days, config)
    if not days:
        return
    root_dir = CONFIG.get('cleanup').get(site).get('root')
    deleted_log = CONFIG.get('cleanup').get('deleted_log')
    assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir)
    log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log))

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site != "archive":
        ## work flow for cleaning up illumina/analysis ##
        projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ]
        list_to_delete = get_closed_projects(projects, pcon, days)
    else:
        ##work flow for cleaning archive ##
        list_to_delete = []
        archived_in_swestore = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'), no_ext=True)
        runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ]
        with filesystem.chdir(root_dir):
            for run in runs:
                fc_date = run.split('_')[0]
                if misc.days_old(fc_date) > days:
                    if run in archived_in_swestore:
                        list_to_delete.append(run)
                    else:
                        logger.warn("Run {} is older than {} days but not in "
                                    "swestore, so SKIPPING".format(run, days))

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info('Will remove {} from {}'.format(item,root_dir))
            continue
        try:
            shutil.rmtree(os.path.join(root_dir,item))
            logger.info('Removed project {} from {}'.format(item,root_dir))
            with open(log_file,'a') as to_log:
                to_log.write("{}\t{}\n".format(item,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M')))
        except OSError:
            logger.warn("Could not remove path {} from {}"
                        .format(item,root_dir))
            continue
Beispiel #18
0
    def demultiplex_run(self):
        """ Demultiplex a NextSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        if not os.path.exists(self.ssname):
            # We should not get here really and this run should be defined as NON NGI-RUN
            return False
        # TODO SampleSheetParser may throw an exception
        ssparser = SampleSheetParser(self.ssname)
        # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        # if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # Check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(self._generate_clean_samplesheet(ssparser))
            except Exception as e:
                if os.path.exists(samplesheet_dest):
                    os.remove(samplesheet_dest)
                logger.error(e)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        # SampleSheet.csv generated to be used in bcl2fastq
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        # Make the demux call
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)

        return True
 def create_report(self):
     """ Create a final aggregate report via a system call """
     logprefix = os.path.abspath(
         self.expand_path(os.path.join(self.logpath, self.projectid)))
     try:
         if not create_folder(os.path.dirname(logprefix)):
             logprefix = None
     except AttributeError:
         logprefix = None
     with chdir(self.expand_path(self.reportpath)):
         cl = self.report_aggregate.split(' ')
         call_external_command(cl,
                               with_log_files=(logprefix is not None),
                               prefix="{}_aggregate".format(logprefix))
Beispiel #20
0
 def create_report(self):
     """ Create a final aggregate report via a system call """
     logprefix = os.path.abspath(
         self.expand_path(os.path.join(self.logpath, self.projectid)))
     try:
         if not create_folder(os.path.dirname(logprefix)):
             logprefix = None
     except AttributeError:
         logprefix = None
     with chdir(self.expand_path(self.reportpath)):
         cl = self.report_aggregate.split(' ')
         call_external_command(
             cl,
             with_log_files=(logprefix is not None),
             prefix="{}_aggregate".format(logprefix))
Beispiel #21
0
 def demultiplex_run(self): 
     """ Demultiplex a NextSeq run:
         - find the samplesheet
         - make a local copy of the samplesheet and name it SampleSheet.csv
         - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
         - run bcl2fastq conversion
     """
     if not os.path.exists(self.ssname):
         # We should not get here really and this run should be defined as NON NGI-RUN
         return False
     # TODO SampleSheetParser may throw an exception
     ssparser = SampleSheetParser(self.ssname)
     # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
     # if this is not the case then create it and take special care of modification to be done on the SampleSheet
     samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
     # Check that the samplesheet is not already present. In this case go the next step
     if not os.path.exists(samplesheet_dest):
         try:
             with open(samplesheet_dest, 'wb') as fcd:
                 fcd.write(self._generate_clean_samplesheet(ssparser))
         except Exception as e:
             if os.path.exists(samplesheet_dest):
                 os.remove(samplesheet_dest)
             logger.error(e)
             return False
         logger.info(("Created SampleSheet.csv for Flowcell {} in {} "
                      .format(self.id, samplesheet_dest)))
     # SampleSheet.csv generated to be used in bcl2fastq
     self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
     # Make the demux call
     with chdir(self.run_dir):
         cl = [self.CONFIG.get('bcl2fastq')['bin']]
         if self.CONFIG.get('bcl2fastq').has_key('options'):
             cl_options = self.CONFIG['bcl2fastq']['options']
             # Append all options that appear in the configuration file to the main command.
             for option in cl_options:
                 if isinstance(option, dict):
                     opt, val = option.items()[0]
                     cl.extend(['--{}'.format(opt), str(val)])
                 else:
                     cl.append('--{}'.format(option))
         logger.info(("BCL to FASTQ conversion and demultiplexing started for "
              " run {} on {}".format(os.path.basename(self.id), datetime.now())))
         misc.call_external_command_detached(cl, with_log_files=True)
         
     return True
Beispiel #22
0
def cleanup_nas(days):
    """Will move the finished runs in NASes to nosync directory.

    :param int days: Number of days to consider a run to be old
    """
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, 'RTAComplete.txt')
                if os.path.exists(rta_file):
                    # 1 day == 60*60*24 seconds --> 86400
                    if os.stat(rta_file).st_mtime < time.time() - (86400 * days):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run))
Beispiel #23
0
def cleanup_nas(days):
    """Will move the finished runs in NASes to nosync directory.

    :param int days: Number of days to consider a run to be old
    """
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    # 1 day == 60*60*24 seconds --> 86400
                    if os.stat(rta_file).st_mtime < time.time() - (86400 * days):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info('{} file exists but is not older than {} day(s), skipping run {}'.format(
                                    finished_run_indicator, str(days), run))
Beispiel #24
0
    def generate_bcl_command(self, lanes, bcl2fastq_cmd_counter, is_10X=False):
        #I have everything to run demultiplexing now.
        logger.info('Building a bcl2fastq command')
        per_lane_base_masks = self._generate_per_lane_base_mask()
        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            output_dir = "Demultiplexing_{}".format(bcl2fastq_cmd_counter)
            cl.extend(["--output-dir", output_dir])
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            cl_options = []
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                for option in self.CONFIG['bcl2fastq']['options']:
                    cl_options.extend([option])
                # Add the extra 10X command options if we have a 10X run
                if is_10X:
                    cl_options.extend(self.CONFIG['bcl2fastq']['options_10X'])
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        if "output-dir" not in opt:
                            cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))

            cl.extend([
                "--sample-sheet",
                os.path.join(
                    os.path.join(
                        self.run_dir,
                        "SampleSheet_{}.csv".format(bcl2fastq_cmd_counter)))
            ])
            #now add the base_mask for each lane
            for lane in sorted(lanes):
                #Iterate thorugh each lane and add the correct --use-bases-mask for that lane
                base_mask = [
                    per_lane_base_masks[lane][bm]['base_mask']
                    for bm in per_lane_base_masks[lane]
                ][0]  # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])
        return cl
Beispiel #25
0
 def pdc_put(cls, run):
     """Archive the collected runs to PDC"""
     bk = cls(run)
     bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True)
     logger.info("In total, found {} run(s) to send PDC".format(len(bk.runs)))
     for run in bk.runs:
         run.flag = "{}.archiving".format(run.name)
         run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
         if run.path not in bk.archive_dirs.values():
             logger.error(("Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate "
                           "archive dir before sending it to PDC".format(",".join(bk.archive_dirs.values()), run.name)))
             continue
         if not os.path.exists(run.dst_key_encrypted):
             logger.error("Encrypted key file {} is not found for file {}, skipping it".format(run.dst_key_encrypted, run.zip_encrypted))
             continue
         #skip run if being encrypted
         if os.path.exists("{}.encrypting".format(run.name)):
             logger.warn("Run {} is currently being encrypted, so skipping now".format(run.name))
             continue
         # skip run if already ongoing
         if os.path.exists(run.flag):
             logger.warn("Run {} is already being archived, so skipping now".format(run.name))
             continue
         flag = open(run.flag, 'w').close()
         with filesystem.chdir(run.path):
             if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
                 logger.warn("Seems like files realted to run {} already exist in PDC, check and cleanup".format(run.name))
                 bk._clean_tmp_files([run.flag])
                 continue
             logger.info("Sending file {} to PDC".format(run.zip_encrypted))
             if bk._call_commands(cmd1="dsmc archive {}".format(run.zip_encrypted), tmp_files=[run.flag]):
                 time.sleep(15) # give some time just in case 'dsmc' needs to settle
                 if bk._call_commands(cmd1="dsmc archive {}".format(run.dst_key_encrypted), tmp_files=[run.flag]):
                     time.sleep(5) # give some time just in case 'dsmc' needs to settle
                     if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted):
                         logger.info("Successfully sent file {} to PDC, removing file locally from {}".format(run.zip_encrypted, run.path))
                         if bk.couch_info:
                             bk._log_pdc_statusdb(run.name)
                         bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag])
                     continue
             logger.warn("Sending file {} to PDC failed".format(run.zip_encrypted))
Beispiel #26
0
 def pdc_put(cls, run):
     """Archive the collected runs to PDC."""
     bk = cls(run)
     bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True)
     logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs)))
     for run in bk.runs:
         run.flag = '{}.archiving'.format(run.name)
         run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
         if run.path not in bk.archive_dirs.values():
             logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate '
                           'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name)))
             continue
         if not os.path.exists(run.dst_key_encrypted):
             logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted))
             continue
         with filesystem.chdir(run.path):
             #skip run if being encrypted
             if os.path.exists('{}.encrypting'.format(run.name)):
                 logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name))
                 continue
             # skip run if already ongoing
             if os.path.exists(run.flag):
                 logger.warn('Run {} is already being archived, so skipping now'.format(run.name))
                 continue
             if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
                 logger.warn('Seems like files realted to run {} already exist in PDC, check and cleanup'.format(run.name))
                 continue
             flag = open(run.flag, 'w').close()
             logger.info('Sending file {} to PDC'.format(run.zip_encrypted))
             if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]):
                 time.sleep(15) # give some time just in case 'dsmc' needs to settle
                 if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]):
                     time.sleep(5) # give some time just in case 'dsmc' needs to settle
                     if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted):
                         logger.info('Successfully sent file {} to PDC, removing file locally from {}'.format(run.zip_encrypted, run.path))
                         if bk.couch_info:
                             bk._log_pdc_statusdb(run.name)
                         bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag])
                     continue
             logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted))
Beispiel #27
0
    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        #Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(
                ssname,
                os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError(
                "unable to copy file {} to destination {}".format(
                    ssname, self.run_dir))

        #this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        #to be used it needs some editing
        #this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, 'wb') as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(
            self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        #extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        #simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(simple_lanes, True,
                                                 bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [
            complex_lanes[base_masks].keys() for base_masks in complex_lanes
        ]
        different_masks = list(
            set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[
                        lane][mask]
            #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux,
                                                 True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command,
                    with_log_files=True,
                    prefix="demux_{}".format(execution))
                execution += 1
Beispiel #28
0
def cleanup_milou(site, seconds, dry_run=False):
    """Remove project/run that have been closed more than given time (as seconds)
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int seconds: Days/hours converted as second to consider a run to be old
    :param bool dry_run: Will summarize what is going to be done without really doing it
    """
    seconds = check_default(site, seconds, CONFIG)
    if not seconds:
        return
    root_dir = CONFIG.get('cleanup').get('milou').get(site).get('root')
    deleted_log = CONFIG.get('cleanup').get('milou').get('deleted_log')
    assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir)
    log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log))
    list_to_delete = []

    ## get glob path patterns to search and remove from root directory
    try:
        archive_config = CONFIG['cleanup']['milou']['archive']
        ## the glob path should be relative to the run folder, like "Unaligned_*/Project_*"
        config_ppath = archive_config['proj_path']
        ## Glob path should be relative to run folder, like "Unaligned_0bp/Undetermined_indices/*/*.fastq.gz"
        config_npath = archive_config['undet_noindex']
        ## Glob path should be relative to run folder, like "Unaligned_*bp/Undetermined_indices/*/*.fastq.gz"
        config_upath = archive_config['undet_all']
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site in ["analysis", "illumina"]:
        ## work flow for cleaning up illumina/analysis ##
        projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ]
        list_to_delete.extend(get_closed_projects(projects, pcon, seconds))
    elif site == "archive":
        ##work flow for cleaning archive ##
        runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ]
        for run in runs:
            with filesystem.chdir(os.path.join(root_dir, run)):
                ## Collect all project path from demultiplexed directories in the run folder
                all_proj_path = glob(config_ppath)
                all_proj_dict = {os.path.basename(pp).replace('Project_','').replace('__', '.'): pp for pp in all_proj_path}
                closed_projects = get_closed_projects(all_proj_dict.keys(), pcon, seconds)
                ## Only proceed cleaning the data for closed projects
                for closed_proj in closed_projects:
                    closed_proj_fq = glob("{}/*/*.fastq.gz".format(all_proj_dict[closed_proj]))
                    list_to_delete.extend([os.path.join(run, pfile) for pfile in closed_proj_fq])
                ## Remove the undetermined fastq files for NoIndex case always
                undetermined_fastq_files = glob(config_npath)
                ## Remove undeterminded fastq files for all index length if all project run in the FC is closed
                if len(all_proj_dict.keys()) == len(closed_projects):
                    undetermined_fastq_files = glob(config_upath)
                list_to_delete.extend([os.path.join(run, ufile) for ufile in undetermined_fastq_files])

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info('Will remove {} from {}'.format(item,root_dir))
            continue
        try:
            to_remove = os.path.join(root_dir,item)
            if os.path.isfile(to_remove):
                os.remove(to_remove)
            elif os.path.isdir(to_remove):
                shutil.rmtree(to_remove)
            logger.info('Removed {} from {}'.format(item,root_dir))
            with open(log_file,'a') as to_log:
                to_log.write("{}\t{}\n".format(to_remove,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M')))
        except OSError:
            logger.warn("Could not remove {} from {}".format(item,root_dir))
            continue
Beispiel #29
0
def cleanup_milou(site, seconds, dry_run=False):
    """Remove project/run that have been closed more than given time (as seconds)
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int seconds: Days/hours converted as second to consider a run to be old
    :param bool dry_run: Will summarize what is going to be done without really doing it
    """
    seconds = check_default(site, seconds, CONFIG)
    if not seconds:
        return
    root_dir = CONFIG.get('cleanup').get('milou').get(site).get('root')
    deleted_log = CONFIG.get('cleanup').get('milou').get('deleted_log')
    assert os.path.exists(os.path.join(
        root_dir, deleted_log)), "Log directory {} doesn't exist in {}".format(
            deleted_log, root_dir)
    log_file = os.path.join(root_dir, "{fl}/{fl}.log".format(fl=deleted_log))
    list_to_delete = []

    ## get glob path patterns to search and remove from root directory
    try:
        archive_config = CONFIG['cleanup']['milou']['archive']
        ## the glob path should be relative to the run folder, like "Unaligned_*/Project_*"
        config_ppath = archive_config['proj_path']
        ## Glob path should be relative to run folder, like "Unaligned_0bp/Undetermined_indices/*/*.fastq.gz"
        config_npath = archive_config['undet_noindex']
        ## Glob path should be relative to run folder, like "Unaligned_*bp/Undetermined_indices/*/*.fastq.gz"
        config_upath = archive_config['undet_all']
    except KeyError as e:
        logger.error(
            "Config file is missing the key {}, make sure it have all required information"
            .format(str(e)))
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site in ["analysis", "illumina"]:
        ## work flow for cleaning up illumina/analysis ##
        projects = [
            p for p in os.listdir(root_dir)
            if re.match(filesystem.PROJECT_RE, p)
        ]
        list_to_delete.extend(get_closed_projects(projects, pcon, seconds))
    elif site == "archive":
        ##work flow for cleaning archive ##
        runs = [
            r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE, r)
        ]
        for run in runs:
            with filesystem.chdir(os.path.join(root_dir, run)):
                ## Collect all project path from demultiplexed directories in the run folder
                all_proj_path = glob(config_ppath)
                all_proj_dict = {
                    os.path.basename(pp).replace('Project_',
                                                 '').replace('__', '.'): pp
                    for pp in all_proj_path
                }
                closed_projects = get_closed_projects(all_proj_dict.keys(),
                                                      pcon, seconds)
                ## Only proceed cleaning the data for closed projects
                for closed_proj in closed_projects:
                    closed_proj_fq = glob("{}/*/*.fastq.gz".format(
                        all_proj_dict[closed_proj]))
                    list_to_delete.extend(
                        [os.path.join(run, pfile) for pfile in closed_proj_fq])
                ## Remove the undetermined fastq files for NoIndex case always
                undetermined_fastq_files = glob(config_npath)
                ## Remove undeterminded fastq files for all index length if all project run in the FC is closed
                if len(all_proj_dict.keys()) == len(closed_projects):
                    undetermined_fastq_files = glob(config_upath)
                list_to_delete.extend([
                    os.path.join(run, ufile)
                    for ufile in undetermined_fastq_files
                ])

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info('Will remove {} from {}'.format(item, root_dir))
            continue
        try:
            to_remove = os.path.join(root_dir, item)
            if os.path.isfile(to_remove):
                os.remove(to_remove)
            elif os.path.isdir(to_remove):
                shutil.rmtree(to_remove)
            logger.info('Removed {} from {}'.format(item, root_dir))
            with open(log_file, 'a') as to_log:
                to_log.write("{}\t{}\n".format(
                    to_remove,
                    datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')))
        except OSError:
            logger.warn("Could not remove {} from {}".format(item, root_dir))
            continue
Beispiel #30
0
def cleanup_irma(days_fastq,
                 days_analysis,
                 only_fastq,
                 only_analysis,
                 clean_undetermined,
                 status_db_config,
                 exclude_projects,
                 list_only,
                 date,
                 dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
                undet_file_pattern: "Undetermined_*.fastq.gz"
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        flowcell_undet_files = config['flowcell']['undet_file_pattern']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
        if date:
            date = datetime.strptime(date, '%Y-%m-%d')
    except KeyError as e:
        logger.error(
            "Config file is missing the key {}, make sure it have all required information"
            .format(str(e)))
        raise SystemExit
    except ValueError as e:
        logger.error(
            "Date given with '--date' option is not in required format, see help for more info"
        )
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"

    # make exclude project list if provided
    exclude_list = []
    if exclude_projects:
        if os.path.isfile(exclude_projects):
            with open(exclude_projects, 'r') as in_file:
                exclude_list.extend([p.strip() for p in in_file.readlines()])
        else:
            exclude_list.extend(exclude_projects.split(','))
        # sanity check for mentioned project to exculde or valid
        invalid_projects = filter(
            lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.
            keys(), exclude_list)
        if invalid_projects:
            logger.error(
                "'--exclude_projects' was called with some invalid projects '{}', "
                "provide valid project name/id".format(
                    ",".join(invalid_projects)))
            raise SystemExit

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    if not list_only and not clean_undetermined:
        logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info(
            "Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info(
            "Option 'only_analysis' is given, so will not look for fastq data")

    if clean_undetermined:
        all_undet_files = []
        for flowcell_dir in flowcell_dir_root:
            for fc in [
                    d for d in os.listdir(flowcell_dir)
                    if re.match(filesystem.RUN_RE, d)
            ]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn(
                            "Flowcell {} do not contain a '{}' direcotry".
                            format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    # the above check looked for project directories and also that are not cleaned
                    # so if it could not find any project, means there is no project diretory at all
                    # or all the project directory is already cleaned. Then we can remove the undet
                    if len(projects_in_fc) > 0:
                        continue
                    fc_undet_files = glob(
                        os.path.join(flowcell_project_source,
                                     flowcell_undet_files))
                    if fc_undet_files:
                        logger.info(
                            "All projects was cleaned for FC {}, found {} undeterminded files"
                            .format(fc, len(fc_undet_files)))
                        all_undet_files.extend(
                            map(os.path.abspath, fc_undet_files))
        if all_undet_files:
            undet_size = _def_get_size_unit(
                sum(map(os.path.getsize, all_undet_files)))
            if misc.query_yes_no(
                    "In total found {} undetermined files which are {} in size, delete now ?"
                    .format(len(all_undet_files), undet_size),
                    default="no"):
                removed = _remove_files(all_undet_files)
        return
    elif only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(
                pid, pcon.get_entry(pid, use_id_view=True), date)
            if proj_info and proj_info['closed_days'] >= days_analysis:
                # move on if this project has to be excluded
                if proj_info['name'] in exclude_list or proj_info[
                        'pid'] in exclude_list:
                    continue
                analysis_data, analysis_size = collect_analysis_data_irma(
                    pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [
                    d for d in os.listdir(flowcell_dir)
                    if re.match(filesystem.RUN_RE, d)
            ]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn(
                            "Flowcell {} do not contain a '{}' direcotry".
                            format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[
                                    proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(
                                    fc_abs_path,
                                    os.path.join(flowcell_project_source,
                                                 _proj))
                                project_clean_list[proj]['fastq_to_remove'][
                                    'flowcells'][fc] = fc_fq_files[
                                        'flowcells'][fc]
                                project_clean_list[proj][
                                    'fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(
                            proj, pcon.get_entry(proj), date)
                        if proj_info:
                            # move on if this project has to be excluded
                            if proj_info['name'] in exclude_list or proj_info[
                                    'pid'] in exclude_list:
                                continue
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(
                                    fc_abs_path,
                                    os.path.join(flowcell_project_source,
                                                 _proj), data_dir,
                                    proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(
                                        proj_info['pid'], analysis_dir,
                                        analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or (
                                    (not analysis_data or analysis_data
                                     == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info

    if not project_clean_list:
        logger.info("There are no projects to clean")
        return

    # list only the project and exit if 'list_only' option is selected
    if list_only:
        print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
        for p_info in sorted(project_clean_list.values(),
                             key=lambda d: d['closed_days'],
                             reverse=True):
            print "\t".join([
                p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
                str(p_info['closed_days']), p_info['closed_date'],
                _def_get_size_unit(p_info['fastq_size']),
                _def_get_size_unit(p_info['analysis_size'])
            ])
        raise SystemExit

    logger.info("Initial list is built with {} projects {}".format(
        len(project_clean_list), get_files_size_text(project_clean_list)))
    if misc.query_yes_no("Interactively filter projects for cleanup ?",
                         default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no(
                    "{}Delete files for this project ({}/{})".format(
                        get_proj_meta_info(info, days_fastq), proj_count,
                        len(project_clean_list)),
                    default="no"):
                logger.info(
                    "Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(
            len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(
            len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")

    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info(
                    "Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info(
                            "Removed fastq files from FC {} for project {}, marking it as cleaned"
                            .format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info(
                        "All flowcells cleaned for this project, marking it as cleaned in {}"
                        .format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass

        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(
                    qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn(
                            "Couldn't remove some files in qc directory '{}'".
                            format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info(
                    "Removed analysis data for project {}, marking it cleaned".
                    format(proj))
                _touch_cleaned(proj_analysis_root)
Beispiel #31
0
    def compute_undetermined(self):
        """
        This function returns true if all demux steps are done and we can proceed to QC
        For simple lanes with index: no check is done everything needs to be in place
        for complex lanes: no check is done everything needs to be in place
        for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion
        """
        NoIndexLanes = [
            lane["Lane"] for lane in self.runParserObj.samplesheet.data
            if "NoIndex" in lane["index"]
        ]
        if len(NoIndexLanes) == 0:
            return True  # everything is fine I can proceed to QC
        #otherwise proceed

        NoIndex_Undetermiend = os.path.join(self.run_dir,
                                            "Demultiplexing_NoIndex")
        if not os.path.exists(NoIndex_Undetermiend):
            #for these lanes I have no undetermiend as I demux them without index.
            #now geenrate the base masks per lane
            per_lane_base_masks = self._generate_per_lane_base_mask()
            #store here only the NoIndex lanes
            per_lane_base_masks_NoIndex = {}
            run_with_no_index = False  # use this flag to check that we are not in the C.Daub case
            for NoIndexLane in NoIndexLanes:
                per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[
                    NoIndexLane]
                base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0]
                new_base_mask = []
                if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]
                       ['base_mask']):
                    #C.Daub_15_01 case, only one sample per lane and no index at all
                    run_with_no_index = True
                else:
                    for baseMask_element in per_lane_base_masks_NoIndex[
                            NoIndexLane][base_mask_key]['base_mask']:
                        if baseMask_element.startswith("Y"):
                            new_base_mask.append(
                                baseMask_element.replace("Y", "N"))
                        elif baseMask_element.startswith("N"):
                            new_base_mask.append(
                                baseMask_element.replace("N", "Y"))
                    per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key][
                        'base_mask'] = new_base_mask
            if not run_with_no_index:
                os.makedirs(NoIndex_Undetermiend)
                command = self._generate_bcl2fastq_command(
                    per_lane_base_masks_NoIndex,
                    True,
                    "NoIndex",
                    mask_short_adapter_reads=True)
                with chdir(self.run_dir):
                    misc.call_external_command_detached(command,
                                                        with_log_files=True,
                                                        prefix="demux_NoIndex")
                #return false, as I need to wait to finish the demux for the NoIndex case
                return False
            else:
                #in this case I do not want to start a demux for th eindex, beceause I do not have the index at all
                #I need to softlink everythin else that is in Stats as I do not want to recompute it
                missingStatsFiles = glob.glob(
                    os.path.join(self.run_dir, "Demultiplexing_0", "Stats",
                                 "*F*L*.txt"))
                destination = os.path.join(self.run_dir, self.demux_dir,
                                           "Stats")
                for source in missingStatsFiles:
                    source_file_name = os.path.basename(source)
                    if not os.path.exists(
                            os.path.join(destination, source_file_name)):
                        os.symlink(source,
                                   os.path.join(destination, source_file_name))
                return True
        else:
            #in this case it means that I have already started to demux the NoIndex
            if not os.path.exists(
                    os.path.join(self.run_dir, "Demultiplexing_NoIndex",
                                 'Stats', 'DemultiplexingStats.xml')):
                #demultiplexing of undetermined is still ongoing
                logger.info("Demux of NoIndex lanes ongoing")
                return False
            else:
                logger.info("Demux of NoIndex lanes done.")
                #now I need to produce the files needed in the QC
                flag_file = os.path.join(NoIndex_Undetermiend, "ongoing")
                if os.path.exists(flag_file):
                    #it means that a previous instance of TACA is running and still processing this FC
                    logger.info(
                        "Counting of undetermined indexes for NoIndex lanes ongoing"
                    )
                    return False
                #now check if the stats have been already computed
                computed = True
                for lane_id in NoIndexLanes:
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats",
                        "DemuxSummaryF1L{}.txt".format(lane_id))
                    if not os.path.exists(demuxSummary_file):
                        #if does not exists and the ongoing falg is not present, then I need to set computed to False
                        computed = False
                if computed:
                    #in this case I already computed all the demux stats that I need
                    return True
                #otherwise I need to comput them
                open(flag_file, 'a').close(
                )  # create the flag file indicating I am working on this
                for lane_id in NoIndexLanes:
                    #count the index occurences, each lane corresponds to one project, a project might have multiple lanes
                    current_lane = [
                        lane for lane in self.runParserObj.samplesheet.data
                        if lane_id == lane["Lane"]
                    ][0]
                    if current_lane["index"] != "NoIndex":
                        logger.error(
                            "while processing run {} NoIndex lane {}, index {} found in SampleSheet"
                            .format(self.id, lane_id, current_lane["index"]))
                        return False
                    index_counter = {}
                    indexes_fastq1 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend, current_lane[
                                self.runParserObj.samplesheet.dfield_proj],
                            current_lane[
                                self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R2_001.fastq.gz".format(
                                current_lane[
                                    self.runParserObj.samplesheet.dfield_snm],
                                lane_id)))[0]
                    indexes_fastq2 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend, current_lane[
                                self.runParserObj.samplesheet.dfield_proj],
                            current_lane[
                                self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R3_001.fastq.gz".format(
                                current_lane[
                                    self.runParserObj.samplesheet.dfield_snm],
                                lane_id)))[0]
                    # I assume these two files are always present, maybe it is posisble to have no index with a single index...
                    logger.info(
                        "Computing Undetermiend indexes for NoIndex lane {}".
                        format(lane_id))
                    zcat = subprocess.Popen(['zcat', indexes_fastq1],
                                            stdout=subprocess.PIPE)
                    #this command allows to steam two files, print them line after line separated by a plus
                    awk = subprocess.Popen([
                        'awk',
                        'BEGIN {{OFS="+"}}{{  ("zcat " "{0} " ) | getline line ; print $0,line }}'
                        .format(indexes_fastq2)
                    ],
                                           stdout=subprocess.PIPE,
                                           stdin=zcat.stdout)
                    #now select only the 2nd line every 4 (i.e., only the index1+index2 line)
                    sed = subprocess.Popen(['sed', '-n', "2~4p"],
                                           stdout=subprocess.PIPE,
                                           stdin=awk.stdout)
                    zcat.stdout.close()
                    awk.stdout.close()
                    output = sed.communicate()[0]
                    zcat.wait()
                    awk.wait()
                    for barcode in output.split('\n')[:-1]:
                        try:
                            index_counter[barcode] += 1
                        except KeyError:
                            index_counter[barcode] = 1
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats",
                        "DemuxSummaryF1L{}.txt".format(lane_id))
                    with open(demuxSummary_file, 'w') as demuxSummary_file_fh:
                        demuxSummary_file_fh.write(
                            "### Most Popular Unknown Index Sequences\n")
                        demuxSummary_file_fh.write(
                            "### Columns: Index_Sequence Hit_Count\n")
                        for (index, occ) in sorted(index_counter.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True):
                            demuxSummary_file_fh.write("{}\t{}\n".format(
                                index, occ))

                #I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values
                undeterminedStats = DemuxSummaryParser(
                    os.path.join(self.run_dir, self.demux_dir, "Stats"))
                sample_data_old = self.runParserObj.lanes.sample_data
                sample_data_new = []
                for lane in sample_data_old:
                    if lane["Lane"] in NoIndexLanes:
                        #in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        lane["% One mismatchbarcode"] = '0'
                        lane["% Perfectbarcode"] = '100'
                        lane["% of thelane"] = '100'
                        lane["PF Clusters"] = str(PF_clusters)
                    sample_data_new.append(lane)
                self.runParserObj.lanes.sample_data = sample_data_new

                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_lane_dir = _create_folder_structure(
                    demux_folder,
                    ["Reports", "html", self.flowcell_id, "all", "all", "all"])
                new_html_report_lane = os.path.join(new_html_report_lane_dir,
                                                    "lane.html")
                _generate_lane_html(new_html_report_lane,
                                    self.runParserObj.lanes)
                #now do the same for laneBarcode
                sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data
                sampleBarcode_data_new = []
                for sample in sampleBarcode_data_old:
                    if sample["Lane"] in NoIndexLanes:
                        #in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        sample["% One mismatchbarcode"] = '0'
                        sample["% Perfectbarcode"] = '100'
                        sample["% of thelane"] = '100'
                        sample["PF Clusters"] = str(PF_clusters)
                    sampleBarcode_data_new.append(sample)
                self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new
                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_sampleBarcode_dir = _create_folder_structure(
                    demux_folder,
                    ["Reports", "html", self.flowcell_id, "all", "all", "all"])
                new_html_report_sampleBarcode = os.path.join(
                    new_html_report_sampleBarcode_dir, "laneBarcode.html")
                _generate_lane_html(new_html_report_sampleBarcode,
                                    self.runParserObj.lanebarcodes)

                os.remove(
                    flag_file
                )  # remove flag file to allow future iteration on this FC
                return True  #return true, I have done everything I was supposed to do
Beispiel #32
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(self.lanes_10X) and len(self.lanes_not_10X):
            Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X = "SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, self.lanes_not_10X))
                samplesheet_dest_10X = "SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, self.lanes_10X))
        else:
            with chdir(self.run_dir):
                samplesheet_dest = "SampleSheet_0.csv"
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet,
                            (self.lanes_10X or self.lanes_not_10X)))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if self.lanes_not_10X:
                cmd_normal = self.generate_bcl_command(self.lanes_not_10X,
                                                       bcl2fastq_cmd_counter)
                misc.call_external_command_detached(
                    cmd_normal,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "normal run {} on {}".format(os.path.basename(self.id),
                                                  datetime.now())))
                bcl2fastq_cmd_counter += 1
            if self.lanes_10X:
                cmd_10X = self.generate_bcl_command(self.lanes_10X,
                                                    bcl2fastq_cmd_counter,
                                                    is_10X=True)
                misc.call_external_command_detached(
                    cmd_10X,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "10X run {} on {}".format(os.path.basename(self.id),
                                               datetime.now())))
                bcl2fastq_cmd_counter += 1
        return True
Beispiel #33
0
 def encrypt_runs(cls, run, force):
     """Encrypt the runs that have been collected"""
     bk = cls(run)
     bk.collect_runs(ext=".tar.gz")
     logger.info("In total, found {} run(s) to be encrypted".format(
         len(bk.runs)))
     for run in bk.runs:
         run.flag = "{}.encrypting".format(run.name)
         run.dst_key_encrypted = os.path.join(bk.keys_path,
                                              run.key_encrypted)
         tmp_files = [
             run.zip_encrypted, run.key_encrypted, run.key, run.flag
         ]
         logger.info("Encryption of run {} is now started".format(run.name))
         # Check if there is enough space and exit if not
         bk.avail_disk_space(run.path, run.name)
         # Check if the run in demultiplexed
         if not force and bk.check_demux:
             if not misc.run_is_demuxed(run.name, bk.couch_info):
                 logger.warn(
                     "Run {} is not demultiplexed yet, so skipping it".
                     format(run.name))
                 continue
             logger.info(
                 "Run {} is demultiplexed and proceeding with encryption".
                 format(run.name))
         with filesystem.chdir(run.path):
             # skip run if already ongoing
             if os.path.exists(run.flag):
                 logger.warn(
                     "Run {} is already being encrypted, so skipping now".
                     format(run.name))
                 continue
             flag = open(run.flag, 'w').close()
             # zip the run directory
             if os.path.exists(run.zip):
                 if os.path.isdir(run.name):
                     logger.warn(
                         "Both run source and zipped archive exist for run {}, skipping run as precaution"
                         .format(run.name))
                     bk._clean_tmp_files([run.flag])
                     continue
                 logger.info(
                     "Zipped archive already exist for run {}, so using it for encryption"
                     .format(run.name))
             else:
                 logger.info("Creating zipped archive for run {}".format(
                     run.name))
                 if bk._call_commands(cmd1="tar -cf - {}".format(run.name),
                                      cmd2="pigz --fast -c -",
                                      out_file=run.zip,
                                      mail_failed=True,
                                      tmp_files=[run.zip, run.flag]):
                     logger.info(
                         "Run {} was successfully compressed, so removing the run source directory"
                         .format(run.name))
                     shutil.rmtree(run.name)
                 else:
                     logger.warn("Skipping run {} and moving on".format(
                         run.name))
                     continue
             # Remove encrypted file if already exists
             if os.path.exists(run.zip_encrypted):
                 logger.warn((
                     "Removing already existing encrypted file for run {}, this is a precaution "
                     "to make sure the file was encrypted with correct key file"
                     .format(run.name)))
                 bk._clean_tmp_files([
                     run.zip_encrypted, run.key, run.key_encrypted,
                     run.dst_key_encrypted
                 ])
             # Generate random key to use as pasphrase
             if not bk._call_commands(cmd1="gpg --gen-random 1 256",
                                      out_file=run.key,
                                      tmp_files=tmp_files):
                 logger.warn("Skipping run {} and moving on".format(
                     run.name))
                 continue
             logger.info("Generated randon phrase key for run {}".format(
                 run.name))
             # Calculate md5 sum pre encryption
             if not force:
                 logger.info("Calculating md5sum before encryption")
                 md5_call, md5_out = bk._call_commands(
                     cmd1="md5sum {}".format(run.zip),
                     return_out=True,
                     tmp_files=tmp_files)
                 if not md5_call:
                     logger.warn("Skipping run {} and moving on".format(
                         run.name))
                     continue
                 md5_pre_encrypt = md5_out.split()[0]
             # Encrypt the zipped run file
             logger.info("Encrypting the zipped run file")
             if not bk._call_commands(
                     cmd1=
                 ("gpg --symmetric --cipher-algo aes256 --passphrase-file {} --batch --compress-algo "
                  "none -o {} {}".format(run.key, run.zip_encrypted,
                                         run.zip)),
                     tmp_files=tmp_files):
                 logger.warn("Skipping run {} and moving on".format(
                     run.name))
                 continue
             # Decrypt and check for md5
             if not force:
                 logger.info("Calculating md5sum after encryption")
                 md5_call, md5_out = bk._call_commands(
                     cmd1=
                     "gpg --decrypt --cipher-algo aes256 --passphrase-file {} --batch {}"
                     .format(run.key, run.zip_encrypted),
                     cmd2="md5sum",
                     return_out=True,
                     tmp_files=tmp_files)
                 if not md5_call:
                     logger.warn("Skipping run {} and moving on".format(
                         run.name))
                     continue
                 md5_post_encrypt = md5_out.split()[0]
                 if md5_pre_encrypt != md5_post_encrypt:
                     logger.error((
                         "md5sum did not match before {} and after {} encryption. Will remove temp files and "
                         "move on".format(md5_pre_encrypt,
                                          md5_post_encrypt)))
                     bk._clean_tmp_files(tmp_files)
                     continue
                 logger.info(
                     "Md5sum is macthing before and after encryption")
             # Encrypt and move the key file
             if bk._call_commands(cmd1="gpg -e -r {} -o {} {}".format(
                     bk.gpg_receiver, run.key_encrypted, run.key),
                                  tmp_files=tmp_files):
                 shutil.move(run.key_encrypted, run.dst_key_encrypted)
             else:
                 logger.error("Encrption of key file failed, skipping run")
                 continue
             bk._clean_tmp_files([run.zip, run.key, run.flag])
             logger.info(
                 "Encryption of run {} is successfully done, removing zipped run file"
                 .format(run.name))
Beispiel #34
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """
        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        try:
            indexfile = self.CONFIG['bcl2fastq']['index_path']
        except KeyError:
            logger.error(
                "Path to index file (10X) not found in the config file")
            raise RuntimeError
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #Function that returns a list of which lanes contains 10X samples.
        (lanes_10X, lanes_not_10X) = look_for_lanes_with_10X_indicies(
            indexfile, ssparser)
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            indexfile,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(
                    "encountered the following exception '{}'".format(e))
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated

        ##when demultiplexing SampleSheet.csv is the one I need to use
        ## Need to rewrite so that SampleSheet_0.csv is always used.
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))
        #we have 10x lane - need to split the  samples sheet and build a 10x command for bcl2fastq
        Complex_run = False
        if len(lanes_10X) and len(lanes_not_10X):
            Complex_run = True

        if Complex_run:
            with chdir(self.run_dir):
                samplesheet_dest_not_10X = "SampleSheet_0.csv"
                with open(samplesheet_dest_not_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_not_10X))
                samplesheet_dest_10X = "SampleSheet_1.csv"
                with open(samplesheet_dest_10X, 'wb') as fcd:
                    fcd.write(
                        _generate_samplesheet_subset(
                            self.runParserObj.samplesheet, lanes_10X))
        else:
            with chdir(self.run_dir):
                shutil.copy("SampleSheet.csv", "SampleSheet_0.csv")

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        bcl2fastq_cmd_counter = 0
        with chdir(self.run_dir):
            # create Demultiplexing dir, this changes the status to IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
        with chdir(self.run_dir):
            if lanes_not_10X:
                cmd_normal = self.generate_bcl_command(lanes_not_10X,
                                                       bcl2fastq_cmd_counter)
                misc.call_external_command_detached(
                    cmd_normal,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "normal run {} on {}".format(os.path.basename(self.id),
                                                  datetime.now())))
                bcl2fastq_cmd_counter += 1
            if lanes_10X:
                cmd_10X = self.generate_bcl_command(lanes_10X,
                                                    bcl2fastq_cmd_counter,
                                                    is_10X=True)
                misc.call_external_command_detached(
                    cmd_10X,
                    with_log_files=True,
                    prefix="demux_{}".format(bcl2fastq_cmd_counter))
                logger.info(
                    ("BCL to FASTQ conversion and demultiplexing started for "
                     "10X run {} on {}".format(os.path.basename(self.id),
                                               datetime.now())))
                bcl2fastq_cmd_counter += 1
        return True
Beispiel #35
0
    def compute_undetermined(self):
        """
        This function returns true if all demux steps are done and we can proceed to QC
        For simple lanes with index: no check is done everything needs to be in place
        for complex lanes: no check is done everything needs to be in place
        for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion
        """
        NoIndexLanes = [lane["Lane"] for lane in self.runParserObj.samplesheet.data if "NoIndex" in lane["index"]]
        if len(NoIndexLanes) == 0:
            return True  # everything is fine I can proceed to QC
        # otherwise proceed

        NoIndex_Undetermiend = os.path.join(self.run_dir, "Demultiplexing_NoIndex")
        if not os.path.exists(NoIndex_Undetermiend):
            # for these lanes I have no undetermiend as I demux them without index.
            # now geenrate the base masks per lane
            per_lane_base_masks = self._generate_per_lane_base_mask()
            # store here only the NoIndex lanes
            per_lane_base_masks_NoIndex = {}
            run_with_no_index = False  # use this flag to check that we are not in the C.Daub case
            for NoIndexLane in NoIndexLanes:
                per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[NoIndexLane]
                base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0]
                new_base_mask = []
                if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]):
                    # C.Daub_15_01 case, only one sample per lane and no index at all
                    run_with_no_index = True
                else:
                    for baseMask_element in per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]:
                        if baseMask_element.startswith("Y"):
                            new_base_mask.append(baseMask_element.replace("Y", "N"))
                        elif baseMask_element.startswith("N"):
                            new_base_mask.append(baseMask_element.replace("N", "Y"))
                    per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"] = new_base_mask
            if not run_with_no_index:
                os.makedirs(NoIndex_Undetermiend)
                command = self._generate_bcl2fastq_command(
                    per_lane_base_masks_NoIndex, True, "NoIndex", mask_short_adapter_reads=True
                )
                with chdir(self.run_dir):
                    misc.call_external_command_detached(command, with_log_files=True, prefix="demux_NoIndex")
                # return false, as I need to wait to finish the demux for the NoIndex case
                return False
            else:
                # in this case I do not want to start a demux for th eindex, beceause I do not have the index at all
                # I need to softlink everythin else that is in Stats as I do not want to recompute it
                missingStatsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_0", "Stats", "*F*L*.txt"))
                destination = os.path.join(self.run_dir, self.demux_dir, "Stats")
                for source in missingStatsFiles:
                    source_file_name = os.path.basename(source)
                    if not os.path.exists(os.path.join(destination, source_file_name)):
                        os.symlink(source, os.path.join(destination, source_file_name))
                return True
        else:
            # in this case it means that I have already started to demux the NoIndex
            if not os.path.exists(
                os.path.join(self.run_dir, "Demultiplexing_NoIndex", "Stats", "DemultiplexingStats.xml")
            ):
                # demultiplexing of undetermined is still ongoing
                logger.info("Demux of NoIndex lanes ongoing")
                return False
            else:
                logger.info("Demux of NoIndex lanes done.")
                # now I need to produce the files needed in the QC
                flag_file = os.path.join(NoIndex_Undetermiend, "ongoing")
                if os.path.exists(flag_file):
                    # it means that a previous instance of TACA is running and still processing this FC
                    logger.info("Counting of undetermined indexes for NoIndex lanes ongoing")
                    return False
                # now check if the stats have been already computed
                computed = True
                for lane_id in NoIndexLanes:
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    if not os.path.exists(demuxSummary_file):
                        # if does not exists and the ongoing falg is not present, then I need to set computed to False
                        computed = False
                if computed:
                    # in this case I already computed all the demux stats that I need
                    return True
                # otherwise I need to comput them
                open(flag_file, "a").close()  # create the flag file indicating I am working on this
                for lane_id in NoIndexLanes:
                    # count the index occurences, each lane corresponds to one project, a project might have multiple lanes
                    current_lane = [lane for lane in self.runParserObj.samplesheet.data if lane_id == lane["Lane"]][0]
                    if current_lane["index"] != "NoIndex":
                        logger.error(
                            "while processing run {} NoIndex lane {}, index {} found in SampleSheet".format(
                                self.id, lane_id, current_lane["index"]
                            )
                        )
                        return False
                    index_counter = {}
                    indexes_fastq1 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R2_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    indexes_fastq2 = glob.glob(
                        os.path.join(
                            NoIndex_Undetermiend,
                            current_lane[self.runParserObj.samplesheet.dfield_proj],
                            current_lane[self.runParserObj.samplesheet.dfield_sid],
                            "{}_S?_L00{}_R3_001.fastq.gz".format(
                                current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id
                            ),
                        )
                    )[0]
                    # I assume these two files are always present, maybe it is posisble to have no index with a single index...
                    logger.info("Computing Undetermiend indexes for NoIndex lane {}".format(lane_id))
                    zcat = subprocess.Popen(["zcat", indexes_fastq1], stdout=subprocess.PIPE)
                    # this command allows to steam two files, print them line after line separated by a plus
                    awk = subprocess.Popen(
                        [
                            "awk",
                            'BEGIN {{OFS="+"}}{{  ("zcat " "{0} " ) | getline line ; print $0,line }}'.format(
                                indexes_fastq2
                            ),
                        ],
                        stdout=subprocess.PIPE,
                        stdin=zcat.stdout,
                    )
                    # now select only the 2nd line every 4 (i.e., only the index1+index2 line)
                    sed = subprocess.Popen(["sed", "-n", "2~4p"], stdout=subprocess.PIPE, stdin=awk.stdout)
                    zcat.stdout.close()
                    awk.stdout.close()
                    output = sed.communicate()[0]
                    zcat.wait()
                    awk.wait()
                    for barcode in output.split("\n")[:-1]:
                        try:
                            index_counter[barcode] += 1
                        except KeyError:
                            index_counter[barcode] = 1
                    demuxSummary_file = os.path.join(
                        self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)
                    )
                    with open(demuxSummary_file, "w") as demuxSummary_file_fh:
                        demuxSummary_file_fh.write("### Most Popular Unknown Index Sequences\n")
                        demuxSummary_file_fh.write("### Columns: Index_Sequence Hit_Count\n")
                        for (index, occ) in sorted(index_counter.items(), key=operator.itemgetter(1), reverse=True):
                            demuxSummary_file_fh.write("{}\t{}\n".format(index, occ))

                # I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values
                undeterminedStats = DemuxSummaryParser(os.path.join(self.run_dir, self.demux_dir, "Stats"))
                sample_data_old = self.runParserObj.lanes.sample_data
                sample_data_new = []
                for lane in sample_data_old:
                    if lane["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        lane["% One mismatchbarcode"] = "0"
                        lane["% Perfectbarcode"] = "100"
                        lane["% of thelane"] = "100"
                        lane["PF Clusters"] = str(PF_clusters)
                    sample_data_new.append(lane)
                self.runParserObj.lanes.sample_data = sample_data_new

                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_lane_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html")
                _generate_lane_html(new_html_report_lane, self.runParserObj.lanes)
                # now do the same for laneBarcode
                sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data
                sampleBarcode_data_new = []
                for sample in sampleBarcode_data_old:
                    if sample["Lane"] in NoIndexLanes:
                        # in this case I need to fill in new values
                        PF_clusters = undeterminedStats.TOTAL[lane["Lane"]]
                        sample["% One mismatchbarcode"] = "0"
                        sample["% Perfectbarcode"] = "100"
                        sample["% of thelane"] = "100"
                        sample["PF Clusters"] = str(PF_clusters)
                    sampleBarcode_data_new.append(sample)
                self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new
                demux_folder = os.path.join(self.run_dir, "Demultiplexing")
                new_html_report_sampleBarcode_dir = _create_folder_structure(
                    demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
                )
                new_html_report_sampleBarcode = os.path.join(new_html_report_sampleBarcode_dir, "laneBarcode.html")
                _generate_lane_html(new_html_report_sampleBarcode, self.runParserObj.lanebarcodes)

                os.remove(flag_file)  # remove flag file to allow future iteration on this FC
                return True  # return true, I have done everything I was supposed to do
Beispiel #36
0
    def demultiplex_run(self):
        """
        Demultiplex a HiSeq run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - create multiple SampleSheets in case at least one lane have multiple indexes lengths
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        if ssname is None:
            return None
        ssparser = SampleSheetParser(ssname)
        # Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet
        try:
            shutil.copy(ssname, os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id)))
            ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
        except:
            raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir))

        # this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
        # to be used it needs some editing
        # this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        # check that the samplesheet is not already present. In this case go the next step
        if os.path.exists(samplesheet_dest):
            logger.info("SampleSheet.csv found ... overwriting it")
        try:
            with open(samplesheet_dest, "wb") as fcd:
                fcd.write(self._generate_clean_samplesheet(ssparser))
        except Exception as e:
            logger.error(e.text)
            return False
        logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv"))
        # now geenrate the base masks per lane and decide how to demultiplex
        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks])
        # if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
        # extract lanes with a single base masks
        simple_lanes = {}
        complex_lanes = {}
        for lane in per_lane_base_masks:
            if len(per_lane_base_masks[lane]) == 1:
                simple_lanes[lane] = per_lane_base_masks[lane]
            else:
                complex_lanes[lane] = per_lane_base_masks[lane]
        # simple lanes contains the lanes such that there is more than one base mask
        bcl2fastq_commands = []
        bcl2fastq_command_num = 0
        if len(simple_lanes) > 0:
            bcl2fastq_commands.append(self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num))
            bcl2fastq_command_num += 1
        # compute the different masks, there will be one bcl2fastq command per mask
        base_masks_complex = [complex_lanes[base_masks].keys() for base_masks in complex_lanes]
        different_masks = list(set([item for sublist in base_masks_complex for item in sublist]))
        for mask in different_masks:
            base_masks_complex_to_demux = {}
            for lane in complex_lanes:
                if complex_lanes[lane].has_key(mask):
                    base_masks_complex_to_demux[lane] = {}
                    base_masks_complex_to_demux[lane][mask] = complex_lanes[lane][mask]
            # at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
            bcl2fastq_commands.append(
                self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num)
            )
            bcl2fastq_command_num += 1
        # now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
        with chdir(self.run_dir):
            # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
            if not os.path.exists("Demultiplexing"):
                os.makedirs("Demultiplexing")
            execution = 0
            for bcl2fastq_command in bcl2fastq_commands:
                misc.call_external_command_detached(
                    bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution)
                )
                execution += 1
Beispiel #37
0
    def demultiplex_run(self):
        """
           Demultiplex a Xten run:
            - find the samplesheet
            - make a local copy of the samplesheet and name it SampleSheet.csv
            - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep)
            - run bcl2fastq conversion
        """

        ssname = self._get_samplesheet()
        ssparser = SampleSheetParser(ssname)
        #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
        #if this is not the case then create it and take special care of modification to be done on the SampleSheet
        samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
        #check that the samplesheet is not already present. In this case go the next step
        if not os.path.exists(samplesheet_dest):
            try:
                with open(samplesheet_dest, 'wb') as fcd:
                    fcd.write(
                        _generate_clean_samplesheet(
                            ssparser,
                            fields_to_remove=['index2'],
                            rename_samples=True,
                            rename_qPCR_suffix=True,
                            fields_qPCR=[ssparser.dfield_snm]))
            except Exception as e:
                logger.error(e.text)
                return False
            logger.info(
                ("Created SampleSheet.csv for Flowcell {} in {} ".format(
                    self.id, samplesheet_dest)))
        ##SampleSheet.csv generated
        ##when demultiplexing SampleSheet.csv is the one I need to use
        self.runParserObj.samplesheet = SampleSheetParser(
            os.path.join(self.run_dir, "SampleSheet.csv"))

        per_lane_base_masks = self._generate_per_lane_base_mask()
        max_different_base_masks = max([
            len(per_lane_base_masks[base_masks])
            for base_masks in per_lane_base_masks
        ])
        if max_different_base_masks > 1:
            # in a HiSeqX run I cannot have different index sizes in the SAME lane
            logger.error(
                "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \
                         in the same lane".format(self.id))
            return False
        #I have everything to run demultiplexing now.
        logger.info('Building bcl2fastq command')

        with chdir(self.run_dir):
            cl = [self.CONFIG.get('bcl2fastq')['bin']]
            if self.CONFIG.get('bcl2fastq').has_key('options'):
                cl_options = self.CONFIG['bcl2fastq']['options']
                # Append all options that appear in the configuration file to the main command.
                for option in cl_options:
                    if isinstance(option, dict):
                        opt, val = option.items()[0]
                        cl.extend(['--{}'.format(opt), str(val)])
                    else:
                        cl.append('--{}'.format(option))
            #now add the base_mask for each lane
            for lane in sorted(per_lane_base_masks):
                #iterate thorugh each lane and add the correct --use-bases-mask for that lane
                #there is a single basemask for each lane, I checked it a couple of lines above
                base_mask = [
                    per_lane_base_masks[lane][bm]['base_mask']
                    for bm in per_lane_base_masks[lane]
                ][0]  # get the base_mask
                base_mask_expr = "{}:".format(lane) + ",".join(base_mask)
                cl.extend(["--use-bases-mask", base_mask_expr])

            logger.info(
                ("BCL to FASTQ conversion and demultiplexing started for "
                 " run {} on {}".format(os.path.basename(self.id),
                                        datetime.now())))
            misc.call_external_command_detached(cl, with_log_files=True)
        return True
Beispiel #38
0
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, status_db_config, dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit
    
    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info("Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info("Option 'only_analysis' is given, so will not look for fastq data")
     
    if only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True))
            if proj_info and proj_info['closed_days'] >= days_analysis:
                analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj))
                                project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
                                project_clean_list[proj]['fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(proj, pcon.get_entry(proj))
                        if proj_info:
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj),
                                                                                 data_dir, proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files 
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info
    
    if not project_clean_list:
        logger.info("There are no projects to clean")
        return
                    
    get_files_size_text(project_clean_list)
    logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
    if  misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq),
                   proj_count, len(project_clean_list)), default="no"):
                logger.info("Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")
    
    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info("Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass
            
        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn("Couldn't remove some files in qc directory '{}'".format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info("Removed analysis data for project {}, marking it cleaned".format(proj))
                _touch_cleaned(proj_analysis_root)
Beispiel #39
0
 def demultiplex_run(self):
     """
     Demultiplex a HiSeq run:
         - find the samplesheet
         - make a local copy of the samplesheet and name it SampleSheet.csv
         - create multiple SampleSheets in case at least one lane have multiple indexes lengths
         - run bcl2fastq conversion
     """
     #now geenrate the base masks per lane and decide how to demultiplex
     per_lane_base_masks = self._generate_per_lane_base_mask()
     max_different_base_masks = max([
         len(per_lane_base_masks[base_masks])
         for base_masks in per_lane_base_masks
     ])
     #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances
     #extract lanes with a single base masks
     simple_lanes = {}
     complex_lanes = {}
     for lane in per_lane_base_masks:
         if len(per_lane_base_masks[lane]) == 1:
             simple_lanes[lane] = per_lane_base_masks[lane]
         else:
             complex_lanes[lane] = per_lane_base_masks[lane]
     #simple lanes contains the lanes such that there is more than one base mask
     bcl2fastq_commands = []
     bcl2fastq_command_num = 0
     if len(simple_lanes) > 0:
         bcl2fastq_commands.append(
             self._generate_bcl2fastq_command(simple_lanes, True,
                                              bcl2fastq_command_num))
         bcl2fastq_command_num += 1
     #compute the different masks, there will be one bcl2fastq command per mask
     base_masks_complex = [
         complex_lanes[base_masks].keys() for base_masks in complex_lanes
     ]
     different_masks = list(
         set([item for sublist in base_masks_complex for item in sublist]))
     for mask in different_masks:
         base_masks_complex_to_demux = {}
         for lane in complex_lanes:
             if complex_lanes[lane].has_key(mask):
                 base_masks_complex_to_demux[lane] = {}
                 base_masks_complex_to_demux[lane][mask] = complex_lanes[
                     lane][mask]
         #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command
         bcl2fastq_commands.append(
             self._generate_bcl2fastq_command(base_masks_complex_to_demux,
                                              True, bcl2fastq_command_num))
         bcl2fastq_command_num += 1
     #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine
     with chdir(self.run_dir):
         # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS
         if not os.path.exists("Demultiplexing"):
             os.makedirs("Demultiplexing")
         execution = 0
         for bcl2fastq_command in bcl2fastq_commands:
             misc.call_external_command_detached(
                 bcl2fastq_command,
                 with_log_files=True,
                 prefix="demux_{}".format(execution))
             execution += 1
Beispiel #40
0
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
                undet_file_pattern: "Undetermined_*.fastq.gz"
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        flowcell_undet_files = config['flowcell']['undet_file_pattern']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
        if date:
            date = datetime.strptime(date, '%Y-%m-%d')
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit
    except ValueError as e:
        logger.error("Date given with '--date' option is not in required format, see help for more info")
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"
    
    # make exclude project list if provided
    exclude_list = []
    if exclude_projects:
        if os.path.isfile(exclude_projects):
            with open(exclude_projects, 'r') as in_file:
                exclude_list.extend([p.strip() for p in in_file.readlines()])
        else:
            exclude_list.extend(exclude_projects.split(','))
        # sanity check for mentioned project to exculde or valid
        invalid_projects = filter(lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.keys(), exclude_list)
        if invalid_projects:
            logger.error("'--exclude_projects' was called with some invalid projects '{}', "
                         "provide valid project name/id".format(",".join(invalid_projects)))
            raise SystemExit

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    if not list_only and not clean_undetermined:
        logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info("Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info("Option 'only_analysis' is given, so will not look for fastq data")
    
    if clean_undetermined:
        all_undet_files = []
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    # the above check looked for project directories and also that are not cleaned
                    # so if it could not find any project, means there is no project diretory at all
                    # or all the project directory is already cleaned. Then we can remove the undet  
                    if len(projects_in_fc) > 0:
                        continue
                    fc_undet_files = glob(os.path.join(flowcell_project_source,flowcell_undet_files))
                    if fc_undet_files:
                        logger.info("All projects was cleaned for FC {}, found {} undeterminded files".format(fc,len(fc_undet_files)))
                        all_undet_files.extend(map(os.path.abspath, fc_undet_files))
        if all_undet_files:
            undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
            if misc.query_yes_no("In total found {} undetermined files which are {} in size, delete now ?".format(len(all_undet_files),
                                 undet_size), default="no"):
                    removed = _remove_files(all_undet_files)
        return
    elif only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date)
            if proj_info and proj_info['closed_days'] >= days_analysis:
                # move on if this project has to be excluded
                if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
                    continue
                analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj))
                                project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
                                project_clean_list[proj]['fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date)
                        if proj_info:
                            # move on if this project has to be excluded
                            if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
                                continue
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj),
                                                                                 data_dir, proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files 
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info
    
    if not project_clean_list:
        logger.info("There are no projects to clean")
        return
    
    # list only the project and exit if 'list_only' option is selected
    if list_only:
        print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
        for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True):
            print "\t".join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
                             str(p_info['closed_days']), p_info['closed_date'],
                             _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])])
        raise SystemExit
            
    
    logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
    if  misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq),
                   proj_count, len(project_clean_list)), default="no"):
                logger.info("Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")
    
    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info("Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass
            
        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn("Couldn't remove some files in qc directory '{}'".format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info("Removed analysis data for project {}, marking it cleaned".format(proj))
                _touch_cleaned(proj_analysis_root)
Beispiel #41
0
 def encrypt_runs(cls, run, force):
     """Encrypt the runs that have been collected"""
     bk = cls(run)
     bk.collect_runs(ext=".tar.gz")
     logger.info("In total, found {} run(s) to be encrypted".format(len(bk.runs)))
     for run in bk.runs:
         run.flag = "{}.encrypting".format(run.name)
         run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
         tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag]
         logger.info("Encryption of run {} is now started".format(run.name))
         # Check if there is enough space and exit if not
         bk.avail_disk_space(run.path, run.name)
         # Check if the run in demultiplexed
         if not force and bk.check_demux:
             if not misc.run_is_demuxed(run.name, bk.couch_info):
                 logger.warn("Run {} is not demultiplexed yet, so skipping it".format(run.name))
                 continue
             logger.info("Run {} is demultiplexed and proceeding with encryption".format(run.name))
         with filesystem.chdir(run.path):
             # skip run if already ongoing
             if os.path.exists(run.flag):
                 logger.warn("Run {} is already being encrypted, so skipping now".format(run.name))
                 continue
             flag = open(run.flag, 'w').close()
             # zip the run directory
             if os.path.exists(run.zip):
                 if os.path.isdir(run.name):
                     logger.warn("Both run source and zipped archive exist for run {}, skipping run as precaution".format(run.name))
                     bk._clean_tmp_files([run.flag])
                     continue
                 logger.info("Zipped archive already exist for run {}, so using it for encryption".format(run.name))
             else:
                 logger.info("Creating zipped archive for run {}".format(run.name))
                 if bk._call_commands(cmd1="tar -cf - {}".format(run.name), cmd2="pigz --fast -c -",
                                      out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]):
                     logger.info("Run {} was successfully compressed, so removing the run source directory".format(run.name))
                     shutil.rmtree(run.name)
                 else:
                     logger.warn("Skipping run {} and moving on".format(run.name))
                     continue
             # Remove encrypted file if already exists
             if os.path.exists(run.zip_encrypted):
                 logger.warn(("Removing already existing encrypted file for run {}, this is a precaution "
                              "to make sure the file was encrypted with correct key file".format(run.name)))
                 bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted])
             # Generate random key to use as pasphrase
             if not bk._call_commands(cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files):
                 logger.warn("Skipping run {} and moving on".format(run.name))
                 continue
             logger.info("Generated randon phrase key for run {}".format(run.name))
             # Calculate md5 sum pre encryption
             if not force:
                 logger.info("Calculating md5sum before encryption")
                 md5_call, md5_out = bk._call_commands(cmd1="md5sum {}".format(run.zip), return_out=True, tmp_files=tmp_files)
                 if not md5_call:
                     logger.warn("Skipping run {} and moving on".format(run.name))
                     continue
                 md5_pre_encrypt = md5_out.split()[0]
             # Encrypt the zipped run file
             logger.info("Encrypting the zipped run file")
             if not bk._call_commands(cmd1=("gpg --symmetric --cipher-algo aes256 --passphrase-file {} --batch --compress-algo "
                                            "none -o {} {}".format(run.key, run.zip_encrypted, run.zip)), tmp_files=tmp_files):
                 logger.warn("Skipping run {} and moving on".format(run.name))
                 continue
             # Decrypt and check for md5
             if not force:
                 logger.info("Calculating md5sum after encryption")
                 md5_call, md5_out = bk._call_commands(cmd1="gpg --decrypt --cipher-algo aes256 --passphrase-file {} --batch {}".format(run.key, run.zip_encrypted),
                                                       cmd2="md5sum", return_out=True, tmp_files=tmp_files)
                 if not md5_call:
                     logger.warn("Skipping run {} and moving on".format(run.name))
                     continue
                 md5_post_encrypt = md5_out.split()[0]
                 if md5_pre_encrypt != md5_post_encrypt:
                     logger.error(("md5sum did not match before {} and after {} encryption. Will remove temp files and "
                                   "move on".format(md5_pre_encrypt, md5_post_encrypt)))
                     bk._clean_tmp_files(tmp_files)
                     continue
                 logger.info("Md5sum is macthing before and after encryption")
             # Encrypt and move the key file
             if bk._call_commands(cmd1="gpg -e -r {} -o {} {}".format(bk.gpg_receiver, run.key_encrypted, run.key), tmp_files=tmp_files):
                 shutil.move(run.key_encrypted, run.dst_key_encrypted)
             else:
                 logger.error("Encrption of key file failed, skipping run")
                 continue
             bk._clean_tmp_files([run.zip, run.key, run.flag])
             logger.info("Encryption of run {} is successfully done, removing zipped run file".format(run.name))
Beispiel #42
0
    def demultiplex_run(self):
        """
           Demultiplex a run:
            - Make sub-samplesheet based on sample classes
            - Decide correct bcl2fastq command parameters based on sample classes
            - run bcl2fastq conversion
        """
        # Check sample types
        sample_type_list = []
        for lane, lane_contents in self.sample_table.items():
            for sample in lane_contents:
                sample_detail = sample[1]
                sample_type = sample_detail['sample_type']
                if sample_type not in sample_type_list:
                    sample_type_list.append(sample_type)

        # Go through sample_table for demultiplexing
        bcl2fastq_cmd_counter = 0
        for sample_type in sorted(sample_type_list):
            # Looking for lanes with multiple masks under the same sample type
            lane_table = dict()
            for lane, lane_contents in self.sample_table.items():
                for sample in lane_contents:
                    sample_detail = sample[1]
                    sample_type_t = sample_detail['sample_type']
                    sample_index_length = sample_detail['index_length']
                    if sample_type_t == sample_type:
                        if lane_table.get(lane):
                            if sample_index_length not in lane_table[lane]:
                                lane_table[lane].append(sample_index_length)
                        else:
                            lane_table.update({lane: [sample_index_length]})
            # Determine the number of demux needed for the same sample type
            demux_number_with_the_same_sample_type = len(
                max([v for k, v in lane_table.items()], key=len))
            # Prepare sub-samplesheets, masks and commands
            for i in range(0, demux_number_with_the_same_sample_type):
                # Prepare sub-samplesheet
                # A dictionary with lane and sample IDs to include
                samples_to_include = dict()
                # A dictionary with lane and index length for generating masks
                mask_table = dict()
                for lane, lane_contents in self.sample_table.items():
                    try:
                        index_length = lane_table[lane][i]
                        mask_table.update({lane: index_length})
                        for sample in lane_contents:
                            sample_name = sample[0]
                            sample_detail = sample[1]
                            sample_type_t = sample_detail['sample_type']
                            sample_index_length = sample_detail['index_length']
                            if sample_type_t == sample_type and sample_index_length == index_length:
                                if samples_to_include.get(lane):
                                    samples_to_include[lane].append(
                                        sample_name)
                                else:
                                    samples_to_include.update(
                                        {lane: [sample_name]})
                    except (KeyError, IndexError) as err:
                        logger.info(
                            ('No corresponding mask in lane {}. Skip it.'.
                             format(lane)))
                        continue

                # Make sub-samplesheet
                with chdir(self.run_dir):
                    samplesheet_dest = 'SampleSheet_{}.csv'.format(
                        bcl2fastq_cmd_counter)
                    with open(samplesheet_dest, 'w') as fcd:
                        fcd.write(
                            _generate_samplesheet_subset(
                                self.runParserObj.samplesheet,
                                samples_to_include))

                # Prepare demultiplexing dir
                with chdir(self.run_dir):
                    # Create Demultiplexing dir, this changes the status to IN_PROGRESS
                    if not os.path.exists('Demultiplexing'):
                        os.makedirs('Demultiplexing')

                # Prepare demultiplexing command
                with chdir(self.run_dir):
                    cmd = self.generate_bcl_command(sample_type, mask_table,
                                                    bcl2fastq_cmd_counter)
                    misc.call_external_command_detached(
                        cmd,
                        with_log_files=True,
                        prefix='demux_{}'.format(bcl2fastq_cmd_counter))
                    logger.info(('BCL to FASTQ conversion and demultiplexing ' \
                    'started for run {} on {}'.format(os.path.basename(self.id),
                                                      datetime.now())))

                # Demutiplexing done for one mask type and scripts will continue
                # Working with the next type. Command counter should increase by 1
                bcl2fastq_cmd_counter += 1
        return True
Beispiel #43
0
 def test_chdir(self):
     """Ensure start dir and end dir are the same."""
     initial_dir = os.getcwd()
     filesystem.chdir(self.rootdir)
     final_dir = os.getcwd()
     self.assertEqual(initial_dir, final_dir)