Esempio n. 1
0
def cleanup_processing(seconds):
    """Cleanup runs in processing server.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    try:
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, finished_run_indicator)
                    if os.path.exists(rta_file):
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info('Removing run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info('{} file exists but is not older than given time, skipping run {}'.format(
                                        finished_run_indicator, run))
    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = ("Could not find transfer.tsv file, so I cannot decide if I should "
               "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Esempio n. 2
0
 def __init__(self,
              projectid=None,
              sampleid=None,
              pi_email=None,
              sensitive=True,
              hard_stage_only=False,
              add_user=None,
              **kwargs):
     super(GrusProjectDeliverer, self).__init__(projectid, sampleid,
                                                **kwargs)
     self.stagingpathhard = getattr(self, 'stagingpathhard', None)
     if self.stagingpathhard is None:
         raise AttributeError(
             "stagingpathhard is required when delivering to GRUS")
     self.config_snic = CONFIG.get('snic', None)
     if self.config_snic is None:
         raise AttributeError(
             "snic confoguration is needed  delivering to GRUS (snic_api_url, snic_api_user, snic_api_password"
         )
     self.config_statusdb = CONFIG.get('statusdb', None)
     if self.config_statusdb is None:
         raise AttributeError(
             "statusdb configuration is needed  delivering to GRUS (url, username, password, port"
         )
     self.orderportal = CONFIG.get(
         'order_portal', None
     )  # do not need to raise exception here, I have already checked for this and monitoring does not need it
     if self.orderportal:
         self._set_pi_details(pi_email)  # set PI email and SNIC id
         self._set_other_member_details(
             add_user,
             CONFIG.get('add_project_owner',
                        False))  # set SNIC id for other project members
     self.sensitive = sensitive
     self.hard_stage_only = hard_stage_only
Esempio n. 3
0
def cleanup_processing(seconds):
    """Cleanup runs in processing server.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    try:
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [
                        r for r in os.listdir(archive_dir)
                        if re.match(filesystem.RUN_RE, r)
                ]:
                    rta_file = os.path.join(run, finished_run_indicator)
                    if os.path.exists(rta_file):
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info(
                                'Removing run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info(
                                '{} file exists but is not older than given time, skipping run {}'
                                .format(finished_run_indicator, run))
    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = (
            "Could not find transfer.tsv file, so I cannot decide if I should "
            "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Esempio n. 4
0
def find_runs_to_process():
    """Find nanopore runs to process."""
    nanopore_data_dir = CONFIG.get('nanopore_analysis').get('data_dir')[0]
    found_run_dirs = []
    skip_dirs = CONFIG.get('nanopore_analysis').get('ignore_dirs')
    try:
        found_top_dirs = [
            os.path.join(nanopore_data_dir, top_dir)
            for top_dir in os.listdir(nanopore_data_dir)
            if os.path.isdir(os.path.join(nanopore_data_dir, top_dir))
            and top_dir not in skip_dirs
        ]
    except OSError:
        logger.warn('There was an issue locating the following directory: {}. '
                    'Please check that it exists and try again.'.format(
                        nanopore_data_dir))
    # Get the actual location of the run directories in /var/lib/MinKnow/data/QC_runs/USERDETERMINEDNAME/USERDETSAMPLENAME/run
    if found_top_dirs:
        for top_dir in found_top_dirs:
            if os.path.isdir(top_dir):
                for sample_dir in os.listdir(top_dir):
                    if os.path.isdir(os.path.join(top_dir, sample_dir)):
                        for run_dir in os.listdir(
                                os.path.join(top_dir, sample_dir)):
                            found_run_dirs.append(
                                os.path.join(top_dir, sample_dir, run_dir))
    else:
        logger.warn('Could not find any run directories in {}'.format(
            nanopore_data_dir))
    return found_run_dirs
Esempio n. 5
0
def cleanup_nas(seconds):
    """Will move the finished runs in NASes to nosync directory.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    couch_info = CONFIG.get('statusdb')
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    check_demux = CONFIG.get('storage', {}).get('check_demux', False)
    host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    if check_demux:
                        if misc.run_is_demuxed(run, couch_info):
                            logger.info('Moving run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        elif os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.warn('Run {} is older than given time, but it is not demultiplexed yet'
                                        .format(run))
                            sbt = "Run not demultiplexed - {}".format(run)
                            msg = ("Run '{}' in '{}' is older then given threshold, but seems like it is not "
                                  "yet demultiplexed".format(os.path.join(data_dir, run), host_name))
                            misc.send_mail(sbt, msg, mail_recipients)
                    else:
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info('Moving run {} to nosync directory'.format(os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        else:
                            logger.info('{} file exists but is not older than given time, skipping run {}'
                                        .format(finished_run_indicator, run))
Esempio n. 6
0
def cleanup_nas(seconds):
    """Will move the finished runs in NASes to nosync directory.

    :param int seconds: Days/hours converted as second to consider a run to be old
    """
    couch_info = CONFIG.get('statusdb')
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    check_demux = CONFIG.get('storage', {}).get('check_demux', False)
    host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
            logger.warn(
                "Data directory '{}' does not exist or not a directory".format(
                    data_dir))
            continue
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [
                    r for r in os.listdir(data_dir)
                    if re.match(filesystem.RUN_RE, r)
            ]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    if check_demux:
                        if misc.run_is_demuxed(run, couch_info):
                            logger.info(
                                'Moving run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        elif 'miseq' in data_dir:
                            miseq_run = MiSeq_Run(run, CONFIG)
                            if miseq_run.get_run_type() == 'NON-NGI-RUN':
                                logger.info(
                                    'Run {} is a non-platform run, so moving it to nosync directory'
                                    .format(os.path.basename(run)))
                                shutil.move(run, 'nosync')
                        elif os.stat(
                                rta_file).st_mtime < time.time() - seconds:
                            logger.warn(
                                'Run {} is older than given time, but it is not demultiplexed yet'
                                .format(run))
                            sbt = "Run not demultiplexed - {}".format(run)
                            msg = (
                                "Run '{}' in '{}' is older then given threshold, but seems like it is not "
                                "yet demultiplexed".format(
                                    os.path.join(data_dir, run), host_name))
                            misc.send_mail(sbt, msg, mail_recipients)
                    else:
                        if os.stat(rta_file).st_mtime < time.time() - seconds:
                            logger.info(
                                'Moving run {} to nosync directory'.format(
                                    os.path.basename(run)))
                            shutil.move(run, 'nosync')
                        else:
                            logger.info(
                                '{} file exists but is not older than given time, skipping run {}'
                                .format(finished_run_indicator, run))
Esempio n. 7
0
def cleanup_uppmax(site, days, dry_run=False):
    """Remove project/run that have been closed more than 'days'
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int days: number of days to check for closed projects
    """
    days = check_days(site, days, config)
    if not days:
        return
    root_dir = CONFIG.get("cleanup").get(site).get("root")
    deleted_log = CONFIG.get("cleanup").get("deleted_log")
    assert os.path.exists(os.path.join(root_dir, deleted_log)), "Log directory {} doesn't exist in {}".format(
        deleted_log, root_dir
    )
    log_file = os.path.join(root_dir, "{fl}/{fl}.log".format(fl=deleted_log))

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site != "archive":
        ## work flow for cleaning up illumina/analysis ##
        projects = [p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE, p)]
        list_to_delete = get_closed_projects(projects, pcon, days)
    else:
        ##work flow for cleaning archive ##
        list_to_delete = []
        archived_in_swestore = filesystem.list_runs_in_swestore(
            path=CONFIG.get("cleanup").get("swestore").get("root"), no_ext=True
        )
        runs = [r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE, r)]
        with filesystem.chdir(root_dir):
            for run in runs:
                fc_date = run.split("_")[0]
                if misc.days_old(fc_date) > days:
                    if run in archived_in_swestore:
                        list_to_delete.append(run)
                    else:
                        logger.warn(
                            "Run {} is older than {} days but not in " "swestore, so SKIPPING".format(run, days)
                        )

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info("Will remove {} from {}".format(item, root_dir))
            continue
        try:
            shutil.rmtree(os.path.join(root_dir, item))
            logger.info("Removed project {} from {}".format(item, root_dir))
            with open(log_file, "a") as to_log:
                to_log.write("{}\t{}\n".format(item, datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M")))
        except OSError:
            logger.warn("Could not remove path {} from {}".format(item, root_dir))
            continue
Esempio n. 8
0
def archive_to_swestore(seconds,
                        run=None,
                        max_runs=None,
                        force=False,
                        compress_only=False):
    """Send runs (as archives) in NAS nosync to swestore for backup

    :param int seconds: Days/hours converted as seconds to check
    :param str run: specific run to send swestore
    :param int max_runs: number of runs to be processed simultaneously
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Compress the run without sending it to swestore
    """
    # If the run is specified in the command line, check that exists and archive
    if run:
        run = os.path.basename(run)
        base_dir = os.path.dirname(run)
        if re.match(filesystem.RUN_RE, run):
            # If the parameter is not an absolute path, find the run in the archive_dirs
            if not base_dir:
                for archive_dir in CONFIG.get('storage').get('archive_dirs'):
                    if os.path.exists(os.path.join(archive_dir, run)):
                        base_dir = archive_dir
            if not os.path.exists(os.path.join(base_dir, run)):
                logger.error(("Run {} not found. Please make sure to specify "
                              "the absolute path or relative path being in "
                              "the correct directory.".format(run)))
            else:
                with filesystem.chdir(base_dir):
                    _archive_run((run, seconds, force, compress_only))
        else:
            logger.error(
                "The name {} doesn't look like an Illumina run".format(
                    os.path.basename(run)))
    # Otherwise find all runs in every data dir on the nosync partition
    else:
        logger.info("Archiving old runs to SWESTORE")
        for to_send_dir in CONFIG.get('storage').get('archive_dirs'):
            logger.info('Checking {} directory'.format(to_send_dir))
            with filesystem.chdir(to_send_dir):
                to_be_archived = [
                    r for r in os.listdir(to_send_dir)
                    if re.match(filesystem.RUN_RE, r) and
                    not os.path.exists("{}.archiving".format(r.split('.')[0]))
                ]
                if to_be_archived:
                    pool = Pool(processes=len(to_be_archived
                                              ) if not max_runs else max_runs)
                    pool.map_async(_archive_run,
                                   ((run, seconds, force, compress_only)
                                    for run in to_be_archived))
                    pool.close()
                    pool.join()
                else:
                    logger.info('No old runs to be archived')
Esempio n. 9
0
def _archive_run((run, days, force, compress_only)):
    """ Archive a specific run to swestore

    :param str run: Run directory
    :param int days: Days to consider a run old
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Only compress the run without sending it to swestore
    """

    def _send_to_swestore(f, dest, remove=True):
        """ Send file to swestore checking adler32 on destination and eventually
        removing the file from disk

        :param str f: File to remove
        :param str dest: Destination directory in Swestore
        :param bool remove: If True, remove original file from source
        """
        if not filesystem.is_in_swestore(f):
            logger.info("Sending {} to swestore".format(f))
            misc.call_external_command("iput -K -P {file} {dest}".format(file=f, dest=dest), with_log_files=True)
            logger.info("Run {} sent correctly and checksum was okay.".format(f))
            if remove:
                logger.info("Removing run".format(f))
                os.remove(f)
        else:
            logger.warn("Run {} is already in Swestore, not sending it again nor removing from the disk".format(f))

    # Create state file to say that the run is being archived
    open("{}.archiving".format(run.split(".")[0]), "w").close()
    if run.endswith("bz2"):
        if os.stat(run).st_mtime < time.time() - (86400 * days):
            _send_to_swestore(run, CONFIG.get("storage").get("irods").get("irodsHome"))
        else:
            logger.info("Run {} is not {} days old yet. Not archiving".format(run, str(days)))
    else:
        rta_file = os.path.join(run, "RTAComplete.txt")
        if not os.path.exists(rta_file) and not force:
            logger.warn(
                (
                    "Run {} doesn't seem to be completed and --force option was "
                    "not enabled, not archiving the run".format(run)
                )
            )
        if force or (os.path.exists(rta_file) and os.stat(rta_file).st_mtime < time.time() - (86400 * days)):
            logger.info("Compressing run {}".format(run))
            # Compress with pbzip2
            misc.call_external_command("tar --use-compress-program=pbzip2 -cf {run}.tar.bz2 {run}".format(run=run))
            logger.info("Run {} successfully compressed! Removing from disk...".format(run))
            shutil.rmtree(run)
            if not compress_only:
                _send_to_swestore("{}.tar.bz2".format(run), CONFIG.get("storage").get("irods").get("irodsHome"))
        else:
            logger.info("Run {} is not completed or is not {} days old yet. Not archiving".format(run, str(days)))
    os.remove("{}.archiving".format(run.split(".")[0]))
Esempio n. 10
0
def cleanup_uppmax(site, days, dry_run=False):
    """Remove project/run that have been closed more than 'days'
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int days: number of days to check for closed projects
    """
    days = check_days(site, days, config)
    if not days:
        return
    root_dir = CONFIG.get('cleanup').get(site).get('root')
    deleted_log = CONFIG.get('cleanup').get('deleted_log')
    assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir)
    log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log))

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site != "archive":
        ## work flow for cleaning up illumina/analysis ##
        projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ]
        list_to_delete = get_closed_projects(projects, pcon, days)
    else:
        ##work flow for cleaning archive ##
        list_to_delete = []
        archived_in_swestore = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'), no_ext=True)
        runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ]
        with filesystem.chdir(root_dir):
            for run in runs:
                fc_date = run.split('_')[0]
                if misc.days_old(fc_date) > days:
                    if run in archived_in_swestore:
                        list_to_delete.append(run)
                    else:
                        logger.warn("Run {} is older than {} days but not in "
                                    "swestore, so SKIPPING".format(run, days))

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info('Will remove {} from {}'.format(item,root_dir))
            continue
        try:
            shutil.rmtree(os.path.join(root_dir,item))
            logger.info('Removed project {} from {}'.format(item,root_dir))
            with open(log_file,'a') as to_log:
                to_log.write("{}\t{}\n".format(item,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M')))
        except OSError:
            logger.warn("Could not remove path {} from {}"
                        .format(item,root_dir))
            continue
Esempio n. 11
0
 def fetch_config_info(self):
     """Try to fecth required info from the config file. Log and exit if any neccesary info is missing."""
     try:
         self.data_dirs = CONFIG['backup']['data_dirs']
         self.archive_dirs = CONFIG['backup']['archive_dirs']
         self.keys_path = CONFIG['backup']['keys_path']
         self.gpg_receiver = CONFIG['backup']['gpg_receiver']
         self.mail_recipients = CONFIG['mail']['recipients']
         self.check_demux = CONFIG.get('backup', {}).get('check_demux', False)
         self.couch_info = CONFIG.get('statusdb')
     except KeyError as e:
         logger.error('Config file is missing the key {}, make sure it have all required information'.format(str(e)))
         raise SystemExit
Esempio n. 12
0
File: backup.py Progetto: vezzi/TACA
 def fetch_config_info(self):
     """Try to fecth required info from the config file. Log and exit if any neccesary info is missing"""
     try:
         self.data_dirs = CONFIG['backup']['data_dirs']
         self.archive_dirs = CONFIG['backup']['archive_dirs']
         self.keys_path = CONFIG['backup']['keys_path']
         self.gpg_receiver = CONFIG['backup']['gpg_receiver']
         self.mail_recipients = CONFIG['mail']['recipients']
         self.check_demux = CONFIG.get('backup', {}).get('check_demux', False)
         self.couch_info = CONFIG.get('statusdb')
     except KeyError as e:
         logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
         raise SystemExit
Esempio n. 13
0
def _archive_run((run, seconds, force, compress_only)):
    """ Archive a specific run to swestore

    :param str run: Run directory
    :param int seconds: Days/hours converted as seconds to check
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Only compress the run without sending it to swestore
    """

    def _send_to_swestore(f, dest, remove=True):
        """ Send file to swestore checking adler32 on destination and eventually
        removing the file from disk

        :param str f: File to remove
        :param str dest: Destination directory in Swestore
        :param bool remove: If True, remove original file from source
        """
        if not filesystem.is_in_swestore(f):
            logger.info("Sending {} to swestore".format(f))
            misc.call_external_command('iput -R swestoreArchCacheResc -P {file} {dest}'.format(file=f, dest=dest),
                    with_log_files=True, prefix=f.replace('.tar.bz2',''), log_dir="swestore_logs")
            logger.info('Run {} sent to swestore.'.format(f))
            if remove:
                logger.info('Removing run'.format(f))
                os.remove(f)
        else:
            logger.warn('Run {} is already in Swestore, not sending it again nor removing from the disk'.format(f))

    # Create state file to say that the run is being archived
    open("{}.archiving".format(run.split('.')[0]), 'w').close()
    if run.endswith('bz2'):
        if os.stat(run).st_mtime < time.time() - seconds:
            _send_to_swestore(run, CONFIG.get('storage').get('irods').get('irodsHome'))
        else:
            logger.info("Run {} is not older than given time yet. Not archiving".format(run))
    else:
        rta_file = os.path.join(run, finished_run_indicator)
        if not os.path.exists(rta_file) and not force:
            logger.warn(("Run {} doesn't seem to be completed and --force option was "
                      "not enabled, not archiving the run".format(run)))
        if force or (os.path.exists(rta_file) and os.stat(rta_file).st_mtime < time.time() - seconds):
            logger.info("Compressing run {}".format(run))
            # Compress with pbzip2
            misc.call_external_command('tar --use-compress-program=pbzip2 -cf {run}.tar.bz2 {run}'.format(run=run))
            logger.info('Run {} successfully compressed! Removing from disk...'.format(run))
            shutil.rmtree(run)
            if not compress_only:
                _send_to_swestore('{}.tar.bz2'.format(run), CONFIG.get('storage').get('irods').get('irodsHome'))
        else:
            logger.info("Run {} is not completed or is not older than given time yet. Not archiving".format(run))
    os.remove("{}.archiving".format(run.split('.')[0]))
Esempio n. 14
0
def _archive_run((run, days, force, compress_only)):
    """ Archive a specific run to swestore

    :param str run: Run directory
    :param int days: Days to consider a run old
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Only compress the run without sending it to swestore
    """

    def _send_to_swestore(f, dest, remove=True):
        """ Send file to swestore checking adler32 on destination and eventually
        removing the file from disk

        :param str f: File to remove
        :param str dest: Destination directory in Swestore
        :param bool remove: If True, remove original file from source
        """
        if not filesystem.is_in_swestore(f):
            logger.info("Sending {} to swestore".format(f))
            misc.call_external_command('iput -K -P {file} {dest}'.format(file=f, dest=dest),
                    with_log_files=True)
            logger.info('Run {} sent correctly and checksum was okay.'.format(f))
            if remove:
                logger.info('Removing run'.format(f))
                os.remove(f)
        else:
            logger.warn('Run {} is already in Swestore, not sending it again nor removing from the disk'.format(f))

    # Create state file to say that the run is being archived
    open("{}.archiving".format(run.split('.')[0]), 'w').close()
    if run.endswith('bz2'):
        if os.stat(run).st_mtime < time.time() - (86400 * days):
            _send_to_swestore(run, CONFIG.get('storage').get('irods').get('irodsHome'))
        else:
            logger.info("Run {} is not {} days old yet. Not archiving".format(run, str(days)))
    else:
        rta_file = os.path.join(run, 'RTAComplete.txt')
        if not os.path.exists(rta_file) and not force:
            logger.warn(("Run {} doesn't seem to be completed and --force option was "
                      "not enabled, not archiving the run".format(run)))
        if force or (os.path.exists(rta_file) and os.stat(rta_file).st_mtime < time.time() - (86400 * days)):
            logger.info("Compressing run {}".format(run))
            # Compress with pbzip2
            misc.call_external_command('tar --use-compress-program=pbzip2 -cf {run}.tar.bz2 {run}'.format(run=run))
            logger.info('Run {} successfully compressed! Removing from disk...'.format(run))
            shutil.rmtree(run)
            if not compress_only:
                _send_to_swestore('{}.tar.bz2'.format(run), CONFIG.get('storage').get('irods').get('irodsHome'))
        else:
            logger.info("Run {} is not completed or is not {} days old yet. Not archiving".format(run, str(days)))
    os.remove("{}.archiving".format(run.split('.')[0]))
Esempio n. 15
0
def archive_to_swestore(days, run=None, max_runs=None, force=False, compress_only=False):
    """Send runs (as archives) in NAS nosync to swestore for backup

    :param int days: number fo days to check threshold
    :param str run: specific run to send swestore
    :param int max_runs: number of runs to be processed simultaneously
    :param bool force: Force the archiving even if the run is not complete
    :param bool compress_only: Compress the run without sending it to swestore
    """
    # If the run is specified in the command line, check that exists and archive
    if run:
        run = os.path.basename(run)
        base_dir = os.path.dirname(run)
        if re.match(filesystem.RUN_RE, run):
            # If the parameter is not an absolute path, find the run in the archive_dirs
            if not base_dir:
                for archive_dir in CONFIG.get("storage").get("archive_dirs"):
                    if os.path.exists(os.path.join(archive_dir, run)):
                        base_dir = archive_dir
            if not os.path.exists(os.path.join(base_dir, run)):
                logger.error(
                    (
                        "Run {} not found. Please make sure to specify "
                        "the absolute path or relative path being in "
                        "the correct directory.".format(run)
                    )
                )
            else:
                with filesystem.chdir(base_dir):
                    _archive_run((run, days, force, compress_only))
        else:
            logger.error("The name {} doesn't look like an Illumina run".format(os.path.basename(run)))
    # Otherwise find all runs in every data dir on the nosync partition
    else:
        logger.info("Archiving old runs to SWESTORE")
        for to_send_dir in CONFIG.get("storage").get("archive_dirs"):
            logger.info("Checking {} directory".format(to_send_dir))
            with filesystem.chdir(to_send_dir):
                to_be_archived = [
                    r
                    for r in os.listdir(to_send_dir)
                    if re.match(filesystem.RUN_RE, r) and not os.path.exists("{}.archiving".format(r.split(".")[0]))
                ]
                if to_be_archived:
                    pool = Pool(processes=len(to_be_archived) if not max_runs else max_runs)
                    pool.map_async(_archive_run, ((run, days, force, compress_only) for run in to_be_archived))
                    pool.close()
                    pool.join()
                else:
                    logger.info("No old runs to be archived")
Esempio n. 16
0
def transfer_run(run_dir, analysis):
    """ Interface for click to force a transfer a run to uppmax
        :param: string run_dir: the run to tranfer
        :param bool analysis: if trigger or not the analysis
    """
    runObj = get_runObj(run_dir)
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    if runObj is None:
        mail_recipients = CONFIG.get('mail', {}).get('recipients')
        # Maybe throw an exception if possible?
        logger.error("Trying to force a transfer of run {} but the sequencer was not recognized.".format(run_dir))
    else:
        runObj.transfer_run(os.path.join("nosync",CONFIG['analysis']['status_dir'], 'transfer.tsv'),
                            analysis, mail_recipients) # do not start analsysis automatically if I force the transfer
Esempio n. 17
0
def fail_run(runid, project):
    """Updates status of specified run or project-run to Failed"""
    username = CONFIG.get('statusdb', {}).get('username')
    password = CONFIG.get('statusdb', {}).get('password')
    url = CONFIG.get('statusdb', {}).get('url')
    port = CONFIG.get('statusdb', {}).get('port')
    status_db_url = "http://{username}:{password}@{url}:{port}".format(username=username, password=password, url=url, port=port)
    logger.info('Connecting to status db: {}:{}'.format(url, port))
    try:
        status_db = couchdb.Server(status_db_url)
    except Exception, e:
        logger.error("Can't connect to status_db: {}".format(status_db_url))
        logger.error(e)
        raise e
Esempio n. 18
0
def cleanup_processing(days):
    """Cleanup runs in processing server.

    :param int days: Number of days to consider a run to be old
    """
    transfer_file = os.path.join(CONFIG.get("preprocessing", {}).get("status_dir"), "transfer.tsv")
    if not days:
        days = CONFIG.get("cleanup", {}).get("processing-server", {}).get("days", 10)
    try:
        # Move finished runs to nosync
        for data_dir in CONFIG.get("storage").get("data_dirs"):
            logger.info("Moving old runs in {}".format(data_dir))
            with filesystem.chdir(data_dir):
                for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                    if filesystem.is_in_file(transfer_file, run):
                        logger.info("Moving run {} to nosync directory".format(os.path.basename(run)))
                        shutil.move(run, "nosync")
                    else:
                        logger.info(
                            ("Run {} has not been transferred to the analysis " "server yet, not archiving".format(run))
                        )
        # Remove old runs from archiving dirs
        for archive_dir in CONFIG.get("storage").get("archive_dirs").values():
            logger.info("Removing old runs in {}".format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, "RTAComplete.txt")
                    if os.path.exists(rta_file):
                        # 1 day == 60*60*24 seconds --> 86400
                        if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and filesystem.is_in_swestore(
                            "{}.tar.bz2".format(run)
                        ):
                            logger.info("Removing run {} to nosync directory".format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info(
                                "RTAComplete.txt file exists but is not older than {} day(s), skipping run {}".format(
                                    str(days), run
                                )
                            )

    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = "Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not."
        cnt = CONFIG.get("contact", None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Esempio n. 19
0
def transfer_run(run_dir):
    """Interface for click to force a transfer a run to uppmax.

    :param: string run_dir: the run to tranfer
    """
    runObj = get_runObj(run_dir)
    mail_recipients = CONFIG.get('mail', {}).get('recipients')
    if runObj is None:
        mail_recipients = CONFIG.get('mail', {}).get('recipients')
        logger.error(
            'Trying to force a transfer of run {} but the sequencer was not recognized.'
            .format(run_dir))
    else:
        runObj.transfer_run(
            os.path.join('nosync', CONFIG['analysis']['status_dir'],
                         'transfer.tsv'), mail_recipients)
Esempio n. 20
0
def update_status_db(data, server_type=None):
    """ Pushed the data to status db.

    data can be from nases
    server_type should be 'nas'.
    """
    db_config = CONFIG.get('statusdb')
    if db_config is None:
        logging.error('"statusdb" must be present in the config file!')
        raise RuntimeError('"statusdb" must be present in the config file!')
    try:
        couch_connection = statusdb.StatusdbSession(db_config).connection
    except Exception as e:
        logging.error(e.message)
        raise

    db = couch_connection['server_status']
    logging.info('Connection established')
    for key in data.keys():  # data is dict of dicts
        server = data[key]  # data[key] is dictionary (the command output)
        server['name'] = key  # key is nas url
        # datetime.datetime(2015, 11, 18, 9, 54, 33, 473189) is not JSON serializable
        server['time'] = datetime.datetime.now().isoformat()
        server['server_type'] = server_type or 'unknown'

        try:
            db.save(server)
        except Exception as e:
            logging.error(e.message)
            raise
        else:
            logging.info('{}: Server status has been updated'.format(key))
Esempio n. 21
0
def trigger_analysis(run_id):
    """ Trigger the analysis of the flowcell in the analysis sever.

    :param str run_id: run/flowcell id
    """
    if not CONFIG.get('analysis', {}).get('analysis_server', {}):
        logger.warn(("No configuration found for remote analysis server. "
                     "Not triggering analysis of {}"
                     .format(os.path.basename(run_id))))
    else:
        url = ("http://{host}:{port}/flowcell_analysis/{dir}"
               .format(host=CONFIG['analysis']['analysis_server']['host'],
                       port=CONFIG['analysis']['analysis_server']['port'],
                       dir=os.path.basename(run_id)))
        params = {'path': CONFIG['analysis']['analysis_server']['sync']['data_archive']}
        try:
            r = requests.get(url, params=params)
            if r.status_code != requests.status_codes.codes.OK:
                logger.warn(("Something went wrong when triggering the "
                             "analysis of {}. Please check the logfile "
                             "and make sure to start the analysis!"
                             .format(os.path.basename(run_id))))
            else:
                logger.info('Analysis of flowcell {} triggered in {}'
                            .format(os.path.basename(run_id),
                                    CONFIG['analysis']['analysis_server']['host']))
                a_file = os.path.join(CONFIG['analysis']['status_dir'], 'analysis.tsv')
                with open(a_file, 'a') as analysis_file:
                    tsv_writer = csv.writer(analysis_file, delimiter='\t')
                    tsv_writer.writerow([os.path.basename(run_id), str(datetime.now())])
        except requests.exceptions.ConnectionError:
            logger.warn(("Something went wrong when triggering the analysis "
                         "of {}. Please check the logfile and make sure to "
                         "start the analysis!".format(os.path.basename(run_id))))
Esempio n. 22
0
def cleanup_processing(days):
    """Cleanup runs in processing server.

    :param int days: Number of days to consider a run to be old
    """
    transfer_file = os.path.join(CONFIG.get('preprocessing', {}).get('status_dir'), 'transfer.tsv')
    if not days:
        days = CONFIG.get('cleanup', {}).get('processing-server', {}).get('days', 10)
    try:
        #Move finished runs to nosync
        for data_dir in CONFIG.get('storage').get('data_dirs'):
            logger.info('Moving old runs in {}'.format(data_dir))
            with filesystem.chdir(data_dir):
                for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                    if filesystem.is_in_file(transfer_file, run):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info(("Run {} has not been transferred to the analysis "
                            "server yet, not archiving".format(run)))
        #Remove old runs from archiving dirs
        for archive_dir in CONFIG.get('storage').get('archive_dirs').values():
            logger.info('Removing old runs in {}'.format(archive_dir))
            with filesystem.chdir(archive_dir):
                for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]:
                    rta_file = os.path.join(run, 'RTAComplete.txt')
                    if os.path.exists(rta_file):
                        # 1 day == 60*60*24 seconds --> 86400
                        if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and \
                                filesystem.is_in_swestore("{}.tar.bz2".format(run)):
                            logger.info('Removing run {} to nosync directory'
                                        .format(os.path.basename(run)))
                            shutil.rmtree(run)
                        else:
                            logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run))

    except IOError:
        sbj = "Cannot archive old runs in processing server"
        msg = ("Could not find transfer.tsv file, so I cannot decide if I should "
               "archive any run or not.")
        cnt = CONFIG.get('contact', None)
        if not cnt:
            cnt = "{}@localhost".format(getpass.getuser())
        logger.error(msg)
        misc.send_mail(sbj, msg, cnt)
Esempio n. 23
0
def update_cronjob_db():
    server = platform.node().split(".")[0]
    timestamp = datetime.datetime.now()
    # parse results
    result = _parse_crontab()
    # connect to db
    url = "http://{username}:{password}@{url}:{port}".format(
        url=CONFIG.get("statusdb", {}).get("url"),
        username=CONFIG.get("statusdb", {}).get("username"),
        password=CONFIG.get("statusdb", {}).get("password"),
        port=CONFIG.get("statusdb", {}).get("port"),
    )
    logging.info("Connecting to database: {}".format(CONFIG.get("statusdb", {}).get("url")))
    try:
        couch = couchdb.Server(url)
    except Exception, e:
        logging.error(e.message)
Esempio n. 24
0
def update_cronjob_db():
    server = platform.node().split('.')[0]
    timestamp = datetime.datetime.now()
    # parse results
    result = _parse_crontab()
    # connect to db
    url = "http://{username}:{password}@{url}:{port}".format(
        url=CONFIG.get('statusdb', {}).get('url'),
        username=CONFIG.get('statusdb', {}).get('username'),
        password=CONFIG.get('statusdb', {}).get('password'),
        port=CONFIG.get('statusdb', {}).get('port'))
    logging.info('Connecting to database: {}'.format(
        CONFIG.get('statusdb', {}).get('url')))
    try:
        couch = couchdb.Server(url)
    except Exception, e:
        logging.error(e.message)
    def _get_pi_email(self):
        url = CONFIG.get('statusdb', {}).get('url')
        username = CONFIG.get('statusdb', {}).get('username')
        password = CONFIG.get('statusdb', {}).get('password')
        port = CONFIG.get('statusdb', {}).get('port')
        status_db_url = 'http://{}:{}@{}:{}'.format(username, password, url, port)

        status_db = couchdb.Server(status_db_url)
        orderportal_db = status_db['orderportal_ngi']
        view = orderportal_db.view('taca/project_id_to_pi_email')
        rows = view[self.projectid].rows
        if len(rows) < 1:
            raise AssertionError("Project {} not found in StatusDB: {}".format(self.projecid, url))
        if len(rows) > 1:
            raise AssertionError('Project {} has more than one entry in orderportal_db'.format(self.projectid))

        pi_email = rows[0].value
        return pi_email
Esempio n. 26
0
def process_promethion_run(promethion_run):
    """Process promethion runs."""
    email_recipients = CONFIG.get('mail').get('recipients')
    logger.info('Processing run {}'.format(promethion_run.run_id))

    if len(promethion_run.summary_file) and os.path.isfile(
            promethion_run.summary_file[0]):
        logger.info(
            'Sequencing done for run {}. Attempting to start processing.'.
            format(promethion_run.run_id))
        if promethion_run.is_not_transferred():
            if promethion_run.transfer_run():
                if promethion_run.update_transfer_log():
                    logger.info(
                        'Run {} has been synced to the analysis cluster.'.
                        format(promethion_run.run_id))
                else:
                    email_subject = ('Run processed with errors: {}'.format(
                        promethion_run.run_id))
                    email_message = (
                        'Run {} has been transferred, but an error occurred while updating '
                        'the transfer log').format(promethion_run.run_id)
                    send_mail(email_subject, email_message, email_recipients)

                if promethion_run.archive_run():
                    logger.info('Run {} is finished and has been archived. '
                                'Notifying operator.'.format(
                                    promethion_run.run_id))
                    email_subject = ('Run successfully processed: {}'.format(
                        promethion_run.run_id))
                    email_message = (
                        'Run {} has been transferred and archived '
                        'successfully.').format(promethion_run.run_id)
                    send_mail(email_subject, email_message, email_recipients)
                else:
                    email_subject = ('Run processed with errors: {}'.format(
                        promethion_run.run_id))
                    email_message = (
                        'Run {} has been analysed, but an error occurred during '
                        'archiving').format(promethion_run.run_id)
                    send_mail(email_subject, email_message, email_recipients)

            else:
                email_subject = ('Run processed with errors: {}'.format(
                    promethion_run.run_id))
                email_message = ('An error occurred during transfer of run {} '
                                 'to the analysis cluster.').format(
                                     promethion_run.run_id)
                send_mail(email_subject, email_message, email_recipients)

        else:
            logger.warn('The following run has already been transferred, '
                        'skipping: {}'.format(promethion_run.run_id))
    else:
        logger.info('Run {} not finished sequencing yet. Skipping.'.format(
            promethion_run.run_id))
Esempio n. 27
0
 def __init__(self, projectid=None, sampleid=None, pi_email=None, sensitive=True, hard_stage_only=False, **kwargs):
     super(GrusProjectDeliverer, self).__init__(
         projectid,
         sampleid,
         **kwargs
     )
     self.stagingpathhard = getattr(self, 'stagingpathhard', None)
     if self.stagingpathhard is None:
         raise AttributeError("stagingpathhard is required when delivering to GRUS")
     self.config_snic = CONFIG.get('snic',None)
     if self.config_snic is None:
         raise AttributeError("snic confoguration is needed  delivering to GRUS (snic_api_url, snic_api_user, snic_api_password")
     self.config_statusdb = CONFIG.get('statusdb',None)
     if self.config_statusdb is None:
         raise AttributeError("statusdb configuration is needed  delivering to GRUS (url, username, password, port")
     self.orderportal = CONFIG.get('order_portal',None) # do not need to raise exception here, I have already checked for this and monitoring does not need it
     self.pi_email  = pi_email
     self.sensitive = sensitive
     self.hard_stage_only = hard_stage_only
Esempio n. 28
0
def update_cronjob_db():
    server = platform.node().split('.')[0]
    timestamp = datetime.datetime.now()
    # parse results
    result = _parse_crontab()
    # connect to db
    statusdb_conf = CONFIG.get('statusdb')
    logging.info('Connecting to database: {}'.format(
        CONFIG.get('statusdb', {}).get('url')))
    try:
        couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
    except Exception as e:
        logging.error(e.message)
    else:
        # update document
        crontab_db = couch_connection['cronjobs']
        view = crontab_db.view('server/alias')
        # to be safe
        doc = {}
        # create doc if not exist
        if not view[server].rows:
            logging.info('Creating a document')
            doc = {
                'users': {user: cronjobs
                          for user, cronjobs in result.items()},
                'Last updated': str(timestamp),
                'server': server,
            }
        # else: get existing doc
        for row in view[server]:
            logging.info('Updating the document')
            doc = crontab_db.get(row.value)
            doc['users'].update(result)
            doc['Last updated'] = str(timestamp)
        if doc:
            try:
                crontab_db.save(doc)
            except Exception as e:
                logging.error(e.message)
            else:
                logging.info('{} has been successfully updated'.format(server))
        else:
            logging.warning('Document has not been created/updated')
Esempio n. 29
0
def archive_run(run_dir):
    """Move directory to nosync."""
    logger.info('Archiving run ' + run_dir)
    archive_dir = CONFIG.get('nanopore_analysis').get('finished_dir')
    top_dir = '/'.join(run_dir.split('/')[0:-2]) # Get the project folder to archive
    try:                                         # Try pathlib (pathlib.Path(run_dir).parent.parent) when running completely on python3
        shutil.move(top_dir, archive_dir)
        logger.info('Successfully archived {}'.format(run_dir))
    except shutil.Error:
        logger.warn('An error occurred when archiving {}. '
                    'Please check the logfile for more info.'.format(run_dir))
    return
Esempio n. 30
0
 def copy_results_for_lims(self):
     """Find results and copy to lims directory."""
     year_processed = self.run_id[0:4]
     lims_result_file = os.path.join(CONFIG.get('nanopore_analysis').get('lims_results_dir'),
                                     year_processed, 'anglerfish_stats_' + self.flowcell_id + '.txt')
     anglerfish_results = self._find_anglerfish_results()
     try:
         shutil.copyfile(anglerfish_results, lims_result_file)
         return True
     except OSError as e:
         logger.warn('An error occurred while copying the Anglerfish results for {} to lims: {}'.format(self.run_id, e))
         return False
Esempio n. 31
0
def copy_results_for_lims(run_dir, anglerfish_results_dir):
    """Find results and copy to lims directory."""
    run_id = os.path.basename(run_dir)
    year_processed = run_id[0:4]
    flowcell_id = run_id.split('_')[3]
    lims_result_file = os.path.join(CONFIG.get('nanopore_analysis').get('lims_results_dir'),
                                    year_processed, 'anglerfish_stats_' + flowcell_id + '.txt')
    anglerfish_results = find_anglerfish_results(anglerfish_results_dir)
    try:
        shutil.copyfile(anglerfish_results, lims_result_file)
    except OSError as e:
        logger.warn('An error occurred while copying the Anglerfish results for {} to lims: {}'.format(run_id, e))
    return
Esempio n. 32
0
 def _get_original_samplesheet(self):
     """Find original lims sample sheet."""
     lims_samplesheet_dir = os.path.join(CONFIG.get('nanopore_analysis').get('samplesheets_dir'),
                                         self.year_processed)
     found_samplesheets = glob.glob(lims_samplesheet_dir + '/*' + self.flowcell_id + '*')
     if not found_samplesheets:
         logger.warn('No Lims sample sheets found for run {}. Skipping it.'.format(self.run_id))
         self.lims_samplesheet = None
     elif len(found_samplesheets) > 1:
         logger.warn('Found more than one Lims sample sheets for run {}. Skipping it.'.format(self.run_id))
         self.lims_samplesheet = None
     else:
         self.lims_samplesheet = found_samplesheets[0]
Esempio n. 33
0
 def __init__(self, projectid=None, sampleid=None, 
              pi_email=None, sensitive=True,
              add_user=None, fcid=None, do_release=False, 
              project_title=None, project_description=None,
              ignore_orderportal_members=False, **kwargs):
     super(DDSProjectDeliverer, self).__init__(
         projectid,
         sampleid,
         **kwargs
     )
     self.config_statusdb = CONFIG.get('statusdb', None)
     if self.config_statusdb is None and not do_release:
         raise AttributeError("statusdb configuration is needed when delivering to DDS (url, username, password, port")
     self.orderportal = CONFIG.get('order_portal', None)
     if self.orderportal is None and not do_release:
         raise AttributeError("Order portal configuration is needed when delivering to DDS")
     if self.orderportal:
         self._set_pi_email(pi_email)
         self._set_other_member_details(add_user, CONFIG.get('add_project_owner', False), ignore_orderportal_members)
         self._set_project_details(project_title, project_description)
     self.sensitive = sensitive
     self.fcid = fcid
Esempio n. 34
0
def get_original_samplesheet(run_id):
    """Find original lims sample sheet."""
    year_processed = run_id[0:4]
    flowcell_id = run_id.split('_')[3]
    lims_samplesheet_dir = os.path.join(CONFIG.get('nanopore_analysis').get('samplesheets_dir'),
                                        year_processed)
    found_samplesheets = glob.glob(lims_samplesheet_dir + '/*'+ flowcell_id + '*')
    if not found_samplesheets:
        logger.warn('No Lims sample sheets found for run {}'.format(run_id))
        return
    elif len(found_samplesheets) > 1:
        logger.warn('Found more than one Lims sample sheets for run {}'.format(run_id))
        return
    return found_samplesheets[0]
Esempio n. 35
0
def transfer_run(run_dir):
    """rsync dir to Irma."""
    logger.info('Transferring run {} to analysis cluster'.format(run_dir))
    destination = CONFIG.get('nanopore_analysis').get('transfer').get('destination')
    rsync_opts = {'-Lav': None,
                  '--chown': ':ngi2016003',
                  '--chmod' : 'Dg+s,g+rw',
                  '-r' : None,
                  '--exclude' : 'work'}
    connection_details = CONFIG.get('nanopore_analysis').get('transfer').get('analysis_server')
    transfer_object = RsyncAgent(run_dir,
                                 dest_path=destination,
                                 remote_host=connection_details['host'],
                                 remote_user=connection_details['user'],
                                 validate=False,
                                 opts=rsync_opts)
    try:
        transfer_object.transfer()
    except RsyncError:
        logger.warn('An error occurred while transferring {} to the '
                    'ananlysis server. Please check the logfiles'.format(run_dir))
        return False
    return True
Esempio n. 36
0
def fail_run(runid, project):
    """Updates status of specified run or project-run to Failed."""
    statusdb_conf = CONFIG.get('statusdb')
    logger.info('Connecting to status db: {}:{}'.format(
        statusdb_conf.get('url'), statusdb_conf.get('port')))
    try:
        status_db = statusdb.StatusdbSession(statusdb_conf).connection
    except Exception as e:
        logger.error(
            'Can not connect to status_db: http://{}:*****@{}:{}'.format(
                statusdb_conf.get('username'), statusdb_conf.get('url'),
                statusdb_conf.get('port')))
        logger.error(e)
        raise e
    bioinfo_db = status_db['bioinfo_analysis']
    if project is not None:
        view = bioinfo_db.view('full_doc/pj_run_to_doc')
        rows = view[[project, runid]].rows
        logger.info(
            'Updating status of {} objects with flowcell_id: {} and project_id {}'
            .format(len(rows), runid, project))
    else:
        view = bioinfo_db.view('full_doc/run_id_to_doc')
        rows = view[[runid]].rows
        logger.info(
            'Updating status of {} objects with flowcell_id: {}'.format(
                len(rows), runid))

    new_timestamp = datetime.datetime.now().isoformat()
    updated = 0
    for row in rows:
        if row.value['status'] != 'Failed':
            row.value['values'][new_timestamp] = {
                'sample_status': 'Failed',
                'user': '******'
            }
            row.value['status'] = 'Failed'
        try:
            bioinfo_db.save(row.value)
            updated += 1
        except Exception as e:
            logger.error(
                'Cannot update object project-sample-run-lane: {}-{}-{}-{}'.
                format(row.value.get('project_id'), row.value.get('sample'),
                       row.value.get('run_id'), row.value.get('lane')))
            logger.error(e)
            raise e
    logger.info('Successfully updated {} objects'.format(updated))
Esempio n. 37
0
def cleanup_swestore(days, dry_run=False):
    """Remove archived runs from swestore

    :param int days: Threshold days to check and remove
    """
    days = check_days('swestore', days, config)
    if not days:
        return
    runs = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'))
    for run in runs:
        date = run.split('_')[0]
        if misc.days_old(date) > days:
            if dry_run:
                logger.info('Will remove file {} from swestore'.format(run))
                continue
            misc.call_external_command('irm -f {}'.format(run))
            logger.info('Removed file {} from swestore'.format(run))
Esempio n. 38
0
def cleanup_swestore(days, dry_run=False):
    """Remove archived runs from swestore

    :param int days: Threshold days to check and remove
    """
    days = check_days('swestore', days, config)
    if not days:
        return
    runs = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'))
    for run in runs:
        date = run.split('_')[0]
        if misc.days_old(date) > days:
            if dry_run:
                logger.info('Will remove file {} from swestore'.format(run))
                continue
            misc.call_external_command('irm -f {}'.format(run))
            logger.info('Removed file {} from swestore'.format(run))
Esempio n. 39
0
def cleanup_swestore(seconds, dry_run=False):
    """Remove archived runs from swestore

    :param int seconds: Days/hours converted as seconds to check
    """
    seconds = check_default(site, seconds, CONFIG)
    if not seconds:
        return
    runs = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'))
    for run in runs:
        date = run.split('_')[0]
        if misc.to_seconds(misc.days_old(date)) > seconds:
            if dry_run:
                logger.info('Will remove file {} from swestore'.format(run))
                continue
            misc.call_external_command('irm -f {}'.format(run))
            logger.info('Removed file {} from swestore'.format(run))
Esempio n. 40
0
    def start_nanoseq(self):
        """Start Nanoseq analysis."""
        flowcell_product_code = self._get_flowcell_product_code() 
        kit_id = os.path.basename(self.nanoseq_sample_sheet).split('_')[0]
        nanoseq_version = CONFIG.get('nanopore_analysis').get('nanoseq_version')
        if self._is_multiplexed():
            logger.info('Run {} is multiplexed. Starting nanoseq with --barcode_kit option'.format(self.run_dir))
            barcode_kit = self._get_barcode_kit()
            analysis_command = ('nextflow run nf-core/nanoseq'
                                + ' -r ' + nanoseq_version
                                + ' --input ' + self.nanoseq_sample_sheet
                                + ' --protocol DNA'
                                + ' --input_path ' + os.path.join(self.run_dir, 'fast5')
                                + ' --outdir ' + os.path.join(self.run_dir, 'nanoseq_output')
                                + ' --flowcell ' + flowcell_product_code
                                + ' --guppy_gpu'
                                + ' --skip_alignment'
                                + ' --skip_quantification'
                                + ' --kit ' + kit_id
                                + ' --max_cpus 6'
                                + ' --max_memory 20.GB'
                                + ' --barcode_kit ' + barcode_kit
                                + ' -profile singularity; echo $? > .exitcode_for_nanoseq')
        else:
            logger.info('Run {} is not multiplexed. Starting nanoseq without --barcode_kit option'.format(self.run_dir))
            analysis_command = ('nextflow run nf-core/nanoseq'
                                + ' -r ' + nanoseq_version
                                + ' --input ' + self.nanoseq_sample_sheet
                                + ' --protocol DNA'
                                + ' --input_path ' + os.path.join(self.run_dir, 'fast5')
                                + ' --outdir ' + os.path.join(self.run_dir, 'nanoseq_output')
                                + ' --flowcell ' + flowcell_product_code
                                + ' --guppy_gpu'
                                + ' --skip_alignment'
                                + ' --skip_quantification'
                                + ' --kit ' + kit_id
                                + ' --max_cpus 6'
                                + ' --max_memory 20.GB'
                                + ' -profile singularity; echo $? > .exitcode_for_nanoseq')

        try:
            p_handle = subprocess.Popen(analysis_command, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir)
            logger.info('Started Nanoseq for run {}'.format(self.run_dir))
        except subprocess.CalledProcessError:
            logger.warn('An error occurred while starting the Nanoseq for run {}. '
                        'Please check the logfile for info.'.format(self.run_dir))
Esempio n. 41
0
 def __init__(self, projectid, sampleid, **kwargs):
     """
         :param string projectid: id of project to deliver
         :param string sampleid: id of sample to deliver
         :param bool no_checksum: if True, skip the checksum computation
         :param string hash_algorithm: algorithm to use for calculating
             file checksums, defaults to sha1
     """
     # override configuration options with options given on the command line
     self.config = CONFIG.get('deliver', {})
     self.config.update(kwargs)
     # set items in the configuration as attributes
     for k, v in self.config.items():
         setattr(self, k, v)
     self.projectid = projectid
     self.sampleid = sampleid
     self.hash_algorithm = getattr(self, 'hash_algorithm', 'sha1')
     self.no_checksum = getattr(self, 'no_checksum', False)
     self.files_to_deliver = getattr(self, 'files_to_deliver', None)
     self.deliverystatuspath = getattr(self, 'deliverystatuspath', None)
     self.stagingpath = getattr(self, 'stagingpath', None)
     self.deliverypath = getattr(self, 'deliverypath', None)
     self.logpath = getattr(self, 'logpath', None)
     self.reportpath = getattr(self, 'reportpath', None)
     self.force = getattr(self, 'force', False)
     self.stage_only = getattr(self, 'stage_only', False)
     self.ignore_analysis_status = getattr(self, 'ignore_analysis_status',
                                           False)
     #Fetches a project name, should always be availble; but is not a requirement
     try:
         self.projectname = db.project_entry(db.dbcon(), projectid)['name']
     except KeyError:
         pass
     # only set an attribute for uppnexid if it's actually given or in the db
     try:
         getattr(self, 'uppnexid')
     except AttributeError:
         try:
             self.uppnexid = db.project_entry(db.dbcon(),
                                              projectid)['uppnex_id']
         except KeyError:
             pass
     # set a custom signal handler to intercept interruptions
     signal.signal(signal.SIGINT, _signal_handler)
     signal.signal(signal.SIGTERM, _signal_handler)
Esempio n. 42
0
def cleanup_swestore(seconds, dry_run=False):
    """Remove archived runs from swestore

    :param int seconds: Days/hours converted as seconds to check
    """
    seconds = check_default(site, seconds, CONFIG)
    if not seconds:
        return
    runs = filesystem.list_runs_in_swestore(
        path=CONFIG.get('cleanup').get('swestore').get('root'))
    for run in runs:
        date = run.split('_')[0]
        if misc.to_seconds(misc.days_old(date)) > seconds:
            if dry_run:
                logger.info('Will remove file {} from swestore'.format(run))
                continue
            misc.call_external_command('irm -f {}'.format(run))
            logger.info('Removed file {} from swestore'.format(run))
Esempio n. 43
0
def cleanup_nas(days):
    """Will move the finished runs in NASes to nosync directory.

    :param int days: Number of days to consider a run to be old
    """
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, 'RTAComplete.txt')
                if os.path.exists(rta_file):
                    # 1 day == 60*60*24 seconds --> 86400
                    if os.stat(rta_file).st_mtime < time.time() - (86400 * days):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run))
Esempio n. 44
0
 def __init__(self, projectid, sampleid, **kwargs):
     """
         :param string projectid: id of project to deliver
         :param string sampleid: id of sample to deliver
         :param bool no_checksum: if True, skip the checksum computation
         :param string hash_algorithm: algorithm to use for calculating 
             file checksums, defaults to sha1
     """
     # override configuration options with options given on the command line
     self.config = CONFIG.get('deliver', {})
     self.config.update(kwargs)
     # set items in the configuration as attributes
     for k, v in self.config.items():
         setattr(self, k, v)
     self.projectid = projectid
     self.sampleid = sampleid
     self.hash_algorithm = getattr(self, 'hash_algorithm', 'sha1')
     self.no_checksum = getattr(self, 'no_checksum', False)
     self.files_to_deliver = getattr(self, 'files_to_deliver', None)
     self.deliverystatuspath = getattr(self, 'deliverystatuspath', None)
     self.stagingpath = getattr(self, 'stagingpath', None)
     self.deliverypath = getattr(self, 'deliverypath', None)
     self.logpath = getattr(self, 'logpath', None)
     self.reportpath = getattr(self, 'reportpath', None)
     self.force = getattr(self, 'force', False)
     self.stage_only = getattr(self, 'stage_only', False)
     self.ignore_analysis_status = getattr(self, 'ignore_analysis_status', False)
     #Fetches a project name, should always be availble; but is not a requirement
     try:
         self.projectname = db.project_entry(db.dbcon(), projectid)['name']
     except KeyError:
         pass
     # only set an attribute for uppnexid if it's actually given or in the db
     try:
         getattr(self, 'uppnexid')
     except AttributeError:
         try:
             self.uppnexid = db.project_entry(db.dbcon(), projectid)['uppnex_id']
         except KeyError:
             pass
     # set a custom signal handler to intercept interruptions
     signal.signal(signal.SIGINT, _signal_handler)
     signal.signal(signal.SIGTERM, _signal_handler)
Esempio n. 45
0
def cleanup_nas(days):
    """Will move the finished runs in NASes to nosync directory.

    :param int days: Number of days to consider a run to be old
    """
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    # 1 day == 60*60*24 seconds --> 86400
                    if os.stat(rta_file).st_mtime < time.time() - (86400 * days):
                        logger.info('Moving run {} to nosync directory'
                                    .format(os.path.basename(run)))
                        shutil.move(run, 'nosync')
                    else:
                        logger.info('{} file exists but is not older than {} day(s), skipping run {}'.format(
                                    finished_run_indicator, str(days), run))
Esempio n. 46
0
def update_status_db(data, server_type=None):
    """ Pushed the data to status db,
        data can be from nases or from uppmax
        server_type should be either 'uppmax' or 'nas'
    """
    db_config = CONFIG.get('statusdb')
    if db_config is None:
        logging.error("'statusdb' must be present in the config file!")
        raise RuntimeError("'statusdb' must be present in the config file!")

    server = "http://{username}:{password}@{url}:{port}".format(
        url=db_config['url'],
        username=db_config['username'],
        password=db_config['password'],
        port=db_config['port'])
    try:
        couch = couchdb.Server(server)
    except Exception, e:
        logging.error(e.message)
        raise
Esempio n. 47
0
def create(projects, ngi_config_file, fastq_1, fastq_2):
    #connect to statusdb
    couch_info = CONFIG.get('statusdb')
    if couch_info is None:
        logger.error("No statusdb field in taca configuration file")
        return 1
    if "dev" not in couch_info["url"]:
        logger.error("url for status db is {}, but dev must be specified in this case".format(couch_info["url"]))
    couch=setupServer(couch_info)
    # connect to db and to view
    projectsDB = couch["projects"]
    project_summary = projectsDB.view("project/summary")
    projects_closed_more_than_three_months = {}
    projects_closed_more_than_one_month_less_than_three = {}
    projects_closed_less_than_one_month    = {}
    projects_opened = {}
    current_date =  datetime.datetime.today()
    date_limit_one_year = current_date - relativedelta(months=6) #yes yes I know.. but in this way i am sure all data in in xflocell_db
    date_limit_one_month = current_date - relativedelta(months=1)
    date_limit_three_month = current_date - relativedelta(months=3)
    for row in project_summary:
        project_id = row["key"][1]
        project_status = row["key"][0]
        if "application" not in row["value"]:
            continue
        if row["value"]["no_samples"] > 50:
            continue #skip large projects
        application = row["value"]["application"]
        if project_status == "closed":
            if "close_date" in row["value"]:
                close_date = datetime.datetime.strptime(row["value"]["close_date"], '%Y-%m-%d')
                if close_date > date_limit_one_year: #if the project has been closed after the date limit
                    if close_date >= date_limit_one_month:
                        projects_closed_less_than_one_month[project_id] = {"project_name": row["value"]["project_name"],
                                                                            "application": application, "no_samples": row["value"]["no_samples"]}
                    elif close_date < date_limit_one_month and close_date >= date_limit_three_month:
                        projects_closed_more_than_one_month_less_than_three[project_id] = {"project_name": row["value"]["project_name"],
                                                                            "application": application, "no_samples": row["value"]["no_samples"]}
                    elif close_date < date_limit_three_month:
                        projects_closed_more_than_three_months[project_id] = {"project_name": row["value"]["project_name"],
                                                                            "application": application, "no_samples": row["value"]["no_samples"]}
        elif project_status == "open":
            if "lanes_sequenced" in row["value"] and row["value"]["lanes_sequenced"] > 0:
                projects_opened[project_id] =  {"project_name": row["value"]["project_name"],
                                            "application": application, "no_samples": row["value"]["no_samples"]}
        else:
            print "status {}".format(project_status)
    ##now I can parse the x_flowcell db to check what I can and cannot use
    ##it is less than one year we are using the flowcell_db so old projects might be not present
    whole_genome_projects = int(2*projects/3)
    projects_to_reproduce = []
    select_random_projects(projects_closed_more_than_three_months, whole_genome_projects/4+1, "WG re-seq", projects_to_reproduce, "WGreseq_tot_closed")
    select_random_projects(projects_closed_more_than_one_month_less_than_three, whole_genome_projects/4+1, "WG re-seq", projects_to_reproduce, "WGreseq_closed_clean_no_del")
    select_random_projects(projects_closed_less_than_one_month,whole_genome_projects/4+1, "WG re-seq", projects_to_reproduce, "WGreseq_closed_no_clean")
    select_random_projects(projects_opened, whole_genome_projects/4+1, "WG re-seq", projects_to_reproduce, "WGreseq_open")

    other_projects = int(projects/3)
    select_random_projects(projects_closed_more_than_three_months, other_projects/4+1, "other", projects_to_reproduce, "noWGreseq_tot_closed")
    select_random_projects(projects_closed_more_than_one_month_less_than_three, other_projects/4+1, "other", projects_to_reproduce, "noWGreseq_closed_clean_no_del")
    select_random_projects(projects_closed_less_than_one_month, other_projects/4+1, "other", projects_to_reproduce, "noWGreseq_closed_no_clean")
    select_random_projects(projects_opened, other_projects/4+1, "other", projects_to_reproduce, "noWGreseq_open")

    ### create ngi_pipeline enviorment
    print "#NGI_CONFIG varaible is {} . This variable needs to be in the .bashrc file".format(ngi_config_file)
    print "NGI_CONFIG={}".format(ngi_config_file)
    try:
        ngi_config = conf.load_config(ngi_config_file)
    except IOError as e:
        print "ERROR: {}".format(e.message)
    #now create uppmax env
    paths = create_uppmax_env(ngi_config)


    print "#going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry".format(len(projects_to_reproduce))
    ### At this point I scan over x_flowcell and reproduce FCs
    flowcellDB = couch["x_flowcells"]
    reproduced_projects = {}
    for fc_doc in flowcellDB:
        try:
            samplesheet_csv = flowcellDB[fc_doc]["samplesheet_csv"]
        except KeyError:
            continue #parse only FC that have a samplesheet
        #now check if this FC contains one of the proejcts I need to replicate.
        projects_in_FC = set()
        if "SampleName" in samplesheet_csv[0]:
            projects_in_FC = set([line["SampleName"].split("_")[0] for line in samplesheet_csv])
        else:
            projects_in_FC = set([line["Sample_Name"].split("_")[0] for line in samplesheet_csv])
        found = False
        for project_pair in projects_to_reproduce:
            project = project_pair[0]
            if project in projects_in_FC:
                #this FC needs to be created
                if not found:
                    #I create the FC only the first time I see a project belonging to it
                    create_FC(paths["flowcell_inbox"] , flowcellDB[fc_doc]["RunInfo"]["Id"], samplesheet_csv, fastq_1, fastq_2)
                    found = True
                #but I keep track of all projects-run I need to organise
                if project not in reproduced_projects:
                    reproduced_projects[project] = []
                reproduced_projects[project].append(flowcellDB[fc_doc]["RunInfo"]["Id"])
    print "#reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)".format(len(reproduced_projects))
    for project in projects_to_reproduce:
        if project[0] in reproduced_projects:
            print "#  {}: {}".format(project[0], project[1])
    #need to output the command to organise
    to_be_deleted = []
    for project in reproduced_projects:
        for FC in reproduced_projects[project]:
            print "Running: ngi_pipeline_start.py organize flowcell {} -p {}".format(FC, project)
            with open("ngi_pipeline_local.logs", "w") as NGILOGS:
                return_value = subprocess.call(["ngi_pipeline_start.py", "organize", "flowcell", "{}".format(FC), "-p", "{}".format(project) ],
                            stdout=NGILOGS, stderr=NGILOGS)
            if return_value > 0:
                print "#project {} not organised: have a look to the logs, but most likely this projec is not in charon".format(project)
                if project not in to_be_deleted:
                    to_be_deleted.append(project)

    for project in to_be_deleted:
        del reproduced_projects[project]

    #at this point create ANALYSIS --
    for project in projects_to_reproduce:
        if project[0] in reproduced_projects: #only for projects that I know I have organised
            produce_analysis_qc_ngi(ngi_config, project[0])
            if project[1].startswith("WGreseq"):
                produce_analysis_piper(ngi_config, project[0])
                


    #now I need to store in a file the results
    with open("projects.txt", "w") as PROJECTS:
        for project in projects_to_reproduce:
            if project[0] in reproduced_projects:
                PROJECTS.write("{}:{}\n".format(project[0], project[1]))
Esempio n. 48
0
def run_preprocessing(run):
    """Run demultiplexing in all data directories

    :param str run: Process a particular run instead of looking for runs
    """

    def _process(run):
        """Process a run/flowcell and transfer to analysis server

        :param taca.illumina.Run run: Run to be processed and transferred
        """
        logger.info('Checking run {}'.format(run.id))
        if run.is_finished():
            if  run.status == 'TO_START':
                logger.info(("Starting BCL to FASTQ conversion and "
                             "demultiplexing for run {}".format(run.id)))
                # work around LIMS problem
                if prepare_sample_sheet(run.run_dir):
                    run.demultiplex()
            elif run.status == 'IN_PROGRESS':
                logger.info(("BCL conversion and demultiplexing process in "
                             "progress for run {}, skipping it"
                             .format(run.id)))
                ud.check_undetermined_status(run.run_dir, dex_status=run.status, und_tresh=CONFIG['analysis']['undetermined']['lane_treshold'],
                    q30_tresh=CONFIG['analysis']['undetermined']['q30_treshold'], freq_tresh=CONFIG['analysis']['undetermined']['highest_freq'],
                    pooled_tresh=CONFIG['analysis']['undetermined']['pooled_und_treshold'])
            elif run.status == 'COMPLETED':
                logger.info(("Preprocessing of run {} is finished, check if "
                             "run has been transferred and transfer it "
                             "otherwise".format(run.id)))

                control_fastq_filename(os.path.join(run.run_dir, CONFIG['analysis']['bcl2fastq']['options'][0]['output-dir']))
                passed_qc=ud.check_undetermined_status(run.run_dir, dex_status=run.status, und_tresh=CONFIG['analysis']['undetermined']['lane_treshold'],
                    q30_tresh=CONFIG['analysis']['undetermined']['q30_treshold'], freq_tresh=CONFIG['analysis']['undetermined']['highest_freq'],
                    pooled_tresh=CONFIG['analysis']['undetermined']['pooled_und_treshold'])
                qc_file = os.path.join(CONFIG['analysis']['status_dir'], 'qc.tsv')

                post_qc(run.run_dir, qc_file, passed_qc)
                upload_to_statusdb(run.run_dir)

                t_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
                transferred = is_transferred(run.run_dir, t_file)
                if passed_qc:
                    if not transferred:
                        logger.info("Run {} hasn't been transferred yet."
                                    .format(run.id))
                        logger.info('Transferring run {} to {} into {}'
                                    .format(run.id,
                            CONFIG['analysis']['analysis_server']['host'],
                            CONFIG['analysis']['analysis_server']['sync']['data_archive']))
                        transfer_run(run.run_dir)
                    else:
                        logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id))
                else:
                    logger.warn('Run {} failed qc, transferring will not take place'.format(run.id))
                    r_file = os.path.join(CONFIG['analysis']['status_dir'], 'report.out')



        if not run.is_finished():
            # Check status files and say i.e Run in second read, maybe something
            # even more specific like cycle or something
            logger.info('Run {} is not finished yet'.format(run.id))

    if run:
        _process(Run(run))
    else:
        data_dirs = CONFIG.get('analysis').get('data_dirs')
        for data_dir in data_dirs:
            runs = glob.glob(os.path.join(data_dir, '1*XX'))
            for _run in runs:
                _process(Run(_run))
Esempio n. 49
0
def run_preprocessing(run, force_trasfer=True, statusdb=True):
    """ Run demultiplexing in all data directories
        :param str run: Process a particular run instead of looking for runs
        :param bool force_tranfer: if set to True the FC is transferred also if fails QC
        :param bool statusdb: True if we want to upload info to statusdb
    """
    def _process(run, force_trasfer):
        """ Process a run/flowcell and transfer to analysis server
            :param taca.illumina.Run run: Run to be processed and transferred
        """
        logger.info('Checking run {}'.format(run.id))
        t_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
        if run.is_transferred(t_file):
            # In this case I am either processing a run that is in transfer
            # or that has been already transferred. Do nothing.
            # time to time this situation is due to runs that are copied back from NAS after a reboot.
            # This check avoid failures
            logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id))
            return

        if run.get_run_status() == 'SEQUENCING':
            # Check status files and say i.e Run in second read, maybe something
            # even more specific like cycle or something
            logger.info('Run {} is not finished yet'.format(run.id))
        elif run.get_run_status() == 'TO_START':
            if run.get_run_type() == 'NON-NGI-RUN':
                # For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs)
                logger.warn("Run {} marked as {}, "
                            "TACA will skip this and move the run to "
                            "no-sync directory".format(run.id, run.get_run_type()))
                # Archive the run if indicated in the config file
                if 'storage' in CONFIG:
                    run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
                return
            # Otherwise it is fine, process it
            logger.info(("Starting BCL to FASTQ conversion and demultiplexing for run {}".format(run.id)))
            run.demultiplex_run()
        elif run.get_run_status() == 'IN_PROGRESS':
            logger.info(("BCL conversion and demultiplexing process in "
                         "progress for run {}, skipping it".format(run.id)))
            # In the case of Xten returns, in future have a look to Cycles.txt
            # in the case of HiSeq check that partial demux are done and performs aggregation if this is the case
            run.check_run_status()

        # previous elif might change the status to COMPLETED (in HiSeq), therefore to avoid skipping
        # a cycle take the last if out of the elif
        if run.get_run_status() == 'COMPLETED':
            logger.info(("Preprocessing of run {} is finished, transferring it".format(run.id)))
            # In the case of of HiSeq this function computes undetermined indexes for NoIndex lanes
            if not run.compute_undetermined():
                return
            # Otherwise I can proceed to QC
            # Check the run QC
            run_QC_status = run.check_QC()
            if run_QC_status is not None:
                # Store QC results in appropriate file and mail user if failed
                qc_file = os.path.join(CONFIG['analysis']['status_dir'], 'qc.tsv')
                # This method is implemented in Runs
                run.post_qc(qc_file, run_QC_status, log_file=CONFIG['log']['file'],
                            rcp=CONFIG['mail']['recipients'])
            # Upload to statusDB if applies
            if 'statusdb' in CONFIG:
                _upload_to_statusdb(run)

            # Copy demultiplex stats file to shared file system for LIMS purpose
            if 'mfs_path' in CONFIG['analysis']:
                try:
                    mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id)
                    logger.info('Copying demultiplex stats for run {} to {}'.format(run.id, mfs_dest))
                    if not os.path.exists(mfs_dest):
                        os.mkdir(mfs_dest)
                    demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports',
                                                    'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html')
                    copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html'))
                except:
                    logger.warn('Could not copy demultiplex stat file for run {}'.format(run.id))

            # Transfer to analysis server if flag is True
            if run.transfer_to_analysis_server:
                logger.info('Transferring run {} to {} into {}'
                            .format(run.id,
                                    run.CONFIG['analysis_server']['host'],
                                    run.CONFIG['analysis_server']['sync']['data_archive']))
                run.transfer_run(t_file,  False) # Do not trigger analysis

            # Archive the run if indicated in the config file
            if 'storage' in CONFIG:
                run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])

    if run:
        # Needs to guess what run type I have (HiSeq, MiSeq, HiSeqX, NextSeq)
        runObj = get_runObj(run)
        if not runObj:
            raise RuntimeError("Unrecognized instrument type or incorrect run folder {}".format(run))
        else:
            _process(runObj, force_trasfer)
    else:
        data_dirs = CONFIG.get('analysis').get('data_dirs')
        for data_dir in data_dirs:
            # Run folder looks like DATE_*_*_*, the last section is the FC name. See Courtesy information from illumina of 10 June 2016 (no more XX at the end of the FC)
            runs = glob.glob(os.path.join(data_dir, '[1-9]*_*_*_*'))
            for _run in runs:
                runObj = get_runObj(_run)
                if not runObj:
                    logger.warning("Unrecognized instrument type or incorrect run folder {}".format(run))
                else:
                    try:
                        _process(runObj, force_trasfer)
                    except:
                        # this function might throw and exception,
                        # it is better to continue processing other runs
                        logger.warning("There was an error processing the run {}".format(run))
                        pass
Esempio n. 50
0
def cleanup_milou(site, seconds, dry_run=False):
    """Remove project/run that have been closed more than given time (as seconds)
    from the given 'site' on uppmax

    :param str site: site where the cleanup should be performed
    :param int seconds: Days/hours converted as second to consider a run to be old
    :param bool dry_run: Will summarize what is going to be done without really doing it
    """
    seconds = check_default(site, seconds, CONFIG)
    if not seconds:
        return
    root_dir = CONFIG.get('cleanup').get('milou').get(site).get('root')
    deleted_log = CONFIG.get('cleanup').get('milou').get('deleted_log')
    assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir)
    log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log))
    list_to_delete = []

    ## get glob path patterns to search and remove from root directory
    try:
        archive_config = CONFIG['cleanup']['milou']['archive']
        ## the glob path should be relative to the run folder, like "Unaligned_*/Project_*"
        config_ppath = archive_config['proj_path']
        ## Glob path should be relative to run folder, like "Unaligned_0bp/Undetermined_indices/*/*.fastq.gz"
        config_npath = archive_config['undet_noindex']
        ## Glob path should be relative to run folder, like "Unaligned_*bp/Undetermined_indices/*/*.fastq.gz"
        config_upath = archive_config['undet_all']
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection()
    assert pcon, "Could not connect to project database in StatusDB"

    if site in ["analysis", "illumina"]:
        ## work flow for cleaning up illumina/analysis ##
        projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ]
        list_to_delete.extend(get_closed_projects(projects, pcon, seconds))
    elif site == "archive":
        ##work flow for cleaning archive ##
        runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ]
        for run in runs:
            with filesystem.chdir(os.path.join(root_dir, run)):
                ## Collect all project path from demultiplexed directories in the run folder
                all_proj_path = glob(config_ppath)
                all_proj_dict = {os.path.basename(pp).replace('Project_','').replace('__', '.'): pp for pp in all_proj_path}
                closed_projects = get_closed_projects(all_proj_dict.keys(), pcon, seconds)
                ## Only proceed cleaning the data for closed projects
                for closed_proj in closed_projects:
                    closed_proj_fq = glob("{}/*/*.fastq.gz".format(all_proj_dict[closed_proj]))
                    list_to_delete.extend([os.path.join(run, pfile) for pfile in closed_proj_fq])
                ## Remove the undetermined fastq files for NoIndex case always
                undetermined_fastq_files = glob(config_npath)
                ## Remove undeterminded fastq files for all index length if all project run in the FC is closed
                if len(all_proj_dict.keys()) == len(closed_projects):
                    undetermined_fastq_files = glob(config_upath)
                list_to_delete.extend([os.path.join(run, ufile) for ufile in undetermined_fastq_files])

    ## delete and log
    for item in list_to_delete:
        if dry_run:
            logger.info('Will remove {} from {}'.format(item,root_dir))
            continue
        try:
            to_remove = os.path.join(root_dir,item)
            if os.path.isfile(to_remove):
                os.remove(to_remove)
            elif os.path.isdir(to_remove):
                shutil.rmtree(to_remove)
            logger.info('Removed {} from {}'.format(item,root_dir))
            with open(log_file,'a') as to_log:
                to_log.write("{}\t{}\n".format(to_remove,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M')))
        except OSError:
            logger.warn("Could not remove {} from {}".format(item,root_dir))
            continue
Esempio n. 51
0
    def _process(run, force_trasfer):
        """ Process a run/flowcell and transfer to analysis server
            :param taca.illumina.Run run: Run to be processed and transferred
        """
        logger.info('Checking run {}'.format(run.id))
        t_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
        if run.is_transferred(t_file):
            # In this case I am either processing a run that is in transfer
            # or that has been already transferred. Do nothing.
            # time to time this situation is due to runs that are copied back from NAS after a reboot.
            # This check avoid failures
            logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id))
            return

        if run.get_run_status() == 'SEQUENCING':
            # Check status files and say i.e Run in second read, maybe something
            # even more specific like cycle or something
            logger.info('Run {} is not finished yet'.format(run.id))
            # Upload to statusDB if applies
            if 'statusdb' in CONFIG:
                _upload_to_statusdb(run)
        elif run.get_run_status() == 'TO_START':
            if run.get_run_type() == 'NON-NGI-RUN':
                # For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs)
                logger.warn("Run {} marked as {}, "
                            "TACA will skip this and move the run to "
                            "no-sync directory".format(run.id, run.get_run_type()))
                # Archive the run if indicated in the config file
                if 'storage' in CONFIG:
                    run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
                return
            # Otherwise it is fine, process it
            logger.info(("Starting BCL to FASTQ conversion and demultiplexing for run {}".format(run.id)))
            # Upload to statusDB if applies
            if 'statusdb' in CONFIG:
                _upload_to_statusdb(run)
            run.demultiplex_run()
        elif run.get_run_status() == 'IN_PROGRESS':
            logger.info(("BCL conversion and demultiplexing process in "
                         "progress for run {}, skipping it".format(run.id)))
            # Upload to statusDB if applies
            if 'statusdb' in CONFIG:
                _upload_to_statusdb(run)
            #this function checks if demux is done
            run.check_run_status()

        # previous elif might change the status to COMPLETED, therefore to avoid skipping
        # a cycle take the last if out of the elif
        if run.get_run_status() == 'COMPLETED':
            logger.info(("Preprocessing of run {} is finished, transferring it".format(run.id)))
            # Upload to statusDB if applies
            if 'statusdb' in CONFIG:
                _upload_to_statusdb(run)
                #notify with a mail run completion and stats uploaded
                msg = """The run {run} has been demultiplexed.
                The Run will be transferred to Irma for further analysis.

                The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}

                """.format(run=run.id)
                run.send_mail(msg, rcp=CONFIG['mail']['recipients'])

            # Copy demultiplex stats file to shared file system for LIMS purpose
            if 'mfs_path' in CONFIG['analysis']:
                try:
                    mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id)
                    logger.info('Copying demultiplex stats for run {} to {}'.format(run.id, mfs_dest))
                    if not os.path.exists(mfs_dest):
                        os.mkdir(mfs_dest)
                    demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports',
                                                    'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html')
                    copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html'))
                except:
                    logger.warn('Could not copy demultiplex stat file for run {}'.format(run.id))

            # Transfer to analysis server if flag is True
            if run.transfer_to_analysis_server:
                mail_recipients = CONFIG.get('mail', {}).get('recipients')
                logger.info('Transferring run {} to {} into {}'
                            .format(run.id,
                                    run.CONFIG['analysis_server']['host'],
                                    run.CONFIG['analysis_server']['sync']['data_archive']))
                run.transfer_run(t_file,  False, mail_recipients) # Do not trigger analysis


            # Archive the run if indicated in the config file
            if 'storage' in CONFIG:
                run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
Esempio n. 52
0
 user = getpass.getuser()
 timestamp = datetime.datetime.now()
 result = []
 # parse results
 for job in crontab.crons:
     result.append({'Command': job.command,
                    'Comment': job.comment,
                    'Enabled': job.enabled,
                    'Minute': str(job.minutes),
                    'Hour': str(job.hours),
                    'Day of month' : str(job.month),
                    'Month': str(job.month),
                    'Day of week': str(job.day)})
 # connect to db
 url = "http://{username}:{password}@{url}:{port}".format(
         url=CONFIG.get('statusdb', {}).get('url'),
         username=CONFIG.get('statusdb', {}).get('username'),
         password=CONFIG.get('statusdb', {}).get('password'),
         port=CONFIG.get('statusdb', {}).get('port'))
 logging.info('Connecting to database: {}'.format(CONFIG.get('statusdb', {}).get('url')))
 try:
     couch = couchdb.Server(url)
 except Exception, e:
     logging.error(e.message)
 else:
     # update document
     crontab_db = couch['cronjobs']
     view = crontab_db.view('server/alias')
     # to be safe
     doc = {}
     # create doc if not exist
Esempio n. 53
0
import logging
import re
import shutil
import time

from datetime import datetime
from multiprocessing import Pool

from statusdb.db import connections as statusdb
from taca.utils.config import CONFIG
from taca.utils import filesystem, misc

logger = logging.getLogger(__name__)

# This is used by many of the functions in this module
finished_run_indicator = CONFIG.get('storage', {}).get('finished_run_indicator',
                                                   'RTAComplete.txt')

def cleanup_nas(days):
    """Will move the finished runs in NASes to nosync directory.

    :param int days: Number of days to consider a run to be old
    """
    for data_dir in CONFIG.get('storage').get('data_dirs'):
        logger.info('Moving old runs in {}'.format(data_dir))
        with filesystem.chdir(data_dir):
            for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]:
                rta_file = os.path.join(run, finished_run_indicator)
                if os.path.exists(rta_file):
                    # 1 day == 60*60*24 seconds --> 86400
                    if os.stat(rta_file).st_mtime < time.time() - (86400 * days):
                        logger.info('Moving run {} to nosync directory'
Esempio n. 54
0
def server_status():
    """ Monitor server status """
    if not CONFIG.get('server_status', ''):
        logging.warning("Configuration missing required entries: server_status")