Esempio n. 1
0
def check_days(site, days, config):
    """Check if 'days' given while running command. If not take the default threshold
    from config file (which should exist). Also when 'days' given on the command line
    raise a check to make sure it was really meant to do so

    :param str site: site to be cleaned and relevent date to pick
    :param int days: number of days to check, will be None if '-d' not used
    :param dict config: config file parsed and saved as dictionary
    """
    try:
        default_days = config["cleanup"][site]["days"]
    except KeyError:
        raise
    if not days:
        return default_days
    elif days >= default_days:
        return days
    else:
        if misc.query_yes_no(
            "Seems like given days({}) is less than the "
            " default({}), are you sure to proceed ?".format(days, default_days),
            default="no",
        ):
            return days
        else:
            return None
Esempio n. 2
0
def check_default(site, seconds, config):
    """Check if time(as seconds) given while running command. If not take the default threshold
    from config file (which should exist). Also when 'days' given on the command line
    raise a check to make sure it was really meant to do so

    :param str site: site to be cleaned and relevent date to pick
    :param int seconds: Days/hours converted as seconds to check
    :param dict config: config file parsed and saved as dictionary
    """
    try:
        default_days = config['cleanup']['milou'][site]['days']
        default_seconds = misc.to_seconds(days=default_days)
    except KeyError:
        raise
    if not seconds:
        return default_seconds
    elif seconds >= default_seconds:
        return seconds
    else:
        if misc.query_yes_no(
                "Seems like given time is less than the "
                " default({}) days, are you sure to proceed ?".format(
                    default_days),
                default="no"):
            return seconds
        else:
            return None
Esempio n. 3
0
def check_default(site, seconds, config):
    """Check if time(as seconds) given while running command. If not take the default threshold
    from config file (which should exist). Also when 'days' given on the command line
    raise a check to make sure it was really meant to do so

    :param str site: site to be cleaned and relevent date to pick
    :param int seconds: Days/hours converted as seconds to check
    :param dict config: config file parsed and saved as dictionary
    """
    try:
        default_days = config['cleanup']['milou'][site]['days']
        default_seconds = misc.to_seconds(days=default_days)
    except KeyError:
        raise
    if not seconds:
        return default_seconds
    elif seconds >= default_seconds:
        return seconds
    else:
        if misc.query_yes_no("Seems like given time is less than the "
                             " default({}) days, are you sure to proceed ?"
                             .format(default_days), default="no"):
            return seconds
        else:
            return None
Esempio n. 4
0
def check_days(site, days, config):
    """Check if 'days' given while running command. If not take the default threshold
    from config file (which should exist). Also when 'days' given on the command line
    raise a check to make sure it was really meant to do so

    :param str site: site to be cleaned and relevent date to pick
    :param int days: number of days to check, will be None if '-d' not used
    :param dict config: config file parsed and saved as dictionary
    """
    try:
        default_days = config['cleanup'][site]['days']
    except KeyError:
        raise
    if not days:
        return default_days
    elif days >= default_days:
        return days
    else:
        if misc.query_yes_no("Seems like given days({}) is less than the "
                             " default({}), are you sure to proceed ?"
                             .format(days,default_days), default="no"):
            return days
        else:
            return None
Esempio n. 5
0
def cleanup_irma(days_fastq,
                 days_analysis,
                 only_fastq,
                 only_analysis,
                 clean_undetermined,
                 status_db_config,
                 exclude_projects,
                 list_only,
                 date,
                 dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
                undet_file_pattern: "Undetermined_*.fastq.gz"
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        flowcell_undet_files = config['flowcell']['undet_file_pattern']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
        if date:
            date = datetime.strptime(date, '%Y-%m-%d')
    except KeyError as e:
        logger.error(
            "Config file is missing the key {}, make sure it have all required information"
            .format(str(e)))
        raise SystemExit
    except ValueError as e:
        logger.error(
            "Date given with '--date' option is not in required format, see help for more info"
        )
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"

    # make exclude project list if provided
    exclude_list = []
    if exclude_projects:
        if os.path.isfile(exclude_projects):
            with open(exclude_projects, 'r') as in_file:
                exclude_list.extend([p.strip() for p in in_file.readlines()])
        else:
            exclude_list.extend(exclude_projects.split(','))
        # sanity check for mentioned project to exculde or valid
        invalid_projects = filter(
            lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.
            keys(), exclude_list)
        if invalid_projects:
            logger.error(
                "'--exclude_projects' was called with some invalid projects '{}', "
                "provide valid project name/id".format(
                    ",".join(invalid_projects)))
            raise SystemExit

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    if not list_only and not clean_undetermined:
        logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info(
            "Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info(
            "Option 'only_analysis' is given, so will not look for fastq data")

    if clean_undetermined:
        all_undet_files = []
        for flowcell_dir in flowcell_dir_root:
            for fc in [
                    d for d in os.listdir(flowcell_dir)
                    if re.match(filesystem.RUN_RE, d)
            ]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn(
                            "Flowcell {} do not contain a '{}' direcotry".
                            format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    # the above check looked for project directories and also that are not cleaned
                    # so if it could not find any project, means there is no project diretory at all
                    # or all the project directory is already cleaned. Then we can remove the undet
                    if len(projects_in_fc) > 0:
                        continue
                    fc_undet_files = glob(
                        os.path.join(flowcell_project_source,
                                     flowcell_undet_files))
                    if fc_undet_files:
                        logger.info(
                            "All projects was cleaned for FC {}, found {} undeterminded files"
                            .format(fc, len(fc_undet_files)))
                        all_undet_files.extend(
                            map(os.path.abspath, fc_undet_files))
        if all_undet_files:
            undet_size = _def_get_size_unit(
                sum(map(os.path.getsize, all_undet_files)))
            if misc.query_yes_no(
                    "In total found {} undetermined files which are {} in size, delete now ?"
                    .format(len(all_undet_files), undet_size),
                    default="no"):
                removed = _remove_files(all_undet_files)
        return
    elif only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(
                pid, pcon.get_entry(pid, use_id_view=True), date)
            if proj_info and proj_info['closed_days'] >= days_analysis:
                # move on if this project has to be excluded
                if proj_info['name'] in exclude_list or proj_info[
                        'pid'] in exclude_list:
                    continue
                analysis_data, analysis_size = collect_analysis_data_irma(
                    pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [
                    d for d in os.listdir(flowcell_dir)
                    if re.match(filesystem.RUN_RE, d)
            ]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn(
                            "Flowcell {} do not contain a '{}' direcotry".
                            format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[
                                    proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(
                                    fc_abs_path,
                                    os.path.join(flowcell_project_source,
                                                 _proj))
                                project_clean_list[proj]['fastq_to_remove'][
                                    'flowcells'][fc] = fc_fq_files[
                                        'flowcells'][fc]
                                project_clean_list[proj][
                                    'fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(
                            proj, pcon.get_entry(proj), date)
                        if proj_info:
                            # move on if this project has to be excluded
                            if proj_info['name'] in exclude_list or proj_info[
                                    'pid'] in exclude_list:
                                continue
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(
                                    fc_abs_path,
                                    os.path.join(flowcell_project_source,
                                                 _proj), data_dir,
                                    proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(
                                        proj_info['pid'], analysis_dir,
                                        analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or (
                                    (not analysis_data or analysis_data
                                     == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info

    if not project_clean_list:
        logger.info("There are no projects to clean")
        return

    # list only the project and exit if 'list_only' option is selected
    if list_only:
        print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
        for p_info in sorted(project_clean_list.values(),
                             key=lambda d: d['closed_days'],
                             reverse=True):
            print "\t".join([
                p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
                str(p_info['closed_days']), p_info['closed_date'],
                _def_get_size_unit(p_info['fastq_size']),
                _def_get_size_unit(p_info['analysis_size'])
            ])
        raise SystemExit

    logger.info("Initial list is built with {} projects {}".format(
        len(project_clean_list), get_files_size_text(project_clean_list)))
    if misc.query_yes_no("Interactively filter projects for cleanup ?",
                         default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no(
                    "{}Delete files for this project ({}/{})".format(
                        get_proj_meta_info(info, days_fastq), proj_count,
                        len(project_clean_list)),
                    default="no"):
                logger.info(
                    "Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(
            len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(
            len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")

    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info(
                    "Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info(
                            "Removed fastq files from FC {} for project {}, marking it as cleaned"
                            .format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info(
                        "All flowcells cleaned for this project, marking it as cleaned in {}"
                        .format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass

        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(
                    qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn(
                            "Couldn't remove some files in qc directory '{}'".
                            format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info(
                    "Removed analysis data for project {}, marking it cleaned".
                    format(proj))
                _touch_cleaned(proj_analysis_root)
Esempio n. 6
0
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
                undet_file_pattern: "Undetermined_*.fastq.gz"
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        flowcell_undet_files = config['flowcell']['undet_file_pattern']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
        if date:
            date = datetime.strptime(date, '%Y-%m-%d')
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit
    except ValueError as e:
        logger.error("Date given with '--date' option is not in required format, see help for more info")
        raise SystemExit

    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"
    
    # make exclude project list if provided
    exclude_list = []
    if exclude_projects:
        if os.path.isfile(exclude_projects):
            with open(exclude_projects, 'r') as in_file:
                exclude_list.extend([p.strip() for p in in_file.readlines()])
        else:
            exclude_list.extend(exclude_projects.split(','))
        # sanity check for mentioned project to exculde or valid
        invalid_projects = filter(lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.keys(), exclude_list)
        if invalid_projects:
            logger.error("'--exclude_projects' was called with some invalid projects '{}', "
                         "provide valid project name/id".format(",".join(invalid_projects)))
            raise SystemExit

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    if not list_only and not clean_undetermined:
        logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info("Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info("Option 'only_analysis' is given, so will not look for fastq data")
    
    if clean_undetermined:
        all_undet_files = []
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    # the above check looked for project directories and also that are not cleaned
                    # so if it could not find any project, means there is no project diretory at all
                    # or all the project directory is already cleaned. Then we can remove the undet  
                    if len(projects_in_fc) > 0:
                        continue
                    fc_undet_files = glob(os.path.join(flowcell_project_source,flowcell_undet_files))
                    if fc_undet_files:
                        logger.info("All projects was cleaned for FC {}, found {} undeterminded files".format(fc,len(fc_undet_files)))
                        all_undet_files.extend(map(os.path.abspath, fc_undet_files))
        if all_undet_files:
            undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
            if misc.query_yes_no("In total found {} undetermined files which are {} in size, delete now ?".format(len(all_undet_files),
                                 undet_size), default="no"):
                    removed = _remove_files(all_undet_files)
        return
    elif only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date)
            if proj_info and proj_info['closed_days'] >= days_analysis:
                # move on if this project has to be excluded
                if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
                    continue
                analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    if not os.path.exists(flowcell_project_source):
                        logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source))
                        continue
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj))
                                project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
                                project_clean_list[proj]['fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date)
                        if proj_info:
                            # move on if this project has to be excluded
                            if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
                                continue
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj),
                                                                                 data_dir, proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files 
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info
    
    if not project_clean_list:
        logger.info("There are no projects to clean")
        return
    
    # list only the project and exit if 'list_only' option is selected
    if list_only:
        print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
        for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True):
            print "\t".join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
                             str(p_info['closed_days']), p_info['closed_date'],
                             _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])])
        raise SystemExit
            
    
    logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
    if  misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq),
                   proj_count, len(project_clean_list)), default="no"):
                logger.info("Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")
    
    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info("Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass
            
        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn("Couldn't remove some files in qc directory '{}'".format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info("Removed analysis data for project {}, marking it cleaned".format(proj))
                _touch_cleaned(proj_analysis_root)
Esempio n. 7
0
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, status_db_config, dry_run=False):
    """Remove fastq/analysis data for projects that have been closed more than given 
    days (as days_fastq/days_analysis) from the given 'irma' cluster

    :param int days_fastq: Days to consider to remove fastq files for project
    :param int days_analysis: Days to consider to remove analysis data for project
    :param bool only_fastq: Remove only fastq files for closed projects
    :param bool only_analysis: Remove only analysis data for closed projects
    :param bool dry_run: Will summarize what is going to be done without really doing it
    
    Example for mat for config file
    cleanup:
        irma:
            flowcell:
                ##this path is nothing but incoming directory, can given multiple paths
                root: 
                    - path/to/flowcells_dir
                relative_project_source: Demultiplexing
    
            ##this is path where projects are organized
            data_dir: path/to/data_dir
            analysis:
                ##directory where analysis are perfoemed for projects
                root: path/to/analysis_dir
                #should be exactly same as the qc folder name and files wished to be removed
                files_to_remove:
                    piper_ngi: 
                        - "*.bam"
    """
    try:
        config = CONFIG['cleanup']['irma']
        flowcell_dir_root = config['flowcell']['root']
        flowcell_project_source = config['flowcell']['relative_project_source']
        data_dir = config['data_dir']
        analysis_dir = config['analysis']['root']
        analysis_data_to_remove = config['analysis']['files_to_remove']
    except KeyError as e:
        logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e)))
        raise SystemExit
    
    # make a connection for project db #
    pcon = statusdb.ProjectSummaryConnection(conf=status_db_config)
    assert pcon, "Could not connect to project database in StatusDB"

    #compile list for project to delete
    project_clean_list, project_processed_list = ({}, [])
    logger.info("Building initial project list for removing data..")
    if only_fastq:
        logger.info("Option 'only_fastq' is given, so will not look for analysis data")
    elif only_analysis:
        logger.info("Option 'only_analysis' is given, so will not look for fastq data")
     
    if only_analysis:
        for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
                    not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]:
            proj_abs_path = os.path.join(analysis_dir, pid)
            proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True))
            if proj_info and proj_info['closed_days'] >= days_analysis:
                analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove)
                proj_info['analysis_to_remove'] = analysis_data
                proj_info['analysis_size'] = analysis_size
                proj_info['fastq_to_remove'] = "not_selected"
                proj_info['fastq_size'] = 0
                project_clean_list[proj_info['name']] = proj_info
    else:
        for flowcell_dir in flowcell_dir_root:
            for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
                fc_abs_path = os.path.join(flowcell_dir, fc)
                with filesystem.chdir(fc_abs_path):
                    projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
                                      if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
                                      not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))]
                    for _proj in projects_in_fc:
                        proj = re.sub(r'_+', '.', _proj, 1)
                        # if a project is already processed no need of fetching it again from status db
                        if proj in project_processed_list:
                            # if the project is closed more than threshold days collect the fastq files from FC
                            # no need of looking for analysis data as they would have been collected in the first time
                            if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
                                fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj))
                                project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
                                project_clean_list[proj]['fastq_size'] += fq_size
                            continue
                        project_processed_list.append(proj)
                        #by default assume all projects are not old enough for delete
                        fastq_data, analysis_data = ("young", "young")
                        fastq_size, analysis_size = (0, 0)
                        proj_info = get_closed_proj_info(proj, pcon.get_entry(proj))
                        if proj_info:
                            # if project not old enough for fastq files and only fastq files selected move on to next project
                            if proj_info['closed_days'] >= days_fastq:
                                fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj),
                                                                                 data_dir, proj_info['pid'])
                            if not only_fastq:
                                # if project is old enough for fastq files and not 'only_fastq' try collect analysis files 
                                if proj_info['closed_days'] >= days_analysis:
                                    analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove)
                                # if both fastq and analysis files are not old enough move on
                                if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"):
                                    continue
                            elif fastq_data == "young":
                                continue
                            else:
                                analysis_data = "not_selected"
                            proj_info['fastq_to_remove'] = fastq_data
                            proj_info['fastq_size'] = fastq_size
                            proj_info['analysis_to_remove'] = analysis_data
                            proj_info['analysis_size'] = analysis_size
                            project_clean_list[proj] = proj_info
    
    if not project_clean_list:
        logger.info("There are no projects to clean")
        return
                    
    get_files_size_text(project_clean_list)
    logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
    if  misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
        filtered_project, proj_count = ([], 0)
        #go through complied project list and remove files
        for proj, info in project_clean_list.iteritems():
            proj_count += 1
            if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq),
                   proj_count, len(project_clean_list)), default="no"):
                logger.info("Will not remove files for project {}".format(proj))
                filtered_project.append(proj)
        # remove projects that were decided not to delete
        map(project_clean_list.pop, filtered_project)
        logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count))
        if not project_clean_list:
            logger.info("There are no projects to clean after filtering")
            return
        logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list)))
        if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
            logger.info("Aborting cleanup")
            return
    logger.info("Will start cleaning up project now")
    
    for proj, info in project_clean_list.iteritems():
        fastq_info = info.get('fastq_to_remove')
        if fastq_info and isinstance(fastq_info, dict):
            logger.info("Cleaning fastq files for project {}".format(proj))
            fastq_fc = fastq_info.get('flowcells', {})
            removed_fc = []
            for fc, fc_info in fastq_fc.iteritems():
                proj_fc_root = fc_info['proj_root']
                logger.info("Removing fastq files from {}".format(proj_fc_root))
                if not dry_run:
                    if _remove_files(fc_info['fq_files']):
                        logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj))
                        _touch_cleaned(proj_fc_root)
                        removed_fc.append(fc)
            if len(fastq_fc) == len(removed_fc):
                try:
                    proj_data_root = fastq_info['proj_data']['proj_data_root']
                    logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root))
                    _touch_cleaned(proj_data_root)
                except:
                    pass
            
        analysis_info = info.get('analysis_to_remove')
        if analysis_info and isinstance(analysis_info, dict):
            proj_analysis_root = analysis_info['proj_analysis_root']
            logger.info("cleaning analysis data for project {}".format(proj))
            removed_qc = []
            for qc, files in analysis_info['analysis_files'].iteritems():
                logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root))
                if not dry_run:
                    if _remove_files(files):
                        removed_qc.append(qc)
                    else:
                        logger.warn("Couldn't remove some files in qc directory '{}'".format(qc))
            map(analysis_info['analysis_files'].pop, removed_qc)
            if len(analysis_info['analysis_files']) == 0:
                logger.info("Removed analysis data for project {}, marking it cleaned".format(proj))
                _touch_cleaned(proj_analysis_root)
Esempio n. 8
0
 def test_query_yes_no_false(self, mock_raw_input):
     """Return False from answer no."""
     response = misc.query_yes_no('Some question')
     self.assertFalse(response)
Esempio n. 9
0
 def test_query_yes_no_true(self, mock_raw_input):
     """Return True from answer yes."""
     response = misc.query_yes_no('Some question')
     self.assertTrue(response)