コード例 #1
0
ファイル: sync_run.py プロジェクト: GenomicParisCentre/aozan
def sync(run_id, conf):
    """Synchronize a run.

    Arguments:
        run_id: the run id
        conf: configuration dictionary
    """

    start_time = time.time()
    common.log('INFO', 'Sync step: Starting', conf)

    bcl_data_path = conf[BCL_DATA_PATH_KEY]
    reports_data_base_path = conf[REPORTS_DATA_PATH_KEY]
    output_path = bcl_data_path + '/' + run_id

    # check if rsync exists in PATH
    if not common.exists_in_path("rsync"):
        error("Can't find all needed commands in PATH env var",
              "Can't find all needed commands in PATH env var. Unable to find: rsync command.", conf)
        return False

    # Check if reports_data_path exists
    if not os.path.exists(reports_data_base_path):
        error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf)
        return False

    # Check if enough space to store reports
    if common.df(reports_data_base_path) < 10 * 1024 * 1024 * 1024:
        error("Not enough disk space to store aozan reports for run " + run_id,
              "Not enough disk space to store aozan reports for run " + run_id +
              '.\nNeed more than 10 Gb on ' + reports_data_base_path + '.', conf)
        return False

    # Do the synchronization
    if not partial_sync(run_id, True, conf):
        return False

    # Rename partial sync directory to final run BCL directory
    if os.path.exists(output_path + '.tmp'):
        os.rename(output_path + '.tmp', output_path)

    # Check used and free space
    df_in_bytes = common.df(bcl_data_path)
    du_in_bytes = common.du(output_path)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024 * 1024)

    common.log("WARNING", "Sync step: output disk free after sync: " + str(df_in_bytes), conf)
    common.log("WARNING", "Sync step: space used by sync: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending synchronization for run ' + run_id + '.\n' + \
          'Job finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \
          'Run output files (without .cif files) can be found in the following directory:\n  ' + output_path

    # Add path to report if reports.url exists
    if common.is_conf_key_exists(REPORTS_URL_KEY, conf):
        msg += '\n\nRun reports can be found at following location:\n  ' + conf[REPORTS_URL_KEY] + '/' + run_id

    msg += '\n\nFor this task %.2f GB has been used and %.2f GB is still free.' % (du, df)

    common.send_msg('[Aozan] Ending synchronization for run ' + run_id + ' on ' +
                    common.get_instrument_name(run_id, conf), msg, False, conf)
    common.log('INFO', 'sync step: successful in ' + common.duration_to_human_readable(duration), conf)
    return True
コード例 #2
0
ファイル: aozan.py プロジェクト: GenomicParisCentre/aozan
        Common.initLogger(conf[AOZAN_LOG_PATH_KEY], conf[AOZAN_LOG_LEVEL_KEY])
    except AozanException, exp:
        common.exception_msg(exp, conf)

    # Check main path file in configuration
    if not common.check_configuration(conf, conf_file):
        common.log('SEVERE',
                   'Aozan can not be executed. Configuration is invalid or missing, some useful directories ' +
                   'may be inaccessible. ',
                   conf)
        sys.exit(1)

    # check if global program set is available in PATH
    global_program_set = {"bash", "du", "touch", "chmod", "cp", "mv", "rm", "find", "tar"}
    for program in global_program_set:
        if not common.exists_in_path(program):
            common.log('SEVERE',
                  "Can't find all needed commands in PATH env var. Unable to find: " + program + " command.", conf)
            sys.exit(1)

    # Check critical free space available
    hiseq_run.send_mail_if_critical_free_space_available(conf)

    lock_file_path = conf[LOCK_FILE_KEY]

    # Run only if there is no lock
    # if not os.path.exists(lock_file_path):
    if not lock_file_exists(lock_file_path):
        try:
            # Create lock file
            create_lock_file(lock_file_path)
コード例 #3
0
def recompress(run_id, conf):
    """Proceed to recompression of a run.

    Arguments:
        run_id: The run id
        conf: configuration dictionary
    """

    common.log('INFO', 'Recompress step: Starting', conf)

    # Check if input root fastq root data exists
    if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf):
        error("FASTQ data directory does not exist",
              "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf)
        return False

    start_time = time.time()
    fastq_input_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id

    # initial du for comparing with ending disk usage
    previous_du_in_bytes = common.du(fastq_input_dir)

    # get information about compression type
    compression_type = conf[RECOMPRESS_COMPRESSION_KEY]
    compression_level = conf[RECOMPRESS_COMPRESSION_LEVEL_KEY]
    compression_info_tuple = get_info_from_file_type(compression_type, compression_level)

    if compression_info_tuple is None:
        error("Unknown compression type",
              "Unknown compression type: " + compression_type, conf)
        return False

    (compression_type_result, output_file_extension, output_compression_command, output_decompression_command,
     compression_level_argument) = compression_info_tuple

    # The following list contains the processed type of files to recompress
    types_to_recompress = ["fastq.gz", "fastq"]

    # list of program to check if exists in path before execution
    program_set = {"bash", "tee", "touch", "chmod", "md5sum", output_compression_command, output_decompression_command}

    # get list of file to process
    input_files = []
    for extension in types_to_recompress:

        input_files.extend(list_files(fastq_input_dir, extension))
        simple_extension = os.path.splitext(extension)[-1][1:]
        extension_info_tuple = get_info_from_file_type(simple_extension)

        if extension_info_tuple is None:
            error("Unknown extension type",
                  "Unknown extension type: " + extension, conf)
            return False

        program_set.add(extension_info_tuple[3])

    # actual program list check
    for program in program_set:
        if not common.exists_in_path(program):
            error("Can't find all needed commands in PATH env var",
                  "Can't find all needed commands in PATH env var. Unable to find: " + program + " command.", conf)
            return False

    # Create executor and for parallelization of processus
    executor = Executors.newFixedThreadPool(int(conf[RECOMPRESS_THREADS_KEY]))
    workers = []

    # process each fastq and fastq.gz recursively in each fastq directory
    for input_file in input_files:

        simple_extension = os.path.splitext(input_file)[-1][1:]

        # get info about the type of input file
        extension_info_tuple = get_info_from_file_type(simple_extension)
        if extension_info_tuple is None:
            error("Unknown extension type",
                  "Unknown extension type: " + simple_extension, conf)
            return False

        input_decompression_command = extension_info_tuple[3]

        # get file base name and create output_file name, if file is already .fastq its ready to be base_input_file
        base_input_file = input_file[0: input_file.index(".fastq") + 6]
        output_file = base_input_file + "." + output_file_extension

        # Skip if the output_file already exists
        if not os.path.exists(output_file):

            # Create worker then execute in thread
            worker = Worker(input_file, output_file, input_decompression_command, output_compression_command,
                            output_decompression_command,
                            compression_level_argument,
                            common.is_conf_value_equals_true(RECOMPRESS_DELETE_ORIGINAL_FASTQ_KEY, conf))
            workers.append(worker)
            executor.execute(worker)

        else:
            common.log("WARNING", "Recompress step: Omitting processing file " + input_file + ". The associated output file " + output_file + " already exists.", conf)

    # Wait for all thread to finish
    executor.shutdown()
    while not executor.isTerminated():
        time.sleep(1)

    # Check if any worker is in error
    for worker in workers:
        if not worker.is_successful():
            error(worker.get_error_message(),
                  worker.get_long_error_message(), conf)
            return False

    # check new disk usage
    df_in_bytes = common.df(fastq_input_dir)
    du_in_bytes = common.du(fastq_input_dir)
    previous_du = previous_du_in_bytes / (1024 * 1024)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024)

    common.log("WARNING", "Recompress step: output disk free after step: " + str(df_in_bytes), conf)
    common.log("WARNING", "Recompress step: space previously used: " + str(previous_du_in_bytes), conf)
    common.log("WARNING", "Recompress step: space now used by step: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending recompression for run ' + run_id + '.' + \
          '\nJob finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '. '

    msg += '\n\nAfter recompress step FASTQ folder is now %.2f MB (previously %.2f MB) and %.2f GB still free.' % (
        du, previous_du, df)

    common.send_msg('[Aozan] Ending recompress for run ' + run_id + ' on ' +
                    common.get_instrument_name(run_id, conf), msg, False, conf)
    common.log('INFO', 'Recompress step: successful in ' + common.duration_to_human_readable(duration), conf)
    return True