Exemple #1
0
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
    log.info("\n== Compressing corrected reads (with gzip)")
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                compressed_reads_filenames = []
                for reads_file in value:
                    if not os.path.isfile(reads_file):
                        support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    to_compress.append(reads_file)
                    compressed_reads_filenames.append(reads_file + ".gz")
                reads_library[key] = compressed_reads_filenames
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemple #2
0
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
    log.info("\n== Compressing corrected reads (with gzip)")
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                compressed_reads_filenames = []
                for reads_file in value:
                    if not os.path.isfile(reads_file):
                        support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    to_compress.append(reads_file)
                    compressed_reads_filenames.append(reads_file + ".gz")
                reads_library[key] = compressed_reads_filenames
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemple #3
0
def move_dataset_files(dataset_data,
                       dst,
                       ext_python_modules_home,
                       max_threads,
                       log,
                       gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst,
                                                os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (
                                gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning(
                                'file with corrected reads (' + reads_file +
                                ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error(
                                'something went wrong and file with corrected reads ('
                                + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([
                    pigz_path, '-f', '-7', '-p',
                    str(max_threads), reads_file
                ], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(
                delayed(support.sys_call)(['gzip', '-f', '-7', reads_file])
                for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemple #4
0
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst, os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning('file with corrected reads (' + reads_file + ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemple #5
0
def compress_dataset_files(input_file, ext_python_modules_home, max_threads,
                           log, not_used_yaml_file, output_dir, gzip_output):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith("2."):
        import pyyaml2 as pyyaml
        from joblib2 import Parallel, delayed
    elif sys.version.startswith("3."):
        import pyyaml3 as pyyaml
        from joblib3 import Parallel, delayed

    dataset_data = pyyaml.load(open(input_file))
    remove_not_corrected_reads(output_dir)
    is_changed = False
    if gzip_output:
        is_changed = True
        pigz_path = support.which("pigz")
        if pigz_path:
            compressor = "pigz"
        else:
            compressor = "gzip"
        log.info("\n== Compressing corrected reads (with %s)" % compressor)
        to_compress = []
        for reads_library in dataset_data:
            for key, value in reads_library.items():
                if key.endswith("reads"):
                    compressed_reads_filenames = []
                    for reads_file in value:
                        compressed_reads_filenames.append(reads_file + ".gz")
                        to_compress.append(reads_file)
                    reads_library[key] = compressed_reads_filenames

        if len(to_compress):
            for reads_file in to_compress:
                if not isfile(reads_file):
                    support.error(
                        "something went wrong and file with corrected reads (%s) is missing!"
                        % reads_file, log)

            if pigz_path:
                for reads_file in to_compress:
                    support.sys_call([
                        pigz_path, "-f", "-7", "-p",
                        str(max_threads), reads_file
                    ], log)
            else:
                n_jobs = min(len(to_compress), max_threads)
                outputs = Parallel(n_jobs=n_jobs)(
                    delayed(support.sys_call)(["gzip", "-f", "-7", reads_file])
                    for reads_file in to_compress)
                for output in outputs:
                    if output:
                        log.info(output)

    if not_used_yaml_file != "":
        is_changed = True
        not_used_dataset_data = pyyaml.load(open(not_used_yaml_file))
        dataset_data += not_used_dataset_data
    if is_changed:
        with open(input_file, 'w') as f:
            pyyaml.dump(dataset_data,
                        f,
                        default_flow_style=False,
                        default_style='"',
                        width=float("inf"))