Exemple #1
0
def batch_convert_multi_files_to_single(input_path, output_folder, threads,
                                        recursive):

    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive)
    pbar = get_progress_bar(len(file_list))

    def update(results):
        output_file = os.path.basename(results.popleft())
        with open(os.path.join(output_folder, "filename_mapping.txt"),
                  'a') as output_table:
            for filename in results:
                output_table.write("{}\t{}\n".format(output_file, filename))
        pbar.update(pbar.currval + 1)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    results_array = []
    for batch_num, filename in enumerate(file_list):
        results_array.append(
            pool.apply_async(convert_multi_to_single,
                             args=(filename, output_folder, str(batch_num)),
                             callback=update))

    pool.close()
    pool.join()
    pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base,
                                  batch_size, threads, recursive,
                                  follow_symlinks, target_compression):
    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive, follow_symlinks)
    pbar = get_progress_bar(int(
        (len(file_list) + batch_size - 1) / batch_size))

    def update(result):
        output_file = result[1]
        with open(os.path.join(output_folder, "filename_mapping.txt"),
                  'a') as output_table:
            for filename in result[0]:
                output_table.write("{}\t{}\n".format(filename, output_file))
        pbar.update(pbar.currval + 1)

    results_array = []
    os.makedirs(output_folder, exist_ok=True)
    for batch_num, batch in enumerate(batcher(file_list, batch_size)):
        output_file = os.path.join(
            output_folder, "{}_{}.fast5".format(filename_base, batch_num))
        results_array.append(
            pool.apply_async(create_multi_read_file,
                             args=(batch, output_file, target_compression),
                             callback=update))

    pool.close()
    pool.join()
    pbar.finish()
Exemple #3
0
def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True):
    # We require an absolute input path to we can replicate the data structure relative to it later on
    input_folder = os.path.abspath(input_folder)

    file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks)
    if len(file_list) == 0:
        raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive))

    # Set up the process pool and the progressbar
    pool = Pool(min(threads, len(file_list)))
    pbar = get_progress_bar(len(file_list))

    def update(result):
        pbar.update(pbar.currval + 1)

    for input_file in file_list:
        input_path = os.path.join(input_folder, input_file)
        output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder))

        pool.apply_async(func=compress_file,
                         args=(input_path, output_path, target_compression),
                         callback=update)

    # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join()
    pool.close()
    pool.join()
    pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive):

    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive)
    pbar = get_progress_bar(int((len(file_list)+batch_size-1)/batch_size))

    def update(results):
        output_file = os.path.basename(results.popleft())
        with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table:
            for filename in results:
                output_table.write("{}\t{}\n".format(filename, output_file))
        pbar.update(pbar.currval + 1)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table:
        output_table.write("single_read_file\tmulti_read_file\n")

    for batch_num, batch in enumerate(batcher(file_list, batch_size)):
        output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num))
        pool.apply_async(create_multi_read_file, args=(batch, output_file), callback=update)

    pool.close()
    pool.join()
    pbar.finish()
Exemple #5
0
    def __init__(self,
                 input_folder,
                 output_folder,
                 read_list_file,
                 filename_base,
                 batch_size=4000,
                 threads=1,
                 recursive=False,
                 file_list_file=None):
        assert Path(input_folder).is_dir()
        assert Path(read_list_file).is_file()
        assert isinstance(filename_base, str)
        assert isinstance(batch_size, int)
        assert isinstance(threads, int)
        assert isinstance(recursive, bool)
        self.logger = logging.getLogger(self.__class__.__name__)

        self.read_set = get_filter_reads(read_list_file)
        self.input_f5s = get_fast5_file_list(str(input_folder), recursive)

        if file_list_file:
            file_set = get_filter_reads(file_list_file)
            for file in file_set:
                assert Path(file).exists(
                ), "{} from file list doesn't exist".format(file)
            self.input_f5s = list(file_set.intersection(self.input_f5s))

        # determine max number of workers
        self.batch_size = batch_size
        num_outputs = int(ceil(len(self.read_set) / batch_size))
        self.num_workers = min(threads, min(num_outputs, len(self.input_f5s)))

        out_basename = Path(output_folder)
        if not out_basename.exists():
            out_basename.mkdir()

        self.filename_mapping_file = out_basename / "filename_mapping.txt"
        if self.filename_mapping_file.exists():
            self.logger.info("overwriting filename mapping file {}".format(
                self.filename_mapping_file))
            self.filename_mapping_file.unlink()

        # dict where key=filename value=read_set
        self.out_files = {}

        out_file_names = []
        for i in range(num_outputs):
            filename = filename_base + str(i) + ".fast5"
            output_file_name = out_basename / filename

            if output_file_name.exists():
                self.logger.info(
                    "overwriting multiread file {}".format(output_file_name))
                output_file_name.unlink()

            self.out_files[output_file_name] = set()
            out_file_names.append(output_file_name)

        # reversing so that first item to be popped is lower idx
        self.available_out_files = out_file_names[::-1]
        self.tasks = []
        self.pool = None
        # progressbar total is number of reads in read_set plus number of input files
        # (to see progress while scanning files that don't have any relevant reads)
        self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
Exemple #6
0
    def __init__(self,
                 input_folder,
                 output_folder,
                 read_list_file,
                 filename_base,
                 batch_size=4000,
                 threads=1,
                 recursive=False,
                 file_list_file=None):
        assert path.isdir(input_folder)
        assert path.isfile(read_list_file)
        assert isinstance(filename_base, str)
        assert isinstance(batch_size, int)
        assert isinstance(threads, int)
        assert isinstance(recursive, bool)
        self.logger = logging.getLogger(self.__class__.__name__)

        self.read_set = get_filter_reads(read_list_file)
        self.input_f5s = get_fast5_file_list(str(input_folder), recursive)

        if len(self.read_set) < 1:
            raise ValueError(
                "No reads in read list file {}".format(read_list_file))

        if len(self.input_f5s) < 1:
            raise ValueError(
                "No input fast5 files found in {}. Recursion is set to {}".
                format(str(input_folder), recursive))

        if batch_size < 1:
            raise ValueError(
                "Batch size (--batch_size) must be a positive integer, not {}".
                format(batch_size))

        if threads < 1:
            raise ValueError(
                "Max number of threads (--threads) must be a positive integer, not {}"
                .format(threads))

        if file_list_file:
            file_set = get_filter_reads(file_list_file)
            for file in file_set:
                assert path.exists(
                    file), "{} from file list doesn't exist".format(file)
            self.input_f5s = list(file_set.intersection(self.input_f5s))

        # determine max number of workers
        self.batch_size = batch_size
        num_outputs = int(ceil(len(self.read_set) / float(batch_size)))
        self.num_workers = min(threads, min(num_outputs, len(self.input_f5s)))

        if not path.exists(output_folder):
            mkdir(output_folder)

        self.filename_mapping_file = path.join(output_folder,
                                               "filename_mapping.txt")
        if path.exists(self.filename_mapping_file):
            self.logger.info("overwriting filename mapping file {}".format(
                self.filename_mapping_file))
            unlink(self.filename_mapping_file)

        # dict where key=filename value=read_set
        self.out_files = {}

        out_file_names = []
        for i in range(num_outputs):
            filename = filename_base + str(i) + ".fast5"
            output_file_name = path.join(output_folder, filename)

            if path.exists(output_file_name):
                self.logger.info(
                    "overwriting multiread file {}".format(output_file_name))
                unlink(output_file_name)

            self.out_files[output_file_name] = set()
            out_file_names.append(output_file_name)

        # reversing so that first item to be popped is lower idx
        self.available_out_files = out_file_names[::-1]
        self.tasks = []
        self.pool = None
        # progressbar total is number of reads in read_set plus number of input files
        # (to see progress while scanning files that don't have any relevant reads)
        self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))