def batch_convert_single_to_multi(input_path, output_folder, filename_base,
                                  batch_size, threads, recursive,
                                  follow_symlinks, target_compression):
    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive, follow_symlinks)
    pbar = get_progress_bar(int(
        (len(file_list) + batch_size - 1) / batch_size))

    def update(result):
        output_file = result[1]
        with open(os.path.join(output_folder, "filename_mapping.txt"),
                  'a') as output_table:
            for filename in result[0]:
                output_table.write("{}\t{}\n".format(filename, output_file))
        pbar.update(pbar.currval + 1)

    results_array = []
    os.makedirs(output_folder, exist_ok=True)
    for batch_num, batch in enumerate(batcher(file_list, batch_size)):
        output_file = os.path.join(
            output_folder, "{}_{}.fast5".format(filename_base, batch_num))
        results_array.append(
            pool.apply_async(create_multi_read_file,
                             args=(batch, output_file, target_compression),
                             callback=update))

    pool.close()
    pool.join()
    pbar.finish()
Exemple #2
0
def batch_convert_multi_files_to_single(input_path, output_folder, threads,
                                        recursive):

    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive)
    pbar = get_progress_bar(len(file_list))

    def update(results):
        output_file = os.path.basename(results.popleft())
        with open(os.path.join(output_folder, "filename_mapping.txt"),
                  'a') as output_table:
            for filename in results:
                output_table.write("{}\t{}\n".format(output_file, filename))
        pbar.update(pbar.currval + 1)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    results_array = []
    for batch_num, filename in enumerate(file_list):
        results_array.append(
            pool.apply_async(convert_multi_to_single,
                             args=(filename, output_folder, str(batch_num)),
                             callback=update))

    pool.close()
    pool.join()
    pbar.finish()
Exemple #3
0
def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True):
    # We require an absolute input path to we can replicate the data structure relative to it later on
    input_folder = os.path.abspath(input_folder)

    file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks)
    if len(file_list) == 0:
        raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive))

    # Set up the process pool and the progressbar
    pool = Pool(min(threads, len(file_list)))
    pbar = get_progress_bar(len(file_list))

    def update(result):
        pbar.update(pbar.currval + 1)

    for input_file in file_list:
        input_path = os.path.join(input_folder, input_file)
        output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder))

        pool.apply_async(func=compress_file,
                         args=(input_path, output_path, target_compression),
                         callback=update)

    # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join()
    pool.close()
    pool.join()
    pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive):

    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive)
    pbar = get_progress_bar(int((len(file_list)+batch_size-1)/batch_size))

    def update(results):
        output_file = os.path.basename(results.popleft())
        with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table:
            for filename in results:
                output_table.write("{}\t{}\n".format(filename, output_file))
        pbar.update(pbar.currval + 1)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table:
        output_table.write("single_read_file\tmulti_read_file\n")

    for batch_num, batch in enumerate(batcher(file_list, batch_size)):
        output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num))
        pool.apply_async(create_multi_read_file, args=(batch, output_file), callback=update)

    pool.close()
    pool.join()
    pbar.finish()
Exemple #5
0
def hdf_to_sam(args):
    """Entry point for converting guppy methylcalled fast5s to sam."""
    sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted')))
    sys.stdout.write('\n')
    for name, seq, _ in mappy.fastx_read(args.reference, read_comment=False):
        sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq)))

    fast5s = get_fast5_file_list(args.path, recursive=args.recursive)
    worker = functools.partial(hdf_to_sam_worker, args.reference)
    with ProcessPoolExecutor(max_workers=args.workers) as executor:
        for res in executor.map(worker, fast5s):
            for r in res:
                sys.stdout.write('{}\n'.format(r))
Exemple #6
0
 def _run(self):
     """Iterate over input files and stores results into internal queue."""
     fast5s = get_fast5_file_list(self.path, recursive=self.recursive)
     self.total_files = len(fast5s)
     self.logger.info("Found {} files to process.".format(self.total_files))
     with ProcessPoolExecutor(self.workers,
                              max_workers=self.workers) as executor:
         for fname in fast5s:
             while True:
                 if self.queue.qsize() < self.max_size:
                     future = executor.submit(self.extractor, fname)
                     future.add_done_callback(self._store)
                     break
                 else:
                     time.sleep(1)
     self.queue.put(None)
Exemple #7
0
    def test_conversion_script_multi(self, mock_pbar):
        input_folder = os.path.join(test_data, 'multi_read')
        compress_batch(input_folder=input_folder,
                       output_folder=self.save_path,
                       target_compression=VBZ)

        count_files = 0
        count_reads = 0
        for out_file in get_fast5_file_list(self.save_path,
                                            recursive=True,
                                            follow_symlinks=True):
            count_files += 1
            with get_fast5_file(out_file) as f5:
                self.assertTrue(isinstance(f5, MultiFast5File))
                for read in f5.get_reads():
                    self.assertCompressed(read)
                    count_reads += 1
        self.assertEqual(1, count_files)
        self.assertEqual(4, count_reads)
def batch_reverter(input_path,
                   output_folder,
                   filename_base,
                   batch_size,
                   threads,
                   recursive,
                   keys=set(
                       ('Raw', 'channel_id', 'context_tags', 'tracking_id'))):
    # make sure output dir doesn't exists
    if os.path.exists(output_folder):
        sys.stderr.write("Directory exists: %s\n" % output_folder)
        sys.exit(1)
    os.makedirs(output_folder)
    # get files to process - in revert order, since fail is typically before pass
    file_list = get_fast5_file_list(input_path, recursive)
    file_list = file_list[::-1]
    print("%s files to process..." % len(file_list))
    fi, ri = 0, -1
    for i, input_file in enumerate(file_list, 1):
        with MultiFast5File(input_file, 'r') as input_f5:
            for ri, read in enumerate(input_f5.get_read_ids(), ri + 1):
                if not ri % 100:
                    sys.stderr.write(" %s %s %s %s  \r" %
                                     (fi, ri, read, input_file))
                if not ri % batch_size:
                    output_f5 = MultiFast5File(
                        os.path.join(output_folder,
                                     "%s_%s.fast5" % (filename_base, fi)), 'w')
                    fi += 1
                # copy group to new file
                read_name = "read_" + read
                group = input_f5.handle[read_name]
                output_f5.handle.copy(group, read_name)
                # and remove additional info
                reverted_group = output_f5.handle[
                    read_name]  #; print(reverted_group.keys())
                for k in reverted_group.keys():
                    if k not in keys:
                        del reverted_group[k]
Exemple #9
0
    def __init__(self,
                 input_folder,
                 output_folder,
                 read_list_file,
                 filename_base,
                 batch_size=4000,
                 threads=1,
                 recursive=False,
                 file_list_file=None):
        assert Path(input_folder).is_dir()
        assert Path(read_list_file).is_file()
        assert isinstance(filename_base, str)
        assert isinstance(batch_size, int)
        assert isinstance(threads, int)
        assert isinstance(recursive, bool)
        self.logger = logging.getLogger(self.__class__.__name__)

        self.read_set = get_filter_reads(read_list_file)
        self.input_f5s = get_fast5_file_list(str(input_folder), recursive)

        if file_list_file:
            file_set = get_filter_reads(file_list_file)
            for file in file_set:
                assert Path(file).exists(
                ), "{} from file list doesn't exist".format(file)
            self.input_f5s = list(file_set.intersection(self.input_f5s))

        # determine max number of workers
        self.batch_size = batch_size
        num_outputs = int(ceil(len(self.read_set) / batch_size))
        self.num_workers = min(threads, min(num_outputs, len(self.input_f5s)))

        out_basename = Path(output_folder)
        if not out_basename.exists():
            out_basename.mkdir()

        self.filename_mapping_file = out_basename / "filename_mapping.txt"
        if self.filename_mapping_file.exists():
            self.logger.info("overwriting filename mapping file {}".format(
                self.filename_mapping_file))
            self.filename_mapping_file.unlink()

        # dict where key=filename value=read_set
        self.out_files = {}

        out_file_names = []
        for i in range(num_outputs):
            filename = filename_base + str(i) + ".fast5"
            output_file_name = out_basename / filename

            if output_file_name.exists():
                self.logger.info(
                    "overwriting multiread file {}".format(output_file_name))
                output_file_name.unlink()

            self.out_files[output_file_name] = set()
            out_file_names.append(output_file_name)

        # reversing so that first item to be popped is lower idx
        self.available_out_files = out_file_names[::-1]
        self.tasks = []
        self.pool = None
        # progressbar total is number of reads in read_set plus number of input files
        # (to see progress while scanning files that don't have any relevant reads)
        self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
Exemple #10
0
    def __init__(self,
                 input_folder,
                 output_folder,
                 read_list_file,
                 filename_base,
                 batch_size=4000,
                 threads=1,
                 recursive=False,
                 file_list_file=None):
        assert path.isdir(input_folder)
        assert path.isfile(read_list_file)
        assert isinstance(filename_base, str)
        assert isinstance(batch_size, int)
        assert isinstance(threads, int)
        assert isinstance(recursive, bool)
        self.logger = logging.getLogger(self.__class__.__name__)

        self.read_set = get_filter_reads(read_list_file)
        self.input_f5s = get_fast5_file_list(str(input_folder), recursive)

        if len(self.read_set) < 1:
            raise ValueError(
                "No reads in read list file {}".format(read_list_file))

        if len(self.input_f5s) < 1:
            raise ValueError(
                "No input fast5 files found in {}. Recursion is set to {}".
                format(str(input_folder), recursive))

        if batch_size < 1:
            raise ValueError(
                "Batch size (--batch_size) must be a positive integer, not {}".
                format(batch_size))

        if threads < 1:
            raise ValueError(
                "Max number of threads (--threads) must be a positive integer, not {}"
                .format(threads))

        if file_list_file:
            file_set = get_filter_reads(file_list_file)
            for file in file_set:
                assert path.exists(
                    file), "{} from file list doesn't exist".format(file)
            self.input_f5s = list(file_set.intersection(self.input_f5s))

        # determine max number of workers
        self.batch_size = batch_size
        num_outputs = int(ceil(len(self.read_set) / float(batch_size)))
        self.num_workers = min(threads, min(num_outputs, len(self.input_f5s)))

        if not path.exists(output_folder):
            mkdir(output_folder)

        self.filename_mapping_file = path.join(output_folder,
                                               "filename_mapping.txt")
        if path.exists(self.filename_mapping_file):
            self.logger.info("overwriting filename mapping file {}".format(
                self.filename_mapping_file))
            unlink(self.filename_mapping_file)

        # dict where key=filename value=read_set
        self.out_files = {}

        out_file_names = []
        for i in range(num_outputs):
            filename = filename_base + str(i) + ".fast5"
            output_file_name = path.join(output_folder, filename)

            if path.exists(output_file_name):
                self.logger.info(
                    "overwriting multiread file {}".format(output_file_name))
                unlink(output_file_name)

            self.out_files[output_file_name] = set()
            out_file_names.append(output_file_name)

        # reversing so that first item to be popped is lower idx
        self.available_out_files = out_file_names[::-1]
        self.tasks = []
        self.pool = None
        # progressbar total is number of reads in read_set plus number of input files
        # (to see progress while scanning files that don't have any relevant reads)
        self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
Exemple #11
0
 def setUp(self):
     self.files = get_fast5_file_list(self.read_dir, recursive=False)