Esempio n. 1
0
def compress_file(input_file, output_file, target_compression):
    try:
        makedirs(os.path.dirname(output_file), exist_ok=True)
        if is_multi_read(input_file):
            with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5:
                for read in input_f5.get_reads():
                    compress_read_from_multi(output_f5, read, target_compression)
        else:
            with Fast5File(input_file, 'r') as input_f5, \
                    EmptyFast5(output_file, 'a') as output_f5:
                compress_read_from_single(output_f5, input_f5, target_compression)
    except Exception as e:
        # Error raised in Pool.aync will be lost so we explicitly print them.
        logging.exception(e)
        raise
Esempio n. 2
0
def read_generator(input_file, read_set):
    """
    Open input_file as Fast5, yield tuples (read_id, Group) for every read_id that is present in read_set
    :param input_file:
    :param read_set:
    :return:
    """

    with MultiFast5File(str(input_file), 'r') as input_f5:
        read_ids = input_f5.get_read_ids()
        if len(read_ids) == 0:
            if not is_multi_read(input_file):
                raise TypeError(
                    "Filtering from single-read Fast5 not supported")
        for read in read_set.intersection(read_ids):
            group = input_f5.handle["read_" + read]
            yield read, group
Esempio n. 3
0
def compress_file(input_file, output_file, target_compression, sanitize=False):
    try:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        if is_multi_read(input_file):
            with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(
                    output_file, 'a') as output_f5:
                for read in input_f5.get_reads():
                    output_f5.add_existing_read(read,
                                                target_compression,
                                                sanitize=sanitize)
        else:
            with Fast5File(input_file, 'r') as input_f5, \
                    EmptyFast5(output_file, 'a') as output_f5:
                compress_single_read(output_f5,
                                     input_f5,
                                     target_compression,
                                     sanitize=sanitize)
    except Exception as e:
        # Error raised in Pool.async will be lost so we explicitly print them.
        logging.exception(e)
        raise
    return (input_file, output_file)
Esempio n. 4
0
 def index(input, recursive=False, output_prefix="", tmp_prefix=None):
     if tmp_prefix and not os.path.exists(tmp_prefix):
         os.makedirs(tmp_prefix)
     input_files = []
     # scan input
     if os.path.isfile(input):
         input_files.append(input)
     else:
         if recursive:
             input_files.extend([
                 os.path.join(dirpath, f)
                 for dirpath, _, files in os.walk(input) for f in files
                 if f.endswith('.fast5') or f.endswith('.tar')
             ])
         else:
             input_files.extend(glob.glob(os.path.join(input, '*.fast5')))
             input_files.extend(glob.glob(os.path.join(input, '*.tar')))
     # index all provided files
     for input_file in input_files:
         input_relative = os.path.normpath(
             os.path.join(
                 output_prefix,
                 os.path.dirname(os.path.relpath(input_file, start=input)),
                 os.path.basename(input_file)))
         # extract reads from packed tar archive and retrieve read IDs
         if input_file.endswith('.tar'):
             with tempfile.TemporaryDirectory(
                     prefix=tmp_prefix) as tmpdirname, tarfile.open(
                         input_file) as fp_tar:
                 fp_tar.extractall(path=tmpdirname)
                 f5files = [
                     os.path.join(dirpath, f)
                     for dirpath, _, files in os.walk(tmpdirname)
                     for f in files if f.endswith('.fast5')
                 ]
                 for f5file in f5files:
                     try:
                         ID = fast5_Index.__get_ID_single__(f5file)
                         print("\t".join([
                             os.path.normpath(
                                 os.path.join(
                                     input_relative,
                                     os.path.relpath(f5file,
                                                     start=tmpdirname))), ID
                         ]))
                     except:
                         print(
                             "[ERROR] Failed to open {f5}, skip file for indexing"
                             .format(f5=f5file),
                             file=sys.stderr)
         # bulk and single read fast5
         else:
             if is_multi_read(input_file):
                 reads = fast5_Index.__get_ID_multi__(input_file)
                 for f, (group, ID) in zip([input_relative] * len(reads),
                                           reads):
                     yield '\t'.join((os.path.join(f, group), ID))
             else:
                 try:
                     ID = fast5_Index.__get_ID_single__(input_relative)
                 except:
                     print(
                         "[ERROR] Failed to open {f5}, skip file for indexing"
                         .format(f5=input_file),
                         file=sys.stderr)
                 yield '\t'.join([input_relative, ID])