Ejemplo n.º 1
0
def convert_multi_to_single(input_file, read_list, output_folder):
    '''
    Pull the exact read out of the file.
    '''
    results = [os.path.basename(input_file)]
    try:
        with MultiFast5File(input_file, 'r') as multi_f5:
            read_ids = set(multi_f5.get_read_ids())
            for query_read in read_list:
                if query_read in read_ids:
                    try:
                        read = multi_f5.get_read(query_read)
                        output_file = os.path.join(
                            output_folder, "{}.fast5".format(query_read))
                        create_single_f5(output_file, read)
                        results.append(os.path.basename(output_file))
                    except:
                        traceback.print_exc()
                        sys.stderr.write(
                            "{}\n\tFailed to copy read '{}' from {}\n".format(
                                "convert_multi_to_single", query_read,
                                input_file))
                else:
                    sys.stderr.write(
                        "{}\n\tFailed to find read '{}' in {}\n".format(
                            "convert_multi_to_single", query_read, input_file))
    except:
        traceback.print_exc()
        sys.stderr.write("{}\n\tFailed to copy files from: {}\n".format(
            "convert_multi_to_single", input_file))
    finally:
        return results
Ejemplo n.º 2
0
 def __copy_reads_to__(self, read_ids, output):
     if not os.path.exists(output):
         os.makedirs(output)
     batch_id_files = [
         tuple([id] + re.split('(\.fast5|\.tar)\/', self.index_dict[id]))
         for id in read_ids if id in self.index_dict
     ]
     batch_id_files.sort(key=lambda x: (x[1], x[2]) if len(x) > 2 else x[1])
     for _, id_batch_paths in itertools.groupby(batch_id_files,
                                                key=lambda x: (x[1], x[2])
                                                if len(x) > 2 else x[1]):
         fofns = list(id_batch_paths)
         if len(fofns) == 1 and len(fofns[0]) == 2:
             # single read fast5
             id, src_file = fofns[0]
             shutil.copy(
                 os.path.join(os.path.dirname(args.index), src_file),
                 output)
         else:
             _, batch_file, batch_ext, _ = fofns[0]
             tarFiles = set([x[3] for x in fofns])
             # single read fast5 batch in tar archive
             if batch_ext == '.tar':
                 tar_file = os.path.join(os.path.dirname(self.index_file),
                                         batch_file + batch_ext)
                 with tarfile.open(tar_file) as fp_tar:
                     tar_members = fp_tar.getmembers()
                     for tar_member in tar_members:
                         if any(s in tar_member.name for s in tarFiles):
                             try:
                                 tar_member.name = os.path.basename(
                                     tar_member.name)
                                 fp_tar.extract(tar_member, path=output)
                             except:
                                 RuntimeError(
                                     '[ERROR] Could not extract {id} from {batch}.'
                                     .format(id=tar_member.name,
                                             batch=tar_file))
             elif batch_ext == '.fast5':
                 f5_file = os.path.join(os.path.dirname(self.index_file),
                                        batch_file + batch_ext)
                 with MultiFast5File(f5_file, 'r') as multi_f5:
                     target_ids = set([x[0] for x in fofns])
                     for read_id in multi_f5.get_read_ids():
                         if read_id in target_ids:
                             try:
                                 read = multi_f5.get_read(read_id)
                                 output_file = os.path.join(
                                     output, "{}.fast5".format(read_id))
                                 multi_to_single_fast5.create_single_f5(
                                     output_file, read)
                             except:
                                 RuntimeError(
                                     '[ERROR] Could not extract {id} from {batch}.'
                                     .format(id=read_id, batch=f5_file))
             else:
                 pass