def __call__(self, chunk, limit=None): """read chunk in file :param int chunk: chunk number :param int limit: maximum number of lines to read. May be None. :return list: read lines """ # read chunk line_index = self.lines_index start = line_index[chunk] limit = self.chunk_size if limit is None or limit > self.chunk_size else limit data = [] with igzip.IndexedGzipFile(self.urlpath, index_file=self.igzip_index_path) as fobj: fobj.seek(start) for i, text in zip(range(limit), fobj): data.append(text) return data
def load_image(fname): basename = op.basename(fname)[:-7] # nibabel pre-2.1 is not indexed_gzip-aware if nibver <= Version('2.1.0'): fobj = igzip.IndexedGzipFile(fname) fmap = nib.Nifti1Image.make_file_map() fmap[basename].fileobj = fobj image = nib.Nifti1Image.from_file_map(fmap) # nibabel 2.2.x, we have to set keep_file_open='auto' # to get it to use indexed_gzip elif Version('2.2.0') <= nibver < Version('2.3.0'): image = nib.load(fname, keep_file_open='auto') # nibabel >= 2.3.x uses indexed_gzip automatically else: image = nib.load(fname) return image
def benchmark(filename, uncompressed_size): names = [ 'GzipFile', 'IndexedGzipFile(drop_handles=True, spacing=0)', 'IndexedGzipFile(drop_handles=False, spacing=0)', 'IndexedGzipFile(drop_handles=True, spacing=32 MiB)', 'IndexedGzipFile(drop_handles=False, spacing=32 MiB)', 'IndexedGzipFile(drop_handles=True, spacing=32 MiB, readbuf_size=uncompressed_size)', 'IndexedGzipFile(drop_handles=False, spacing=32 MiB, readbuf_size=uncompressed_size)' ] namelen = max([len(n) for n in names]) namefmt = '{{:<{}s}}'.format(namelen + len( "Read 131072 KiB chunks" )) fobjs = [ lambda : gzip.GzipFile( filename, 'rb'), lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=0), lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=0), lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=32 * 1024 * 1024), lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=32 * 1024 * 1024), lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=32 * 1024 * 1024, readbuf_size=uncompressed_size), lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=32 * 1024 * 1024, readbuf_size=uncompressed_size), ] firstMd5 = None for name, fobj in zip(names, fobjs): for chunksize in [4 * 1024, 32 * 1024, 128 * 1024, 1024 * 1024, 32 * 1024 * 1024, uncompressed_size]: def update(i): label = f"Read {chunksize // 1024:6} KiB chunks from {namefmt.format(name)}" print(f'\r{label} {100.0 * i / uncompressed_size:6.2f} %', end='') sys.stdout.flush() with fobj() as f: md5, time = benchmark_file(f, chunksize, update) print(f' {md5} {time:0.3f} s') if firstMd5 is None: firstMd5 = md5 else: assert firstMd5 == md5 print()
def _test_IndexedGzipFile_open_close(testfile, drop): f = igzip.IndexedGzipFile(filename=testfile, drop_handles=drop) f.seek(10) f.read(10) f.close()
def compute_vol_hit_stats(roi_file,vfms,bboxes,idxs,readwith='indexgzip',n_jobs=1,atlas_dir=None,atlas_name=None,run_type='sharedmem',joblib_cache_dir='/tmp'): """ """ print 'computing hit stats for roi %s' % roi_file roi_img = nib.load(roi_file) roi_dat = roi_img.get_data() if idxs is 'all': idxs = range(vfms.shape[0]) # only read files with overlapping bounding boxes bbox_isol,bbox_propol = compute_roi_bbox_overlaps(bboxes,roi_file) #est_file) bbox_isol_idx = np.nonzero(bbox_isol)[0] idxsinbbox = [idx for idx in idxs if idx in bbox_isol_idx] if atlas_dir == None: atlas_dir = '%s/%s' %(abd,atlas_name) # compute hit stats for roi on atlas volumes if run_type == 'simple': hstats = Parallel(n_jobs=n_jobs,temp_folder=joblib_cache_dir)\ (delayed(hit_stats_for_vols)\ (roi_dat,igzip4dnii(vfms.ix[idx]['nii_file'], vfms.ix[idx]['4dvolind'],atlas_name=atlas_name,atlas_dir=atlas_dir))\ for idx in idxsinbbox) idxsused = idxsinbbox elif run_type == 'sharedmem': # Loop through each file, load in to memory, and spawn parallel jobs hstats,idxsused = [],[] unique_fs_inbbox = sorted(np.unique(vfms.ix[idxsinbbox]['nii_file'])) for fname in unique_fs_inbbox: #if atlas_dir: fpath = '%s/%s' %(atlas_dir,fname) #'%s/%s/%s' %(abd,atlas_name,fname) #else: # fpath = fname idxs_forthisf = vfms[vfms.nii_file == fname].index idxstorun = [idx for idx in idxs_forthisf if idx in idxsinbbox] volinds = vfms.ix[idxstorun]['4dvolind'] # buffer (than the default size of 16KB). fobj = igzip.IndexedGzipFile(filename=fpath,spacing=4194304, readbuf_size=131072) # Create a nibabel image using # the existing file handle. fmap = nib.Nifti1Image.make_file_map() fmap['image'].fileobj = fobj image = nib.Nifti1Image.from_file_map(fmap) vols = [np.squeeze(image.dataobj[:,:,:,int(v)]) for v in volinds] res = Parallel(n_jobs=n_jobs,temp_folder=joblib_cache_dir)\ (delayed(hit_stats_for_vols)\ (roi_dat,vol) for vol in vols) hstats += res idxsused += list(idxstorun) if len(hstats) > 0 : df_hstats = pd.DataFrame({idx: hstat for idx,hstat in zip(idxsused,hstats)}).T df_hstats.columns.names = ['metric'] df_hstats.index.names = ['idx'] else: df_hstats = None return df_hstats
ap.add_argument("-z", "--gz", help="fastq is zipped", action='store_true') ap.add_argument("-m", "--mode", help="mode of storage (shelve/sqlite)", default="sqlite") args = vars(ap.parse_args()) if (args["mode"] == "sqlite"): conn = sqlite3.connect(args["idx"]) c = conn.cursor() if (args["mode"] == "shelve"): pos_bar = shelve.open(args["idx"], writeback=False) #myfile = open(args["fastq"], "rt") if (args["gz"]): myfile = igzip.IndexedGzipFile(args["fastq"], index_file=args["fastq"] + '.gzidx') else: myfile = open(args["fastq"], "rt") bx_re = re.compile("BX:Z:([ATCG]{16})-1") for barcode in open(args['bdx']): print("barcode " + barcode, file=sys.stderr) barcode = barcode.rstrip() pos = -1 if (args["mode"] == "sqlite"): t = (barcode, ) c.execute('select pos from bx_pos where barcode=?', t) for row in c: pos = row[0] if (args["mode"] == "shelve"):
'bz2': CompressionInfo( ['bz2', 'bzip2'], ['tb2', 'tbz', 'tbz2', 'tz2'], 'indexed_bzip2', lambda x: (x.read(4)[:3] == b'BZh' and x.read(6) == (0x314159265359).to_bytes(6, 'big')), lambda x: indexed_bzip2.open(x), ), 'gz': CompressionInfo( ['gz', 'gzip'], ['taz', 'tgz'], 'indexed_gzip', lambda x: x.read(2) == b'\x1F\x8B', lambda x: indexed_gzip.IndexedGzipFile(fileobj=x), ), 'rar': CompressionInfo( ['rar'], [], 'rarfile', lambda x: x.read(6) == b'Rar!\x1A\x07', lambda x: rarfile.RarFile(x), ), 'xz': CompressionInfo( ['xz'], ['txz'], 'lzmaffi' if 'lzmaffi' in sys.modules else 'xz', lambda x: x.read(6) == b"\xFD7zXZ\x00",
def create_igz(fid): return igzip.IndexedGzipFile(fid=fid)