Beispiel #1
0
    def __call__(self, chunk, limit=None):
        """read chunk in file

        :param int chunk: chunk number
        :param int limit: maximum number of lines to read. May be None.
        :return list: read lines
        """
        # read chunk
        line_index = self.lines_index
        start = line_index[chunk]
        limit = self.chunk_size if limit is None or limit > self.chunk_size else limit
        data = []
        with igzip.IndexedGzipFile(self.urlpath, index_file=self.igzip_index_path) as fobj:
            fobj.seek(start)
            for i, text in zip(range(limit), fobj):
                data.append(text)
        return data
Beispiel #2
0
def load_image(fname):

    basename = op.basename(fname)[:-7]

    # nibabel pre-2.1 is not indexed_gzip-aware
    if nibver <= Version('2.1.0'):
        fobj = igzip.IndexedGzipFile(fname)
        fmap = nib.Nifti1Image.make_file_map()
        fmap[basename].fileobj = fobj
        image = nib.Nifti1Image.from_file_map(fmap)

    # nibabel 2.2.x, we have to set keep_file_open='auto'
    # to get it to use indexed_gzip
    elif Version('2.2.0') <= nibver < Version('2.3.0'):
        image = nib.load(fname, keep_file_open='auto')

    # nibabel >= 2.3.x uses indexed_gzip automatically
    else:
        image = nib.load(fname)

    return image
def benchmark(filename, uncompressed_size):
    names = [
        'GzipFile',
        'IndexedGzipFile(drop_handles=True, spacing=0)',
        'IndexedGzipFile(drop_handles=False, spacing=0)',
        'IndexedGzipFile(drop_handles=True, spacing=32 MiB)',
        'IndexedGzipFile(drop_handles=False, spacing=32 MiB)',
        'IndexedGzipFile(drop_handles=True, spacing=32 MiB, readbuf_size=uncompressed_size)',
        'IndexedGzipFile(drop_handles=False, spacing=32 MiB, readbuf_size=uncompressed_size)'
    ]
    namelen = max([len(n) for n in names])
    namefmt = '{{:<{}s}}'.format(namelen + len( "Read 131072 KiB chunks" ))

    fobjs = [
        lambda : gzip.GzipFile(        filename, 'rb'),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=0),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=0),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=32 * 1024 * 1024),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=32 * 1024 * 1024),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=True, spacing=32 * 1024 * 1024, readbuf_size=uncompressed_size),
        lambda : igzip.IndexedGzipFile(filename, drop_handles=False, spacing=32 * 1024 * 1024, readbuf_size=uncompressed_size),
    ]

    firstMd5 = None
    for name, fobj in zip(names, fobjs):
        for chunksize in [4 * 1024, 32 * 1024, 128 * 1024, 1024 * 1024, 32 * 1024 * 1024, uncompressed_size]:
            def update(i):
                label = f"Read {chunksize // 1024:6} KiB chunks from {namefmt.format(name)}"
                print(f'\r{label} {100.0 * i / uncompressed_size:6.2f} %', end='')
                sys.stdout.flush()

            with fobj() as f:
                md5, time = benchmark_file(f, chunksize, update)

            print(f'  {md5} {time:0.3f} s')

            if firstMd5 is None:
                firstMd5 = md5
            else:
                assert firstMd5 == md5

        print()
Beispiel #4
0
def _test_IndexedGzipFile_open_close(testfile, drop):

    f = igzip.IndexedGzipFile(filename=testfile, drop_handles=drop)
    f.seek(10)
    f.read(10)
    f.close()
Beispiel #5
0
def compute_vol_hit_stats(roi_file,vfms,bboxes,idxs,readwith='indexgzip',n_jobs=1,atlas_dir=None,atlas_name=None,run_type='sharedmem',joblib_cache_dir='/tmp'):
  """
  """

  print 'computing hit stats for roi %s' % roi_file

  roi_img = nib.load(roi_file)
  roi_dat = roi_img.get_data()

  if idxs is 'all': idxs = range(vfms.shape[0])

  # only read files with overlapping bounding boxes
  bbox_isol,bbox_propol = compute_roi_bbox_overlaps(bboxes,roi_file) #est_file)
  bbox_isol_idx = np.nonzero(bbox_isol)[0]
  idxsinbbox = [idx for idx in idxs if idx in bbox_isol_idx]

  if atlas_dir == None:
    atlas_dir = '%s/%s' %(abd,atlas_name)


  # compute hit stats for roi on atlas volumes
  if run_type == 'simple': 

    hstats = Parallel(n_jobs=n_jobs,temp_folder=joblib_cache_dir)\
            (delayed(hit_stats_for_vols)\
            (roi_dat,igzip4dnii(vfms.ix[idx]['nii_file'],
            vfms.ix[idx]['4dvolind'],atlas_name=atlas_name,atlas_dir=atlas_dir))\
            for idx in idxsinbbox)

    idxsused = idxsinbbox

  elif run_type == 'sharedmem':

    # Loop through each file, load in to memory, and spawn parallel jobs

    hstats,idxsused = [],[]

    unique_fs_inbbox = sorted(np.unique(vfms.ix[idxsinbbox]['nii_file']))
     
    for fname in unique_fs_inbbox:
  
      #if atlas_dir: 
      fpath = '%s/%s' %(atlas_dir,fname) #'%s/%s/%s' %(abd,atlas_name,fname)
      #else: 
      #  fpath = fname

      idxs_forthisf = vfms[vfms.nii_file == fname].index
    
      idxstorun = [idx for idx in idxs_forthisf if idx in idxsinbbox]

      volinds = vfms.ix[idxstorun]['4dvolind']
    
      # buffer (than the default size of 16KB).
      fobj = igzip.IndexedGzipFile(filename=fpath,spacing=4194304,
                                   readbuf_size=131072)

      # Create a nibabel image using 
      # the existing file handle.
      fmap = nib.Nifti1Image.make_file_map()
      fmap['image'].fileobj = fobj
      image = nib.Nifti1Image.from_file_map(fmap)
  
      vols = [np.squeeze(image.dataobj[:,:,:,int(v)]) for v in volinds]
    
      res = Parallel(n_jobs=n_jobs,temp_folder=joblib_cache_dir)\
           (delayed(hit_stats_for_vols)\
           (roi_dat,vol) for vol in vols)
               
      hstats += res
      idxsused += list(idxstorun)


  if len(hstats) > 0 :
    df_hstats = pd.DataFrame({idx: hstat for idx,hstat in zip(idxsused,hstats)}).T
    df_hstats.columns.names = ['metric'] 
    df_hstats.index.names = ['idx']
  else: 
    df_hstats = None

  return df_hstats
Beispiel #6
0
ap.add_argument("-z", "--gz", help="fastq is zipped", action='store_true')
ap.add_argument("-m",
                "--mode",
                help="mode of storage (shelve/sqlite)",
                default="sqlite")
args = vars(ap.parse_args())

if (args["mode"] == "sqlite"):
    conn = sqlite3.connect(args["idx"])
    c = conn.cursor()
if (args["mode"] == "shelve"):
    pos_bar = shelve.open(args["idx"], writeback=False)

#myfile = open(args["fastq"], "rt")
if (args["gz"]):
    myfile = igzip.IndexedGzipFile(args["fastq"],
                                   index_file=args["fastq"] + '.gzidx')
else:
    myfile = open(args["fastq"], "rt")

bx_re = re.compile("BX:Z:([ATCG]{16})-1")
for barcode in open(args['bdx']):
    print("barcode " + barcode, file=sys.stderr)
    barcode = barcode.rstrip()
    pos = -1
    if (args["mode"] == "sqlite"):
        t = (barcode, )
        c.execute('select pos from bx_pos where barcode=?', t)
        for row in c:
            pos = row[0]

    if (args["mode"] == "shelve"):
Beispiel #7
0
 'bz2':
 CompressionInfo(
     ['bz2', 'bzip2'],
     ['tb2', 'tbz', 'tbz2', 'tz2'],
     'indexed_bzip2',
     lambda x: (x.read(4)[:3] == b'BZh' and x.read(6) ==
                (0x314159265359).to_bytes(6, 'big')),
     lambda x: indexed_bzip2.open(x),
 ),
 'gz':
 CompressionInfo(
     ['gz', 'gzip'],
     ['taz', 'tgz'],
     'indexed_gzip',
     lambda x: x.read(2) == b'\x1F\x8B',
     lambda x: indexed_gzip.IndexedGzipFile(fileobj=x),
 ),
 'rar':
 CompressionInfo(
     ['rar'],
     [],
     'rarfile',
     lambda x: x.read(6) == b'Rar!\x1A\x07',
     lambda x: rarfile.RarFile(x),
 ),
 'xz':
 CompressionInfo(
     ['xz'],
     ['txz'],
     'lzmaffi' if 'lzmaffi' in sys.modules else 'xz',
     lambda x: x.read(6) == b"\xFD7zXZ\x00",
Beispiel #8
0
def create_igz(fid):
    return igzip.IndexedGzipFile(fid=fid)