def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress(string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) rtn[INDEX] = np.cumsum(np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) rtn_length = len(rtn[INDEX]) if include_symbol: rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length column_set.update(doc[COLUMNS].keys()) for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) values = np.fromstring(lz4.decompress(coldata[DATA]), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) rowmask = np.unpackbits(np.fromstring(lz4.decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) rtn[INDEX] = np.cumsum( np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) rtn_length = len(rtn[INDEX]) if include_symbol: rtn['SYMBOL'] = [ doc[SYMBOL], ] * rtn_length column_set.update(doc[COLUMNS].keys()) for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) values = np.fromstring(lz4.decompress(coldata[DATA]), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) rowmask = np.unpackbits( np.fromstring(lz4.decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def get_coldata(coldata): """ return values and rowmask """ dtype = np.dtype(coldata[DTYPE]) values = np.fromstring(lz4.decompress(coldata[DATA]), dtype=dtype) rowmask = np.unpackbits( np.fromstring(lz4.decompress(coldata[ROWMASK]), dtype='uint8')) return list(values), list(rowmask)
def roundtrip(size=None): if size is None: size = struct.unpack(">I", b"\0" + os.urandom(3))[0] data = os.urandom(size) assert rustlz4.decompress(pylz4.compress(data)) == data assert pylz4.decompress(buffer(rustlz4.compress(data))) == data assert rustlz4.decompress(pylz4.compressHC(data)) == data assert pylz4.decompress(buffer(rustlz4.compresshc(data))) == data
def decompress_array(str_list): """ Decompress a list of strings """ if ENABLE_PARALLEL: return clz4.decompressarr(str_list) return [lz4.decompress(chunk) for chunk in str_list]
def loadCOCOAndOverSeg(im_set="test", detector="sf", N_SPIX=1000, fold=0): from pickle import dumps, loads try: import lz4, pickle decompress = lambda s: pickle.loads(lz4.decompress(s)) compress = lambda o: lz4.compressHC(pickle.dumps(o)) except: compress = lambda x: x decompress = lambda x: x from gop import contour, dataset, segmentation FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat' % (im_set, detector, N_SPIX, fold) try: with open(FILE_NAME, 'rb') as f: over_segs, segmentations = loads(f.read()) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append(decompress(i)) return over_seg, [decompress(i) for i in segmentations], [] #return over_segs,segmentations,[] except FileNotFoundError: pass # Load the dataset data = dataset.loadCOCO2014(im_set == "train", im_set == "valid", fold) # COCO has some pretty gray scale images (WTF!!!) images = [ e['image'] if e['image'].C == 3 else e['image'].tileC(3) for e in data ] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] # Do the over-segmentation if detector == 'sf': detector = contour.StructuredForest() detector.load('../data/sf.dat') elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load("../data/sf.dat") elif detector == 'st': detector = contour.SketchTokens() detector.load('../data/st_full_c.dat') else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX) with open(FILE_NAME, 'wb') as f: #f.write( dumps( (over_segs,segmentations) ) ) f.write( dumps( ([compress(i) for i in over_segs], [compress(i) for i in segmentations]))) f.close() return over_segs, segmentations, []
def _filter(v): path, ids = v result_set = set(ids) with open(path, 'rb') as f: f.seek(-8, 2) footer_fields_size, footer_indices_size = struct.unpack('II', f.read(8)) f.seek(-8 -footer_fields_size -footer_indices_size, 2) indices = cPickle.loads(zlib.decompress(f.read(footer_indices_size))) _fields = marshal.loads(decompress(f.read(footer_fields_size))) for k, v in filters.iteritems(): result = set() if k not in _fields: raise RuntimeError('%s is not in fields!' % k) if k not in indices: raise RuntimeError('%s is not indexed' % k) index = indices[k] if isinstance(v, types.FunctionType): r = index.filter(v) if r is not None: result = set(r) else: result = result_set else: if not isinstance(v, list): v = [v] for vv in v: for _id in result_set: if index.get(vv, _id): result.add(_id) return path, result
def decompress(file): data = open(file,"r+b") if data.name.endswith('.lz4'): decompressed_data = open(data.name[:-4],"w") decompressed_data.write(lz4.decompress(data.read())) else: print "The file type is not the expected."
def _filter(v): path, ids = v result_set = set(ids) with open(path, 'rb') as f: f.seek(-8, 2) footer_fields_size, footer_indices_size = struct.unpack( 'II', f.read(8)) f.seek(-8 - footer_fields_size - footer_indices_size, 2) indices = cPickle.loads( zlib.decompress(f.read(footer_indices_size))) _fields = marshal.loads(decompress(f.read(footer_fields_size))) for k, v in filters.iteritems(): result = set() if k not in _fields: raise RuntimeError('%s is not in fields!' % k) if k not in indices: raise RuntimeError('%s is not indexed' % k) index = indices[k] if isinstance(v, types.FunctionType): r = index.filter(v) if r is not None: result = set(r) else: result = result_set else: if not isinstance(v, list): v = [v] for vv in v: for _id in result_set: if index.get(vv, _id): result.add(_id) return path, result
def _decompress( self, data ): # a decompression funcion like lrzip in spirit: lzma<bz2<zlib<lz0<lz4 try: data = lzma.decompress(data) except: pass data = bz2.decompress(data) data = zlib.decompress(data) try: data = data.decode('zlib') except: pass try: data = lzo.decompress(data) except: pass try: data = lz4.decompress(data) except: pass if self.shuffle == True: try: print "unshuffling..." data = buff_unshuffle(data) print "data unshuffled..." except: pass return data
def deserialize_uint64_blocks(compressed_blocks, shape): """ Reconstitute a volume that was serialized with serialize_uint64_blocks(), above. NOTE: If the volume is not 64-px aligned, then the output will NOT be C-contiguous. """ if (np.array(shape) % 64).any(): padding = 64 - (np.array(shape) % 64) aligned_shape = shape + padding else: aligned_shape = shape aligned_volume = np.empty(aligned_shape, dtype=np.uint64) block_view = view_as_blocks(aligned_volume, (64, 64, 64)) for bi, (zi, yi, xi) in enumerate(np.ndindex(*block_view.shape[:3])): compressed_block = compressed_blocks[bi] # (See note above regarding recompression with LZ4) encoded_block = lz4.decompress(compressed_block) block = decode_label_block(encoded_block) block_view[zi, yi, xi] = block if shape == tuple(aligned_shape): volume = aligned_volume else: # Trim volume = np.asarray(aligned_volume[box_to_slicing((0, 0, 0), shape)], order='C') return volume
def test_tickstore_pandas_to_bucket_image(): symbol = 'SYM' tz = 'UTC' initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'} data = [{'A': 120, 'D': 1}, {'A': 122, 'B': 2.0}, {'A': 3, 'B': 3.0, 'D': 1}] tick_index = [dt(2014, 1, 2, 0, 0, tzinfo=mktz(tz)), dt(2014, 1, 3, 0, 0, tzinfo=mktz(tz)), dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz))] data = pd.DataFrame(data, index=tick_index) bucket, final_image = TickStore._pandas_to_bucket(data, symbol, initial_image) assert final_image == {'index': dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz)), 'A': 3, 'B': 3.0, 'C': 'DESC', 'D': 1} assert IMAGE_DOC in bucket assert bucket[COUNT] == 3 assert bucket[START] == dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)) assert bucket[END] == dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz)) assert set(bucket[COLUMNS]) == set(('A', 'B', 'D')) assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA)) assert get_coldata(bucket[COLUMNS]['A']) == ([120, 122, 3], [1, 1, 1, 0, 0, 0, 0, 0]) values, rowmask = get_coldata(bucket[COLUMNS]['B']) assert np.isnan(values[0]) and values[1:] == [2.0, 3.0] assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0] values, rowmask = get_coldata(bucket[COLUMNS]['D']) assert np.isnan(values[1]) assert values[0] == 1 and values[2] == 1 assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0] index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in list(np.cumsum(np.fromstring(lz4.decompress(bucket[INDEX]), dtype='uint64')))] assert index == tick_index assert bucket[COLUMNS]['A'][DTYPE] == 'int64' assert bucket[COLUMNS]['B'][DTYPE] == 'float64' assert bucket[SYMBOL] == symbol assert bucket[IMAGE_DOC] == {IMAGE: initial_image, IMAGE_TIME: initial_image['index']}
def parsefileblob(path, decompress): raw = None f = open(path, "r") try: raw = f.read() finally: f.close() if decompress: raw = lz4.decompress(raw) index = raw.index('\0') size = int(raw[:index]) data = raw[(index + 1):(index + 1 + size)] start = index + 1 + size firstnode = None mapping = {} while start < len(raw): divider = raw.index('\0', start + 80) currentnode = raw[start:(start + 20)] if not firstnode: firstnode = currentnode p1 = raw[(start + 20):(start + 40)] p2 = raw[(start + 40):(start + 60)] linknode = raw[(start + 60):(start + 80)] copyfrom = raw[(start + 80):divider] mapping[currentnode] = (p1, p2, linknode, copyfrom) start = divider + 1 return size, firstnode, mapping
def test_tickstore_to_bucket_with_image(): symbol = 'SYM' tz = 'UTC' initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'} data = [{'index': dt(2014, 1, 1, 0, 1, tzinfo=mktz(tz)), 'A': 124, 'D': 0}, {'index': dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz)), 'A': 125, 'B': 27.2}] bucket, final_image = TickStore._to_bucket(data, symbol, initial_image) assert bucket[COUNT] == 2 assert bucket[END] == dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz)) assert set(bucket[COLUMNS]) == set(('A', 'B', 'D')) assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA)) assert get_coldata(bucket[COLUMNS]['A']) == ([124, 125], [1, 1, 0, 0, 0, 0, 0, 0]) assert get_coldata(bucket[COLUMNS]['B']) == ([27.2], [0, 1, 0, 0, 0, 0, 0, 0]) assert get_coldata(bucket[COLUMNS]['D']) == ([0], [1, 0, 0, 0, 0, 0, 0, 0]) index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in list(np.cumsum(np.fromstring(lz4.decompress(bucket[INDEX]), dtype='uint64')))] assert index == [i['index'] for i in data] assert bucket[COLUMNS]['A'][DTYPE] == 'int64' assert bucket[COLUMNS]['B'][DTYPE] == 'float64' assert bucket[SYMBOL] == symbol assert bucket[START] == initial_image['index'] assert bucket[IMAGE_DOC][IMAGE] == initial_image assert bucket[IMAGE_DOC] == {IMAGE: initial_image, IMAGE_TIME: initial_image['index']} assert final_image == {'index': data[-1]['index'], 'A': 125, 'B': 27.2, 'C': 'DESC', 'D': 0}
def dump_cache(self): batch = leveldb.WriteBatch() for pattern, triple_id_pairs in self.cache.iteritems(): try: pattern_triples = self.leveldb.Get(pattern) pattern_triples = lz4.decompress(pattern_triples) pattern_triples = pattern_triples.split( MERGING_INDEX_TRIPLE_LINE_DELIMITER) except KeyError: pattern_triples = [] logging.info("Merging bin from %d to %d (%d new)." % ( len(pattern_triples), len(pattern_triples) + len(triple_id_pairs), len(triple_id_pairs), )) for triple_id_pair in triple_id_pairs: pattern_triples.append( MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair)) pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join( pattern_triples) batch.Put(pattern, lz4.compressHC(pattern_triples_dump)) self.leveldb.Write(batch) logging.info("Dump %d bins." % len(self.cache)) self.cache = {} self.cache_size = 0 gc.collect()
def read(self, subvolume): hex = hashlib.sha1(subvolume).hexdigest() file = ".btrfs/" + hex + ".lz4" with open(file, "r") as fd: compressed = fd.read() decompressed = lz4.decompress(compressed) return decompressed.split('\n')
def process_document(self, session, doc): data = doc.get_raw(session) new = lz4.decompress(data) return StringDocument(new, self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)
def chunk_image(chunks): out = '' for chunk in chunks: out += chunk out = decompress(out) sr = dill.loads(out) print type(sr) return sr
def restore(store, bucketname, restorepath, parts, compress=True): bucketname = str(bucketname) for part in parts: part_content = store.get_object(bucketname, part) if compress: uncompressed_part_content = lz4.decompress(part_content) else: uncompressed_part_content = part_content j.sal.fs.writeFile(restorepath, uncompressed_part_content, append=True)
def loadVOCAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, EVAL_DIFFICULT=False, year="2012" ): from pickle import dumps,loads try: import lz4, pickle decompress = lambda s: pickle.loads( lz4.decompress( s ) ) compress = lambda o: lz4.compressHC( pickle.dumps( o ) ) except: compress = lambda x: x decompress = lambda x: x from gop import contour,dataset,segmentation FILE_NAME = '/tmp/%s_%s_%d_%d_%s.dat'%(im_set,detector,N_SPIX,EVAL_DIFFICULT,year) try: with open(FILE_NAME,'rb') as f: over_segs,segmentations,boxes = loads( f.read() ) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append( decompress(i) ) return over_seg,[decompress(i) for i in segmentations],[decompress(i) for i in boxes] except IOError: pass # Load the dataset #data = eval("dataset.loadVOC2012_small")(im_set=="train",im_set=="valid",im_set=="test") data = eval("dataset.loadVOC%s"%year)(im_set=="train",im_set=="valid",im_set=="test") images = [e['image'] for e in data] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] boxes = [[a['bbox'] for a in e['annotation'] if not a['difficult'] or EVAL_DIFFICULT] for e in data] # Do the over-segmentation if detector=='sf': detector = contour.StructuredForest() detector.load( '../data/sf.dat' ) elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load( "../data/sf.dat" ) elif detector=='st': detector = contour.SketchTokens() detector.load( '../data/st_full_c.dat' ) else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX ) #try: with open(FILE_NAME,'wb') as f: f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations],[compress(i) for i in boxes]) ) ) f.close() #except FileNotFoundError: #pass return over_segs,segmentations,boxes
def receivemissing(self, pipe, missingid): line = pipe.readline()[:-1] if not line: raise error.ResponseError(_("error downloading file " + "contents: connection closed early\n"), '') size = int(line) data = pipe.read(size) self.localcache.write(missingid, lz4.decompress(data))
def main(argv): if len(argv) < 2: print "You need to specify a v.0.1.1 AOL file." return 1 database = {} print "Opening {}".format(argv[1]) v011_aol = open(argv[1], "rb") while(1): try: clen, cmd = read_command(v011_aol) klen, key = read_command(v011_aol) if cmd == 'JAR': ctlen, ctype = read_command(v011_aol) originallen, original_size = read_command(v011_aol) _ = v011_aol.read(1) char = b"" length = b"" while char != None and char != b':': char = v011_aol.read(1) if not char: raise FuckOffException() if char != b':': length = length + char compressed = v011_aol.read(int(length)) pre_compressed = struct.pack("I", int(original_size)) + compressed decompressed = lz4.decompress(pre_compressed) database[key] = decompressed.encode("base64") elif cmd == 'SPOIL': # :5:SPOIL:30:10391merveill.espage%3D15_root:20:2014-04-28T12:44:12Z datesize, date = read_command(v011_aol) expired = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") if expired <= datetime.now(): try: del database[key] except KeyError: pass elif cmd == 'SCOOP': try: del database[key] except KeyError: pass v011_aol.read(1) # Newline. except FuckOffException: break v011_aol.close() output = argv[1] + ".json" opened_output = open(output, "w+") json.dump(database, opened_output, separators=(',', ':')) opened_output.close() return 0
def compute(self, split): with self.open_file() as f: f.seek(-8, 2) footer_fields_size, footer_indices_size = struct.unpack( 'II', f.read(8)) footer_offset = self.size - 8 - footer_fields_size - footer_indices_size footer_fields_offset = self.size - 8 - footer_fields_size if split.begin >= footer_offset: return start = split.begin end = min(split.end, footer_offset) stripe_id = start / STRIPE_SIZE f.seek(footer_fields_offset) _fields = marshal.loads(decompress(f.read(footer_fields_size))) if self.fields is None: field_ids = range(len(_fields)) field_names = _fields else: field_names = [] field_ids = [None] * len(_fields) for i, v in enumerate(self.fields): if v in _fields: index = _fields.index(v) field_ids[index] = i field_names.append(v) else: raise RuntimeError('Unknown field: %s' % v) f.seek(start) header_size, = struct.unpack('I', f.read(4)) header = marshal.loads(decompress(f.read(header_size))) content = [None] * len(field_names) for id, size in enumerate(header): index = field_ids[id] if index is not None: content[index] = marshal.loads(decompress(f.read(size))) else: f.seek(size, 1) for r in zip(*content): yield NamedTuple(field_names, r)
def receivemissing(self, pipe, missingid): line = pipe.readline()[:-1] if not line: raise error.ResponseError( _("error downloading file " + "contents: connection closed early\n"), '') size = int(line) data = pipe.read(size) self.localcache.write(missingid, lz4.decompress(data))
def get_bin(self, pattern): pattern_triples = self.leveldb.Get(pattern) pattern_triples = lz4.decompress(pattern_triples) pattern_triples = pattern_triples.split( MERGING_INDEX_TRIPLE_LINE_DELIMITER) triple_id_pairs = [ line.split(MERGING_INDEX_TRIPLE_ID_DELIMITER) for line in pattern_triples ] return {tr_id: tr_line for tr_id, tr_line in triple_id_pairs}
def compute(self, split): with self.open_file() as f: f.seek(-8,2) footer_fields_size, footer_indices_size = struct.unpack('II', f.read(8)) footer_offset = self.size - 8 - footer_fields_size - footer_indices_size footer_fields_offset = self.size - 8 - footer_fields_size if split.begin >= footer_offset: return start = split.begin end = min(split.end, footer_offset) stripe_id = start / STRIPE_SIZE f.seek(footer_fields_offset) _fields = marshal.loads(decompress(f.read(footer_fields_size))) if self.fields is None: field_ids = range(len(_fields)) field_names = _fields else: field_names = [] field_ids = [None] * len(_fields) for i, v in enumerate(self.fields): if v in _fields: index = _fields.index(v) field_ids[index] = i field_names.append(v) else: raise RuntimeError('Unknown field: %s' % v) f.seek(start) header_size, = struct.unpack('I', f.read(4)) header = marshal.loads(decompress(f.read(header_size))) content = [None] * len(field_names) for id, size in enumerate(header): index = field_ids[id] if index is not None: content[index] = marshal.loads(decompress(f.read(size))) else: f.seek(size, 1) for r in zip(*content): yield NamedTuple(field_names, r)
def loadCOCOAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, fold=0 ): from pickle import dumps,loads try: import lz4, pickle decompress = lambda s: pickle.loads( lz4.decompress( s ) ) compress = lambda o: lz4.compressHC( pickle.dumps( o ) ) except: compress = lambda x: x decompress = lambda x: x from gop import contour,dataset,segmentation FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat'%(im_set,detector,N_SPIX,fold) try: with open(FILE_NAME,'rb') as f: over_segs,segmentations = loads( f.read() ) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append( decompress(i) ) return over_seg,[decompress(i) for i in segmentations],[] #return over_segs,segmentations,[] except FileNotFoundError: pass # Load the dataset data = dataset.loadCOCO2014( im_set=="train",im_set=="valid", fold) # COCO has some pretty gray scale images (WTF!!!) images = [e['image'] if e['image'].C==3 else e['image'].tileC(3) for e in data] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] # Do the over-segmentation if detector=='sf': detector = contour.StructuredForest() detector.load( '../data/sf.dat' ) elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load( "../data/sf.dat" ) elif detector=='st': detector = contour.SketchTokens() detector.load( '../data/st_full_c.dat' ) else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX ) with open(FILE_NAME,'wb') as f: #f.write( dumps( (over_segs,segmentations) ) ) f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations]) ) ) f.close() return over_segs,segmentations,[]
def load_book_raw_data(b): if use_compression: f = open( join("/Volumes/NewVolume/Emotional-Arcs/database/cache", str(b.pk) + ".p.lz4"), "rb") return pickle.loads(lz4.decompress(f.read())) # f didn't get closed... else: return pickle.load( open( join("/Volumes/NewVolume/Emotional-Arcs/database/cache", str(b.pk) + ".p"), "rb"))
def read(self, arctic_lib, version, symbol, **kwargs): if 'blob' in version: if version['blob'] == _MAGIC_CHUNKED: collection = arctic_lib.get_top_level_collection() data = ''.join([x['data'] for x in collection.find({'symbol': symbol, 'parent': version['_id']}, sort=[('segment', pymongo.ASCENDING)])]) else: data = version['blob'] # Backwards compatibility return cPickle.loads(lz4.decompress(data)) return version['data']
def load_book_raw_data(b): if use_compression: f = open( join("/Users/andyreagan/projects/2014/09-books/data/cache", str(b.pk) + ".p.lz4"), "rb") return pickle.loads(lz4.decompress(f.read())) # f didn't get closed... else: return pickle.load( open( join("/Users/andyreagan/projects/2014/09-books/data/cache", str(b.pk) + ".p"), "rb"))
def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress( string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress( bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress( data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def fget(self , inst): if hasattr(inst, self.name+'_array') : return getattr(inst, self.name+'_array') nprow = getattr(inst, 'NumpyArrayTable__'+self.name) #~ print 'fget',self.name, nprow, inst.id if nprow is None or nprow.shape is None or nprow.dtype is None: return None if nprow.shape =='': shape = () else: shape = tuple([ int(v) for v in nprow.shape.split(',') ]) dt = np.dtype(nprow.dtype) if nprow.compress == 'blosc': buf = blosc.decompress(nprow.blob) elif nprow.compress == 'zlib': buf = zlib.decompress(nprow.blob) elif nprow.compress == 'lz4': buf = lz4.decompress(nprow.blob) elif nprow.compress == 'snappy': buf = snappy.decompress(nprow.blob) elif nprow.compress is None: buf = nprow.blob if np.prod(shape)==0: if len(buf) != 0: arr = np.frombuffer( buf , dtype = dt) else: arr= np.empty( shape, dtype = dt ) else: arr = np.frombuffer( buf , dtype = dt) arr.flags.writeable = True arr = arr.reshape(shape) if self.arraytype == pq.Quantity: arr = pq.Quantity(arr, units = nprow.units, copy =False) # next access will be direct setattr(inst, self.name+'_array', arr) #~ delattr(inst, 'NumpyArrayTable__'+self.name) return arr
def test_datafeed_unicode_url(self, glet_mock, SR_mock): config = {} chassis = mock.Mock() chassis.request_sub_channel.return_value = None ochannel = mock.Mock() chassis.request_pub_channel.return_value = ochannel chassis.request_rpc_channel.return_value = None rpcmock = mock.Mock() rpcmock.get.return_value = {'error': None, 'result': 'OK'} chassis.send_rpc.return_value = rpcmock b = minemeld.ft.taxii.DataFeed(FTNAME, chassis, config) inputs = ['a'] output = False b.connect(inputs, output) b.mgmtbus_initialize() b.start() # __init__ + get chkp + delete chkp self.assertEqual(len(SR_mock.mock_calls), 6) SR_mock.reset_mock() SR_mock.return_value.zcard.return_value = 1 # unicast b.filtered_update('a', indicator=u'☃.net/påth', value={ 'type': 'URL', 'confidence': 100, 'share_level': 'green', 'sources': ['test.1'] }) for call in SR_mock.mock_calls: name, args, kwargs = call if name == '().pipeline().__enter__().hset': break else: self.fail(msg='hset not found') self.assertEqual(args[2].startswith('lz4'), True) stixdict = json.loads(lz4.decompress(args[2][3:])) indicator = stixdict['indicators'][0] cyboxprops = indicator['observable']['object']['properties'] self.assertEqual(cyboxprops['type'], 'URL') self.assertEqual(cyboxprops['value'], u'\u2603.net/p\xe5th') SR_mock.reset_mock() b.stop()
def read_jsonlz4(filename): """ Read mozilla jsonlz4 file Returns json """ with open(filename, mode='rb') as f: # Check for the mozilla lz4 header if f.read(8) != b'mozLz40\0': return raw_data = f.read() uncompressed = lz4.decompress(raw_data) return json.loads(uncompressed.decode('utf8'))
def read_jsonlz4(filename): """ Read mozilla jsonlz4 file Returns json """ with open(filename, mode="rb") as f: # Check for the mozilla lz4 header if f.read(8) != b"mozLz40\0": return raw_data = f.read() uncompressed = lz4.decompress(raw_data) return json.loads(uncompressed.decode("utf8"))
def read(self, mongoose_lib, version, symbol, **kwargs): blob = version.get("blob") if blob is not None: if blob == _MAGIC_CHUNKED: collection = mongoose_lib.get_top_level_collection() data = b''.join(x['data'] for x in collection.find({'symbol': symbol, 'parent': version['_id']}, sort=[('segment', pymongo.ASCENDING)])) else: data = blob # Backwards compatibility data = lz4.decompress(data) return pickle_compat_load(io.BytesIO(data)) return version['data']
def _recv_value(self, buf, flags): if flags & Client._FLAG_COMPRESSED: buf = lz4.decompress(buf) if flags == 0 or flags == Client._FLAG_COMPRESSED: # Either a bare string or a compressed string now decompressed... val = buf elif flags & Client._FLAG_INTEGER: val = int(buf) elif flags & Client._FLAG_LONG: val = long(buf) elif flags & Client._FLAG_PICKLE: val = pickle.loads(buf) return val
def decompress(data): (compression,) = struct.unpack(">L", data[4:8]) scheme = compression >> 27 size = compression & 0x07ffffff if scheme == 0: pass elif scheme == 1 and lz4: res = lz4.decompress(struct.pack("<L", size) + data[8:]) if len(res) != size: warnings.warn("Table decompression failed.") else: data = res else: warnings.warn("Table is compressed with an unsupported compression scheme") return (data, scheme)
def read(self, mongoose_lib, version, symbol, **kwargs): blob = version.get("blob") if blob is not None: if blob == _MAGIC_CHUNKED: collection = mongoose_lib.get_top_level_collection() data = b''.join(x['data'] for x in collection.find( { 'symbol': symbol, 'parent': version['_id'] }, sort=[('segment', pymongo.ASCENDING)])) else: data = blob # Backwards compatibility data = lz4.decompress(data) return pickle_compat_load(io.BytesIO(data)) return version['data']
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompressarr(c.compressarrHC(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4.decompress(y) for y in [lz4.compressHC(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" Cython LZ4 %s s" % clz4_time) print(" Cython LZ4 Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj): self.data_dir = data_dir self.obj_to_terms = obj_to_terms self.obj_to_str = obj_to_str self.str_to_obj = str_to_obj self.id_term_map = None self.term_id_map = None self.objnum = 0 try: import lz4 as compressor self.compress = compressor.compress self.compressHC = compressor.compressHC self.decompress = compressor.decompress except ImportError: import zlib as compressor self.compress = lambda data: compressor.compress(data, 3) self.compressHC = lambda data: compressor.compress(data, 9) self.decompress = lambda data: compressor.decompress(data)
def msgpack_lz4_to_series(data): try: import msgpack import lz4 except ImportError: logging.info('To load lz4-msgpacked data, ' 'install packages "python-msgpack" and "lz4"') raise content = msgpack.loads(lz4.decompress(data)) series_load = lambda d: pd.Series( data=d['values'], index=d['index'] if d['index'][-1] <= 1e9 \ else pd.DatetimeIndex(d['index']), name=d['id'] ) seria = list(map(series_load, content)) return seria
def decompress(data, c=None): if c == None: return (False, None, data) try: if c == "zlib": import zlib return (True, "zlib", zlib.decompress(data)) elif c == "lz4": import lz4 return (True, "zlib", lz4.decompress(data)) elif c == "snappy": import pysnappy return (True, "snappy", pysnappy.uncompress(data)) else: import zlib return (True, "zlib", zlib.decompress(data)) except: return (False, None, data)
def dump_cache(self): batch = leveldb.WriteBatch() for pattern, triple_id_pairs in self.cache.iteritems(): try: pattern_triples = self.leveldb.Get(pattern) pattern_triples = lz4.decompress(pattern_triples) pattern_triples = pattern_triples.split(MERGING_INDEX_TRIPLE_LINE_DELIMITER) except KeyError: pattern_triples = [] logging.info("Merging bin from %d to %d (%d new)." % ( len(pattern_triples), len(pattern_triples) + len(triple_id_pairs), len(triple_id_pairs), )) for triple_id_pair in triple_id_pairs: pattern_triples.append(MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair)) pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join(pattern_triples) batch.Put(pattern, lz4.compressHC(pattern_triples_dump)) self.leveldb.Write(batch) logging.info("Dump %d bins." % len(self.cache)) self.cache = {} self.cache_size = 0 gc.collect()
def getPoints(): print(request.json) zoom = int(request.json.get("zoom", 1)) seen = request.json.get("seen", []) ensure(seen).is_a_list_of(str) points = POI.objects( at__geo_within_box=(request.json["SW"], request.json["NE"]), min_zoom=zoom, name__nin=request.json["seen"] ) return jsonify( { "points": [ { "name": p.name, "lat": p.at["coordinates"][1], "lng": p.at["coordinates"][0], "abstract": lz4.decompress(p.abstract).decode() if p.abstract else "", "img": p.img, } for p in points ] } )
def __init__(self, data_dir): # term = str() # triple = str() # args(triple) = (int) self.data_dir = data_dir # table: id(term) -> term self.term_id_map = None # table: id(triple) -> args(triple) self.triple_id_map = None # table: id(term) -> args(triple) self.arg_cache = None self.rel_id_map = REL_NAME_ID_MAP self.id_rel_map = REL_ID_NAME_MAP try: import lz4 as compressor self.compress = compressor.compress self.compressHC = compressor.compressHC self.decompress = compressor.decompress except ImportError: import zlib as compressor self.compress = lambda data: compressor.compress(data, 3) self.compressHC = lambda data: compressor.compress(data, 9) self.decompress = lambda data: compressor.decompress(data)
def _indicators_feed(feed, excbegtime, incendtime): if excbegtime is None: excbegtime = 0 else: excbegtime = dt_to_millisec(excbegtime) + 1 incendtime = dt_to_millisec(incendtime) cstart = 0 while True: indicators = SR.zrangebyscore( feed, excbegtime, incendtime, start=cstart, num=100 ) if indicators is None: break for i in indicators: value = SR.hget(feed + '.value', i) if value.startswith('lz4'): try: value = lz4.decompress(value[3:]) value = stix.core.STIXPackage.from_json(value) value = value.to_xml( ns_dict={'https://go.paloaltonetworks.com/minemeld': 'minemeld'} ) except ValueError: continue yield value if len(indicators) < 100: break cstart += 100
def buildLookup(self, subvolumes): lookup = {} files = {} for subvolume in subvolumes: print "* " + subvolume hex = hashlib.sha1(subvolume).hexdigest() file = self.folder + "/" + hex + ".lz4" with open(file, "r") as fd: compressed = fd.read() decompressed = lz4.decompress(compressed) lines = decompressed.split('\n') for line in lines: entry = Entry(line) files[entry.sha1] = entry if entry.sha1 in lookup: lookup[entry.sha1] += 1 else: lookup[entry.sha1] = 1 return lookup, files
def logic(self, list): print list sum = 0 lookup = {} for item in list: hex = hashlib.sha1(item).hexdigest() file = self.folder + "/" + hex + ".lz4" print "processing: " + file with open(file, "r") as fd: compressed = fd.read() decompressed = lz4.decompress(compressed) lines = decompressed.split('\n') for line in lines: file = Entry(line) if file.sha1 in lookup: continue lookup[file.sha1] = True sum += file.size print "Accumulated size is: ", self.printSize(sum)