def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Ejemplo n.º 2
0
    def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns):
        rtn = {}
        if doc[VERSION] != 3:
            raise ArcticException("Unhandled document version: %s" % doc[VERSION])
        rtn[INDEX] = np.cumsum(np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64'))
        doc_length = len(rtn[INDEX])
        rtn_length = len(rtn[INDEX])
        if include_symbol:
            rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length
        column_set.update(doc[COLUMNS].keys())
        for c in column_set:
            try:
                coldata = doc[COLUMNS][c]
                dtype = np.dtype(coldata[DTYPE])
                values = np.fromstring(lz4.decompress(coldata[DATA]), dtype=dtype)
                self._set_or_promote_dtype(column_dtypes, c, dtype)
                rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c])
                rowmask = np.unpackbits(np.fromstring(lz4.decompress(coldata[ROWMASK]),
                                                      dtype='uint8'))[:doc_length].astype('bool')
                rtn[c][rowmask] = values
            except KeyError:
                rtn[c] = None

        if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}):
            rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns)
        return rtn
Ejemplo n.º 3
0
    def _read_bucket(self, doc, column_set, column_dtypes, include_symbol,
                     include_images, columns):
        rtn = {}
        if doc[VERSION] != 3:
            raise ArcticException("Unhandled document version: %s" %
                                  doc[VERSION])
        rtn[INDEX] = np.cumsum(
            np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64'))
        doc_length = len(rtn[INDEX])
        rtn_length = len(rtn[INDEX])
        if include_symbol:
            rtn['SYMBOL'] = [
                doc[SYMBOL],
            ] * rtn_length
        column_set.update(doc[COLUMNS].keys())
        for c in column_set:
            try:
                coldata = doc[COLUMNS][c]
                dtype = np.dtype(coldata[DTYPE])
                values = np.fromstring(lz4.decompress(coldata[DATA]),
                                       dtype=dtype)
                self._set_or_promote_dtype(column_dtypes, c, dtype)
                rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c])
                rowmask = np.unpackbits(
                    np.fromstring(lz4.decompress(coldata[ROWMASK]),
                                  dtype='uint8'))[:doc_length].astype('bool')
                rtn[c][rowmask] = values
            except KeyError:
                rtn[c] = None

        if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}):
            rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length,
                                      column_dtypes, column_set, columns)
        return rtn
Ejemplo n.º 4
0
def get_coldata(coldata):
    """ return values and rowmask """
    dtype = np.dtype(coldata[DTYPE])
    values = np.fromstring(lz4.decompress(coldata[DATA]), dtype=dtype)
    rowmask = np.unpackbits(
        np.fromstring(lz4.decompress(coldata[ROWMASK]), dtype='uint8'))
    return list(values), list(rowmask)
Ejemplo n.º 5
0
def roundtrip(size=None):
    if size is None:
        size = struct.unpack(">I", b"\0" + os.urandom(3))[0]
    data = os.urandom(size)
    assert rustlz4.decompress(pylz4.compress(data)) == data
    assert pylz4.decompress(buffer(rustlz4.compress(data))) == data
    assert rustlz4.decompress(pylz4.compressHC(data)) == data
    assert pylz4.decompress(buffer(rustlz4.compresshc(data))) == data
Ejemplo n.º 6
0
def decompress_array(str_list):
    """
    Decompress a list of strings
    """
    if ENABLE_PARALLEL:
        return clz4.decompressarr(str_list)
    return [lz4.decompress(chunk) for chunk in str_list]
Ejemplo n.º 7
0
def loadCOCOAndOverSeg(im_set="test", detector="sf", N_SPIX=1000, fold=0):
    from pickle import dumps, loads
    try:
        import lz4, pickle
        decompress = lambda s: pickle.loads(lz4.decompress(s))
        compress = lambda o: lz4.compressHC(pickle.dumps(o))
    except:
        compress = lambda x: x
        decompress = lambda x: x
    from gop import contour, dataset, segmentation
    FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat' % (im_set, detector, N_SPIX, fold)
    try:
        with open(FILE_NAME, 'rb') as f:
            over_segs, segmentations = loads(f.read())
            f.close()
            over_seg = segmentation.ImageOverSegmentationVec()
            for i in over_segs:
                over_seg.append(decompress(i))
            return over_seg, [decompress(i) for i in segmentations], []
            #return over_segs,segmentations,[]
    except FileNotFoundError:
        pass

    # Load the dataset
    data = dataset.loadCOCO2014(im_set == "train", im_set == "valid", fold)

    # COCO has some pretty gray scale images (WTF!!!)
    images = [
        e['image'] if e['image'].C == 3 else e['image'].tileC(3) for e in data
    ]
    try:
        segmentations = [e['segmentation'] for e in data]
    except:
        segmentations = []

    # Do the over-segmentation
    if detector == 'sf':
        detector = contour.StructuredForest()
        detector.load('../data/sf.dat')
    elif detector == "mssf":
        detector = contour.MultiScaleStructuredForest()
        detector.load("../data/sf.dat")
    elif detector == 'st':
        detector = contour.SketchTokens()
        detector.load('../data/st_full_c.dat')
    else:
        detector = contour.DirectedSobel()

    if detector != None:
        over_segs = segmentation.generateGeodesicKMeans(
            detector, images, N_SPIX)
    with open(FILE_NAME, 'wb') as f:
        #f.write( dumps( (over_segs,segmentations) ) )
        f.write(
            dumps(
                ([compress(i)
                  for i in over_segs], [compress(i) for i in segmentations])))
        f.close()

    return over_segs, segmentations, []
Ejemplo n.º 8
0
        def _filter(v):
            path, ids = v
            result_set = set(ids)
            with open(path, 'rb') as f:
                f.seek(-8, 2)
                footer_fields_size, footer_indices_size = struct.unpack('II', f.read(8))
                f.seek(-8 -footer_fields_size -footer_indices_size, 2)
                indices = cPickle.loads(zlib.decompress(f.read(footer_indices_size)))
                _fields = marshal.loads(decompress(f.read(footer_fields_size)))
                for k, v in filters.iteritems():
                    result = set()
                    if k not in _fields:
                        raise RuntimeError('%s is not in fields!' % k)

                    if k not in indices:
                        raise RuntimeError('%s is not indexed' % k)

                    index = indices[k]
                    if isinstance(v, types.FunctionType):
                        r = index.filter(v)
                        if r is not None:
                            result = set(r)
                        else:
                            result = result_set
                    else:
                        if not isinstance(v, list):
                            v = [v]

                        for vv in v:
                            for _id in result_set:
                                if index.get(vv, _id):
                                    result.add(_id)
            return path, result
Ejemplo n.º 9
0
def decompress(file):
	data = open(file,"r+b")
	if data.name.endswith('.lz4'):
		decompressed_data = open(data.name[:-4],"w")
		decompressed_data.write(lz4.decompress(data.read()))
	else:
		print "The file type is not the expected." 
Ejemplo n.º 10
0
        def _filter(v):
            path, ids = v
            result_set = set(ids)
            with open(path, 'rb') as f:
                f.seek(-8, 2)
                footer_fields_size, footer_indices_size = struct.unpack(
                    'II', f.read(8))
                f.seek(-8 - footer_fields_size - footer_indices_size, 2)
                indices = cPickle.loads(
                    zlib.decompress(f.read(footer_indices_size)))
                _fields = marshal.loads(decompress(f.read(footer_fields_size)))
                for k, v in filters.iteritems():
                    result = set()
                    if k not in _fields:
                        raise RuntimeError('%s is not in fields!' % k)

                    if k not in indices:
                        raise RuntimeError('%s is not indexed' % k)

                    index = indices[k]
                    if isinstance(v, types.FunctionType):
                        r = index.filter(v)
                        if r is not None:
                            result = set(r)
                        else:
                            result = result_set
                    else:
                        if not isinstance(v, list):
                            v = [v]

                        for vv in v:
                            for _id in result_set:
                                if index.get(vv, _id):
                                    result.add(_id)
            return path, result
Ejemplo n.º 11
0
    def _decompress(
        self, data
    ):  # a decompression funcion like lrzip in spirit: lzma<bz2<zlib<lz0<lz4
        try:
            data = lzma.decompress(data)
        except:
            pass
        data = bz2.decompress(data)
        data = zlib.decompress(data)
        try:
            data = data.decode('zlib')
        except:
            pass
        try:
            data = lzo.decompress(data)
        except:
            pass
        try:
            data = lz4.decompress(data)
        except:
            pass

        if self.shuffle == True:
            try:
                print "unshuffling..."
                data = buff_unshuffle(data)
                print "data unshuffled..."
            except:
                pass

        return data
Ejemplo n.º 12
0
def deserialize_uint64_blocks(compressed_blocks, shape):
    """
    Reconstitute a volume that was serialized with serialize_uint64_blocks(), above.
    
    NOTE: If the volume is not 64-px aligned, then the output will NOT be C-contiguous.
    """
    if (np.array(shape) % 64).any():
        padding = 64 - (np.array(shape) % 64)
        aligned_shape = shape + padding
    else:
        aligned_shape = shape

    aligned_volume = np.empty(aligned_shape, dtype=np.uint64)
    block_view = view_as_blocks(aligned_volume, (64, 64, 64))

    for bi, (zi, yi, xi) in enumerate(np.ndindex(*block_view.shape[:3])):
        compressed_block = compressed_blocks[bi]

        # (See note above regarding recompression with LZ4)
        encoded_block = lz4.decompress(compressed_block)
        block = decode_label_block(encoded_block)
        block_view[zi, yi, xi] = block

    if shape == tuple(aligned_shape):
        volume = aligned_volume
    else:
        # Trim
        volume = np.asarray(aligned_volume[box_to_slicing((0, 0, 0), shape)],
                            order='C')

    return volume
Ejemplo n.º 13
0
def decompress_array(str_list):
    """
    Decompress a list of strings
    """
    if ENABLE_PARALLEL:
        return clz4.decompressarr(str_list)
    return [lz4.decompress(chunk) for chunk in str_list]
Ejemplo n.º 14
0
def test_tickstore_pandas_to_bucket_image():
    symbol = 'SYM'
    tz = 'UTC'
    initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'}
    data = [{'A': 120, 'D': 1}, {'A': 122, 'B': 2.0}, {'A': 3, 'B': 3.0, 'D': 1}]
    tick_index = [dt(2014, 1, 2, 0, 0, tzinfo=mktz(tz)),
                  dt(2014, 1, 3, 0, 0, tzinfo=mktz(tz)),
                  dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz))]
    data = pd.DataFrame(data, index=tick_index)
    bucket, final_image = TickStore._pandas_to_bucket(data, symbol, initial_image)
    assert final_image == {'index': dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz)), 'A': 3, 'B': 3.0, 'C': 'DESC', 'D': 1}
    assert IMAGE_DOC in bucket
    assert bucket[COUNT] == 3
    assert bucket[START] == dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz))
    assert bucket[END] == dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz))
    assert set(bucket[COLUMNS]) == set(('A', 'B', 'D'))
    assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA))
    assert get_coldata(bucket[COLUMNS]['A']) == ([120, 122, 3], [1, 1, 1, 0, 0, 0, 0, 0])
    values, rowmask = get_coldata(bucket[COLUMNS]['B'])
    assert np.isnan(values[0]) and values[1:] == [2.0, 3.0]
    assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0]
    values, rowmask = get_coldata(bucket[COLUMNS]['D'])
    assert np.isnan(values[1])
    assert values[0] == 1 and values[2] == 1
    assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0]
    index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in
             list(np.cumsum(np.fromstring(lz4.decompress(bucket[INDEX]), dtype='uint64')))]
    assert index == tick_index
    assert bucket[COLUMNS]['A'][DTYPE] == 'int64'
    assert bucket[COLUMNS]['B'][DTYPE] == 'float64'
    assert bucket[SYMBOL] == symbol
    assert bucket[IMAGE_DOC] == {IMAGE: initial_image,
                                 IMAGE_TIME: initial_image['index']}
Ejemplo n.º 15
0
def parsefileblob(path, decompress):
    raw = None
    f = open(path, "r")
    try:
        raw = f.read()
    finally:
        f.close()

    if decompress:
        raw = lz4.decompress(raw)

    index = raw.index('\0')
    size = int(raw[:index])
    data = raw[(index + 1):(index + 1 + size)]
    start = index + 1 + size

    firstnode = None

    mapping = {}
    while start < len(raw):
        divider = raw.index('\0', start + 80)

        currentnode = raw[start:(start + 20)]
        if not firstnode:
            firstnode = currentnode

        p1 = raw[(start + 20):(start + 40)]
        p2 = raw[(start + 40):(start + 60)]
        linknode = raw[(start + 60):(start + 80)]
        copyfrom = raw[(start + 80):divider]

        mapping[currentnode] = (p1, p2, linknode, copyfrom)
        start = divider + 1

    return size, firstnode, mapping
Ejemplo n.º 16
0
def test_tickstore_to_bucket_with_image():
    symbol = 'SYM'
    tz = 'UTC'
    initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'}
    data = [{'index': dt(2014, 1, 1, 0, 1, tzinfo=mktz(tz)), 'A': 124, 'D': 0},
            {'index': dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz)), 'A': 125, 'B': 27.2}]
    bucket, final_image = TickStore._to_bucket(data, symbol, initial_image)
    assert bucket[COUNT] == 2
    assert bucket[END] == dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz))
    assert set(bucket[COLUMNS]) == set(('A', 'B', 'D'))
    assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA))
    assert get_coldata(bucket[COLUMNS]['A']) == ([124, 125], [1, 1, 0, 0, 0, 0, 0, 0])
    assert get_coldata(bucket[COLUMNS]['B']) == ([27.2], [0, 1, 0, 0, 0, 0, 0, 0])
    assert get_coldata(bucket[COLUMNS]['D']) == ([0], [1, 0, 0, 0, 0, 0, 0, 0])
    index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in
             list(np.cumsum(np.fromstring(lz4.decompress(bucket[INDEX]), dtype='uint64')))]
    assert index == [i['index'] for i in data]
    assert bucket[COLUMNS]['A'][DTYPE] == 'int64'
    assert bucket[COLUMNS]['B'][DTYPE] == 'float64'
    assert bucket[SYMBOL] == symbol
    assert bucket[START] == initial_image['index']
    assert bucket[IMAGE_DOC][IMAGE] == initial_image
    assert bucket[IMAGE_DOC] == {IMAGE: initial_image,
                                 IMAGE_TIME: initial_image['index']}
    assert final_image == {'index': data[-1]['index'], 'A': 125, 'B': 27.2, 'C': 'DESC', 'D': 0}
Ejemplo n.º 17
0
 def dump_cache(self):
     batch = leveldb.WriteBatch()
     for pattern, triple_id_pairs in self.cache.iteritems():
         try:
             pattern_triples = self.leveldb.Get(pattern)
             pattern_triples = lz4.decompress(pattern_triples)
             pattern_triples = pattern_triples.split(
                 MERGING_INDEX_TRIPLE_LINE_DELIMITER)
         except KeyError:
             pattern_triples = []
         logging.info("Merging bin from %d to %d (%d new)." % (
             len(pattern_triples),
             len(pattern_triples) + len(triple_id_pairs),
             len(triple_id_pairs),
         ))
         for triple_id_pair in triple_id_pairs:
             pattern_triples.append(
                 MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair))
         pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join(
             pattern_triples)
         batch.Put(pattern, lz4.compressHC(pattern_triples_dump))
     self.leveldb.Write(batch)
     logging.info("Dump %d bins." % len(self.cache))
     self.cache = {}
     self.cache_size = 0
     gc.collect()
Ejemplo n.º 18
0
def parsefileblob(path, decompress):
    raw = None
    f = open(path, "r")
    try:
        raw = f.read()
    finally:
        f.close()

    if decompress:
        raw = lz4.decompress(raw)

    index = raw.index('\0')
    size = int(raw[:index])
    data = raw[(index + 1):(index + 1 + size)]
    start = index + 1 + size

    firstnode = None

    mapping = {}
    while start < len(raw):
        divider = raw.index('\0', start + 80)

        currentnode = raw[start:(start + 20)]
        if not firstnode:
            firstnode = currentnode

        p1 = raw[(start + 20):(start + 40)]
        p2 = raw[(start + 40):(start + 60)]
        linknode = raw[(start + 60):(start + 80)]
        copyfrom = raw[(start + 80):divider]

        mapping[currentnode] = (p1, p2, linknode, copyfrom)
        start = divider + 1

    return size, firstnode, mapping
Ejemplo n.º 19
0
    def read(self, subvolume):
            hex = hashlib.sha1(subvolume).hexdigest()
            file = ".btrfs/" + hex + ".lz4"

            with open(file, "r") as fd:
                compressed = fd.read()
                decompressed = lz4.decompress(compressed)
                return decompressed.split('\n')
Ejemplo n.º 20
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     new = lz4.decompress(data)
     return StringDocument(new,
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)
Ejemplo n.º 21
0
def chunk_image(chunks):
	out = ''
	for chunk in chunks:
		out += chunk
	out = decompress(out)
	sr = dill.loads(out)
	print type(sr)
	return sr
Ejemplo n.º 22
0
def restore(store, bucketname, restorepath, parts, compress=True):
    bucketname = str(bucketname)
    for part in parts:
        part_content = store.get_object(bucketname, part)
        if compress:
            uncompressed_part_content = lz4.decompress(part_content)
        else:
            uncompressed_part_content = part_content
        j.sal.fs.writeFile(restorepath, uncompressed_part_content, append=True)
Ejemplo n.º 23
0
def loadVOCAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, EVAL_DIFFICULT=False, year="2012" ):
	from pickle import dumps,loads
	try:
		import lz4, pickle
		decompress = lambda s: pickle.loads( lz4.decompress( s ) )
		compress = lambda o: lz4.compressHC( pickle.dumps( o ) )
	except:
		compress = lambda x: x
		decompress = lambda x: x
	from gop import contour,dataset,segmentation
	FILE_NAME = '/tmp/%s_%s_%d_%d_%s.dat'%(im_set,detector,N_SPIX,EVAL_DIFFICULT,year)
	try:
		with open(FILE_NAME,'rb') as f:
			over_segs,segmentations,boxes = loads( f.read() )
			f.close()
			over_seg = segmentation.ImageOverSegmentationVec()
			for i in over_segs:
				over_seg.append( decompress(i) )
			return over_seg,[decompress(i) for i in segmentations],[decompress(i) for i in boxes]
	except IOError:
		pass
	
	# Load the dataset
	#data = eval("dataset.loadVOC2012_small")(im_set=="train",im_set=="valid",im_set=="test")
	data = eval("dataset.loadVOC%s"%year)(im_set=="train",im_set=="valid",im_set=="test")

	
	images = [e['image'] for e in data]
	try:
		segmentations = [e['segmentation'] for e in data]
	except:
		segmentations = []
	boxes = [[a['bbox'] for a in e['annotation'] if not a['difficult'] or EVAL_DIFFICULT] for e in data]

	# Do the over-segmentation
	if detector=='sf':
		detector = contour.StructuredForest()
		detector.load( '../data/sf.dat' )
	elif detector == "mssf":
		detector = contour.MultiScaleStructuredForest()
		detector.load( "../data/sf.dat" )
	elif detector=='st':
		detector = contour.SketchTokens()
		detector.load( '../data/st_full_c.dat' )
	else:
		detector = contour.DirectedSobel()
	
	if detector != None:
		over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX )
	#try:
	with open(FILE_NAME,'wb') as f:
		f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations],[compress(i) for i in boxes]) ) )
		f.close()
	#except FileNotFoundError:
		#pass
	
	return over_segs,segmentations,boxes
Ejemplo n.º 24
0
def restore(store, bucketname, restorepath, parts, compress=True):
    bucketname = str(bucketname)
    for part in parts:
        part_content = store.get_object(bucketname, part)
        if compress:
            uncompressed_part_content = lz4.decompress(part_content)
        else:
            uncompressed_part_content = part_content
        j.sal.fs.writeFile(restorepath, uncompressed_part_content, append=True)
Ejemplo n.º 25
0
    def receivemissing(self, pipe, missingid):
        line = pipe.readline()[:-1]
        if not line:
            raise error.ResponseError(_("error downloading file " +
                "contents: connection closed early\n"), '')
        size = int(line)
        data = pipe.read(size)

        self.localcache.write(missingid, lz4.decompress(data))
Ejemplo n.º 26
0
def main(argv):
    if len(argv) < 2:
        print "You need to specify a v.0.1.1 AOL file."
        return 1
    database = {}
    print "Opening {}".format(argv[1])
    v011_aol = open(argv[1], "rb")
    while(1):
        try:
            clen, cmd = read_command(v011_aol)
            klen, key = read_command(v011_aol)
            if cmd == 'JAR':
                ctlen, ctype = read_command(v011_aol)
                originallen, original_size = read_command(v011_aol)
                _ = v011_aol.read(1)
                char = b""
                length = b""
                while char != None and char != b':':
                    char = v011_aol.read(1)
                    if not char:
                        raise FuckOffException()
                    if char != b':':
                        length = length + char
                compressed = v011_aol.read(int(length))
                pre_compressed = struct.pack("I", int(original_size)) + compressed
                decompressed = lz4.decompress(pre_compressed)
                database[key] = decompressed.encode("base64")
            elif cmd == 'SPOIL':
                # :5:SPOIL:30:10391merveill.espage%3D15_root:20:2014-04-28T12:44:12Z
                datesize, date = read_command(v011_aol)
                expired = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")

                if expired <= datetime.now():
                    try:
                        del database[key]
                    except KeyError:
                        pass

            elif cmd == 'SCOOP':
                try:
                    del database[key]
                except KeyError:
                    pass

            v011_aol.read(1) # Newline.
        except FuckOffException:
            break

    v011_aol.close()

    output = argv[1] + ".json"
    opened_output = open(output, "w+")
    json.dump(database, opened_output, separators=(',', ':'))
    opened_output.close()

    return 0
Ejemplo n.º 27
0
    def compute(self, split):
        with self.open_file() as f:
            f.seek(-8, 2)
            footer_fields_size, footer_indices_size = struct.unpack(
                'II', f.read(8))
            footer_offset = self.size - 8 - footer_fields_size - footer_indices_size
            footer_fields_offset = self.size - 8 - footer_fields_size
            if split.begin >= footer_offset:
                return

            start = split.begin
            end = min(split.end, footer_offset)
            stripe_id = start / STRIPE_SIZE
            f.seek(footer_fields_offset)
            _fields = marshal.loads(decompress(f.read(footer_fields_size)))

            if self.fields is None:
                field_ids = range(len(_fields))
                field_names = _fields
            else:
                field_names = []
                field_ids = [None] * len(_fields)
                for i, v in enumerate(self.fields):
                    if v in _fields:
                        index = _fields.index(v)
                        field_ids[index] = i
                        field_names.append(v)
                    else:
                        raise RuntimeError('Unknown field: %s' % v)

            f.seek(start)
            header_size, = struct.unpack('I', f.read(4))
            header = marshal.loads(decompress(f.read(header_size)))
            content = [None] * len(field_names)
            for id, size in enumerate(header):
                index = field_ids[id]
                if index is not None:
                    content[index] = marshal.loads(decompress(f.read(size)))
                else:
                    f.seek(size, 1)

            for r in zip(*content):
                yield NamedTuple(field_names, r)
Ejemplo n.º 28
0
    def receivemissing(self, pipe, missingid):
        line = pipe.readline()[:-1]
        if not line:
            raise error.ResponseError(
                _("error downloading file " +
                  "contents: connection closed early\n"), '')
        size = int(line)
        data = pipe.read(size)

        self.localcache.write(missingid, lz4.decompress(data))
Ejemplo n.º 29
0
 def get_bin(self, pattern):
     pattern_triples = self.leveldb.Get(pattern)
     pattern_triples = lz4.decompress(pattern_triples)
     pattern_triples = pattern_triples.split(
         MERGING_INDEX_TRIPLE_LINE_DELIMITER)
     triple_id_pairs = [
         line.split(MERGING_INDEX_TRIPLE_ID_DELIMITER)
         for line in pattern_triples
     ]
     return {tr_id: tr_line for tr_id, tr_line in triple_id_pairs}
Ejemplo n.º 30
0
    def compute(self, split):
        with self.open_file() as f:
            f.seek(-8,2)
            footer_fields_size, footer_indices_size = struct.unpack('II', f.read(8))
            footer_offset = self.size - 8 - footer_fields_size - footer_indices_size
            footer_fields_offset = self.size - 8 - footer_fields_size
            if split.begin >= footer_offset:
                return

            start = split.begin
            end = min(split.end, footer_offset)
            stripe_id = start / STRIPE_SIZE
            f.seek(footer_fields_offset)
            _fields = marshal.loads(decompress(f.read(footer_fields_size)))
            
            if self.fields is None:
                field_ids = range(len(_fields))
                field_names = _fields
            else:
                field_names = []
                field_ids = [None] * len(_fields)
                for i, v in enumerate(self.fields):
                    if v in _fields:
                        index = _fields.index(v)
                        field_ids[index] = i
                        field_names.append(v)
                    else:
                        raise RuntimeError('Unknown field: %s' % v)

            f.seek(start)
            header_size, = struct.unpack('I', f.read(4))
            header = marshal.loads(decompress(f.read(header_size)))
            content = [None] * len(field_names)
            for id, size in enumerate(header):
                index = field_ids[id]
                if index is not None:
                    content[index] = marshal.loads(decompress(f.read(size)))
                else:
                    f.seek(size, 1)

            for r in zip(*content):
                yield NamedTuple(field_names, r)
Ejemplo n.º 31
0
def loadCOCOAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, fold=0 ):
	from pickle import dumps,loads
	try:
		import lz4, pickle
		decompress = lambda s: pickle.loads( lz4.decompress( s ) )
		compress = lambda o: lz4.compressHC( pickle.dumps( o ) )
	except:
		compress = lambda x: x
		decompress = lambda x: x
	from gop import contour,dataset,segmentation
	FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat'%(im_set,detector,N_SPIX,fold)
	try:
		with open(FILE_NAME,'rb') as f:
			over_segs,segmentations = loads( f.read() )
			f.close()
			over_seg = segmentation.ImageOverSegmentationVec()
			for i in over_segs:
				over_seg.append( decompress(i) )
			return over_seg,[decompress(i) for i in segmentations],[]
			#return over_segs,segmentations,[]
	except FileNotFoundError:
		pass
	
	# Load the dataset
	data = dataset.loadCOCO2014( im_set=="train",im_set=="valid", fold)
	
	# COCO has some pretty gray scale images (WTF!!!)
	images = [e['image'] if e['image'].C==3 else e['image'].tileC(3)  for e in data]
	try:
		segmentations = [e['segmentation'] for e in data]
	except:
		segmentations = []
	
	# Do the over-segmentation
	if detector=='sf':
		detector = contour.StructuredForest()
		detector.load( '../data/sf.dat' )
	elif detector == "mssf":
		detector = contour.MultiScaleStructuredForest()
		detector.load( "../data/sf.dat" )
	elif detector=='st':
		detector = contour.SketchTokens()
		detector.load( '../data/st_full_c.dat' )
	else:
		detector = contour.DirectedSobel()
	
	if detector != None:
		over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX )
	with open(FILE_NAME,'wb') as f:
		#f.write( dumps( (over_segs,segmentations) ) )
		f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations]) ) )
		f.close()
	
	return over_segs,segmentations,[]
Ejemplo n.º 32
0
def load_book_raw_data(b):
    if use_compression:
        f = open(
            join("/Volumes/NewVolume/Emotional-Arcs/database/cache",
                 str(b.pk) + ".p.lz4"), "rb")
        return pickle.loads(lz4.decompress(f.read()))
    # f didn't get closed...
    else:
        return pickle.load(
            open(
                join("/Volumes/NewVolume/Emotional-Arcs/database/cache",
                     str(b.pk) + ".p"), "rb"))
Ejemplo n.º 33
0
 def read(self, arctic_lib, version, symbol, **kwargs):
     if 'blob' in version:
         if version['blob'] == _MAGIC_CHUNKED:
             collection = arctic_lib.get_top_level_collection()
             data = ''.join([x['data'] for x in collection.find({'symbol': symbol,
                                                                 'parent': version['_id']},
                                                                 sort=[('segment', pymongo.ASCENDING)])])
         else:
             data = version['blob']
         # Backwards compatibility
         return cPickle.loads(lz4.decompress(data))
     return version['data']
Ejemplo n.º 34
0
def load_book_raw_data(b):
    if use_compression:
        f = open(
            join("/Users/andyreagan/projects/2014/09-books/data/cache",
                 str(b.pk) + ".p.lz4"), "rb")
        return pickle.loads(lz4.decompress(f.read()))
    # f didn't get closed...
    else:
        return pickle.load(
            open(
                join("/Users/andyreagan/projects/2014/09-books/data/cache",
                     str(b.pk) + ".p"), "rb"))
Ejemplo n.º 35
0
    def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                          dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size,
                                            dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                             dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(
                string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(
                bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(
                data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Ejemplo n.º 36
0
 def fget(self , inst):
     
     if hasattr(inst, self.name+'_array') :
         return getattr(inst, self.name+'_array')
     
     nprow = getattr(inst, 'NumpyArrayTable__'+self.name)
     
     
     #~ print 'fget',self.name,  nprow, inst.id
     
     
     if nprow is None or nprow.shape is None or nprow.dtype is None:
         return None
     
     if nprow.shape =='':
         shape = ()
     else:
         shape = tuple([ int(v) for v in  nprow.shape.split(',') ])
     
     dt = np.dtype(nprow.dtype)
     
     if nprow.compress == 'blosc':
         buf = blosc.decompress(nprow.blob)
     elif nprow.compress == 'zlib':
         buf = zlib.decompress(nprow.blob)
     elif nprow.compress == 'lz4':
         buf = lz4.decompress(nprow.blob)
     elif nprow.compress == 'snappy':
         buf = snappy.decompress(nprow.blob)        
     elif nprow.compress is None:
         buf = nprow.blob
         
         
     if np.prod(shape)==0:
         if len(buf) != 0:
             arr = np.frombuffer( buf , dtype = dt)
         else:
             arr= np.empty( shape, dtype = dt )
     else:
         arr = np.frombuffer( buf , dtype = dt)
         arr.flags.writeable = True
         arr = arr.reshape(shape)
     
     if self.arraytype == pq.Quantity:
         arr = pq.Quantity(arr, units = nprow.units, copy =False)
     
     # next access will be direct
     setattr(inst, self.name+'_array', arr)
     
     #~ delattr(inst, 'NumpyArrayTable__'+self.name)
     
     return arr
Ejemplo n.º 37
0
    def test_datafeed_unicode_url(self, glet_mock, SR_mock):
        config = {}
        chassis = mock.Mock()

        chassis.request_sub_channel.return_value = None
        ochannel = mock.Mock()
        chassis.request_pub_channel.return_value = ochannel
        chassis.request_rpc_channel.return_value = None
        rpcmock = mock.Mock()
        rpcmock.get.return_value = {'error': None, 'result': 'OK'}
        chassis.send_rpc.return_value = rpcmock

        b = minemeld.ft.taxii.DataFeed(FTNAME, chassis, config)

        inputs = ['a']
        output = False

        b.connect(inputs, output)
        b.mgmtbus_initialize()

        b.start()
        # __init__ + get chkp + delete chkp
        self.assertEqual(len(SR_mock.mock_calls), 6)
        SR_mock.reset_mock()
        SR_mock.return_value.zcard.return_value = 1

        # unicast
        b.filtered_update('a',
                          indicator=u'☃.net/påth',
                          value={
                              'type': 'URL',
                              'confidence': 100,
                              'share_level': 'green',
                              'sources': ['test.1']
                          })
        for call in SR_mock.mock_calls:
            name, args, kwargs = call
            if name == '().pipeline().__enter__().hset':
                break
        else:
            self.fail(msg='hset not found')

        self.assertEqual(args[2].startswith('lz4'), True)
        stixdict = json.loads(lz4.decompress(args[2][3:]))

        indicator = stixdict['indicators'][0]
        cyboxprops = indicator['observable']['object']['properties']
        self.assertEqual(cyboxprops['type'], 'URL')
        self.assertEqual(cyboxprops['value'], u'\u2603.net/p\xe5th')
        SR_mock.reset_mock()

        b.stop()
Ejemplo n.º 38
0
def read_jsonlz4(filename):
    """
    Read mozilla jsonlz4 file

    Returns json
    """
    with open(filename, mode='rb') as f:
        # Check for the mozilla lz4 header
        if f.read(8) != b'mozLz40\0':
            return
        raw_data = f.read()
        uncompressed = lz4.decompress(raw_data)
        return json.loads(uncompressed.decode('utf8'))
Ejemplo n.º 39
0
def read_jsonlz4(filename):
    """
    Read mozilla jsonlz4 file

    Returns json
    """
    with open(filename, mode="rb") as f:
        # Check for the mozilla lz4 header
        if f.read(8) != b"mozLz40\0":
            return
        raw_data = f.read()
        uncompressed = lz4.decompress(raw_data)
        return json.loads(uncompressed.decode("utf8"))
Ejemplo n.º 40
0
 def read(self, mongoose_lib, version, symbol, **kwargs):
     blob = version.get("blob")
     if blob is not None:
         if blob == _MAGIC_CHUNKED:
             collection = mongoose_lib.get_top_level_collection()
             data = b''.join(x['data'] for x in collection.find({'symbol': symbol,
                                                                'parent': version['_id']},
                                                                sort=[('segment', pymongo.ASCENDING)]))
         else:
             data = blob
         # Backwards compatibility
         data = lz4.decompress(data)
         return pickle_compat_load(io.BytesIO(data))
     return version['data']
Ejemplo n.º 41
0
    def _recv_value(self, buf, flags):
        if flags & Client._FLAG_COMPRESSED:
            buf = lz4.decompress(buf)

        if  flags == 0 or flags == Client._FLAG_COMPRESSED:
            # Either a bare string or a compressed string now decompressed...
            val = buf
        elif flags & Client._FLAG_INTEGER:
            val = int(buf)
        elif flags & Client._FLAG_LONG:
            val = long(buf)
        elif flags & Client._FLAG_PICKLE:
            val = pickle.loads(buf)
        return val
Ejemplo n.º 42
0
def decompress(data):
    (compression,) = struct.unpack(">L", data[4:8])
    scheme = compression >> 27
    size = compression & 0x07ffffff
    if scheme == 0:
        pass
    elif scheme == 1 and lz4:
        res = lz4.decompress(struct.pack("<L", size) + data[8:])
        if len(res) != size:
            warnings.warn("Table decompression failed.")
        else:
            data = res
    else:
        warnings.warn("Table is compressed with an unsupported compression scheme")
    return (data, scheme)
Ejemplo n.º 43
0
 def read(self, mongoose_lib, version, symbol, **kwargs):
     blob = version.get("blob")
     if blob is not None:
         if blob == _MAGIC_CHUNKED:
             collection = mongoose_lib.get_top_level_collection()
             data = b''.join(x['data'] for x in collection.find(
                 {
                     'symbol': symbol,
                     'parent': version['_id']
                 },
                 sort=[('segment', pymongo.ASCENDING)]))
         else:
             data = blob
         # Backwards compatibility
         data = lz4.decompress(data)
         return pickle_compat_load(io.BytesIO(data))
     return version['data']
Ejemplo n.º 44
0
def test_performance_sequential(n, length):
    _str = random_string(length)
    _strarr = [_str for _ in range(n)]
    now = dt.now()
    [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]]
    clz4_time = (dt.now() - now).total_seconds()
    now = dt.now()
    c.decompressarr(c.compressarrHC(_strarr))
    clz4_time_p = (dt.now() - now).total_seconds()
    now = dt.now()
    [lz4.decompress(y) for y in [lz4.compressHC(x) for x in _strarr]]
    lz4_time = (dt.now() - now).total_seconds()
    print()
    print("LZ4 Test %sx len:%s" % (n, length))
    print("    Cython LZ4 %s s" % clz4_time)
    print("    Cython LZ4 Parallel %s s" % clz4_time_p)
    print("    LZ4 %s s" % lz4_time)
Ejemplo n.º 45
0
 def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj):
     self.data_dir = data_dir
     self.obj_to_terms = obj_to_terms
     self.obj_to_str = obj_to_str
     self.str_to_obj = str_to_obj
     self.id_term_map = None
     self.term_id_map = None
     self.objnum = 0
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Ejemplo n.º 46
0
def msgpack_lz4_to_series(data):
    try:
        import msgpack
        import lz4
    except ImportError:
        logging.info('To load lz4-msgpacked data, '
                     'install packages "python-msgpack" and "lz4"')
        raise
    content = msgpack.loads(lz4.decompress(data))
    series_load = lambda d: pd.Series(
        data=d['values'],
        index=d['index'] if d['index'][-1] <= 1e9 \
                         else pd.DatetimeIndex(d['index']),
        name=d['id']
    )
    seria = list(map(series_load, content))

    return seria
 def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj):
     self.data_dir = data_dir
     self.obj_to_terms = obj_to_terms
     self.obj_to_str = obj_to_str
     self.str_to_obj = str_to_obj
     self.id_term_map = None
     self.term_id_map = None
     self.objnum = 0
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Ejemplo n.º 48
0
def decompress(data, c=None):
    if c == None:
        return (False, None, data)
    try:
        if c == "zlib":
            import zlib
            return (True, "zlib", zlib.decompress(data))
        elif c == "lz4":
            import lz4
            return (True, "zlib", lz4.decompress(data))
        elif c == "snappy":
            import pysnappy
            return (True, "snappy", pysnappy.uncompress(data))
        else:
            import zlib
            return (True, "zlib", zlib.decompress(data))
    except:
        return (False, None, data)
Ejemplo n.º 49
0
 def dump_cache(self):
     batch = leveldb.WriteBatch()
     for pattern, triple_id_pairs in self.cache.iteritems():
         try:
             pattern_triples = self.leveldb.Get(pattern)
             pattern_triples = lz4.decompress(pattern_triples)
             pattern_triples = pattern_triples.split(MERGING_INDEX_TRIPLE_LINE_DELIMITER)
         except KeyError:
             pattern_triples = []
         logging.info("Merging bin from %d to %d (%d new)." % (
             len(pattern_triples),
             len(pattern_triples) + len(triple_id_pairs),
             len(triple_id_pairs),
         ))
         for triple_id_pair in triple_id_pairs:
             pattern_triples.append(MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair))
         pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join(pattern_triples)
         batch.Put(pattern, lz4.compressHC(pattern_triples_dump))
     self.leveldb.Write(batch)
     logging.info("Dump %d bins." % len(self.cache))
     self.cache = {}
     self.cache_size = 0
     gc.collect()
Ejemplo n.º 50
0
    def getPoints():
        print(request.json)
        zoom = int(request.json.get("zoom", 1))
        seen = request.json.get("seen", [])
        ensure(seen).is_a_list_of(str)
        points = POI.objects(
            at__geo_within_box=(request.json["SW"], request.json["NE"]), min_zoom=zoom, name__nin=request.json["seen"]
        )

        return jsonify(
            {
                "points": [
                    {
                        "name": p.name,
                        "lat": p.at["coordinates"][1],
                        "lng": p.at["coordinates"][0],
                        "abstract": lz4.decompress(p.abstract).decode() if p.abstract else "",
                        "img": p.img,
                    }
                    for p in points
                ]
            }
        )
 def __init__(self, data_dir):
     # term = str()
     # triple = str()
     # args(triple) = (int)
     self.data_dir = data_dir
     # table: id(term) -> term
     self.term_id_map = None
     # table: id(triple) -> args(triple)
     self.triple_id_map = None
     # table: id(term) -> args(triple)
     self.arg_cache = None
     self.rel_id_map = REL_NAME_ID_MAP
     self.id_rel_map = REL_ID_NAME_MAP
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Ejemplo n.º 52
0
def _indicators_feed(feed, excbegtime, incendtime):
    if excbegtime is None:
        excbegtime = 0
    else:
        excbegtime = dt_to_millisec(excbegtime) + 1
    incendtime = dt_to_millisec(incendtime)

    cstart = 0
    while True:
        indicators = SR.zrangebyscore(
            feed, excbegtime, incendtime,
            start=cstart, num=100
        )
        if indicators is None:
            break

        for i in indicators:
            value = SR.hget(feed + '.value', i)

            if value.startswith('lz4'):
                try:
                    value = lz4.decompress(value[3:])
                    value = stix.core.STIXPackage.from_json(value)
                    value = value.to_xml(
                        ns_dict={'https://go.paloaltonetworks.com/minemeld': 'minemeld'}
                    )

                except ValueError:
                    continue

            yield value

        if len(indicators) < 100:
            break

        cstart += 100
Ejemplo n.º 53
0
    def buildLookup(self, subvolumes):
        lookup = {}
        files = {}

        for subvolume in subvolumes:
            print "* " + subvolume
            hex = hashlib.sha1(subvolume).hexdigest()
            file = self.folder + "/" + hex + ".lz4"

            with open(file, "r") as fd:
                compressed = fd.read()
                decompressed = lz4.decompress(compressed)
                lines = decompressed.split('\n')

                for line in lines:
                    entry = Entry(line)
                    files[entry.sha1] = entry

                    if entry.sha1 in lookup:
                        lookup[entry.sha1] += 1
                    else:
                        lookup[entry.sha1] = 1

        return lookup, files
Ejemplo n.º 54
0
    def logic(self, list):
        print list
        sum = 0
        lookup = {}
        for item in list:
            hex = hashlib.sha1(item).hexdigest()
            file = self.folder + "/" + hex + ".lz4"

            print "processing: " + file
            with open(file, "r") as fd:
                compressed = fd.read()
                decompressed = lz4.decompress(compressed)
                lines = decompressed.split('\n')

                for line in lines:
                    file = Entry(line)

                    if file.sha1 in lookup:
                        continue

                    lookup[file.sha1] = True
                    sum += file.size

        print "Accumulated size is: ", self.printSize(sum)