def fetch_city_name_id(city_id='', city_name='', db_name='imd_city_db'): ''' City Names, IDs and corresponding links are fetched from local levelDB. If you pass city_id and city_name both, city_id would be chosen over city_name, for lookup. Passing only city_name, would help you to find possible matches. If you pass no arguments, then all available records will be returned back. ''' resp = {} try: db_handle = DB(db_name, create_if_missing=True) if (city_id): if (not __validate_city_id__(city_id)): raise Exception('city id not validated') tmp = db_handle.get(city_id.encode('utf-8'), b'') if (tmp): resp.update({city_id: tmp.decode('utf-8').split(';')}) else: resp = {'status': 'record not found'} elif (city_name): resp.update(__match_city_name__(city_name, db_handle.iterator())) else: itr = db_handle.iterator() for i, j in itr: resp.update({i.decode('utf-8'): j.decode('utf-8').split(';')}) itr.close() db_handle.close() except plError as e: resp = {'status': str(e)} except Exception as e: resp = {'status': str(e)} return resp
def test_destroy_db(): with tmp_db('destroy', create=False, delete=False) as name: db = DB(name, create_if_missing=True) db.put(b'foo', b'bar') db.close() del db plyvel.destroy_db(name) assert not os.path.lexists(name)
def test_repair_db(): with tmp_db('repair', create=False) as name: db = DB(name, create_if_missing=True) db.put(b'foo', b'bar') db.close() del db plyvel.repair_db(name) db = DB(name) assert_equal(b'bar', db.get(b'foo'))
def tmp_db(name_prefix, create=True, delete=True): name = tempfile.mkdtemp(prefix=name_prefix + '-', dir=TEST_DB_DIR) if create: db = DB(name, create_if_missing=True, error_if_exists=True) yield db db.close() else: yield name if delete: shutil.rmtree(name)
def store_city_name_id(data, db_name='imd_city_db'): ''' Stores City Names, IDs and correspoding links into a local levelDB. City ID is used as key value. ''' resp = {} try: db_handle = DB(db_name, create_if_missing=True) data = __format_db_entry__(data) for i, j in data.items(): db_handle.put(i, j) db_handle.close() resp = {'status': 'success'} except plError as e: resp = {'status': str(e)} except Exception as e: resp = {'status': str(e)} return resp
def test_open(): with tmp_db('read_only_dir', create=False) as name: # Opening a DB in a read-only dir should not work os.chmod(name, stat.S_IRUSR | stat.S_IXUSR) with assert_raises(plyvel.IOError): DB(name) with tmp_db('úñîçøđê_name') as db: pass with tmp_db('no_create', create=False) as name: with assert_raises(plyvel.Error): DB(name, create_if_missing=False) with tmp_db('exists', create=False) as name: db = DB(name, create_if_missing=True) db.close() with assert_raises(plyvel.Error): DB(name, error_if_exists=True) with assert_raises(TypeError): DB(123) with assert_raises(TypeError): DB('invalid_option_types', write_buffer_size='invalid') with assert_raises(TypeError): DB('invalid_option_types', lru_cache_size='invalid') with assert_raises(ValueError): DB('invalid_compression', compression='invalid', create_if_missing=True) with tmp_db('no_compression', create=False) as name: DB(name, compression=None, create_if_missing=True) with tmp_db('many_options', create=False) as name: DB(name, create_if_missing=True, error_if_exists=False, paranoid_checks=True, write_buffer_size=16 * 1024 * 1024, max_open_files=512, lru_cache_size=64 * 1024 * 1024, block_size=2 * 1024, block_restart_interval=32, compression='snappy', bloom_filter_bits=10)
def test_open_close(): with tmp_db('open_close', create=False) as name: # Create a database with options that result in additional # object allocation (e.g. LRU cache). db = DB(name, create_if_missing=True, lru_cache_size=1024 * 1024, bloom_filter_bits=10) db.put(b'key', b'value') wb = db.write_batch() sn = db.snapshot() it = db.iterator() snapshot_it = sn.iterator() # Close the database db.close() assert db.closed # Expect runtime errors for operations on the database, with assert_raises(RuntimeError): db.get(b'key') with assert_raises(RuntimeError): db.put(b'key', b'value') with assert_raises(RuntimeError): db.delete(b'key') # ... on write batches, with assert_raises(RuntimeError): wb.put(b'key', b'value') # ... on snapshots, assert_raises(RuntimeError, db.snapshot) with assert_raises(RuntimeError): sn.get(b'key') # ... on iterators, with assert_raises(RuntimeError): next(it) # ... and on snapshot iterators, with assert_raises(RuntimeError): next(snapshot_it)
def test_approximate_sizes(): with tmp_db('approximate_sizes', create=False) as name: # Write some data to a fresh database db = DB(name, create_if_missing=True, error_if_exists=True) value = b'a' * 100 with db.write_batch() as wb: for i in xrange(1000): key = bytes(i) * 100 wb.put(key, value) # Close and reopen the database db.close() del wb, db db = DB(name, create_if_missing=False) with assert_raises(TypeError): db.approximate_size(1, 2) with assert_raises(TypeError): db.approximate_sizes(None) with assert_raises(TypeError): db.approximate_sizes((1, 2)) # Test single range assert_greater_equal(db.approximate_size(b'1', b'2'), 0) # Test multiple ranges assert_list_equal([], db.approximate_sizes()) assert_greater_equal(db.approximate_sizes((b'1', b'2'))[0], 0) ranges = [ (b'1', b'3'), (b'', b'\xff'), ] assert_equal(len(ranges), len(db.approximate_sizes(*ranges)))
total_blocks = 1024 * 10 block_size = 1024 ev_inodes, ev_blocks = (0, 0) db = DB('/home/cujo/nfs/db/db2', create_if_missing=True, block_size=int(sys.argv[1])) _, current_blocks = update_vfs(block_size, total_blocks) for i in range(1024): bytes_written = 0 inode = INode() for k in range(len(inode.f_blocks) - 1): block_number = blocks_sample.pop() inode.f_blocks[k] = block_number bytes_written += populate_block(block_number) ev_blocks += 1 _, current_blocks = update_vfs(block_size, current_blocks) inode.f_size = bytes_written db.put(b'i_' + bytes(i), dumps(inode.__dict__)) ev_inodes += 1 _, current_blocks = update_vfs(block_size, current_blocks) db.put(b'fb', bytes(blocks_sample)) ev_blocks += 1 _, current_blocks = update_vfs(block_size, current_blocks) db.close() # Some Stats collection for evaluation print('Total Blocks inserted = ', ev_blocks) print('Total INodes inserted = ', ev_inodes)
class FeatureSelector( Frontend ): def __init__( self, fn, mode ): Frontend.__init__( self, fn, mode ); self._kdbfn = None; self._kdb = None; self._ldbdn = None; self._ldb = None; self._len_c = None; self._len_b = None; self._len_x = None; self._ic = None; self._icbp = None; self._needs_initialization = True; self._core_dims = set(); self._satellite_dims = set(); self._removed_dims = set(); self._remove_c = set(); self._remove_b = set(); self._remove_x = set(); self.bypass_c = False; self.bypass_b = False; self.bypass_x = False; def __enter__( self ): if self._mode == "r": with open( self._fn, "rb" ) as f: state = pickle_load( f ); self._len_c = state[ "c" ]; self._len_b = state[ "b" ]; self._len_x = state[ "x" ]; self._lenrow = self._len_c + self._len_b + self._len_x; self._ic = state[ "ic" ]; self._icbp = state[ "icbp" ]; if self._mode == "w": with NamedTemporaryFile() as tmpfn: self._kdbfn = tmpfn.name + '.kch'; self._kdb = KDB(); try: assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE ); except: print( str( self._kdb.error() ) ); raise; with TemporaryDirectory() as tmpdirname: self._ldbdn = tmpdirname; self._ldb = LDB( self._ldbdn, create_if_missing=True ); return self; def __exit__( self, exc_type, exc_value, traceback ): assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False; if self._ldb is not None: sleep( 3.0 ); self._ldb.close() if self._ldbdn is not None: rmtree( self._ldbdn ); if self._kdb is not None: try: assert self._kdb.close(); except: print( str( self._kdb.error() ) ); raise; if self._kdbfn is not None: remove( self._kdbfn ); def train( self, row ): ( y, c, b, x ) = row; if self._len_c is None: self._len_c = len(c); assert self._len_c == len(c); if self._len_b is None: self._len_b = len(b); assert self._len_b == len(b); if self._len_x is None: self._len_x = len(x); assert self._len_x == len(x); row = c + b + x; if Frontend.train( self, row ): return True; keyfmt = '>IIIII'; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if ( i >= j ) and ( not ( i == self._lenrow-1 ) ): continue; key = pack( keyfmt, i, j, y, row[i], row[j] ); try: assert self._kdb.increment( key, 1, 0 ); except: print( str(self._kdb.error()) ); raise; def _stats( self, cnt_by_a, cnt_by_b, cnt_by_ab ): h_a = 0.0; h_b = 0.0; h_ab = 0.0; for ( val_a, cnt ) in cnt_by_a.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_a -= p * log( p, 2.0 ); for ( val_b, cnt ) in cnt_by_b.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_b -= p * log( p, 2.0 ); for( (val_a,val_b), cnt ) in cnt_by_ab.items(): p = float(cnt) / float(self._rowcount); if p > 0.0: h_ab -= p * log( p, 2.0 ); if h_a == 0.0: return 1.0; if h_b == 0.0: return 1.0; mi = h_a + h_b - h_ab; return ( mi / min( h_a, h_b ), h_a, h_b, h_ab, mi ); def _get_info_content_by_dimension( self, i ): keyfmt = '>IIIII'; valfmt = '>Q'; j = None; cnt_by_a = {}; cnt_by_b = {}; cnt_by_ab = {}; total = 0; with self._ldb.iterator() as it: it.seek( pack( keyfmt, i,0,0,0,0 ) ); for ( key, val ) in it: key = unpack( keyfmt, key ); val = unpack( valfmt, val )[ 0 ]; if not ( key[0] == i ): break; if j is None: j = key[1]; if not ( key[1] == j ): break; # key[2] is the y-value a = key[2]; # key[3] is the value for the i-th dimension b = key[3]; cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val; cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val; cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val; total += val; try: assert total == self._rowcount; except: print( i, j, total, self._rowcount ); raise; return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab ); def _get_info_content_by_pair( self, i, j ): keyfmt = '>IIIII'; valfmt = '>Q'; cnt_by_a = {}; cnt_by_b = {}; cnt_by_ab = {}; total = 0; with self._ldb.iterator() as it: it.seek( pack( keyfmt, i,j,0,0,0 ) ); for ( key, val ) in it: key = unpack( keyfmt, key ); val = unpack( valfmt, val )[ 0 ]; if not ( ( key[0] == i ) and ( key[1] == j ) ): break; # key[2] is the y-value, key[3] the i-th value for the i-th dim a = ( key[2], key[3] ); # key[2] is the y-value, key[4] the i-th value for the j-th dim b = ( key[2], key[4] ); assert (a,b) not in cnt_by_ab; cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val; cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val; cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val; total += val; assert total == self._rowcount; return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab ); def _finalize( self ): assert Frontend._finalize( self ) is None; if False: print( "unique combinations = ", self._kdb.count() ); keyfmt = '>IIIII'; valfmt = '>Q'; c = self._kdb.cursor(); c.jump(); gt2 = 0; gt4 = 0; gt8 = 0; gt16 = 0; gt32 = 0; while True: r = c.get( True ); if not r: break; self._ldb.put( r[0], r[1] ); key = unpack( keyfmt, r[0] ); val = unpack( valfmt, r[1] )[ 0 ]; if val > 2: gt2 += 1; if val > 4: gt4 += 1; if val > 8: gt8 += 1; if val > 16: gt16 += 1; if val > 32: gt32 += 1; if False: print( gt2, gt4, gt8, gt16, gt32 ); self._ic = {}; for i in range( 0, self._lenrow ): self._ic[ i ] = self._get_info_content_by_dimension( i ); self._icbp = {}; for i in range( 0, self._lenrow ): for j in range( 0, self._lenrow ): if i >= j: continue; self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j ); self._state \ = { "ic": self._ic, "icbp": self._icbp, "c": self._len_c, "b": self._len_b, "x": self._len_x }; def _fmt_dim( self, d_ ): d = None; if d_ < self._len_c: d = "c" + str( d_ ); elif d_ < self._len_c + self._len_b: d = "b" + str( d_ - self._len_c ); elif d_ < self._len_c + self._len_b + self._len_x: d = "x" + str( d_ - self._len_c - self._len_b ); else: assert False; return "{:d}({:s})".format( d_, d ); def _init( self ): self._needs_initialization = False; if False: for i in sorted( self._ic ): (corr,h_a,h_b,h_ab,mi) = self._ic[ i ]; print( "{:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( self._fmt_dim( i ), corr, h_a, h_b, h_ab, mi ) ); for (i,j) in sorted( self._icbp ): (corr,h_a,h_b,h_ab,mi) = self._icbp[ (i,j) ]; print( "{:s} {:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( self._fmt_dim( i ), self._fmt_dim( j ), corr, h_a, h_b, h_ab, mi ) ); entropy \ = [ ( h_ab, i ) \ for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ]; output_correlation \ = [ ( corr, i ) \ for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ]; self._core_dims = set(); self._core_dims \ |= { i \ for ( h_ab, i ) \ in sorted( entropy, reverse=True )[ :5 ] }; self._core_dims \ |= { i \ for ( h_ab, i ) \ in sorted( output_correlation, reverse=True )[ :3 ] }; if True: print( "core = ", " ".join([ self._fmt_dim(d) for d in self._core_dims ]) ); self._satellite_dims = set(); for core_dim in self._core_dims: satellite_dim = None; satellite_dim_c = None; satellite_dim_stats = None; for ( (i,j), (corr,h_a,h_b,h_ab,mi) ) in self._icbp.items(): if corr <= 0.5: continue; other_dim = None; if i == core_dim: other_dim = j; elif j == core_dim: other_dim = i; else: continue; if ( satellite_dim_c is None ) or ( corr > satellite_dim_c ): satellite_dim = other_dim; satellite_dim_c = corr; satellite_dim_stats = (corr,h_a,h_b,h_ab,mi); if satellite_dim is not None: self._satellite_dims.add( satellite_dim ); if False: print( '->', self._fmt_dim(core_dim), self._fmt_dim(satellite_dim) ); print( "{:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\ .format( *(corr,h_a,h_b,h_ab,mi) ) ); if True: print( "satellite = ", " ".join([ self._fmt_dim(d) for d in self._satellite_dims ]) ); self._removed_dims = set(); for i in self._ic: if i not in self._core_dims and i not in self._satellite_dims: self._removed_dims.add( i ); if True: print( "removed = ", " ".join([ self._fmt_dim(d) for d in self._removed_dims ]) ); for d_ in self._removed_dims: if d_ < self._len_c: self._remove_c.add( d_ ); elif d_ < self._len_c + self._len_b: self._remove_b.add( d_ - self._len_c ); elif d_ < self._len_c + self._len_b + self._len_x: self._remove_x.add( d_ - self._len_c - self._len_b ); else: assert False; def apply_c( self, c ): if self.bypass_c: return c; if self._needs_initialization: self._init(); c_ = []; for ( i, cval ) in enumerate( c ): if not i in self._remove_c: c_.append( cval ); return c_; def apply_b( self, b ): if self.bypass_b: return b; if self._needs_initialization: self._init(); b_ = []; for ( i, bval ) in enumerate( b ): if not i in self._remove_b: b_.append( bval ); return b_; def apply_x( self, x ): if self.bypass_x: return x; if self._needs_initialization: self._init(); x_ = []; for ( i, xval ) in enumerate( x ): if not i in self._remove_x: x_.append( xval ); return x_; def __call__( self, row ): if self._needs_initialization: self._init(); ( y, c, b, x ) = row; y_ = y; return \ ( y_, self.apply_c( c ), self.apply_b( b ), self.apply_x( x ) );
class LevelDBStorage(object): """Generic database""" def __init__(self, path): self.db = DB( path, create_if_missing=True, lru_cache_size=10*10, bloom_filter_bits=64, block_size=10**9, compression=None, ) self.tuples = self.db.prefixed_db(b'tuples') self.index = self.db.prefixed_db(b'index') def close(self): self.db.close() def ref(self, uid, key): match = [uid, key] for key, value in self.tuples.iterator(start=pack(uid, key)): other = unpack(key) if other == match: value = unpack(value)[0] return value else: return None def get(self, uid): def __get(): for key, value in self.tuples.iterator(start=pack(uid)): other, key = unpack(key) if other == uid: value = unpack(value)[0] yield key, value else: break tuples = dict(__get()) return tuples def add(self, uid, **properties): tuples = self.tuples.write_batch(transaction=True) index = self.index.write_batch(transaction=True) for key, value in properties.items(): tuples.put(pack(uid, key), pack(value)) index.put(pack(key, value, uid), '') tuples.write() index.write() def delete(self, uid): tuples = self.tuples.write_batch(transaction=True) index = self.index.write_batch(transaction=True) for key, value in self.tuples.iterator(start=pack(uid)): other, name = unpack(key) if uid == other: tuples.delete(key) value = unpack(value)[0] index.delete(pack(name, value, uid)) else: break tuples.write() index.write() def update(self, uid, **properties): self.delete(uid) self.add(uid, **properties) def debug(self): for key, value in self.tuples.iterator(): uid, key = unpack(key) value = unpack(value)[0] print(uid, key, value) def query(self, key, value=''): match = (key, value) if value else (key,) iterator = self.index.iterator(start=pack(key, value)) for key, value in iterator: other = unpack(key) ok = reduce( lambda previous, x: (cmp(*x) == 0) and previous, zip(match, other), True ) if ok: yield other else: break
class Snapshot(object): """ use persistent method (like file, db and so on) to store (cache) Output of the Input, so we can bypass the known pair to save time/cpu/... """ def __init__(self, dbpath, *args, debug=False, refresh=None, **kwargs): """ :param refresh: ignore data in db and refresh using new value """ super().__init__(*args, **kwargs) try: self.db = DB(dbpath, create_if_missing=True) except Exception as e: self.db = None raise e self.old_key = None self.upgrade = False if debug: handler.level = logging.DEBUG if refresh: self.refresh = True else: self.refresh = False def __del__(self): self.close() def __exit__(self): self.db.close() def __iter__(self): for k, v in self.db.iterator(): yield self.recover_bytes(k), self.recover_bytes(v) def __contains__(self, key): # raise Exception('we do NOT know which one means EXIST') return self.get(key, None) is not None def __call__(self, *args, ignore=None, redos=None): return self.snapshot(*args, ignore, redos) def close(self): if self.db: self.db.close() self.db = None @staticmethod def to_bytes(data): """ support all basic type. but never support recursion data, like List[Dict]. all data will be translated to bytes if possible. use pickle to save bytes so we can store any possible data. """ s = pickle.dumps(data) return s @staticmethod def recover_bytes(data): s = data return pickle.loads(s) def get(self, key, default=None): """ user shold determine the key exist or not (according to the default) """ logger.debug('key: {}', key, ) key = self.to_bytes(key) data = self.db.get(key, default) if data != default: logger.debug('get exist: {} -> data(type={})', key, type(data)) return data def get_result(self, key) -> bytes: """ get the value related to the key, return the result by decoding it from bytes :param key: :return: """ data = self.get(key) if data is None: return None else: return self.recover_bytes(data) def put(self, k, v): logger.debug('put: {} -> data(type={})', k, type(v)) key = self.to_bytes(k) data = self.to_bytes(v) return self.db.put(key, data) def exist(self, key): return key in self def delete(self, k): key = self.to_bytes(k) return self.db.delete(key) def set_upgrade(self, *old_args): positions, keys = self.get_key_config(*old_args) self.upgrade = True self.old_key = positions, keys @staticmethod def get_key_config(*args): positions, keys = [], [] for item in args: if isinstance(item, int): positions.append(item) elif isinstance(item, str): keys.append(item) return positions, keys def get_key(self, positions, keys, *args, **kwargs): logger.debug('get key from {} {} (positions:{} keys:{})', args, kwargs, positions, keys, ) key = [] for p in positions: key.append(args[p]) for k in keys: key.append(kwargs[k]) return key def snapshot(self, *_args, ignore=None, redos=None, ignore_callback=None, redo_callback=None): """ the args: can be number: the idx/pos of given args can be string: the key name in kwargs the kwargs: some config for snapshot """ logger.debug('choose as key: {}', _args) positions, keys = self.get_key_config(*_args) # will ignore some return value, aka. no snapshot for it _ignore = ignore # will redo for some return value, should be a list _redos = redos or [] logger.debug('choose position args: {}', positions) logger.debug('choose name kwargs: {}', keys) def do_snapshot(func): def is_ignore(value): if value == _ignore: return True if ignore_callback and ignore_callback(value): return True return False def is_redo(value): if value in _redos: return True if redo_callback and redo_callback(value): return True return False def worker(*args, **kwargs): key = self.get_key(positions, keys, *args, **kwargs) if self.upgrade: old_key = self.get_key( self.old_key[0], self.old_key[1], *args, **kwargs) logger.info('will upgrade old_key: {}', old_key) result = self.get(old_key) if result is not None: result = self.recover_bytes(result) logger.info('upgrade result: {} -> {} -> {}', old_key, key, result) self.delete(old_key) self.put(key, result) return result else: result = self.get(key) if result is None: pass else: result = self.recover_bytes(result) if is_redo(result): logger.warning('redo result: {}', result) logging.getLogger().warning('redo result') elif self.refresh: pass else: return result result = func(*args, **kwargs) value = result if is_ignore(value): logger.warning('ignore result: {}', result) elif is_redo(value): logger.warning('redo result: {}', result) else: self.put(key, value) return result return worker return do_snapshot