class BKNNModel( Model ): def __init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel, kval ): Model.__init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel ); self._kval = kval; self._fn_cdata = self._fn; self._fn_ddata = self._fn.replace( '.kch', '-discrete.kch' ); self._fn_meta = self._fn.replace( '.kch', '-meta.pickle' ); self._fn_icov = self._fn.replace( '.kch', '-icov.pickle' ); self._cdata = None; self._ddata = None; self._len_c = None; self._len_b = None; self._len_x = None; self._rowcount = None; self._total_pos = None; self._total_neg = None; self._icov = None; self._co = None; self._sample_y = []; self._sample_c = []; self._sample_b = []; self._sample_x = []; self._sample_x_ = []; self._needs_finalization = False; self._needs_initialization = True; self._dmarginals = {}; self._dscores = {}; self._sparse_points = 0; self._bias = None; def __enter__( self ): self._cdata = DB(); self._ddata = DB(); try: if self._mode == "r": assert self._cdata.open( self._fn_cdata, DB.OREADER ); elif self._mode == "w": if isfile( self._fn_cdata ): remove( self._fn_cdata ); assert self._cdata.open( self._fn_cdata, DB.OWRITER | DB.OCREATE ); else: assert False; except: if self._cdata is not None: print( str( self._cdata.error() ) ); raise; try: if self._mode == "r": assert self._ddata.open( self._fn_ddata, DB.OREADER ); elif self._mode == "w": if isfile( self._fn_ddata ): remove( self._fn_ddata ); assert self._ddata.open( self._fn_ddata, DB.OWRITER | DB.OCREATE ); else: assert False; except: if self._ddata is not None: print( str( self._ddata.error() ) ); raise; if self._mode == "r": with open( self._fn_meta, 'rb' ) as f: r = pickle_load( f ); self._len_c = r[ "c" ]; self._len_b = r[ "b" ]; self._len_x = r[ "x" ]; self._co = r[ "co" ]; with open( self._fn_icov, 'rb' ) as f: self._icov = pickle_load( f ); return self; def __exit__( self, exc_type, exc_value, traceback ): ex_w_exc = False; ex_w_exc = ex_w_exc or ( exc_type is not None ); ex_w_exc = ex_w_exc or ( exc_value is not None ); ex_w_exc = ex_w_exc or ( traceback is not None ); if ( not ex_w_exc ) and ( self._mode == "w" ): if self._needs_finalization: self._finalize(); with open( self._fn_meta, 'wb' ) as f: r = { "c": self._len_c, "b": self._len_b, "x": self._len_x, "co": self._co }; pickle_dump( r, f ); with open( self._fn_icov, 'wb' ) as f: pickle_dump( self._icov, f ); if self._cdata is not None: try: assert self._cdata.close(); except: print( str( self._cdata.error() ) ); raise; self._cdata = None; if self._ddata is not None: try: assert self._ddata.close(); except: print( str( self._ddata.error() ) ); raise; self._ddata = None; if ex_w_exc and ( self._mode == "w" ): if isfile( self._fn_cdata ): remove( self._fn_cdata ); if isfile( self._fn_ddata ): remove( self._fn_ddata ); if isfile( self._fn_meta ): remove( self._fn_meta ); if isfile( self._fn_icov ): remove( self._fn_icov ); return False; def train( self, row ): self._needs_finalization = True; ( y, c, b, x ) = row; c = self._fsel.apply_c( self._catfe( c ) ); b = self._fsel.apply_b( self._binfe( b ) ); x = self._contfe( x ); x_ = self._fdisc( x ); x = self._fsel.apply_x( x ); x_ = self._fsel.apply_x( x_ ); if False: print( y, c, b, x, x_ ); if self._len_c is None: self._len_c = len(c); assert self._len_c == len(c); if self._len_b is None: self._len_b = len(b); assert self._len_b == len(b); if self._len_x is None: self._len_x = len(x); assert self._len_x == len(x); if self._rowcount is None: self._rowcount = 0; self._rowcount += 1; dkeyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) ); self._ddata.increment( pack( dkeyfmt, y, *(c+b) ), 1, 0 ); ckeyfmt = '>' + ( 'I' * len(x) ); cvalfmt = '>I' + ( 'f' * len(x) ); self._cdata.append( pack( ckeyfmt, *x_ ), pack( cvalfmt, y, *x ) ); if len( self._sample_x ) < 50000: assert len( self._sample_x ) == len( self._sample_y ); assert len( self._sample_x ) == len( self._sample_c ); assert len( self._sample_x ) == len( self._sample_b ); assert len( self._sample_x ) == len( self._sample_x_ ); self._sample_y.append( y ); self._sample_c.append( c ); self._sample_b.append( b ); self._sample_x.append( x ); self._sample_x_.append( x_ ); return False; def _init( self ): self._needs_initialization = False; c = self._ddata.cursor(); c.jump(); keyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) ); valfmt = '>Q'; while True: r = c.get( True ); if not r: break; dbkey = unpack( keyfmt, r[0] ); dbval = unpack( valfmt, r[1] )[ 0 ]; additional_count = dbval; y = dbkey[ 0 ]; for ( i, value_of_variable_i ) in enumerate( dbkey[ 1: ] ): if not i in self._dmarginals: self._dmarginals[ i ] = {}; self._dmarginals[ i ][ (y,value_of_variable_i) ] \ = self._dmarginals[ i ].get( (y,value_of_variable_i), 0 ) \ + additional_count; for ( i, count_by_val ) in self._dmarginals.items(): total = 0; total_neg = 0; total_pos = 0; for ( ( y, val ), cnt ) in count_by_val.items(): total += cnt; if y == 0: total_neg += cnt; elif y == 1: total_pos += cnt; if self._rowcount is None: self._rowcount = total; assert self._rowcount == total; if self._total_neg is None: self._total_neg = total_neg; try: assert self._total_neg == total_neg; except: print( self._total_neg, total_neg ); raise; if self._total_pos is None: self._total_pos = total_pos; try: assert self._total_pos == total_pos; except: print( self._total_pos, total_pos ); raise; assert ( self._total_pos + self._total_neg ) == self._rowcount; for i in self._dmarginals: values = set([ val for (y,val) in self._dmarginals[ i ].keys() ]); if i not in self._dscores: self._dscores[ i ] = {}; for val in values: pos_cnt = self._dmarginals[ i ].get( (1,val), 0 ); neg_cnt = self._dmarginals[ i ].get( (0,val), 0 ); p_pos \ = log( float(pos_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_pos) + float( len(values) ) * SMOOTHING, 2.0 ); p_neg \ = log( float(neg_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_neg) + float( len(values) ) * SMOOTHING, 2.0 ); self._dscores[ i ][ val ] = p_pos - p_neg; p_pos \ = log( float(self._total_pos), 2.0 ) \ - log( float(self._rowcount), 2.0 ); p_neg \ = log( float(self._total_neg), 2.0 ) \ - log( float(self._rowcount), 2.0 ); self._bias = p_pos - p_neg; if False: for i in sorted( self._dscores.keys() ): score_by_val = self._dscores[ i ]; for ( val, score ) in score_by_val.items(): print( "{:d} {:10d} {:+2.4f}".format( i, val, score ) ); def _apply( self, row ): if self._needs_initialization: self._init(); ( c, b, x, x_ ) = row; ckeyfmt = '>' + ( 'I' * len(x_) ); cvalfmt = '>I' + ( 'f' * len(x) ); cvalsz = calcsize( cvalfmt ); rng = []; for xval in x_: rng.append( [ xv \ for xv \ in [ xval-2, xval-1, xval, xval+1, xval+2 ] \ if 0 <= xv <= 31 ] ); x_vec = np.array( x ).reshape( 1, self._len_x ).T; nearest_positive = []; all_negative = []; found_ident = 0; for xvals in product( *rng ): try: ckey = pack( ckeyfmt, *xvals ); except: print( ckeyfmt, xvals ); raise; val = self._cdata.get( ckey ); while val: if len(val) <= cvalsz: assert len(val) == cvalsz; val_ = val[:cvalsz]; val = val[cvalsz:]; pt = unpack( cvalfmt, val_ ); pt_y = pt[0]; pt_x = pt[1:]; pt_x_vec = np.array( pt_x ).reshape( 1, self._len_x ).T; diff = pt_x_vec - x_vec; dist = np.sqrt( np.dot( np.dot( diff.T, self._icov ), diff ) ); if dist <= 0.0001: found_ident += 1; continue; if pt_y == 0: all_negative.append( dist ); continue; assert pt_y == 1; nearest_positive.append( dist ); nearest_positive.sort(); nearest_positive = nearest_positive[:self._kval]; # assert found_ident == 1; # assert len( nearest_positive ) == self._kval; if len( nearest_positive ) < self._kval: self._sparse_points += 1; score = self._bias; # if len( nearest_positive ) > 0: if True: if len( nearest_positive ) == 0: threshold = None; else: threshold = nearest_positive[-1]; neg_cnt = 0; for dist in all_negative: if ( threshold is None ) or ( dist <= threshold ): neg_cnt += 1; p_pos \ = log( float( len(nearest_positive) ) + SMOOTHING, 2.0 ) \ - log( float(self._total_pos) + 2.0 * SMOOTHING, 2.0 ); p_neg \ = log( float(neg_cnt) + SMOOTHING, 2.0 ) \ - log( float(self._total_neg) + 2.0 * SMOOTHING, 2.0 ); score += p_pos - p_neg; for ( i, dval ) in enumerate( c+b ): score += self._dscores[ i ].get( dval, 0.0 ); if self._co is None: return score; else: if score >= self._co: return 1; else: return 0; def _finalize( self ): self._needs_finalization = False; covsample = np.array( self._sample_x ); cov = np.cov( covsample.T ); self._icov = LA.inv( cov ); sample \ = zip( self._sample_c, self._sample_b, self._sample_x, self._sample_x_ ); scores = []; for ( c, b, x, x_ ) in sample: scores.append( self._apply( [ c, b, x, x_ ] ) ); sorted_scores = list( sorted( scores ) ); cutoffs = []; for idx in range(0,1000): ratio = float(idx) / 1000.0; cutoffs.append( sorted_scores[ int( float( len(sorted_scores) ) * ratio ) ] ); if False: pprint( cutoffs ); stats_by_co = []; for coidx in range( 0, len(cutoffs) ): stats_by_co.append( { "tp": 0, "fp": 0, "tn": 0, "fn": 0 } ); for ( y, score ) in zip( self._sample_y, scores ): for ( coidx, co ) in enumerate( cutoffs ): if score >= co: if y == 1: stats_by_co[ coidx ][ "tp" ] += 1; else: assert y == 0; stats_by_co[ coidx ][ "fp" ] += 1; else: if y == 0: stats_by_co[ coidx ][ "tn" ] += 1; else: assert y == 1; stats_by_co[ coidx ][ "fn" ] += 1; max_fscore = None; max_fscore_coidx = None; for ( coidx, co ) in enumerate( cutoffs ): tp = stats_by_co[ coidx ][ "tp" ]; fp = stats_by_co[ coidx ][ "fp" ]; tn = stats_by_co[ coidx ][ "tn" ]; fn = stats_by_co[ coidx ][ "fn" ]; if (tp+fp) <= 0: continue; if (tp+fn) <= 0: continue; precision = float(tp) / float(tp+fp); recall = float(tp) / float(tp+fn); if (precision+recall) <= 0.0: continue; fscore = 2.0 * ( ( precision * recall ) / ( precision + recall ) ); if ( max_fscore is None ) or ( fscore > max_fscore ): max_fscore = fscore; max_fscore_coidx = coidx; assert max_fscore_coidx is not None; self._co = cutoffs[ max_fscore_coidx ]; # assert self._sparse_points == 0; if True: print( self._sparse_points ); print( self._co ); print( max_fscore ); def __call__( self, row ): ( c, b, x ) = row; c = self._fsel.apply_c( self._catfe( c ) ); b = self._fsel.apply_b( self._binfe( b ) ); x = self._contfe( x ); x_ = self._fdisc( x ); x = self._fsel.apply_x( x ); x_ = self._fsel.apply_x( x_ ); try: assert self._len_c == len(c); assert self._len_b == len(b); assert self._len_x == len(x); assert self._len_x == len(x_); except: print( self._len_c, self._len_b, self._len_x ); raise; return self._apply( ( c, b, x, x_ ) );
class KyotoCabinetGraph(BaseGraph): def __init__(self, path): # create the database object self._path = path self._db = DB() # open the database if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE): raise GrapheekDataKyotoCabinetInitFailureException( str(self._db.error())) super(KyotoCabinetGraph, self).__init__() self._ensure_prepared() self._closed = False # Start method overriding : def _db_close(self): if not self._closed: self._db.close() def _transaction_begin(self): self._db.begin_transaction() return True def _transaction_commit(self, txn): self._db.end_transaction(True) def _transaction_rollback(self, txn): self._db.end_transaction(False) def _has_key(self, key): return self._db.check(key) >= 0 def _get(self, txn, key): raw_data = self._db.get(key) if raw_data is None: return UNDEFINED # Not returning None, as None is a valid value return msgpack.loads(raw_data, encoding='utf8') def _bulk_get(self, txn, keys): result = {} key_raw_datas = self._db.get_bulk(keys) for key, raw_data in list(key_raw_datas.items()): if PYTHON2: # pragma : no cover k = key else: # pragma : no cover k = str(key, encoding='utf8') result[k] = msgpack.loads(raw_data, encoding='utf8') return result def _set(self, txn, key, value): res = self._db.set(key, msgpack.dumps(value, encoding='utf8')) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_set(self, txn, updates): dic = {} for key, value in list(updates.items()): dic[key] = msgpack.dumps(value, encoding='utf8') res = self._db.set_bulk(dic) if res == -1: # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _remove(self, txn, key): # Contrary to LocalMemoryGraph implementation, it is not needed to wrap # key removal in try.. except because KyotoCabinet only send "False" # when key does not exist # Thus ... _removemethod is idempotent (cf LocalMemoryGraph _remove method comment) self._db.remove(key) def _bulk_remove(self, txn, keys): res = self._db.remove_bulk(list(keys)) if res == -1: # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _remove_prefix(self, txn, prefix): keys = self._db.match_prefix(prefix) self._db.remove_bulk(keys) # overriding list methods # looks like a bucket of hacks, and yes indeed it is :) # btw, it REALLY improves performance if we compare to default implementation which, # in the case of KyotoCabinet would involve msgpack deserialization followed by a serialization def _init_lst(self, txn, key): res = self._db.set(key, '') if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _get_lst(self, txn, key): value = self._db.get(key) if value is None: return UNDEFINED # look _append_to_lst code below to understand why a split is done # And why resulting list is sliced from 1 if PYTHON2: # pragma : no cover return list(map(int, value.split('|')[1:])) return list(map( int, str(value, encoding='utf8').split('|')[1:])) # pragma : no cover def _set_lst(self, txn, key, values): newval = '|'.join([str(value) for value in values]) res = self._db.set(key, '|' + newval) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_get_lst(self, txn, keys): dic_values = self._db.get_bulk(keys) results = [] for key in keys: if PYTHON2: # pragma : no cover values = dic_values.get(key, UNDEFINED) else: # pragma : no cover values = dic_values.get(bytes(key, encoding='utf8'), UNDEFINED) if values == UNDEFINED: results.append([]) else: if PYTHON2: # pragma : no cover results.append(list(map(int, values.split('|')[1:]))) else: # pragma : no cover results.append( list( map(int, str(values, encoding='utf8').split('|')[1:]))) return results def _append_to_lst(self, txn, key, value): self._db.append(key, '|' + str(value)) def _bulk_append_to_lst(self, txn, key, values): newval = '|'.join([str(value) for value in values]) self._db.append(key, '|' + newval) def _remove_from_lst(self, txn, key, value): old = self._db.get(key) if not PYTHON2: # pragma : no cover old = str(old, encoding='utf8') # Caution : we are only removing ONE occurence # This is voluntary # For instance, it lst contains neighbour node, we need to remove only one occurence # cause current entity and neighbour node can be linked multiple time new = old.replace('|%s' % value, '', 1) if new == old: raise ValueError("list.remove(x): x not in list") res = self._db.set(key, new) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res def _bulk_remove_from_lst(self, txn, key, values): assert (len(values)) old = self._db.get(key) if PYTHON2: # pragma : no cover new = old else: # pragma : no cover new = str(old, encoding='utf8') for value in values: new = new.replace('|%s' % value, '', 1) if new == old: # pragma : no cover raise ValueError("list.remove(x): x not in list") res = self._db.set(key, new) if not (res): # pragma : no cover raise GrapheekDataKyotoCabinetException( 'KyotoCabinet : error while saving') return res