class BKNNModel( Model ):


  def __init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel, kval ):

    Model.__init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel );

    self._kval = kval;

    self._fn_cdata = self._fn;
    self._fn_ddata = self._fn.replace( '.kch', '-discrete.kch' );
    self._fn_meta = self._fn.replace( '.kch', '-meta.pickle' );
    self._fn_icov = self._fn.replace( '.kch', '-icov.pickle' );

    self._cdata = None;
    self._ddata = None;

    self._len_c = None;
    self._len_b = None;
    self._len_x = None;

    self._rowcount = None;
    self._total_pos = None;
    self._total_neg = None;

    self._icov = None;
    self._co = None;

    self._sample_y = [];
    self._sample_c = [];
    self._sample_b = [];
    self._sample_x = [];
    self._sample_x_ = [];

    self._needs_finalization = False;
    self._needs_initialization = True;

    self._dmarginals = {};
    self._dscores = {};

    self._sparse_points = 0;

    self._bias = None;


  def __enter__( self ):

    self._cdata = DB();
    self._ddata = DB();

    try:
      if self._mode == "r":
        assert self._cdata.open( self._fn_cdata, DB.OREADER );
      elif self._mode == "w":
        if isfile( self._fn_cdata ):
          remove( self._fn_cdata );
        assert self._cdata.open( self._fn_cdata, DB.OWRITER | DB.OCREATE );
      else:
        assert False;
    except:
      if self._cdata is not None:
        print( str( self._cdata.error() ) );
      raise;

    try:
      if self._mode == "r":
        assert self._ddata.open( self._fn_ddata, DB.OREADER );
      elif self._mode == "w":
        if isfile( self._fn_ddata ):
          remove( self._fn_ddata );
        assert self._ddata.open( self._fn_ddata, DB.OWRITER | DB.OCREATE );
      else:
        assert False;
    except:
      if self._ddata is not None:
        print( str( self._ddata.error() ) );
      raise;

    if self._mode == "r":

      with open( self._fn_meta, 'rb' ) as f:
        r = pickle_load( f );
        self._len_c = r[ "c" ];
        self._len_b = r[ "b" ];
        self._len_x = r[ "x" ];
        self._co = r[ "co" ];

      with open( self._fn_icov, 'rb' ) as f:
        self._icov = pickle_load( f );

    return self;


  def __exit__( self, exc_type, exc_value, traceback ):

    ex_w_exc = False;
    ex_w_exc = ex_w_exc or ( exc_type is not None );
    ex_w_exc = ex_w_exc or ( exc_value is not None );
    ex_w_exc = ex_w_exc or ( traceback is not None );

    if ( not ex_w_exc ) and ( self._mode == "w" ):

      if self._needs_finalization:
        self._finalize();

      with open( self._fn_meta, 'wb' ) as f:

        r = { "c": self._len_c,
              "b": self._len_b,
              "x": self._len_x,
              "co": self._co };

        pickle_dump( r, f );

      with open( self._fn_icov, 'wb' ) as f:

        pickle_dump( self._icov, f );

    if self._cdata is not None:
      try:
        assert self._cdata.close();
      except:
        print( str( self._cdata.error() ) );
        raise;
      self._cdata = None;

    if self._ddata is not None:
      try:
        assert self._ddata.close();
      except:
        print( str( self._ddata.error() ) );
        raise;
      self._ddata = None;

    if ex_w_exc and ( self._mode == "w" ):

      if isfile( self._fn_cdata ):
        remove( self._fn_cdata );

      if isfile( self._fn_ddata ):
        remove( self._fn_ddata );

      if isfile( self._fn_meta ):
        remove( self._fn_meta );

      if isfile( self._fn_icov ):
        remove( self._fn_icov );

    return False;


  def train( self, row ):

    self._needs_finalization = True;

    ( y, c, b, x ) = row;

    c = self._fsel.apply_c( self._catfe( c ) );
    b = self._fsel.apply_b( self._binfe( b ) );

    x = self._contfe( x );
    x_ = self._fdisc( x );

    x = self._fsel.apply_x( x );
    x_ = self._fsel.apply_x( x_ );

    if False:
      print( y, c, b, x, x_ );

    if self._len_c is None:
      self._len_c = len(c);
    assert self._len_c == len(c);

    if self._len_b is None:
      self._len_b = len(b);
    assert self._len_b == len(b);

    if self._len_x is None:
      self._len_x = len(x);
    assert self._len_x == len(x);

    if self._rowcount is None:
      self._rowcount = 0;

    self._rowcount += 1;

    dkeyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) );
    self._ddata.increment( pack( dkeyfmt, y, *(c+b) ), 1, 0 );

    ckeyfmt = '>' + ( 'I' * len(x) );
    cvalfmt = '>I' + ( 'f' * len(x) );
    self._cdata.append( pack( ckeyfmt, *x_ ), pack( cvalfmt, y, *x ) );

    if len( self._sample_x ) < 50000:

      assert len( self._sample_x ) == len( self._sample_y );
      assert len( self._sample_x ) == len( self._sample_c );
      assert len( self._sample_x ) == len( self._sample_b );
      assert len( self._sample_x ) == len( self._sample_x_ );

      self._sample_y.append( y );
      self._sample_c.append( c );
      self._sample_b.append( b );
      self._sample_x.append( x );
      self._sample_x_.append( x_ );

    return False;


  def _init( self ):

    self._needs_initialization = False;

    c = self._ddata.cursor();
    c.jump();

    keyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) );
    valfmt = '>Q';


    while True:

      r = c.get( True );
      if not r:
        break;

      dbkey = unpack( keyfmt, r[0] );
      dbval = unpack( valfmt, r[1] )[ 0 ];

      additional_count = dbval;

      y = dbkey[ 0 ];

      for ( i, value_of_variable_i ) in enumerate( dbkey[ 1: ] ):

        if not i in self._dmarginals:
          self._dmarginals[ i ] = {};

        self._dmarginals[ i ][ (y,value_of_variable_i) ] \
          = self._dmarginals[ i ].get( (y,value_of_variable_i), 0 ) \
              + additional_count;


    for ( i, count_by_val ) in self._dmarginals.items():

      total = 0;
      total_neg = 0;
      total_pos = 0;

      for ( ( y, val ), cnt ) in count_by_val.items():
        total += cnt;
        if y == 0:
          total_neg += cnt;
        elif y == 1:
          total_pos += cnt;

      if self._rowcount is None:
        self._rowcount = total;
      assert self._rowcount == total;

      if self._total_neg is None:
        self._total_neg = total_neg;
      try:
        assert self._total_neg == total_neg;
      except: 
        print( self._total_neg, total_neg );
        raise;

      if self._total_pos is None:
        self._total_pos = total_pos;
      try:
        assert self._total_pos == total_pos;
      except: 
        print( self._total_pos, total_pos );
        raise;

    assert ( self._total_pos + self._total_neg ) == self._rowcount;


    for i in self._dmarginals:

      values = set([ val for (y,val) in self._dmarginals[ i ].keys() ]);

      if i not in self._dscores:
        self._dscores[ i ] = {};

      for val in values:

        pos_cnt = self._dmarginals[ i ].get( (1,val), 0 );
        neg_cnt = self._dmarginals[ i ].get( (0,val), 0 );

        p_pos \
          =   log( float(pos_cnt) + SMOOTHING, 2.0 ) \
            - log( float(self._total_pos) + float( len(values) ) * SMOOTHING, 2.0 );

        p_neg \
          =   log( float(neg_cnt) + SMOOTHING, 2.0 ) \
            - log( float(self._total_neg) + float( len(values) ) * SMOOTHING, 2.0 );

        self._dscores[ i ][ val ] = p_pos - p_neg;

    
    p_pos \
      =   log( float(self._total_pos), 2.0 ) \
        - log( float(self._rowcount), 2.0 );

    p_neg \
      =   log( float(self._total_neg), 2.0 ) \
        - log( float(self._rowcount), 2.0 );

    self._bias = p_pos - p_neg;


    if False:
      for i in sorted( self._dscores.keys() ):
        score_by_val = self._dscores[ i ];
        for ( val, score ) in score_by_val.items():
          print( "{:d} {:10d} {:+2.4f}".format( i, val, score ) );


  def _apply( self, row ):

    if self._needs_initialization:
      self._init();

    ( c, b, x, x_ ) = row;

    ckeyfmt = '>' + ( 'I' * len(x_) );
    cvalfmt = '>I' + ( 'f' * len(x) );
    cvalsz = calcsize( cvalfmt );

    rng = [];
    for xval in x_:
      rng.append(
          [ xv \
              for xv \
               in [ xval-2, xval-1, xval, xval+1, xval+2 ] \
               if 0 <= xv <= 31 ]
        );

    x_vec = np.array( x ).reshape( 1, self._len_x ).T;

    nearest_positive = [];
    all_negative = [];
    found_ident = 0;

    for xvals in product( *rng ):

      try:
        ckey = pack( ckeyfmt, *xvals );
      except:
        print( ckeyfmt, xvals );
        raise;
      val = self._cdata.get( ckey );

      while val:

        if len(val) <= cvalsz:
          assert len(val) == cvalsz;

        val_ = val[:cvalsz];
        val = val[cvalsz:];

        pt = unpack( cvalfmt, val_ );
        pt_y = pt[0];
        pt_x = pt[1:];

        pt_x_vec = np.array( pt_x ).reshape( 1, self._len_x ).T;
        diff = pt_x_vec - x_vec;
        dist = np.sqrt( np.dot( np.dot( diff.T, self._icov ), diff ) );

        if dist <= 0.0001:
          found_ident += 1;
          continue;

        if pt_y == 0:
          all_negative.append( dist );
          continue;

        assert pt_y == 1;

        nearest_positive.append( dist );
        nearest_positive.sort();
        nearest_positive = nearest_positive[:self._kval];

    # assert found_ident == 1;
    # assert len( nearest_positive ) == self._kval;
    if len( nearest_positive ) < self._kval:
      self._sparse_points += 1;

    score = self._bias;

    # if len( nearest_positive ) > 0:
    if True:

      if len( nearest_positive ) == 0:
        threshold = None;
      else:
        threshold = nearest_positive[-1];

      neg_cnt = 0;
      for dist in all_negative:
        if ( threshold is None ) or ( dist <= threshold ):
          neg_cnt += 1;

      p_pos \
        =   log( float( len(nearest_positive) ) + SMOOTHING, 2.0 ) \
          - log( float(self._total_pos) + 2.0 * SMOOTHING, 2.0 );

      p_neg \
        =   log( float(neg_cnt) + SMOOTHING, 2.0 ) \
          - log( float(self._total_neg) + 2.0 * SMOOTHING, 2.0 );

      score += p_pos - p_neg;

    for ( i, dval ) in enumerate( c+b ):
      score += self._dscores[ i ].get( dval, 0.0 );

    if self._co is None:
      return score;
    else:
      if score >= self._co:
        return 1;
      else:
        return 0;


  def _finalize( self ):

    self._needs_finalization = False;

    covsample = np.array( self._sample_x );
    cov = np.cov( covsample.T );
    self._icov = LA.inv( cov );

    sample \
      = zip(
            self._sample_c,
            self._sample_b,
            self._sample_x,
            self._sample_x_
          );

    scores = [];
    for ( c, b, x, x_ ) in sample:
      scores.append( self._apply( [ c, b, x, x_ ] ) );

    sorted_scores = list( sorted( scores ) );

    cutoffs = [];
    for idx in range(0,1000):
      ratio = float(idx) / 1000.0;
      cutoffs.append(
          sorted_scores[ int( float( len(sorted_scores) ) * ratio ) ]
        );

    if False:
      pprint( cutoffs );

    stats_by_co = [];
    for coidx in range( 0, len(cutoffs) ):
      stats_by_co.append( { "tp": 0, "fp": 0, "tn": 0, "fn": 0 } );

    for ( y, score ) in zip( self._sample_y, scores ):
      for ( coidx, co ) in enumerate( cutoffs ):
        if score >= co:
          if y == 1:
            stats_by_co[ coidx ][ "tp" ] += 1;
          else:
            assert y == 0;
            stats_by_co[ coidx ][ "fp" ] += 1;
        else:
          if y == 0:
            stats_by_co[ coidx ][ "tn" ] += 1;
          else:
            assert y == 1;
            stats_by_co[ coidx ][ "fn" ] += 1;

    max_fscore = None;
    max_fscore_coidx = None;
    
    for ( coidx, co ) in enumerate( cutoffs ):

      tp = stats_by_co[ coidx ][ "tp" ];
      fp = stats_by_co[ coidx ][ "fp" ];
      tn = stats_by_co[ coidx ][ "tn" ];
      fn = stats_by_co[ coidx ][ "fn" ];

      if (tp+fp) <= 0:
        continue;

      if (tp+fn) <= 0:
        continue;

      precision = float(tp) / float(tp+fp);
      recall = float(tp) / float(tp+fn);

      if (precision+recall) <= 0.0:
        continue;

      fscore = 2.0 * ( ( precision * recall ) / ( precision + recall ) );

      if ( max_fscore is None ) or ( fscore > max_fscore ):

        max_fscore = fscore;
        max_fscore_coidx = coidx;

    assert max_fscore_coidx is not None;
    self._co = cutoffs[ max_fscore_coidx ];

    # assert self._sparse_points == 0;

    if True:
      print( self._sparse_points );
      print( self._co );
      print( max_fscore );


  def __call__( self, row ):

    ( c, b, x ) = row;

    c = self._fsel.apply_c( self._catfe( c ) );
    b = self._fsel.apply_b( self._binfe( b ) );

    x = self._contfe( x );
    x_ = self._fdisc( x );

    x = self._fsel.apply_x( x );
    x_ = self._fsel.apply_x( x_ );

    try:
      assert self._len_c == len(c);
      assert self._len_b == len(b);
      assert self._len_x == len(x);
      assert self._len_x == len(x_);
    except:
      print( self._len_c, self._len_b, self._len_x );
      raise;

    return self._apply( ( c, b, x, x_ ) );
Example #2
0
class KyotoCabinetGraph(BaseGraph):
    def __init__(self, path):
        # create the database object
        self._path = path
        self._db = DB()
        # open the database
        if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE):
            raise GrapheekDataKyotoCabinetInitFailureException(
                str(self._db.error()))
        super(KyotoCabinetGraph, self).__init__()
        self._ensure_prepared()
        self._closed = False

    # Start method overriding :

    def _db_close(self):
        if not self._closed:
            self._db.close()

    def _transaction_begin(self):
        self._db.begin_transaction()
        return True

    def _transaction_commit(self, txn):
        self._db.end_transaction(True)

    def _transaction_rollback(self, txn):
        self._db.end_transaction(False)

    def _has_key(self, key):
        return self._db.check(key) >= 0

    def _get(self, txn, key):
        raw_data = self._db.get(key)
        if raw_data is None:
            return UNDEFINED  # Not returning None, as None is a valid value
        return msgpack.loads(raw_data, encoding='utf8')

    def _bulk_get(self, txn, keys):
        result = {}
        key_raw_datas = self._db.get_bulk(keys)
        for key, raw_data in list(key_raw_datas.items()):
            if PYTHON2:  # pragma : no cover
                k = key
            else:  # pragma : no cover
                k = str(key, encoding='utf8')
            result[k] = msgpack.loads(raw_data, encoding='utf8')
        return result

    def _set(self, txn, key, value):
        res = self._db.set(key, msgpack.dumps(value, encoding='utf8'))
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_set(self, txn, updates):
        dic = {}
        for key, value in list(updates.items()):
            dic[key] = msgpack.dumps(value, encoding='utf8')
        res = self._db.set_bulk(dic)
        if res == -1:  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _remove(self, txn, key):
        # Contrary to LocalMemoryGraph implementation, it is not needed to wrap
        # key removal in try.. except because KyotoCabinet only send "False"
        # when key does not exist
        # Thus ... _removemethod is idempotent (cf LocalMemoryGraph _remove method comment)
        self._db.remove(key)

    def _bulk_remove(self, txn, keys):
        res = self._db.remove_bulk(list(keys))
        if res == -1:  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _remove_prefix(self, txn, prefix):
        keys = self._db.match_prefix(prefix)
        self._db.remove_bulk(keys)

    # overriding list methods
    # looks like a bucket of hacks, and yes indeed it is :)
    # btw, it REALLY improves performance if we compare to default implementation which,
    # in the case of KyotoCabinet would involve msgpack deserialization followed by a serialization

    def _init_lst(self, txn, key):
        res = self._db.set(key, '')
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _get_lst(self, txn, key):
        value = self._db.get(key)
        if value is None:
            return UNDEFINED
        # look _append_to_lst code below to understand why a split is done
        # And why resulting list is sliced from 1
        if PYTHON2:  # pragma : no cover
            return list(map(int, value.split('|')[1:]))
        return list(map(
            int,
            str(value, encoding='utf8').split('|')[1:]))  # pragma : no cover

    def _set_lst(self, txn, key, values):
        newval = '|'.join([str(value) for value in values])
        res = self._db.set(key, '|' + newval)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_get_lst(self, txn, keys):
        dic_values = self._db.get_bulk(keys)
        results = []
        for key in keys:
            if PYTHON2:  # pragma : no cover
                values = dic_values.get(key, UNDEFINED)
            else:  # pragma : no cover
                values = dic_values.get(bytes(key, encoding='utf8'), UNDEFINED)
            if values == UNDEFINED:
                results.append([])
            else:
                if PYTHON2:  # pragma : no cover
                    results.append(list(map(int, values.split('|')[1:])))
                else:  # pragma : no cover
                    results.append(
                        list(
                            map(int,
                                str(values, encoding='utf8').split('|')[1:])))
        return results

    def _append_to_lst(self, txn, key, value):
        self._db.append(key, '|' + str(value))

    def _bulk_append_to_lst(self, txn, key, values):
        newval = '|'.join([str(value) for value in values])
        self._db.append(key, '|' + newval)

    def _remove_from_lst(self, txn, key, value):
        old = self._db.get(key)
        if not PYTHON2:  # pragma : no cover
            old = str(old, encoding='utf8')
        # Caution : we are only removing ONE occurence
        # This is voluntary
        # For instance, it lst contains neighbour node, we need to remove only one occurence
        # cause current entity and neighbour node can be linked multiple time
        new = old.replace('|%s' % value, '', 1)
        if new == old:
            raise ValueError("list.remove(x): x not in list")
        res = self._db.set(key, new)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_remove_from_lst(self, txn, key, values):
        assert (len(values))
        old = self._db.get(key)
        if PYTHON2:  # pragma : no cover
            new = old
        else:  # pragma : no cover
            new = str(old, encoding='utf8')
        for value in values:
            new = new.replace('|%s' % value, '', 1)
        if new == old:  # pragma : no cover
            raise ValueError("list.remove(x): x not in list")
        res = self._db.set(key, new)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res