Example #1
0
    def _apply_weights(self, neighbor_vals, neighbor_weights):
        # weighted mean/mode of neighbors for a single regression target
        if neighbor_vals.ndim == 2:
            if self.measure == "mean":
                X = np.ma.average(neighbor_vals,
                                  weights=neighbor_weights,
                                  axis=1)
            else:
                X, _ = weighted_mode(neighbor_vals, neighbor_weights, axis=1)

        # weighted mean of neighbors for a multi-target regression
        # neighbor_vals = (n_samples, n_neighbors, n_targets)
        else:
            X = np.zeros((neighbor_vals.shape[0], neighbor_vals.shape[2]))

            if self.measure == "mean":
                for i in range(neighbor_vals.shape[-1]):
                    X[:, i] = np.ma.average(neighbor_vals[:, :, i],
                                            weights=neighbor_weights,
                                            axis=1)
            else:
                for i in range(neighbor_vals.shape[-1]):
                    X[:, i], _ = weighted_mode(neighbor_vals[:, :, i],
                                               neighbor_weights,
                                               axis=1)

        return X
Example #2
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X: array
            A 2-D array representing the test points.

        Returns
        -------
        labels: array
            List of class labels (one for each data sample).
        """
        X = np.atleast_2d(X)

        neigh_dist, neigh_ind = self.kneighbors(X)
        pred_labels = self._y[neigh_ind]

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = smart_mode(pred_labels, axis=1)
        else:
            mode, _ = weighted_mode(pred_labels, weights, axis=1)

        return mode.flatten().astype(np.int)
def simKNNpredict(X,Y,Xtest,Ytest,L,k=3,weighted=True,method='brute'):
    if not isinstance(X, np.matrix):
        X=np.matrix(X)
    if not isinstance(Xtest, np.matrix):
        Xtest=np.matrix(Xtest)
    #transform data
    X=X*L.T
    Xtest=(L*Xtest.T)
    n=Xtest.shape[1]
    y_pred = np.empty(n, dtype=Ytest.dtype)
    if method=='tree':
        #build tree
        print ('not implemented')
    else:
        #resort to brute force
        n=Xtest.shape[1]
        for x in range(n):
            w=np.squeeze(np.asarray(X*Xtest[:,x]))
            neighb=np.argpartition(-w, k)[:k]
            #cannot use bottleneck unless numpy version >= 1.9 available
            #argpartsort(np.squeeze(np.asarray(X*Xtest[:,x])),k)[:k]

            #unweighted
            #lmode, num=mode(Y[neighb], axis=0)
            #weighted
            lmode, weight = weighted_mode(Y[neighb], w[neighb])
            y_pred[x]=lmode
    return y_pred
Example #4
0
    def predict_loo(self):
        """Predict the class labels for the training data via leave-one-out.

        Returns
        -------
        y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
            Class labels for each training data sample.
        """

        neigh_dist, neigh_ind = self.kneighbors()
        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_outputs = len(classes_)
        n_queries = len(neigh_dist)
        weights = _get_weights(neigh_dist, self.weights)

        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            if weights is None:
                mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
            else:
                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

            mode = np.asarray(mode.ravel(), dtype=np.intp)
            y_pred[:, k] = classes_k.take(mode)

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred
def get_bin_indices(hits, bins, Rmax=207):
    segclass = 'segclass'
    binclass = 'binclass'
    fiducial_cut = (hits.x**2+hits.y**2)<Rmax**2
    binsX, binsY, binsZ = bins
    boundary_cut = (hits.x>=binsX.min()) & (hits.x<=binsX.max())\
                 & (hits.y>=binsY.min()) & (hits.y<=binsY.max())\
                 & (hits.z>=binsZ.min()) & (hits.z<=binsZ.max())

    hits_act = hits[fiducial_cut & boundary_cut].reset_index(drop = True)
    xbin = pd.cut(hits_act.x, binsX, labels = np.arange(0, len(binsX)-1)).astype(int)
    ybin = pd.cut(hits_act.y, binsY, labels = np.arange(0, len(binsY)-1)).astype(int)
    zbin = pd.cut(hits_act.z, binsZ, labels = np.arange(0, len(binsZ)-1)).astype(int)

    hits_act = hits_act.assign(xbin=xbin, ybin=ybin, zbin=zbin)
    hits_act.event_id = hits_act.event_id.astype(np.int64)

    if segclass not in hits.columns:
        hits_act = hits_act.assign(segclass = -1)
    if binclass not in hits.columns:
        hits_act = hits_act.assign(binclass = -1)

    #outputs df with bins index and energy, and optional label
    out = hits_act.groupby(['xbin', 'ybin', 'zbin', 'event_id']).apply(
        lambda df:pd.Series({'energy':df['energy'].sum(),
                             segclass:int(weighted_mode(df[segclass], df['energy'])[0][0]),
                             binclass:int(df[binclass].unique()[0])})).reset_index()
    out[segclass] = out[segclass].astype(int)
    out[binclass] = out[binclass].astype(int)
    return out
Example #6
0
def infer(query,
          samples=None,
          db=None,
          sample_db_fn=None,
          depth=None,
          d_type='d1'):
    ''' infer a query, return it's ap
    arguments
    query       : a dict with three keys, see the template
                  {
                    'img': <path_to_img>,
                    'cls': <img class>,
                    'hist' <img histogram>
                  }
    samples     : a list of {
                              'img': <path_to_img>,
                              'cls': <img class>,
                              'hist' <img histogram>
                            }
    db          : an instance of class Database
    sample_db_fn: a function making samples, should be given if Database != None
    depth       : retrieved depth during inference, the default depth is equal to database size
    d_type      : distance type
    '''

    assert samples != None or (
        db != None and sample_db_fn != None
    ), "need to give either samples or db plus sample_db_fn"
    if db:
        samples = sample_db_fn(db)

    q_img, q_cls, q_hist = query['img'], query['cls'], query['hist']
    results = []
    for idx, sample in enumerate(samples):
        s_img, s_cls, s_hist = sample['img'], sample['cls'], sample['hist']
        if q_img == s_img:
            continue
        results.append({
            'dis': distance(q_hist, s_hist, d_type=d_type),
            'cls': s_cls,
            'img': s_img
        })

    results = sorted(results, key=lambda x: x['dis'])
    if depth and depth <= len(results):
        results = results[:depth]
        print(q_img)
        list_im = [sub['img'] for sub in results]
        print(list_im)
        pred = [sub['cls'] for sub in results]
        weig = [sub['dis'] for sub in results]
        weig = np.reciprocal(weig)
        pred2 = weighted_mode(pred, weig)
        pred = np.array_str(pred2[0])[2:-2]

    ap = AP(q_cls, results, sort=False)

    return ap, pred
    def predict_classification(train_data, train_target, test_row,
                               num_neighbors, p, weight_mode):
        neighbors, neighbors_w = get_neighbors(train_data, train_target,
                                               test_row, num_neighbors, p,
                                               weight_mode)

        result = weighted_mode(neighbors, neighbors_w)
        prediction = int(result[0])
        return prediction
Example #8
0
def test_uniform_weights():
    # with uniform weights, results should be identical to stats.mode
    x = np.random.randint(10, size=(10, 5))
    weights = np.ones(x.shape)

    for axis in (None, 0, 1):
        mode, score = stats.mode(x, axis)
        mode2, score2 = weighted_mode(x, weights, axis)

        assert np.all(mode == mode2)
        assert np.all(score == score2)
Example #9
0
 def predict(self, X):
     train_X, train_Y = self.data_
     dist = self.pairwise_distance(train_X, X)
     assert np.all(dist >= 0)
     idx = np.argsort(dist, axis=1)
     nn_idx = idx[:, :self.K]
     nn_dist = dist[np.arange(len(X))[:, None], nn_idx]
     nn_labels = train_Y[nn_idx]
     weights = _get_weights(nn_dist, 'distance')  # Weighted KNN
     a, _ = weighted_mode(nn_labels, weights, axis=1)
     return a.reshape(-1)
def test_uniform_weights():
    # with uniform weights, results should be identical to stats.mode
    rng = np.random.RandomState(0)
    x = rng.randint(10, size=(10, 5))
    weights = np.ones(x.shape)

    for axis in (None, 0, 1):
        mode, score = stats.mode(x, axis)
        mode2, score2 = weighted_mode(x, weights, axis)

        assert_array_equal(mode, mode2)
        assert_array_equal(score, score2)
Example #11
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X : sktime-format pandas dataframe or array-like, shape (n_query,
        n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.

        Returns
        -------
        y : array of shape [n_samples] or [n_samples, n_outputs]
            Class labels for each data sample.
        """
        self.check_is_fitted()

        if hasattr(check_array, '__wrapped__'):
            temp = check_array.__wrapped__.__code__
            check_array.__wrapped__.__code__ = _check_array_ts.__code__
        else:
            temp = check_array.__code__
            check_array.__code__ = _check_array_ts.__code__

        neigh_dist, neigh_ind = self.kneighbors(X)
        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_outputs = len(classes_)
        n_samples = X.shape[0]
        weights = _get_weights(neigh_dist, self.weights)

        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            if weights is None:
                mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
            else:
                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

            mode = np.asarray(mode.ravel(), dtype=np.intp)
            y_pred[:, k] = classes_k.take(mode)

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        if hasattr(check_array, '__wrapped__'):
            check_array.__wrapped__.__code__ = temp
        else:
            check_array.__code__ = temp
        return y_pred
Example #12
0
def test_uniform_weights():
    # with uniform weights, results should be identical to stats.mode
    rng = np.random.RandomState(0)
    x = rng.randint(10, size=(10, 5))
    weights = np.ones(x.shape)

    for axis in (None, 0, 1):
        mode, score = stats.mode(x, axis)
        mode2, score2 = weighted_mode(x, weights, axis)

        assert_array_equal(mode, mode2)
        assert_array_equal(score, score2)
Example #13
0
    def predict(self, x):
        """Predict the label of the given point

        Parameters
        ----------
        x : numpy.ndarray
            Point to be predicted. Array of shape (1, m) where m is the number of features

        Returns
        -------
        predict_label : int
            Predicted label/class
        """

        if not isinstance(x, np.ndarray):
            x = np.array(x)

        k = self.n_neighbours

        # the metric object used to calculate the distance between training data and new data
        if self.metric_params is None:
            dist_metric = DistMetric.get_metric(self.metric)
        else:
            dist_metric = DistMetric.get_metric(self.metric, *self.metric_params)
        dist = dist_metric.dist(self.X_train_, x)

        # if self.weights is 'distance', the weights are the inverse of the distance between training data and the
        # point
        weights = _get_weights(dist, self.weights)

        if weights is None:
            # zip together in order to sort then unzip to get back the sorted lists
            zipped = sorted(zip(dist, self.classes_))
            dist, class_labels = zip(*zipped)

            # get the labels of the k closest points to x
            class_labels = np.array(class_labels)[:k]

            # get the most common label
            predict_label = Counter(class_labels).most_common(1)[0][0]

            return predict_label
        else:
            zipped = sorted(zip(dist, self.classes_, weights))
            dist, class_labels, weights = zip(*zipped)

            class_labels = np.array(class_labels)[:k]
            weights = np.array(weights)[:k]

            predict_label = weighted_mode(class_labels, weights)[0][0]

            return predict_label
Example #14
0
 def predict(self, X):
     X = X.astype(np.float32)
     X = np.ascontiguousarray(X)
     if X.ndim == 1:
         X = X[np.newaxis]
     D, I = self.index.search(X, self.n_neighbors)
     outputs = np.squeeze(self.labels[I])
     weights = _get_weights(D, self.weights)
     if weights is None:
         y_pred, _ = mode(outputs, axis=1)
     else:
         y_pred, _ = weighted_mode(outputs, weights, axis=1)
     return y_pred
Example #15
0
    def predict(self, X, idx=None):

        neigh_dist, neigh_ind = self.kneighbors(X, idx)
        pred_labels = self._y[neigh_ind]

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = smart_mode(pred_labels, axis=1)
        else:
            mode, _ = weighted_mode(pred_labels, weights)

        return mode.flatten().astype(np.int)
Example #16
0
def test_random_weights():
    # set this up so that each row should have a weighted mode of 6,
    # with a score that is easily reproduced
    mode_result = 6

    x = np.random.randint(mode_result, size=(100, 10))
    w = np.random.random(x.shape)

    x[:, :5] = mode_result
    w[:, :5] += 1

    mode, score = weighted_mode(x, w, axis=1)

    assert np.all(mode == mode_result)
    assert np.all(score.ravel() == w[:, :5].sum(1))
Example #17
0
    def predict(self, X, idx=None):

        neigh_dist, neigh_ind = self.kneighbors(X,idx)
        pred_labels = self._y[neigh_ind]

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = stats.mode(pred_labels, axis=1)
        else:
            # Randomly permute the neighbors to tie-break randomly if necessary
            perm = np.random.permutation(n_neighbors)
            ind = ind[perm]
            mode, _ = weighted_mode(pred_labels,weights,axis)
            
        return mode.flatten().astype(np.int)
Example #18
0
def test_random_weights():
    # set this up so that each row should have a weighted mode of 6,
    # with a score that is easily reproduced
    mode_result = 6

    rng = np.random.RandomState(0)
    x = rng.randint(mode_result, size=(100, 10))
    w = rng.random_sample(x.shape)

    x[:, :5] = mode_result
    w[:, :5] += 1

    mode, score = weighted_mode(x, w, axis=1)

    assert_array_equal(mode, mode_result)
    assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
Example #19
0
def test_random_weights():
    # set this up so that each row should have a weighted mode of 6,
    # with a score that is easily reproduced
    mode_result = 6

    rng = np.random.RandomState(0)
    x = rng.randint(mode_result, size=(100, 10))
    w = rng.random_sample(x.shape)

    x[:, :5] = mode_result
    w[:, :5] += 1

    mode, score = weighted_mode(x, w, axis=1)

    np.testing.assert_array_equal(mode, mode_result)
    np.testing.assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
Example #20
0
def majority_vote(scores, n_classes=2, weights=None):
    """Combination method to merge the scores from multiple estimators
    by majority vote.

    Parameters
    ----------
    scores : numpy array of shape (n_samples, n_estimators)
        Score matrix from multiple estimators on the same samples.

    n_classes : int, optional (default=2)
        The number of classes in scores matrix

    weights : numpy array of shape (1, n_estimators)
        If specified, using weighted majority weight.

    Returns
    -------
    combined_scores : numpy array of shape (n_samples, )
        The combined scores.

    """

    scores = check_array(scores)

    # assert only discrete scores are combined with majority vote
    check_classification_targets(scores)
    assert (len(np.unique(scores)) == n_classes)

    n_samples, n_estimators = scores.shape[0], scores.shape[1]

    vote_results = np.zeros([
        n_samples,
    ])

    if weights is not None:
        assert_equal(scores.shape[1], weights.shape[1])

    # equal weights if not set
    else:
        weights = np.ones([1, n_estimators])

    for i in range(n_samples):
        vote_results[i] = weighted_mode(scores[i, :], weights)[0][0]

    return vote_results.ravel()
Example #21
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            Test samples.
        """
        X_ = to_time_series_dataset(X)
        neigh_dist, neigh_ind = self.kneighbors(X_)

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = stats.mode(self._fit_y[neigh_ind], axis=1)
        else:
            mode, _ = weighted_mode(self._fit_y[neigh_ind], weights, axis=1)

        return mode[:, 0]
Example #22
0
    def predict(self, X, E):  #IY
        """Predict the class labels for the provided data
        taking into account data uncertainties (chi^2 distances)

        Parameters
        ----------
        X : array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.
        E : array-like, same shape as X, \
            Data errors
        Returns
        -------
        y : array of shape [n_samples] or [n_samples, n_outputs]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr')
        neigh_dist, neigh_ind = self.kneighbors(X,E=E)

        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_outputs = len(classes_)
        n_samples = X.shape[0]
        weights = _get_weights(neigh_dist, self.weights)

        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            if weights is None:
                mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
            else:
                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

            mode = np.asarray(mode.ravel(), dtype=np.intp)
            y_pred[:, k] = classes_k.take(mode)

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred
Example #23
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            Test samples.
        """
        if self.metric == "min_dist" and self.variables_size > 1:
            X_ = X
        else:
            X_ = to_time_series_dataset(X,self.variables_size)

        neigh_dist, neigh_ind = self.kneighbors(X_,self.multivariate_output,None,True)

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = stats.mode(self._fit_y[neigh_ind], axis=1)
        else:
            mode, _ = weighted_mode(self._fit_y[neigh_ind], weights, axis=1)

        return mode[:, 0]
Example #24
0
def infer(query,
          samples=None,
          db=None,
          sample_db_fn=None,
          depth=None,
          d_type='d1'):
    assert samples is not None or (
        db is not None and sample_db_fn
        is not None), "need to give either samples or db plus sample_db_fn"
    if db:
        samples = sample_db_fn(db)

    q_img, q_cls, q_hist = query['img'], query['cls'], query['hist']
    results = []
    for idx, sample in enumerate(samples):
        s_img, s_cls, s_hist = sample['img'], sample['cls'], sample['hist']
        if q_img == s_img:
            continue
        results.append({
            'dis': distance(q_hist, s_hist, d_type=d_type),
            'cls': s_cls,
            'img': s_img
        })
    results = sorted(results, key=lambda x: x['dis'])
    if depth and depth <= len(results):
        results = results[:depth]
        # list_images = [sub['img'] for sub in results]
        pred = [sub['cls'] for sub in results]
        weig = [sub['dis'] for sub in results]
        weig = np.reciprocal(weig)
        pred2 = weighted_mode(pred, weig)
        pred = np.array_str(pred2[0])[2:-2]

    ap = customAP(q_cls, results, sort=False)

    return ap, pred
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, required=True, help='data')
    parser.add_argument('--valid-keys',
                        type=str,
                        required=True,
                        help='validation keys')
    parser.add_argument('--input-dim', default=40, type=int, help='feats dim')
    parser.add_argument('--seq-len',
                        default=500,
                        type=int,
                        help='input sequence length')
    parser.add_argument('--batch-size',
                        default=64,
                        type=int,
                        help='mini-batch size')
    parser.add_argument('--models', nargs='+', help='weights file')
    parser.add_argument('--model-types', nargs='+', help='model type')
    parser.add_argument('--output', type=str, help='output file')
    parser.add_argument('--filters',
                        type=int,
                        default=64,
                        help='number of filters')
    parser.add_argument('--groups',
                        type=int,
                        default=16,
                        help='number of groups')
    parser.add_argument('--mtl',
                        type=str,
                        default=None,
                        help='MTL training mode')
    parser.add_argument('--units',
                        type=int,
                        default=16,
                        help='number of LSTM cells')
    parser.add_argument('--lstm-depth',
                        type=int,
                        default=2,
                        help='number of LSTM layers')
    parser.add_argument('--norm', type=str, help='normalization file')
    parser.add_argument('--decision',
                        type=str,
                        default='vote',
                        help='decision method')
    parser.add_argument('--weights', nargs='+', help='voting weights')
    args = parser.parse_args()

    valid_keys = []
    with open(args.valid_keys, 'r') as f:
        key = f.readline().strip()
        while key:
            valid_keys.append(key)
            key = f.readline().strip()

    # read best model
    models = []
    input = Input((args.input_dim, args.seq_len, 1))
    for n, type in enumerate(args.model_types):
        if type == 'base':
            output = DCASE_CNN_2019()(input)
        elif type == 'cnn4':
            output = CNN4()(input)
        elif type == 'cnn8':
            output = CNN8()(input)
        elif type == 'gcnn':
            output = GCNN_GRU()(input)
        elif type == 'gcnn-lstm':
            output = GCNN_GRU(lstm=True)(input)
        elif type == 'vgg_fcn':
            output = VGG_FCN(init_filters=args.filters)(input)
        elif type == 'vgg_lstm':
            output = VGG_LSTM(units=args.units,
                              lstm_depth=args.lstm_depth,
                              init_filters=args.filters)(input)
        model = Model(input, output)
        model.load_weights(args.models[n])
        models.append(model)

    validation_generator = ASCDataGenerator(args.data,
                                            keys=valid_keys,
                                            dim=(args.input_dim, args.seq_len),
                                            batch_size=args.batch_size,
                                            mode=args.mtl)
    norm_path = os.path.join(args.norm)
    validation_generator.load_norm(norm_path)

    # バッチの総数を取得
    per_epoch = validation_generator.__len__()

    csv = os.path.join(args.output)

    mat = np.zeros((ASC_CLASS, ASC_CLASS), dtype=int)
    # バッチ総数分のループ
    for n in range(per_epoch):
        # データを取得 x_batch: 入力特徴,y_batch 正解ラベル
        x_batch, y_batch = validation_generator.__getitem__(n)
        # 予測値を計算
        preds = []
        for model in models:
            pred = model.predict(x_batch, x_batch.shape[0], verbose=1)
            preds.append(np.expand_dims(pred, axis=0))
        preds = np.concatenate(preds, axis=0)
        if args.decision == 'vote':
            # 多数決
            y_pred = np.argmax(preds, axis=2)
            y_pred, counts = stats.mode(y_pred, axis=0)
            # 確率が最大となるインデックスを求める
        elif args.decision == 'weight_vote':
            y_pred = np.argmax(preds, axis=2)
            y_pred, score = weighted_mode(y_pred, args.weights)
        elif args.decision == 'mean':
            y_pred = np.mean(preds, axis=0)
            y_pred = np.argmax(pred, axis=1)
        y_true = np.argmax(y_batch, axis=1)
        y_pred = np.reshape(y_pred, (batch_size, ))
        # 正解率の計算に使う行列を作る
        cmat = confusion_matrix(y_true, y_pred)
        mat = np.add(cmat, mat)

    np.savetxt(csv, mat.astype(int))
def main(args):
    # Use the digits dataset.
    data, target = sklearn.datasets.load_digits(n_class=args.classes,
                                                return_X_y=True)
    data = sklearn.preprocessing.MinMaxScaler().fit_transform(data)

    # Split the dataset into a train set and a test set.
    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        data, target, test_size=args.test_size, random_state=args.seed)

    #TODO: Fill in Array
    c0, c1, c2, c3, c4, c5, c6, c7, c8, c9 = [], [], [], [], [], [], [], [], [], []
    c0_t, c1_t, c2_t, c3_t, c4_t, c5_t, c6_t, c7_t, c8_t, c9_t = [], [], [], [], [], [], [], [], [], []

    for i in range(len(train_data)):
        if train_target[i] == 0:
            c0.append(train_data[i])
        elif train_target[i] == 1:
            c1.append(train_data[i])
        elif train_target[i] == 2:
            c2.append(train_data[i])
        elif train_target[i] == 3:
            c3.append(train_data[i])
        elif train_target[i] == 4:
            c4.append(train_data[i])
        elif train_target[i] == 5:
            c5.append(train_data[i])
        elif train_target[i] == 6:
            c6.append(train_data[i])
        elif train_target[i] == 7:
            c7.append(train_data[i])
        elif train_target[i] == 8:
            c8.append(train_data[i])
        else:
            c9.append(train_data[i])
    for i in range(len(test_data)):
        if test_target[i] == 0:
            c0_t.append(test_data[i])
        elif test_target[i] == 1:
            c1_t.append(test_data[i])
        elif test_target[i] == 2:
            c2_t.append(test_data[i])
        elif test_target[i] == 3:
            c3_t.append(test_data[i])
        elif test_target[i] == 4:
            c4_t.append(test_data[i])
        elif test_target[i] == 5:
            c5_t.append(test_data[i])
        elif test_target[i] == 6:
            c6_t.append(test_data[i])
        elif test_target[i] == 7:
            c7_t.append(test_data[i])
        elif test_target[i] == 8:
            c8_t.append(test_data[i])
        else:
            c9_t.append(test_data[i])

    xs = [
        np.array(c0),
        np.array(c1),
        np.array(c2),
        np.array(c3),
        np.array(c4),
        np.array(c5),
        np.array(c6),
        np.array(c7),
        np.array(c8),
        np.array(c9)
    ]
    ys = [
        np.full(len(c0), 0),
        np.full(len(c1), 1),
        np.full(len(c2), 2),
        np.full(len(c3), 3),
        np.full(len(c4), 4),
        np.full(len(c5), 5),
        np.full(len(c6), 6),
        np.full(len(c7), 7),
        np.full(len(c8), 8),
        np.full(len(c9), 9)
    ]

    xs_t = [
        np.array(c0_t),
        np.array(c1_t),
        np.array(c2_t),
        np.array(c3_t),
        np.array(c4_t),
        np.array(c5_t),
        np.array(c6_t),
        np.array(c7_t),
        np.array(c8_t),
        np.array(c9_t)
    ]
    ys_t = [
        np.full(len(c0_t), 0),
        np.full(len(c1_t), 1),
        np.full(len(c2_t), 2),
        np.full(len(c3_t), 3),
        np.full(len(c4_t), 4),
        np.full(len(c5_t), 5),
        np.full(len(c6_t), 6),
        np.full(len(c7_t), 7),
        np.full(len(c8_t), 8),
        np.full(len(c9_t), 9)
    ]
    '''i = 1
    j = 2
    print(train_data)
    print(train_target)
    print(len(train_data[(train_target == i) | (train_target == j)]))
    print(len(train_target[(train_target == i) | (train_target == j)]))'''

    empty = np.empty((len(test_target), 1))
    result = np.empty((len(test_target), 1))
    d = dict()
    for i in range(args.classes):
        d[i] = dict()
        for j in range(args.classes):
            if j <= i:
                continue
            d[i][j] = dict()

            #TODO: problem here solved
            #res_x = np.vstack([xs[i], xs[j]]) #res_x = train_data[(train_target == i) | (train_target == j)]       #res_x = np.vstack([xs[i], xs[j]])
            copy_res_x = train_data[(train_target == i) | (train_target == j)]

            #res_y = np.concatenate([ys[i], ys[j]]) #res_y = train_target[(train_target == i) | (train_target == j)]     #res_y = np.concatenate([ys[i], ys[j]])
            #res_y = [1 if res_y[m] == i else -1 for m in range(len(res_y))]
            copy_res_y = train_target[(train_target == i) |
                                      (train_target == j)]
            copy_res_y = [
                1 if copy_res_y[m] == i else -1 for m in range(len(copy_res_y))
            ]

            #res_x_t = train_data[(train_target == i) | (train_target == j)]
            #res_y_t = train_target[(train_target == i) | (train_target == j)]
            #res_y_t = [1 if res_y_t[m] == i else -1 for m in range(len(res_y_t))]

            #TODO: =================================== New Prediction ===================================
            x_votes = test_data

            s_vectors, s_weights, b, _, _ = smo(args, copy_res_x, copy_res_y,
                                                copy_res_x, copy_res_y)

            def predict_votes(row):
                sums = 0
                for oo in range(len(s_weights)):
                    sums += s_weights[oo] * test_kernels_votes[row, oo]
                return sums + b

            test_kernels_votes = np.empty((len(x_votes), len(s_vectors)),
                                          dtype=float)

            for x1, x1_ in enumerate(x_votes):
                for x2, x2_ in enumerate(s_vectors):
                    test_kernels_votes[x1][x2] = kernel(args, x1_, x2_)[0]

            votes = []
            for t in range(len(x_votes)):
                my_predict_votes = predict_votes(t)
                if my_predict_votes >= 0:
                    votes.append(i)
                else:
                    votes.append(j)

            votes = np.array(votes).reshape(-1, 1)

            if i == 0 and j == 1:
                result = np.c_[empty, votes]
            else:
                result = np.c_[result, votes]

    my_test = []
    for row in result:
        row = np.delete(row, 0)
        #print(row)
        select = weighted_mode(row, np.full(len(row), 1))
        my_test.append(int(select[0]))

    my_test = np.array(my_test)
    test_accuracy = sklearn.metrics.accuracy_score(my_test, test_target)

    return test_accuracy
Example #27
0
    def decisionBoundary(self, X, y):
        # function to return plots
        assert (len(list(X.columns)) == 2)

        color = ["r", "y", "b"]
        lookup = {"Setosa": 0, "Versicolor": 1, "Virginica": 2}
        fig1, ax1 = plt.subplots(1,
                                 len(self.trees),
                                 figsize=(5 * len(self.trees), 4))

        x_min, x_max = X.iloc[:, 0].min(), X.iloc[:, 0].max()
        y_min, y_max = X.iloc[:, 1].min(), X.iloc[:, 1].max()
        x_range = x_max - x_min
        y_range = y_max - y_min
        Zs = []

        for i, tree in enumerate(self.trees):
            xx, yy = np.meshgrid(
                np.arange(x_min - 0.2, x_max + 0.2, (x_range) / 50),
                np.arange(y_min - 0.2, y_max + 0.2, (y_range) / 50))
            Z = tree.predict(
                pd.DataFrame(np.c_[xx.ravel(), yy.ravel()],
                             columns=list(X.columns))).to_numpy()
            Z = np.vectorize(lambda x: lookup[x])(Z)
            Z = Z.reshape(xx.shape)
            cs = ax1[i].contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
            fig1.colorbar(cs, ax=ax1[i], shrink=0.9)
            ax1[i].set_ylabel("X2")
            ax1[i].set_xlabel("X1")

            Zs.append(Z)
            for y_label in y.unique():
                idx = y == y_label
                id = list(y.cat.categories).index(y[idx].iloc[0])
                ax1[i].scatter(X[idx].iloc[:, 0],
                               X[idx].iloc[:, 1],
                               c=color[id],
                               cmap=plt.cm.RdYlBu,
                               edgecolor='black',
                               s=30,
                               label="Class: " + str(y_label))
            ax1[i].set_title("Decision Surface Tree: " + str(i + 1))
            ax1[i].legend()
        fig1.tight_layout()

        fig2, ax2 = plt.subplots(1, 1, figsize=(6, 4))
        Zs = np.array(Zs)
        com_surface, _ = weighted_mode(Zs, np.ones(Zs.shape))
        Z = np.mean(Zs, axis=0)
        cs = ax2.contourf(xx, yy, com_surface[0], cmap=plt.cm.RdYlBu)
        for y_label in y.unique():
            idx = y == y_label
            id = list(y.cat.categories).index(y[idx].iloc[0])
            ax2.scatter(X[idx].iloc[:, 0],
                        X[idx].iloc[:, 1],
                        c=color[id],
                        cmap=plt.cm.RdYlBu,
                        edgecolor='black',
                        s=30,
                        label="Class: " + str(y_label))
        ax2.set_ylabel("X2")
        ax2.set_xlabel("X1")
        ax2.legend()
        ax2.set_title("Common Decision Surface")
        fig2.colorbar(cs, ax=ax2, shrink=0.9)

        # Saving Figures
        fig1.savefig(os.path.join("figures", "Q7_Fig1.png"))
        fig2.savefig(os.path.join("figures", "Q7_Fig2.png"))
        return fig1, fig2
Example #28
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X: array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.

        Returns
        -------
        y: array of shape [n_samples] or [n_samples, n_outputs]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr')

        neigh_dist, neigh_ind = self.kneighbors(X)
        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_outputs = len(classes_)
        n_samples = X.shape[0]
        weights = _get_weights(neigh_dist, self.weights)

        if issparse(self._y):
            y_pred = lil_matrix((n_outputs, n_samples), dtype=classes_[0].dtype)
            if weights is None:
                register_parallel_pytest_cov()
                with mp.Pool(processes=self.n_jobs) as pool:
                    k_cls = list(tqdm(pool.imap_unordered(func=partial(_sparse_multilabel_classification,
                                                                       y=_y, neigh_ind=neigh_ind),
                                                          iterable=enumerate(classes_),
                                                          chunksize=10),
                                      disable=False if self.verbose else True,
                                      total=len(classes_),
                                      unit='classes',
                                      desc='Multilabel classification'))
                for k, cls in tqdm(k_cls,
                                   desc='Collecting results',
                                   disable=False if self.verbose else True):
                    y_pred[k] = cls
            else:
                raise NotImplementedError
            y_pred = y_pred.tocsc().T
        else:
            y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
            for k, classes_k in enumerate(classes_):
                if weights is None:
                    mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
                else:
                    mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

                mode = np.asarray(mode.ravel(), dtype=np.intp)
                y_pred[:, k] = classes_k.take(mode)

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred
Example #29
0
def resample_pointdata(source,
                       target,
                       data,
                       is_sphere=False,
                       source_mask=None,
                       target_mask=None,
                       red_func='mean',
                       k=3,
                       fill=0,
                       n_jobs=1,
                       append=False,
                       key=None):
    """Resample point data in source to target surface.

    Parameters
    ----------
    source : vtkPolyData or BSPolyData
        Source surface.
    target : vtkPolyData or BSPolyData
        Target surface.
    data : str, 1D ndarray or list or str and ndarray
        Point data in source surface to resample.
    is_sphere : bool, optional
        If True, assume source and target are provided as spheres that are
        aligned. Default is False.
    source_mask : str or 1D ndarray, optional
        Boolean mask. If str, it must be in the point data attributes of
        `source`. Default is None. If specified, only consider points within
        the mask.
    target_mask : str or 1D ndarray, optional
        Boolean mask. If str, it must be in the point data attributes of
        `target`. Default is None. If specified, only consider points within
        the mask.
    red_func : {'mean', 'weighted_mean', 'mode', 'weighted_mode'}, optional
        Reduction function. Default is 'mean'.
    k : int, optional
        Number of closest points to consider during resampling.
        Only used when ``is_sphere==False``. Default is 3.
    fill : int or float, optional
        Value used for entries out of the mask. Only used if the
        `target_mask` is provided. Default is 0.
    n_jobs : int, optional
        Number of parallel jobs. Only used when ``is_sphere==False``.
        Default is 1.
    append: bool, optional
        If True, append array to point data attributes of target surface and
        return surface. Otherwise, only return resampled arrays.
        Default is False.
    key : str or list of str, optional
        Array names to append to target's point data attributes. Only used if
        ``append == True``. If None, use names in `source_name`.
        Default is None.

    Returns
    -------
    output : vtkPolyData, BSPolyData or list of ndarray
        Resampled point data. Return ndarray or list of ndarray if
        ``append == False``. Otherwise, return target surface with the
        new arrays.

    Notes
    -----
    This function is meant for the same source and target surfaces but with
    different number of points. For other types of resampling, see
    vtkResampleWithDataSet.

    """
    opt = ['mean', 'mode', 'weighted_mean', 'weighted_mode']

    is_list = True
    if not isinstance(data, list):
        data = [data]
        is_list = False

    if isinstance(red_func, str):
        red_func = [red_func] * len(data)

    if isinstance(source_mask, str):
        source_mask = source.PointData[source_mask]
    if isinstance(target_mask, str):
        target_mask = source.PointData[target_mask]

    if not is_sphere:
        use_weights = False
        if k > 1 and np.isin(red_func, opt[2:]).any():
            use_weights = True

        pids = _get_pids_naive(source,
                               target,
                               k=k,
                               source_mask=source_mask,
                               target_mask=target_mask,
                               n_jobs=n_jobs,
                               return_weights=use_weights)
        if use_weights:
            pids, w = pids
    else:
        pids, w = _get_pids_sphere(source,
                                   target,
                                   source_mask=source_mask,
                                   target_mask=target_mask)

        k = None
        for i, rf in enumerate(red_func):
            if rf in ['mean', 'mode']:
                red_func[i] = 'weighted_%s' % rf

    resampled = [None] * len(data)
    for i, d in enumerate(data):
        if isinstance(d, str):
            d = source.PointData[d]

        if source_mask is not None:
            d = d[source_mask]

        if k == 1:
            feat = d[pids]
        elif red_func[i] == 'mean':
            feat = np.mean(d[pids], axis=1)
        elif red_func[i] == 'weighted_mean':
            feat = np.average(d[pids], weights=w, axis=1)
        elif red_func[i] == 'mode':
            feat = mode(d[pids], axis=1)[0].squeeze()
        elif red_func[i] == 'weighted_mode':
            feat = weighted_mode(d[pids], w, axis=1)[0].squeeze()
            feat = feat.astype(d.dtype)
        else:
            raise ValueError('Unknown red_func: {0}'.format(red_func[i]))

        if target_mask is not None:
            feat = map_to_mask(feat, mask=target_mask, fill=fill)
        resampled[i] = feat

    if append and key is not None:
        for i, feat in enumerate(resampled):
            target.append_array(feat, name=key[i], at='p')

    return resampled if is_list else resampled[0]
Example #30
0
    def predict(self, X):
        """Predict the class labels for the provided data
        Parameters
        ----------
        X : array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.
        Returns
        -------
        y : array of shape [n_samples]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse="csr")
        n_samples = X.shape[0]

        neigh_dist, neigh_ind = self.radius_neighbors(X)
        inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0]
        outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0]

        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]
        n_outputs = len(classes_)

        if self.outlier_function is None and outliers:
            raise ValueError(
                "No neighbors found for test samples %r, "
                "you can try using larger radius, "
                "give a function for outliers, "
                "or consider removing them from your dataset." % outliers
            )

        if type(neigh_ind) is int:
            neigh_ind = [neigh_ind]

        weights = self.weight_function(neigh_dist=neigh_dist, neigh_ind=neigh_ind, target_space=self.target_space)

        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            pred_labels = np.array([_y[ind, k] for ind in neigh_ind], dtype=object)
            if weights is None:
                mode = np.array([stats.mode(pl)[0] for pl in pred_labels[inliers]], dtype=np.int)
            else:
                mode = np.array(
                    [weighted_mode(pl, w)[0] for (pl, w) in zip(pred_labels[inliers], weights)], dtype=np.int
                )

            mode = mode.ravel()

            y_pred[inliers, k] = classes_k.take(mode)

        if outliers:
            for outlier in outliers:
                y_pred[outlier, 0] = self.outlier_function.predict(X[outlier])

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred
Example #31
0
         def fred(x, w): return weighted_mode(x, w, axis=axis)
 elif red_op == 'sum':
Example #32
0
 def _custom_weighting(self, neighbor_vals, neighbor_dist):
     neighbor_weights = self.weights(neighbor_dist)
     new_X = weighted_mode(neighbor_vals, neighbor_weights, axis=1)
     return new_X
Example #33
0
 def _distance_weighting(neighbor_vals, neighbor_dist):
     neighbor_weights = 1 / neighbor_dist
     X = weighted_mode(neighbor_vals, neighbor_weights, axis=1)
     return X
Example #34
0
def infer(query,
          samples=None,
          db=None,
          sample_db_fn=None,
          depth=None,
          d_type='d1'):
    ''' 
    infer a query, return it's ap

    arguments
      query       : a dict with three keys, see the template
                    {
                      'img': <path_to_img>,
                      'cls': <img class>,
                      'hist' <img histogram>
                    }
      samples     : a list of {
                                'img': <path_to_img>,
                                'cls': <img class>,
                                'hist' <img histogram>
                              }
      db          : an instance of class Database
      sample_db_fn: a function making samples, should be given if Database != None
      depth       : retrieved depth during inference, the default depth is equal to database size
      d_type      : distance type
  '''
    assert samples != None or (
        db != None and sample_db_fn != None
    ), "need to give either samples or db plus sample_db_fn"
    if db:
        samples = sample_db_fn(db)

    q_img, q_cls, q_hist = query['img'], query['cls'], query['hist']
    results = []
    for idx, sample in enumerate(samples):
        s_img, s_cls, s_hist = sample['img'], sample['cls'], sample['hist']

        if q_img == s_img:
            continue

        results.append({
            'img': s_img,
            'dis': distance(q_hist, s_hist, d_type=d_type),
            'cls': s_cls
        })

    results = sorted(results, key=lambda x: x['dis'])

    if depth and depth <= len(results):
        results = results[:depth]
        print(q_img)
        list_im = [sub['img'] for sub in results]
        print(list_im)
        list_im.insert(0, q_img)
        pred = [sub['cls'] for sub in results]
        weig = [sub['dis'] for sub in results]
        weig = np.reciprocal(weig)
        pred2 = weighted_mode(pred, weig)
        pred = np.array_str(pred2[0])[2:-2]

        list_im.insert(0, q_img)
        imgs = [PIL.Image.open(i) for i in list_im]
        min_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[0][1]
        imgs_comb = np.hstack((np.asarray(i.resize(min_shape)) for i in imgs))
        plt.imshow(imgs_comb / 255.)
        plt.pause(0.1)
        plt.close()

    ap = myAP(q_cls, results, sort=True)

    return ap, pred
Example #35
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X: array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.

        Returns
        -------
        y: array of shape [n_samples] or [n_samples, n_outputs]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr')
        n_samples = X.shape[0]

        neigh_dist, neigh_ind = self.radius_neighbors(X)
        inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0]
        outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0]

        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]
        n_outputs = len(classes_)

        if self.outlier_label is not None:
            neigh_dist[outliers] = 1e-6
        elif outliers:
            raise ValueError('No neighbors found for test samples %r, '
                             'you can try using larger radius, '
                             'give a label for outliers, '
                             'or consider removing them from your dataset.' %
                             outliers)

        weights = _get_weights(neigh_dist, self.weights)

        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            pred_labels = np.zeros(len(neigh_ind), dtype=object)
            pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
            if weights is None:
                mode = np.array(
                    [stats.mode(pl)[0] for pl in pred_labels[inliers]],
                    dtype=np.int)
            else:
                mode = np.array([
                    weighted_mode(pl, w)[0]
                    for (pl, w) in zip(pred_labels[inliers], weights[inliers])
                ],
                                dtype=np.int)

            mode = mode.ravel()

            y_pred[inliers, k] = classes_k.take(mode)

        if outliers:
            y_pred[outliers, :] = self.outlier_label

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred
Example #36
0
    def update(self, v):
        # check if data exists
        if (self.target_markers
                is not None) and (self.target_markers
                                  is not None) and (self.pred_labels
                                                    is not None):
            # obtain the chunks from the target/non-target stream
            target_chunk = self.target_markers.chunks['markers']
            target_marker_time = target_chunk.block.axes[0].times
            target_marker_rec = target_chunk.block.axes[0].data

            # obtain the chunks from the Row/Col stream
            rowcol_chunk = self.rolcol_markers.chunks['markers']
            rowcol_marker_time = rowcol_chunk.block.axes[0].times
            rowcol_marker_rec = rowcol_chunk.block.axes[0].data

            # obtain the chunks from the predicted labels
            y_pred_chunk, y = extract_chunks(self.pred_labels)

            if (len(target_marker_rec) > 0) and (
                    len(rowcol_marker_rec) > 0) and (y_pred_chunk is not None):
                import numpy as np
                import sklearn.metrics as metrics
                import scipy.stats as stats
                from sklearn.utils.extmath import weighted_mode

                target_marker = []
                target_marker_num = []
                for i in range(len(target_marker_rec)):
                    curr_marker = target_marker_rec[i][0]
                    target_marker.append(curr_marker)
                    if curr_marker == 'NonTarget':
                        target_marker_num.append(0)
                    elif curr_marker == 'Target':
                        target_marker_num.append(1)
                    else:
                        target_marker_num.append(np.nan)
                target_marker_num = np.stack(target_marker_num, axis=-1)
                if np.any(np.isnan(target_marker_num)):
                    raise Exception(
                        'Faulty naming with target/non target labels')

                rowcol_marker = []
                rowcol_marker_num = []
                for i in range(len(rowcol_marker_rec)):
                    curr_marker = rowcol_marker_rec[i][0]
                    rowcol_marker.append(curr_marker)
                    if curr_marker == 'Col1':
                        rowcol_marker_num.append(1)
                    elif curr_marker == 'Col2':
                        rowcol_marker_num.append(2)
                    elif curr_marker == 'Col3':
                        rowcol_marker_num.append(3)
                    elif curr_marker == 'Col4':
                        rowcol_marker_num.append(4)
                    elif curr_marker == 'Col5':
                        rowcol_marker_num.append(5)
                    elif curr_marker == 'Col6':
                        rowcol_marker_num.append(6)
                    elif curr_marker == 'Row1':
                        rowcol_marker_num.append(7)
                    elif curr_marker == 'Row2':
                        rowcol_marker_num.append(8)
                    elif curr_marker == 'Row3':
                        rowcol_marker_num.append(9)
                    elif curr_marker == 'Row4':
                        rowcol_marker_num.append(10)
                    elif curr_marker == 'Row5':
                        rowcol_marker_num.append(11)
                    elif curr_marker == 'Row6':
                        rowcol_marker_num.append(12)
                    else:
                        rowcol_marker_num.append(np.nan)
                rowcol_marker_num = np.stack(rowcol_marker_num, axis=-1)
                if np.any(np.isnan(rowcol_marker_num)):
                    raise Exception('Faulty naming with row/column labels')

                bool_stream = y_pred_chunk.props['is_streaming']

                # obtain the predicted labels and the time stamps for the events
                y_pred_marker_time = y_pred_chunk.block.axes[0].times
                y_pred_score = y_pred_chunk.block.data
                if y_pred_score.shape[1] == 1:
                    y_pred = y_pred_score.reshape(-1)
                    y_pred_score_target = np.ones_like(y_pred)

                elif y_pred_score.shape[1] == 2:
                    y_pred = np.argmax(y_pred_score, axis=1)
                    y_pred_score_target = y_pred_score[:, 1]
                else:
                    y_pred = None
                    y_pred_score_target = None

                if not bool_stream:
                    y = y.astype(np.int)
                    if not np.all(target_marker_num == y):
                        raise Exception(
                            "Ground truth label streaming doesn't match label extracted from marker"
                        )

                    # sanity check with all the time stamps
                    if not np.all(
                            y_pred_marker_time == target_marker_time
                    ) and np.all(y_pred_marker_time == rowcol_marker_time):
                        raise Exception("Time stamps of markers don't agree")

                    # compute the accuracy, recall, precision and AUC using sklearn
                    acc = metrics.accuracy_score(y, y_pred)
                    recall = metrics.recall_score(y, y_pred)
                    precision = metrics.precision_score(y, y_pred)
                    auc = metrics.roc_auc_score(y, y_pred)

                    # display the statistics
                    logger.info('Current Accuracy: {}'.format(acc))
                    logger.info('Current Recall: {}'.format(recall))
                    logger.info('Current Precision: {}'.format(precision))
                    logger.info('Current AUC: {}'.format(auc))

                    # now try to decipher the characters
                    # case 1: ALS dataset, when there are multiples words in each file
                    n_flashes = len(y)
                    if len(self.n_char_per_word) == 1:
                        # offline version
                        # sanity check
                        if n_flashes % (self.n_char_per_word[0] *
                                        self.n_flash_per_char) != 0:
                            raise Exception(
                                "Number of flashes doesn't match number of characters"
                            )

                        n_flash_per_char = self.n_flash_per_char
                        n_char_per_word_curr = self.n_char_per_word[0]
                        n_flashes_per_word = n_char_per_word_curr * n_flash_per_char

                        # loop through the words
                        for idx_word in range(
                                int(n_flashes / n_flashes_per_word)):
                            rowcol_marker_num_word_curr = rowcol_marker_num[
                                idx_word * n_flashes_per_word:(idx_word + 1) *
                                n_flashes_per_word]
                            y_pred_word_curr = y_pred[idx_word *
                                                      n_flashes_per_word:
                                                      (idx_word + 1) *
                                                      n_flashes_per_word]
                            y_word_curr = y[idx_word *
                                            n_flashes_per_word:(idx_word + 1) *
                                            n_flashes_per_word]
                            y_pred_score_target_word_curr = y_pred_score_target[
                                idx_word * n_flashes_per_word:(idx_word + 1) *
                                n_flashes_per_word]

                            str_word_pred = ''
                            str_word = ''

                            logger.info(
                                'Predicting the {}-th word'.format(idx_word +
                                                                   1))
                            # loop through the characters
                            for idx_char in range(n_char_per_word_curr):
                                logger.info('{}-th character'.format(idx_char +
                                                                     1))
                                rowcol_marker_num_char_curr = rowcol_marker_num_word_curr[
                                    idx_char *
                                    n_flash_per_char:(idx_char + 1) *
                                    n_flash_per_char]
                                y_pred_char_curr = y_pred_word_curr[
                                    idx_char *
                                    n_flash_per_char:(idx_char + 1) *
                                    n_flash_per_char]
                                y_char_curr = y_word_curr[idx_char *
                                                          n_flash_per_char:
                                                          (idx_char + 1) *
                                                          n_flash_per_char]
                                y_pred_score_target_char_curr = y_pred_score_target_word_curr[
                                    idx_char *
                                    n_flash_per_char:(idx_char + 1) *
                                    n_flash_per_char]

                                # obtain where the flashes are targets
                                y_pred_char_hit_curr = rowcol_marker_num_char_curr[
                                    y_pred_char_curr.astype(bool)]
                                y_char_hit_curr = rowcol_marker_num_char_curr[
                                    y_char_curr.astype(bool)]

                                y_pred_score_target_char_hit_curr = y_pred_score_target_char_curr[
                                    y_pred_char_curr.astype(bool)]

                                # test for the rows and columns separately
                                y_char_hit_curr_row, _ = stats.mode(
                                    y_char_hit_curr[y_char_hit_curr >= 7])
                                y_char_hit_curr_row = y_char_hit_curr_row[0]
                                y_char_hit_curr_col, _ = stats.mode(
                                    y_char_hit_curr[y_char_hit_curr < 7])
                                y_char_hit_curr_col = y_char_hit_curr_col[0]

                                # y_pred_char_hit_curr_row, _ = stats.mode(y_pred_char_hit_curr[y_pred_char_hit_curr >= 7])
                                # if len(y_pred_char_hit_curr_row) >= 1:
                                #     y_pred_char_hit_curr_row = y_pred_char_hit_curr_row[0]
                                # else:
                                #     # random guess if not available
                                #     y_pred_char_hit_curr_row = 7
                                # y_pred_char_hit_curr_col, _ = stats.mode(y_pred_char_hit_curr[y_pred_char_hit_curr < 7])
                                # if len(y_pred_char_hit_curr_col) >= 1:
                                #     y_pred_char_hit_curr_col = y_pred_char_hit_curr_col[0]
                                # else:
                                #     # random guess if not available
                                #     y_pred_char_hit_curr_col = 1

                                if np.sum(y_pred_char_hit_curr >= 7) > 0:
                                    y_pred_char_hit_curr_row, _ = weighted_mode(
                                        y_pred_char_hit_curr[
                                            y_pred_char_hit_curr >= 7],
                                        y_pred_score_target_char_hit_curr[
                                            y_pred_char_hit_curr >= 7])
                                    y_pred_char_hit_curr_row = int(
                                        y_pred_char_hit_curr_row[0])
                                else:
                                    # random guess if not available
                                    y_pred_char_hit_curr_row = 7
                                if np.sum(y_pred_char_hit_curr < 7) > 0:
                                    y_pred_char_hit_curr_col, _ = weighted_mode(
                                        y_pred_char_hit_curr[
                                            y_pred_char_hit_curr < 7],
                                        y_pred_score_target_char_hit_curr[
                                            y_pred_char_hit_curr < 7])
                                    y_pred_char_hit_curr_col = int(
                                        y_pred_char_hit_curr_col[0])
                                else:
                                    # random guess if not available
                                    y_pred_char_hit_curr_col = 1

                                str_y_pred_char_curr = self._char_mapping(
                                    y_pred_char_hit_curr_row,
                                    y_pred_char_hit_curr_col)
                                str_y_char_curr = self._char_mapping(
                                    y_char_hit_curr_row, y_char_hit_curr_col)

                                str_word_pred += str_y_pred_char_curr
                                str_word += str_y_char_curr

                                logger.info(
                                    'Current character: {}, Predicted Character: {}'
                                    .format(str_y_char_curr,
                                            str_y_pred_char_curr))

                            logger.info(
                                'The correct word is: {}'.format(str_word))
                            logger.info('The predicted word is: {}\n'.format(
                                str_word_pred))

                            self.finished = True

                    # case 2: BCI Comp dataset, when there is a single word in each file
                    else:
                        pass

                # online version
                else:
                    y = self.pred_labels.chunks['segmented-markers'].block.data
                    y = y.astype(np.int)

                    # could be missing some markers so skip the check
                    # sanity check with all the time stamps
                    if not np.all(rowcol_marker_time == target_marker_time):
                        raise Exception("Time stamps of markers don't agree")

                    # case 1: ALS dataset, when there are multiples words in each file
                    if len(self.n_char_per_word) == 1:
                        if ((len(self.y_pred_buffer) + len(y_pred)) < self.n_flash_per_char * self.n_char_per_word[0]) and \
                                ((len(self.target_markers_num_buffer) + len(target_marker_num)) < self.n_flash_per_char * self.n_char_per_word[0]):
                            if len(self.y_pred_buffer) == 0:
                                self.target_markers_num_buffer = target_marker_num
                                self.target_markers_time_buffer = target_marker_time

                                self.y_pred_buffer = y_pred
                                self.y_pred_marker_time_buffer = y_pred_marker_time
                                self.y_buffer = y
                                self.y_pred_score_target_buffer = y_pred_score_target

                                self.rowcol_marker_num_buffer = rowcol_marker_num
                                self.rowcol_marker_time_buffer = rowcol_marker_time

                                self.y_buffer_full = y
                                self.y_pred_buffer_full = y_pred

                            else:
                                self.target_markers_num_buffer = np.concatenate(
                                    (self.target_markers_num_buffer,
                                     target_marker_num),
                                    axis=0)
                                self.target_markers_time_buffer = np.concatenate(
                                    (self.target_markers_time_buffer,
                                     target_marker_time),
                                    axis=0)

                                self.y_pred_buffer = np.concatenate(
                                    (self.y_pred_buffer, y_pred), axis=0)
                                self.y_pred_marker_time_buffer = np.concatenate(
                                    (self.y_pred_marker_time_buffer,
                                     y_pred_marker_time),
                                    axis=0)
                                self.y_buffer = np.concatenate(
                                    (self.y_buffer, y), axis=0)
                                self.y_pred_score_target_buffer = np.concatenate(
                                    (self.y_pred_score_target_buffer,
                                     y_pred_score_target),
                                    axis=0)

                                self.rowcol_marker_num_buffer = np.concatenate(
                                    (self.rowcol_marker_num_buffer,
                                     rowcol_marker_num),
                                    axis=0)
                                self.rowcol_marker_time_buffer = np.concatenate(
                                    (self.rowcol_marker_time_buffer,
                                     rowcol_marker_time),
                                    axis=0)

                                self.y_buffer_full = np.concatenate(
                                    (self.y_buffer_full, y), axis=0)
                                self.y_pred_buffer_full = np.concatenate(
                                    (self.y_pred_buffer_full, y_pred), axis=0)

                        else:
                            target_markers_num_curr = np.concatenate(
                                (self.target_markers_num_buffer,
                                 target_marker_num),
                                axis=0)
                            target_markers_time_curr = np.concatenate(
                                (self.target_markers_time_buffer,
                                 target_marker_time),
                                axis=0)

                            y_pred_curr = np.concatenate(
                                (self.y_pred_buffer, y_pred), axis=0)
                            y_pred_marker_time_curr = np.concatenate(
                                (self.y_pred_marker_time_buffer,
                                 y_pred_marker_time),
                                axis=0)
                            y_curr = np.concatenate((self.y_buffer, y), axis=0)
                            y_pred_score_target_curr = np.concatenate(
                                (self.y_pred_score_target_buffer,
                                 y_pred_score_target),
                                axis=0)

                            rowcol_marker_num_curr = np.concatenate(
                                (self.rowcol_marker_num_buffer,
                                 rowcol_marker_num),
                                axis=0)
                            rowcol_marker_time_curr = np.concatenate(
                                (self.rowcol_marker_time_buffer,
                                 rowcol_marker_time),
                                axis=0)

                            self.y_buffer_full = np.concatenate(
                                (self.y_buffer_full, y), axis=0)
                            self.y_pred_buffer_full = np.concatenate(
                                (self.y_pred_buffer_full, y_pred), axis=0)

                            # reset the buffers for future iterations
                            n_flash_per_char = self.n_flash_per_char
                            n_char_per_word_curr = self.n_char_per_word[0]
                            n_flashes_per_word = n_char_per_word_curr * n_flash_per_char

                            # if extra epochs are around
                            if len(self.target_markers_num_buffer) + len(
                                    target_marker_num) > n_flashes_per_word:
                                self.target_markers_num_buffer = target_markers_num_curr[
                                    n_flashes_per_word:]
                                self.target_markers_time_buffer = target_markers_time_curr[
                                    n_flashes_per_word:]

                                self.rowcol_marker_num_buffer = rowcol_marker_num_curr[
                                    n_flashes_per_word:]
                                self.rowcol_marker_time_buffer = rowcol_marker_time_curr[
                                    n_flashes_per_word:]

                            # if everything is perfectly aligned: empty the buffers
                            else:
                                self.target_markers_num_buffer = []
                                self.target_markers_time_buffer = []

                                self.rowcol_marker_num_buffer = []
                                self.rowcol_marker_time_buffer = []

                            if len(self.y_buffer) + len(
                                    y_pred) > n_flashes_per_word:
                                self.y_pred_buffer = y_pred_curr[
                                    n_flashes_per_word:]
                                self.y_pred_marker_time_buffer = y_pred_marker_time_curr[
                                    n_flashes_per_word:]
                                self.y_buffer = y_curr[n_flashes_per_word:]
                                self.y_pred_score_target_buffer = y_pred_score_target_curr[
                                    n_flashes_per_word:]
                            else:
                                self.y_pred_buffer = []
                                self.y_pred_marker_time_buffer = []
                                self.y_buffer = []
                                self.y_pred_score_target_buffer = []

                            # try to obtain all information corresponding to current character
                            target_marker_num = target_markers_num_curr[:
                                                                        n_flashes_per_word]
                            target_marker_time = target_markers_time_curr[:
                                                                          n_flashes_per_word]

                            y_pred = y_pred_curr[:n_flashes_per_word]
                            y_pred_marker_time = y_pred_marker_time_curr[:
                                                                         n_flashes_per_word]
                            y = y_curr[:n_flashes_per_word]
                            y_pred_score_target = y_pred_score_target_curr[:
                                                                           n_flashes_per_word]

                            rowcol_marker_num = rowcol_marker_num_curr[:
                                                                       n_flashes_per_word]
                            rowcol_marker_time = rowcol_marker_time_curr[:
                                                                         n_flashes_per_word]

                            # sanity check with all the time stamps and data
                            if not np.all(target_marker_num == y):
                                raise Exception(
                                    "Ground truth label streaming doesn't match label extracted from marker"
                                )
                            if not np.all(
                                    y_pred_marker_time == target_marker_time
                            ) and np.all(
                                    y_pred_marker_time == rowcol_marker_time):
                                raise Exception(
                                    "Time stamps of markers don't agree")

                            # loop through the words
                            if self.idx_word is None:
                                self.idx_word = 0
                            else:
                                self.idx_word += 1
                            idx_word = self.idx_word

                            str_word_pred = ''
                            str_word = ''

                            logger.info(
                                'Predicting the {}-th word'.format(idx_word +
                                                                   1))
                            # loop through the characters
                            for idx_char in range(n_char_per_word_curr):
                                logger.info('{}-th character'.format(idx_char +
                                                                     1))
                                rowcol_marker_num_char_curr = rowcol_marker_num[
                                    idx_char *
                                    n_flash_per_char:(idx_char + 1) *
                                    n_flash_per_char]
                                y_pred_char_curr = y_pred[idx_char *
                                                          n_flash_per_char:
                                                          (idx_char + 1) *
                                                          n_flash_per_char]
                                y_char_curr = y[idx_char *
                                                n_flash_per_char:(idx_char +
                                                                  1) *
                                                n_flash_per_char]
                                y_pred_score_target_char_curr = y_pred_score_target[
                                    idx_char *
                                    n_flash_per_char:(idx_char + 1) *
                                    n_flash_per_char]

                                # obtain where the flashes are targets
                                y_pred_char_hit_curr = rowcol_marker_num_char_curr[
                                    y_pred_char_curr.astype(bool)]
                                y_char_hit_curr = rowcol_marker_num_char_curr[
                                    y_char_curr.astype(bool)]

                                y_pred_score_target_char_hit_curr = y_pred_score_target_char_curr[
                                    y_pred_char_curr.astype(bool)]

                                # test for the rows and columns separately
                                y_char_hit_curr_row, _ = stats.mode(
                                    y_char_hit_curr[y_char_hit_curr >= 7])
                                y_char_hit_curr_row = y_char_hit_curr_row[0]
                                y_char_hit_curr_col, _ = stats.mode(
                                    y_char_hit_curr[y_char_hit_curr < 7])
                                y_char_hit_curr_col = y_char_hit_curr_col[0]

                                if np.sum(y_pred_char_hit_curr >= 7) > 0:
                                    y_pred_char_hit_curr_row, _ = weighted_mode(
                                        y_pred_char_hit_curr[
                                            y_pred_char_hit_curr >= 7],
                                        y_pred_score_target_char_hit_curr[
                                            y_pred_char_hit_curr >= 7])
                                    y_pred_char_hit_curr_row = int(
                                        y_pred_char_hit_curr_row[0])
                                else:
                                    # random guess if not available
                                    y_pred_char_hit_curr_row = 7
                                if np.sum(y_pred_char_hit_curr < 7) > 0:
                                    y_pred_char_hit_curr_col, _ = weighted_mode(
                                        y_pred_char_hit_curr[
                                            y_pred_char_hit_curr < 7],
                                        y_pred_score_target_char_hit_curr[
                                            y_pred_char_hit_curr < 7])
                                    y_pred_char_hit_curr_col = int(
                                        y_pred_char_hit_curr_col[0])
                                else:
                                    # random guess if not available
                                    y_pred_char_hit_curr_col = 1

                                str_y_pred_char_curr = self._char_mapping(
                                    y_pred_char_hit_curr_row,
                                    y_pred_char_hit_curr_col)
                                str_y_char_curr = self._char_mapping(
                                    y_char_hit_curr_row, y_char_hit_curr_col)

                                str_word_pred += str_y_pred_char_curr
                                str_word += str_y_char_curr

                                logger.info(
                                    'Current character: {}, Predicted Character: {}'
                                    .format(str_y_char_curr,
                                            str_y_pred_char_curr))

                            logger.info(
                                'The correct word is: {}'.format(str_word))
                            logger.info('The predicted word is: {}\n'.format(
                                str_word_pred))

                        # compute the accuracy, recall, precision and AUC using sklearn
                        if self.idx_word == 3:
                            acc = metrics.accuracy_score(
                                self.y_buffer_full, self.y_pred_buffer_full)
                            recall = metrics.recall_score(
                                self.y_buffer_full, self.y_pred_buffer_full)
                            precision = metrics.precision_score(
                                self.y_buffer_full, self.y_pred_buffer_full)
                            auc = metrics.roc_auc_score(
                                self.y_buffer_full, self.y_pred_buffer_full)

                            # display the statistics
                            logger.info('\nFinal Test Set Results')
                            logger.info('Current Accuracy: {}'.format(acc))
                            logger.info('Current Recall: {}'.format(recall))
                            logger.info(
                                'Current Precision: {}'.format(precision))
                            logger.info('Current AUC: {}'.format(auc))

                    # case 2: BCI Comp dataset, when there is a single word in each file
                    else:
                        pass