Exemple #1
0
    def extend(self, df):
        # TODO: Ensure that df is consistent with existing data
        if not df.index.is_monotonic_increasing:
            df = df.sort_index(inplace=False)
        index = df.index.values
        partition_name = '--'.join([escape(index.min()), escape(index.max())])

        mkdir(self.dirname(partition_name))

        new_categories, self.categories, df = _decategorize(
            self.categories, df)
        self.append_categories(new_categories)

        # Store columns
        for col in df.columns:
            pack_file(df[col].values, self.dirname(partition_name, col))

        # Store index
        fn = self.dirname(partition_name, '.index')
        x = df.index.values
        bloscpack.pack_ndarray_file(x,
                                    fn,
                                    bloscpack_args=bp_args,
                                    blosc_args=blosc_args(x.dtype))

        if not len(self.partitions):
            self.minimum = index.min()
        self.partitions[index.max()] = partition_name
        self.flush()
Exemple #2
0
    def extend(self, df):
        # TODO: Ensure that df is consistent with existing data
        if not df.index.is_monotonic_increasing:
            df = df.sort_index(inplace=False)
        index = df.index.values
        partition_name = '--'.join([escape(index.min()), escape(index.max())])

        mkdir(self.dirname(partition_name))

        new_categories, self.categories, df = _decategorize(self.categories, df)
        self.append_categories(new_categories)

        # Store columns
        for col in df.columns:
            pack_file(df[col].values, self.dirname(partition_name, col))

        # Store index
        fn = self.dirname(partition_name, '.index')
        x = df.index.values
        bloscpack.pack_ndarray_file(x, fn)

        if not len(self.partitions):
            self.minimum = index.min()
        self.partitions[index.max()] = partition_name
        self.flush()
def save_data(data, fp, upload_s3=True):

    create_parent_dir_if_not_exists(fp)

    if fp.endswith('.bp'):
        try:
            bp.pack_ndarray_file(np.ascontiguousarray(data), fp)
            # ascontiguousarray is important, without which sometimes the loaded array will be different from saved.
        except:
            fp = fp.replace('.bp', '.npy')
            np.save(fp, np.ascontiguousarray(data))
    elif fp.endswith('.npy'):
        np.save(fp, np.ascontiguousarray(data))
    elif fp.endswith('.json'):
        save_json(data, fp)
    elif fp.endswith('.pkl'):
        save_pickle(data, fp)
    elif fp.endswith('.hdf'):
        save_hdf(data, fp)
    elif fp.endswith('.stl'):
        save_mesh_stl(data, fp)
    elif fp.endswith('.txt'):
        if isinstance(data, np.ndarray):
            np.savetxt(fp, data)
        else:
            raise
    elif fp.endswith('.dump'):  # sklearn classifiers
        joblib.dump(data, fp)
    elif fp.endswith('.png') or fp.endswith('.tif') or fp.endswith('.jpg'):
        imsave(fp, data)
    else:
        raise
Exemple #4
0
    def extend(self, df):
        # TODO: Ensure that df is consistent with existing data
        if not df.index.is_monotonic_increasing:
            df = df.sort_index(inplace=False)
        if len(self.partitions) and df.index[0] < self.partitions.index[0]:
            if is_trivial_index(df.index):
                df = df.copy()
                start = self.partitions.index[-1] + 1
                new_index = pd.Index(np.arange(start, start + len(df)),
                                     name = df.index.name)
                df.index = new_index
            else:
                raise ValueError("Index of new dataframe less than known data")
        index = df.index.values
        partition_name = '--'.join([escape(index.min()), escape(index.max())])

        mkdir(self.dirname(partition_name))

        new_categories, self.categories, df = _decategorize(self.categories,
                                                            df)
        self.append_categories(new_categories)

        # Store columns
        for col in df.columns:
            pack_file(df[col].values, self.dirname(partition_name, col))

        # Store index
        fn = self.dirname(partition_name, '.index')
        bloscpack.pack_ndarray_file(index, fn, bloscpack_args=bp_args,
                                    blosc_args=blosc_args(index.dtype))

        if not len(self.partitions):
            self.minimum = coerce_index(index.dtype, index.min())
        self.partitions[index.max()] = partition_name
        self.flush()
Exemple #5
0
def pack_file(x, fn, encoding='utf8'):
    """ Pack numpy array into filename

    Supports binary data with bloscpack and text data with msgpack+blosc

    >>> pack_file(np.array([1, 2, 3]), 'foo.blp')  # doctest: +SKIP

    See also:
        unpack_file
    """
    if x.dtype != 'O':
        bloscpack.pack_ndarray_file(x, fn)
    else:
        bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1)
        with open(fn, 'wb') as f:
            f.write(bytes)
Exemple #6
0
def pack_file(x, fn, encoding='utf8'):
    """ Pack numpy array into filename

    Supports binary data with bloscpack and text data with msgpack+blosc

    >>> pack_file(np.array([1, 2, 3]), 'foo.blp')  # doctest: +SKIP

    See also:
        unpack_file
    """
    if x.dtype != 'O':
        bloscpack.pack_ndarray_file(x, fn, bloscpack_args=bp_args,
                blosc_args=blosc_args(x.dtype))
    else:
        bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1)
        with open(fn, 'wb') as f:
            f.write(bytes)
Exemple #7
0
def write_numpy(
    tile,
    bands,
    pixelbuffer=0):
    if pixelbuffer > 0:
        raise NotImplementedError(
            "pixelbuffers on NumPy output not yet supported"
        )
    if isinstance(bands, tuple):
        bp.pack_ndarray_file(
            np.stack(bands),
            tile.path
            )
    elif isinstance(bands, np.ndarray):
        bp.pack_ndarray_file(
            bands,
            tile.path
            )
Exemple #8
0
    def extend(self, df):
        if self._readonly:
            raise IOError('File not open for writing')
        if len(df) == 0:
            return
        # TODO: Ensure that df is consistent with existing data
        if not df.index.is_monotonic_increasing:
            df = df.sort_index(inplace=False)

        new_categories, self.categories, df = _decategorize(
            self.categories, df)
        self.append_categories(new_categories)

        if len(self.partitions) and df.index[0] <= self.partitions.index[-1]:
            if is_trivial_index(df.index):
                df = df.copy()
                start = self.partitions.index[-1] + 1
                new_index = pd.Index(np.arange(start, start + len(df)),
                                     name=df.index.name)
                df.index = new_index
            else:
                raise ValueError("Index of new dataframe less than known data")

        index = df.index.values
        partition_name = '--'.join([escape(index.min()), escape(index.max())])

        mkdir(self.dirname(partition_name))

        # Store columns
        for col in df.columns:
            pack_file(df[col].values, self.dirname(partition_name, col))

        # Store index
        fn = self.dirname(partition_name, '.index')
        bloscpack.pack_ndarray_file(index,
                                    fn,
                                    bloscpack_args=bp_args,
                                    blosc_args=blosc_args(index.dtype))

        if not len(self.partitions):
            self.minimum = coerce_index(index.dtype, index.min())
        self.partitions.loc[index.max()] = partition_name
        self.flush()
def _pickle_data_matrices(np_train_x, np_train_y, np_test_x, np_test_y,
                         s_train_x_path, s_train_y_path, s_test_x_path, s_test_y_path):
    """
    Pickles given train and test, x and y matrices, into given .

    :param np_train_x: numpy matrix, training data
    :param np_train_y: numpy matrix, training labels
    :param np_test_x:  numpy matrix, testing data
    :param np_test_y:  numpy matrix, testing labels
    :return: None
    """
    print "\npickling data matrices"
    # pickling training data
    bp.pack_ndarray_file(np_train_x, s_train_x_path)
    bp.pack_ndarray_file(np_train_y, s_train_y_path)
    # pickling testing data
    bp.pack_ndarray_file(np_test_x, s_test_x_path)
    bp.pack_ndarray_file(np_test_y, s_test_y_path)
Exemple #10
0
def _nnet2file(layers, set_layer_num = -1, path="dnn.tmp", start_layer = 0, input_factor = 0.0, factor=[]):
    if os.path.exists(path):
       shutil.rmtree(path)
    os.makedirs(path)
    blosc_args=bp.BloscArgs(clevel=9) 
    n_layers = len(layers)
    nnet_dict = {}
    if set_layer_num == -1:
       set_layer_num = n_layers
    for i in range(start_layer, set_layer_num):
       layer = layers[i]
       dict_a = 'W' + str(i)
       dropout_factor = 0.0
       if i == 0:
           dropout_factor = input_factor
       if i > 0 and len(factor) > 0:
           dropout_factor = factor[i-1]

       if layer.type == 'fc':
           n = str(uuid.uuid4())+".blp"
           tmpFileName = path + "/" + n;
           nnet_dict[dict_a] = n
           bp.pack_ndarray_file((1.0 - dropout_factor) * layer.W.get_value(), tmpFileName, chunk_size='100M', blosc_args=blosc_args)
       elif layer.type == 'conv':
           filter_shape = layer.filter_shape
           for next_X in range(filter_shape[0]):
               for this_X in range(filter_shape[1]):
                   n = str(uuid.uuid4())+".blp" 
                   tmpFileName = path + "/" + n;
                   new_dict_a = dict_a + ' ' + str(next_X) + ' ' + str(this_X)
                   nnet_dict[new_dict_a] = n
                   bp.pack_ndarray_file((1.0-dropout_factor) * (layer.W.get_value())[next_X, this_X], tmpFileName, chunk_size='100M', blosc_args=blosc_args)

       n = str(uuid.uuid4())+".blp"
       tmpFileName = path + "/" + n;
       dict_a = 'b' + str(i)
       nnet_dict[dict_a] = n
       bp.pack_ndarray_file(layer.b.get_value(),tmpFileName, chunk_size='100M', blosc_args=blosc_args)

    with open(path + '/metadata.tmp', 'wb') as fp:
        pickle.dump(nnet_dict,fp,pickle.HIGHEST_PROTOCOL)
 def wrapped_f():
     if path.exists(self.filename):
         return bloscpack.unpack_ndarray_file(self.filename)
     result = f()
     bloscpack.pack_ndarray_file(result, self.filename)
     return result
 def __call__(self):
     if path.exists(self.filename):
         return bloscpack.unpack_ndarray_file(self.filename)
     result = self.f()
     bloscpack.pack_ndarray_file(result, self.filename)
     return result
Exemple #13
0
 def to_blp(ndarray: np.array, serialized_filepath):
     bp.pack_ndarray_file(ndarray, serialized_filepath)
for i in range(dm.n_kernel):

    sys.stderr.write('%d\n' % i)
    a = bp.unpack_ndarray_file(os.environ['GORDON_RESULT_DIR'] +
                               '/feature_%03d.bp' % i).reshape((-1, ))

    if which_part == 0:
        features.append(a[:len(a) / 2])
    else:
        features.append(a[len(a) / 2:])

features = np.asarray(features).T

sys.stderr.write('done in %f seconds\n' % (time.time() - t))

t = time.time()
sys.stderr.write('rotate features ...')

items_per_job = 100

features_rotated = Parallel(n_jobs=8)(
    delayed(rotate_features)(fs, i) for i, fs in enumerate(
        np.array_split(features, features.shape[0] / items_per_job)))
features_rotated = np.vstack(features_rotated)

print features_rotated.shape
bp.pack_ndarray_file(
    features_rotated,
    os.environ['GORDON_RESULT_DIR'] + '/featuresRotated_%d.bp' % which_part)

sys.stderr.write('done in %f seconds\n' % (time.time() - t))
Exemple #15
0
    # Final data prep, possibly dependent on cooder object so here
    # ------------------------------------------------------------
    
    if hasattr(coder, 'prep_data'):
        logger.info("Prepping data...")
        datas = coder.prep_data(datas, args.prep_data)
    
    if args.permute:
        if os.path.isfile(args.permute):
            perm = bp.unpack_ndarray_file(args.permute)
        else:
            logger.warning("Given permutation file does not exits; creating " +
                           args.permute)
            perm = np.random.permutation(len(datas[0]))
            bp.pack_ndarray_file(perm, args.permute)
        for d in datas:
            d[:] = d[perm]
        
    # partition to training and validation
    train_idx = int((1.0 - args.validation) * len(datas[0]))
    logger.info('Train samples {}, validation samples {}',
                train_idx, len(datas[0]) - train_idx)
        
    # now execute requested actions
    # -----------------------------

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    sess.run(tf.global_variables_initializer())    
    with sess.as_default():
        if args.epochs:
Exemple #16
0
        "feat.proc_exists_attending_practitioner_id.blp",
        "feat.proc_exists_referring_practitioner_id.blp",
        "feat.proc_exists_operating_practitioner_id.blp",
        "feat.surg_unuq_claim_id.blp",
        "feat.proc_num_record.blp",
        "feat.proc_uniq_claim_id.blp",
        "feat.pres_unique_claim_id.blp",
        "feat.diag_num_record.blp",
        "feat.diag_uniq_claim_id.blp",
        "feat.diag_diagnosis_date_unique_count.blp",
    ]
    for dense_name in dense_list:
        print("Generating {} ...".format(dense_name))
        function_name = dense_name.split('.')[1]
        caller = getattr(sys.modules[__name__], function_name)
        bp.pack_ndarray_file(caller(), dense_name)

    # Generating sparse features
    sparse_list = [
        "feat.pres_drug_dv.h5",
        "feat.diag_physician_specialty_description_dv.h5",
        "feat.diag_physician_state_dv.h5",
        "feat.diag_physician_cbsa_dv.h5",
        "feat.pres_drug_bb_usc_code_dv.h5",
        "feat.pres_drug_generic_name_dv.h5",
        "feat.pres_drug_manufacturer_dv.h5",
        "feat.proc_physician_specialty_description_dv.h5",
        "feat.diag_diagnosis_code_dv.h5",
        "feat.diag_diagnosis_description_1gram_dv.h5",
        "feat.diag_primary_physician_role_dv.h5",
        "feat.proc_procedure_code_dv.h5",
Exemple #17
0
 def compress(self):
     blosc_args = self.blosc_args.copy()
     blosc_args['clevel'] = self.level
     bp.pack_ndarray_file(self.ndarray, self.storage,
                          blosc_args=blosc_args,
                          bloscpack_args=self.bloscpack_args)
 def compress(self):
     blosc_args = bp.BloscArgs(clevel=self.level, cname=self.cname)
     bp.pack_ndarray_file(self.ndarray,
                          self.storage,
                          blosc_args=blosc_args,
                          bloscpack_args=self.bloscpack_args)
    def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1: y[3], :])
            label_block.append(incr_class_label[x[3] + 1: y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
#            predict_true = handle(clf, "zero")
#            if predict_true:
#                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(fit_for_class_support)

#            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
#            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update, key=lambda x: x[3])

#                    index = [index0[3] for index0 in accord_to_index]
#                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
#                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :])
                    label_block.append(incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])

#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        print
        return self
#!/usr/bin/env python
import numpy as np
import pandas as pd
import bloscpack as bp

print 'loading data..'
df = pd.read_csv('mtgoxUSD.csv', names=['date', 'price', 'volume'])
a = np.array(df[['price', 'volume']]).T.copy()

base = 2**(20)
for name, size in (('small', base / 16),
                    ('mid', base * 10 / 16),
                    ('large', base * 100 /16),
                    ):
    print 'save: %s' % name
    np.save('bitcoin_%s' % name, a[:, :size])
    bp.pack_ndarray_file(a[:, :size], 'bitcoin_%s.blp' % name)
    def get_incr_classificator(self,
                               incr_datas,
                               incr_class_label,
                               test_datas,
                               test_class_label,
                               method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1:y[3], :])
            label_block.append(incr_class_label[x[3] + 1:y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(
                    origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
            #            predict_true = handle(clf, "zero")
            #            if predict_true:
            #                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(
                fit_for_class_support)

            #            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
            #            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_,
                                          "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_,
                                         "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(
            dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (
                    i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update,
                                             key=lambda x: x[3])

                    #                    index = [index0[3] for index0 in accord_to_index]
                    #                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
                    #                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] +
                                                1:, :])
                    label_block.append(
                        incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[
                        0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]),
                zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])


#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        print
        return self
Exemple #22
0
import bloscpack as bp
import gzip
import pickle
import sys

if __name__ == '__main__':

    f = gzip.open('mnist_py3k.pkl.gz', 'rb')
    (train_set_mat,train_set_vec), (valid_set_mat,valid_set_vec), (test_set_mat,test_set_vec) = pickle.load(f)
    blosc_args=bp.BloscArgs(clevel=9)
    bp.pack_ndarray_file(train_set_mat[0:25000,:], 'train.part0.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(train_set_vec[0:25000], 'train.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(valid_set_mat[0:5000,:], 'valid.part0.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(valid_set_vec[0:5000], 'valid.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(test_set_mat[0:5000,:], 'test.part0.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(test_set_vec[0:5000], 'test.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args)

    bp.pack_ndarray_file(train_set_mat[25000:,:], 'train.part1.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(train_set_vec[25000:], 'train.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(valid_set_mat[5000:,:], 'valid.part1.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(valid_set_vec[5000:], 'valid.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(test_set_mat[5000:,:], 'test.part1.blp', chunk_size='100M', blosc_args=blosc_args)
    bp.pack_ndarray_file(test_set_vec[5000:], 'test.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args)