def extend(self, df): # TODO: Ensure that df is consistent with existing data if not df.index.is_monotonic_increasing: df = df.sort_index(inplace=False) index = df.index.values partition_name = '--'.join([escape(index.min()), escape(index.max())]) mkdir(self.dirname(partition_name)) new_categories, self.categories, df = _decategorize( self.categories, df) self.append_categories(new_categories) # Store columns for col in df.columns: pack_file(df[col].values, self.dirname(partition_name, col)) # Store index fn = self.dirname(partition_name, '.index') x = df.index.values bloscpack.pack_ndarray_file(x, fn, bloscpack_args=bp_args, blosc_args=blosc_args(x.dtype)) if not len(self.partitions): self.minimum = index.min() self.partitions[index.max()] = partition_name self.flush()
def extend(self, df): # TODO: Ensure that df is consistent with existing data if not df.index.is_monotonic_increasing: df = df.sort_index(inplace=False) index = df.index.values partition_name = '--'.join([escape(index.min()), escape(index.max())]) mkdir(self.dirname(partition_name)) new_categories, self.categories, df = _decategorize(self.categories, df) self.append_categories(new_categories) # Store columns for col in df.columns: pack_file(df[col].values, self.dirname(partition_name, col)) # Store index fn = self.dirname(partition_name, '.index') x = df.index.values bloscpack.pack_ndarray_file(x, fn) if not len(self.partitions): self.minimum = index.min() self.partitions[index.max()] = partition_name self.flush()
def save_data(data, fp, upload_s3=True): create_parent_dir_if_not_exists(fp) if fp.endswith('.bp'): try: bp.pack_ndarray_file(np.ascontiguousarray(data), fp) # ascontiguousarray is important, without which sometimes the loaded array will be different from saved. except: fp = fp.replace('.bp', '.npy') np.save(fp, np.ascontiguousarray(data)) elif fp.endswith('.npy'): np.save(fp, np.ascontiguousarray(data)) elif fp.endswith('.json'): save_json(data, fp) elif fp.endswith('.pkl'): save_pickle(data, fp) elif fp.endswith('.hdf'): save_hdf(data, fp) elif fp.endswith('.stl'): save_mesh_stl(data, fp) elif fp.endswith('.txt'): if isinstance(data, np.ndarray): np.savetxt(fp, data) else: raise elif fp.endswith('.dump'): # sklearn classifiers joblib.dump(data, fp) elif fp.endswith('.png') or fp.endswith('.tif') or fp.endswith('.jpg'): imsave(fp, data) else: raise
def extend(self, df): # TODO: Ensure that df is consistent with existing data if not df.index.is_monotonic_increasing: df = df.sort_index(inplace=False) if len(self.partitions) and df.index[0] < self.partitions.index[0]: if is_trivial_index(df.index): df = df.copy() start = self.partitions.index[-1] + 1 new_index = pd.Index(np.arange(start, start + len(df)), name = df.index.name) df.index = new_index else: raise ValueError("Index of new dataframe less than known data") index = df.index.values partition_name = '--'.join([escape(index.min()), escape(index.max())]) mkdir(self.dirname(partition_name)) new_categories, self.categories, df = _decategorize(self.categories, df) self.append_categories(new_categories) # Store columns for col in df.columns: pack_file(df[col].values, self.dirname(partition_name, col)) # Store index fn = self.dirname(partition_name, '.index') bloscpack.pack_ndarray_file(index, fn, bloscpack_args=bp_args, blosc_args=blosc_args(index.dtype)) if not len(self.partitions): self.minimum = coerce_index(index.dtype, index.min()) self.partitions[index.max()] = partition_name self.flush()
def pack_file(x, fn, encoding='utf8'): """ Pack numpy array into filename Supports binary data with bloscpack and text data with msgpack+blosc >>> pack_file(np.array([1, 2, 3]), 'foo.blp') # doctest: +SKIP See also: unpack_file """ if x.dtype != 'O': bloscpack.pack_ndarray_file(x, fn) else: bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1) with open(fn, 'wb') as f: f.write(bytes)
def pack_file(x, fn, encoding='utf8'): """ Pack numpy array into filename Supports binary data with bloscpack and text data with msgpack+blosc >>> pack_file(np.array([1, 2, 3]), 'foo.blp') # doctest: +SKIP See also: unpack_file """ if x.dtype != 'O': bloscpack.pack_ndarray_file(x, fn, bloscpack_args=bp_args, blosc_args=blosc_args(x.dtype)) else: bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1) with open(fn, 'wb') as f: f.write(bytes)
def write_numpy( tile, bands, pixelbuffer=0): if pixelbuffer > 0: raise NotImplementedError( "pixelbuffers on NumPy output not yet supported" ) if isinstance(bands, tuple): bp.pack_ndarray_file( np.stack(bands), tile.path ) elif isinstance(bands, np.ndarray): bp.pack_ndarray_file( bands, tile.path )
def extend(self, df): if self._readonly: raise IOError('File not open for writing') if len(df) == 0: return # TODO: Ensure that df is consistent with existing data if not df.index.is_monotonic_increasing: df = df.sort_index(inplace=False) new_categories, self.categories, df = _decategorize( self.categories, df) self.append_categories(new_categories) if len(self.partitions) and df.index[0] <= self.partitions.index[-1]: if is_trivial_index(df.index): df = df.copy() start = self.partitions.index[-1] + 1 new_index = pd.Index(np.arange(start, start + len(df)), name=df.index.name) df.index = new_index else: raise ValueError("Index of new dataframe less than known data") index = df.index.values partition_name = '--'.join([escape(index.min()), escape(index.max())]) mkdir(self.dirname(partition_name)) # Store columns for col in df.columns: pack_file(df[col].values, self.dirname(partition_name, col)) # Store index fn = self.dirname(partition_name, '.index') bloscpack.pack_ndarray_file(index, fn, bloscpack_args=bp_args, blosc_args=blosc_args(index.dtype)) if not len(self.partitions): self.minimum = coerce_index(index.dtype, index.min()) self.partitions.loc[index.max()] = partition_name self.flush()
def _pickle_data_matrices(np_train_x, np_train_y, np_test_x, np_test_y, s_train_x_path, s_train_y_path, s_test_x_path, s_test_y_path): """ Pickles given train and test, x and y matrices, into given . :param np_train_x: numpy matrix, training data :param np_train_y: numpy matrix, training labels :param np_test_x: numpy matrix, testing data :param np_test_y: numpy matrix, testing labels :return: None """ print "\npickling data matrices" # pickling training data bp.pack_ndarray_file(np_train_x, s_train_x_path) bp.pack_ndarray_file(np_train_y, s_train_y_path) # pickling testing data bp.pack_ndarray_file(np_test_x, s_test_x_path) bp.pack_ndarray_file(np_test_y, s_test_y_path)
def _nnet2file(layers, set_layer_num = -1, path="dnn.tmp", start_layer = 0, input_factor = 0.0, factor=[]): if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) blosc_args=bp.BloscArgs(clevel=9) n_layers = len(layers) nnet_dict = {} if set_layer_num == -1: set_layer_num = n_layers for i in range(start_layer, set_layer_num): layer = layers[i] dict_a = 'W' + str(i) dropout_factor = 0.0 if i == 0: dropout_factor = input_factor if i > 0 and len(factor) > 0: dropout_factor = factor[i-1] if layer.type == 'fc': n = str(uuid.uuid4())+".blp" tmpFileName = path + "/" + n; nnet_dict[dict_a] = n bp.pack_ndarray_file((1.0 - dropout_factor) * layer.W.get_value(), tmpFileName, chunk_size='100M', blosc_args=blosc_args) elif layer.type == 'conv': filter_shape = layer.filter_shape for next_X in range(filter_shape[0]): for this_X in range(filter_shape[1]): n = str(uuid.uuid4())+".blp" tmpFileName = path + "/" + n; new_dict_a = dict_a + ' ' + str(next_X) + ' ' + str(this_X) nnet_dict[new_dict_a] = n bp.pack_ndarray_file((1.0-dropout_factor) * (layer.W.get_value())[next_X, this_X], tmpFileName, chunk_size='100M', blosc_args=blosc_args) n = str(uuid.uuid4())+".blp" tmpFileName = path + "/" + n; dict_a = 'b' + str(i) nnet_dict[dict_a] = n bp.pack_ndarray_file(layer.b.get_value(),tmpFileName, chunk_size='100M', blosc_args=blosc_args) with open(path + '/metadata.tmp', 'wb') as fp: pickle.dump(nnet_dict,fp,pickle.HIGHEST_PROTOCOL)
def wrapped_f(): if path.exists(self.filename): return bloscpack.unpack_ndarray_file(self.filename) result = f() bloscpack.pack_ndarray_file(result, self.filename) return result
def __call__(self): if path.exists(self.filename): return bloscpack.unpack_ndarray_file(self.filename) result = self.f() bloscpack.pack_ndarray_file(result, self.filename) return result
def to_blp(ndarray: np.array, serialized_filepath): bp.pack_ndarray_file(ndarray, serialized_filepath)
for i in range(dm.n_kernel): sys.stderr.write('%d\n' % i) a = bp.unpack_ndarray_file(os.environ['GORDON_RESULT_DIR'] + '/feature_%03d.bp' % i).reshape((-1, )) if which_part == 0: features.append(a[:len(a) / 2]) else: features.append(a[len(a) / 2:]) features = np.asarray(features).T sys.stderr.write('done in %f seconds\n' % (time.time() - t)) t = time.time() sys.stderr.write('rotate features ...') items_per_job = 100 features_rotated = Parallel(n_jobs=8)( delayed(rotate_features)(fs, i) for i, fs in enumerate( np.array_split(features, features.shape[0] / items_per_job))) features_rotated = np.vstack(features_rotated) print features_rotated.shape bp.pack_ndarray_file( features_rotated, os.environ['GORDON_RESULT_DIR'] + '/featuresRotated_%d.bp' % which_part) sys.stderr.write('done in %f seconds\n' % (time.time() - t))
# Final data prep, possibly dependent on cooder object so here # ------------------------------------------------------------ if hasattr(coder, 'prep_data'): logger.info("Prepping data...") datas = coder.prep_data(datas, args.prep_data) if args.permute: if os.path.isfile(args.permute): perm = bp.unpack_ndarray_file(args.permute) else: logger.warning("Given permutation file does not exits; creating " + args.permute) perm = np.random.permutation(len(datas[0])) bp.pack_ndarray_file(perm, args.permute) for d in datas: d[:] = d[perm] # partition to training and validation train_idx = int((1.0 - args.validation) * len(datas[0])) logger.info('Train samples {}, validation samples {}', train_idx, len(datas[0]) - train_idx) # now execute requested actions # ----------------------------- sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) sess.run(tf.global_variables_initializer()) with sess.as_default(): if args.epochs:
"feat.proc_exists_attending_practitioner_id.blp", "feat.proc_exists_referring_practitioner_id.blp", "feat.proc_exists_operating_practitioner_id.blp", "feat.surg_unuq_claim_id.blp", "feat.proc_num_record.blp", "feat.proc_uniq_claim_id.blp", "feat.pres_unique_claim_id.blp", "feat.diag_num_record.blp", "feat.diag_uniq_claim_id.blp", "feat.diag_diagnosis_date_unique_count.blp", ] for dense_name in dense_list: print("Generating {} ...".format(dense_name)) function_name = dense_name.split('.')[1] caller = getattr(sys.modules[__name__], function_name) bp.pack_ndarray_file(caller(), dense_name) # Generating sparse features sparse_list = [ "feat.pres_drug_dv.h5", "feat.diag_physician_specialty_description_dv.h5", "feat.diag_physician_state_dv.h5", "feat.diag_physician_cbsa_dv.h5", "feat.pres_drug_bb_usc_code_dv.h5", "feat.pres_drug_generic_name_dv.h5", "feat.pres_drug_manufacturer_dv.h5", "feat.proc_physician_specialty_description_dv.h5", "feat.diag_diagnosis_code_dv.h5", "feat.diag_diagnosis_description_1gram_dv.h5", "feat.diag_primary_physician_role_dv.h5", "feat.proc_procedure_code_dv.h5",
def compress(self): blosc_args = self.blosc_args.copy() blosc_args['clevel'] = self.level bp.pack_ndarray_file(self.ndarray, self.storage, blosc_args=blosc_args, bloscpack_args=self.bloscpack_args)
def compress(self): blosc_args = bp.BloscArgs(clevel=self.level, cname=self.cname) bp.pack_ndarray_file(self.ndarray, self.storage, blosc_args=blosc_args, bloscpack_args=self.bloscpack_args)
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1: y[3], :]) label_block.append(incr_class_label[x[3] + 1: y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len(fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append(incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return self
#!/usr/bin/env python import numpy as np import pandas as pd import bloscpack as bp print 'loading data..' df = pd.read_csv('mtgoxUSD.csv', names=['date', 'price', 'volume']) a = np.array(df[['price', 'volume']]).T.copy() base = 2**(20) for name, size in (('small', base / 16), ('mid', base * 10 / 16), ('large', base * 100 /16), ): print 'save: %s' % name np.save('bitcoin_%s' % name, a[:, :size]) bp.pack_ndarray_file(a[:, :size], 'bitcoin_%s.blp' % name)
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1:y[3], :]) label_block.append(incr_class_label[x[3] + 1:y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len( fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join( dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % ( i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append( incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[ 0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') print return self
import bloscpack as bp import gzip import pickle import sys if __name__ == '__main__': f = gzip.open('mnist_py3k.pkl.gz', 'rb') (train_set_mat,train_set_vec), (valid_set_mat,valid_set_vec), (test_set_mat,test_set_vec) = pickle.load(f) blosc_args=bp.BloscArgs(clevel=9) bp.pack_ndarray_file(train_set_mat[0:25000,:], 'train.part0.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(train_set_vec[0:25000], 'train.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(valid_set_mat[0:5000,:], 'valid.part0.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(valid_set_vec[0:5000], 'valid.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(test_set_mat[0:5000,:], 'test.part0.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(test_set_vec[0:5000], 'test.part0.blp.labels', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(train_set_mat[25000:,:], 'train.part1.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(train_set_vec[25000:], 'train.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(valid_set_mat[5000:,:], 'valid.part1.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(valid_set_vec[5000:], 'valid.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(test_set_mat[5000:,:], 'test.part1.blp', chunk_size='100M', blosc_args=blosc_args) bp.pack_ndarray_file(test_set_vec[5000:], 'test.part1.blp.labels', chunk_size='100M', blosc_args=blosc_args)