def dumpInput(outputPath,data_spec): x = sorted(glob.glob(data_spec)); tmp = bp.unpack_ndarray_file(x[0]) dims = len(tmp[0]) count = len(tmp) for i in range (1,len(x)): tmp = bp.unpack_ndarray_file(x[i]) if len(tmp[0]) != dims: raise ValueError("Dimension "+str(len(tmp[0]))+" not same as first file "+str(dims)) count = count + len(tmp) with open(outputPath+'/originalData.txt', 'w') as fout: c = 0 fout.write(str(count)+" "+str(dims)+"\n") for inputFile in x: tmp = bp.unpack_ndarray_file(inputFile) for xi in range (0,tmp.shape[0]): x_vector = tmp[xi] xstr = numpy.char.mod('%f', x_vector) output = " ".join(xstr)+"\n" fout.write(output) c = c + len(tmp) if c != count: raise ValueError("Error output count ("+str(c)+") is not equal to calculated output count ("+str(count)+")") return (count,dims)
def _get_neural_features_dataset(reshape_style, classes): print("Loading neural features data...") # Paths to pickled data root = '/Users/miljan/PycharmProjects/entity-dependent-sentiment-mining/data/blp/glove/300d/' + classes + '/' data_path_train_x = root + 'train_x.blp' data_path_train_y = root + 'train_y.blp' data_path_test_x = root + 'test_x.blp' data_path_test_y = root + 'test_y.blp' # Load train data print("\nUnpickling training data...") np_train_x = bp.unpack_ndarray_file(data_path_train_x) # np_train_x = np_train_x[:np.floor(np_train_x.shape[0]/2.0), :, :] np_train_y = bp.unpack_ndarray_file(data_path_train_y) # np_train_y = np_train_y[:np.floor(np_train_y.shape[0]/2.0)] gc.collect() # Load test data print("\nUnpickling testing data...") np_test_x = bp.unpack_ndarray_file(data_path_test_x) np_test_y = bp.unpack_ndarray_file(data_path_test_y) gc.collect() # reshape data matrices into single row per sample matrices (3D > 2D reshaping) np_train_x, np_test_x = _reshape_vectors(reshape_style, np_train_x, np_test_x) print("\nDone") return np_train_x, np_train_y, np_test_x, np_test_y
def _file2nnet(layers, set_layer_num = -1, path="dnn.tmp", factor=1.0): n_layers = len(layers) nnet_dict = {} if set_layer_num == -1: set_layer_num = n_layers log("file2nnet set_layer_num is -1 so set it to "+str(set_layer_num)) with open(path + '/metadata.tmp', 'rb') as fp: nnet_dict = pickle.load(fp) for i in range(set_layer_num): dict_a = 'W' + str(i) layer = layers[i] if layer.type == 'fc': mat_shape = layer.W.get_value().shape f = path + "/" + os.path.split(nnet_dict[dict_a])[1] layer.W.set_value(factor * np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX).reshape(mat_shape)) elif layer.type == 'conv': filter_shape = layer.filter_shape W_array = layer.W.get_value() for next_X in range(filter_shape[0]): for this_X in range(filter_shape[1]): new_dict_a = dict_a + ' ' + str(next_X) + ' ' + str(this_X) mat_shape = W_array[next_X, this_X, :, :].shape f = path + "/" + os.path.split(nnet_dict[new_dict_a])[1] W_array[next_X, this_X, :, :] = factor * np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX).reshape(mat_shape) layer.W.set_value(W_array) dict_a = 'b' + str(i) f = path + "/" + os.path.split(nnet_dict[dict_a])[1] layer.b.set_value(np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX))
def _load_train_data(divisor, offset): ''' Load subset of data :param divisor: :param offset: :return: ''' start = offset / float(divisor) end = (1 + offset) / float(divisor) x_train = bp.unpack_ndarray_file(data_path_train_x) data_temp = np.empty_like( x_train[np.floor(x_train.shape[0] * start):np.floor(x_train.shape[0] * end), :, :]) np.copyto( data_temp, x_train[np.floor(x_train.shape[0] * start):np.floor(x_train.shape[0] * end), :, :]) x_train = -1 gc.collect() x_train = data_temp y_train = bp.unpack_ndarray_file(data_path_train_y) y_train = deepcopy(y_train[np.floor(y_train.shape[0] * start):np.floor(y_train.shape[0] * end)]) return x_train, y_train
def load_data(fp, polydata_instead_of_face_vertex_list=True, download_s3=True): from .vis3d_utilities import load_mesh_stl from .distributed_utilities import download_from_s3 if ENABLE_DOWNLOAD_S3 and download_s3: download_from_s3(fp) if fp.endswith('.bp'): try: data = bp.unpack_ndarray_file(fp) except: fp = fp.replace('.bp','.npy') data = np.load(fp) elif fp.endswith('.npy'): data = np.load(fp) elif fp.endswith('.json'): data = load_json(fp) elif fp.endswith('.pkl'): data = load_pickle(fp) elif fp.endswith('.stl'): data = load_mesh_stl(fp, return_polydata_only=polydata_instead_of_face_vertex_list) elif fp.endswith('.txt'): data = np.loadtxt(fp) elif fp.endswith('.png') or fp.endswith('.tif'): data = imread(fp) elif fp.endswith('.ini'): data = load_ini(fp) elif fp.endswith('.csv'): data = csv_to_dict(fp) else: raise return data
def get_total_size(self): if self.total_size != None: return self.total_size self.total_size = 0 for f in self.pfile_path_list: l = bp.unpack_ndarray_file(f+".labels") self.total_size += len(l) return self.total_size
def load_Xy(): X_np = np.hstack([bp.unpack_ndarray_file(fn) for fn in FEATURE_LIST]) X_ss = None for fn in SPARSE_LIST: print("Loading {}".format(fn)) if X_ss is None: X_ss = loadmat(fn) X_ss = ss.csr_matrix(X_ss) else: X_ss = ss.hstack([X_ss, loadmat(fn)]) X_ss = ss.csr_matrix(X_ss) X_np = ss.csr_matrix(X_np) print("Concatinate X_ss and X_np") X = ss.hstack([X_np, X_ss]) print("Convert X into CSC matrix") X = ss.csc_matrix(X) print("done") del X_np, X_ss y_train = bp.unpack_ndarray_file("feat.y.blp") return X, y_train
def make_dataset_from_name(size, data_name): base = 2**(20) to_load = {base: 'small', base*10: 'mid', base*100: 'large', base*1000: 'xlarge', } return bp.unpack_ndarray_file(os.path.join(DATASET_ROOT, '%s_%s.blp' % (data_name, to_load[size])))
def _load_train_data(divisor, offset): ''' Load subset of data :param divisor: :param offset: :return: ''' start = offset / float(divisor) end = (1 + offset) / float(divisor) x_train = bp.unpack_ndarray_file(data_path_train_x) data_temp = np.empty_like(x_train[np.floor(x_train.shape[0]*start):np.floor(x_train.shape[0]*end), :, :]) np.copyto(data_temp, x_train[np.floor(x_train.shape[0]*start):np.floor(x_train.shape[0]*end), :, :]) x_train = -1 gc.collect() x_train = data_temp y_train = bp.unpack_ndarray_file(data_path_train_y) y_train = deepcopy(y_train[np.floor(y_train.shape[0]*start):np.floor(y_train.shape[0]*end)]) return x_train, y_train
def load_data(fp): if fp.endswith('bp'): data = bp.unpack_ndarray_file(fp) elif fp.endswith('jpg'): data = imread(fp) elif fp.endswith('hdf'): data = load_hdf_v2(fp).tolist() elif fp.endswith('pkl'): data = pickle.load(open(fp, 'r')) else: raise Exception('Not recognized.') return data
def load_next_partition(self, shared_xy): pfile_path = self.pfile_path_list[self.cur_pfile_index] if self.feat_mat is None or len(self.pfile_path_list) > 1: #log("Start reading partition "+pfile_path) self.feat_mat = bp.unpack_ndarray_file(pfile_path) self.label_vec = bp.unpack_ndarray_file(pfile_path+".labels") shared_x, shared_y = shared_xy self.feat_mat, self.label_vec = \ preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts['random']: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True) #log("Finished reading partition "+pfile_path) self.cur_frame_num = len(self.feat_mat) self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0 return pfile_path
def dumpInput(outputPath,perplexity,data_spec): x = sorted(glob.glob(data_spec)); tmp = bp.unpack_ndarray_file(x[0]) dims = len(tmp[0]) count = len(tmp) for i in range (1,len(x)): tmp = bp.unpack_ndarray_file(x[i]) if len(tmp[0]) != dims: raise ValueError("Dimension "+str(len(tmp[0]))+" not same as first file "+str(dims)) count = count + len(tmp) files = [] for p in perplexity: files.append(createDataFile(outputPath,'OriginalData',count,2,dims,p)) c = 0 for inputFile in x: tmp = bp.unpack_ndarray_file(inputFile) for f in files: appendData(f,tmp) c = c + len(tmp) for data_file in files: closeDataFile(data_file) if c != count: raise ValueError("Error output count ("+str(c)+") is not equal to calculated output count ("+str(count)+")") return (count,dims)
def __init__(self): scene.SceneCanvas.__init__(self, keys='interactive', size=(960, 960), show=True, bgcolor='black', title='MRI', vsync=False) self.unfreeze() self.view = self.central_widget.add_view() # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_thumbnail.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_2.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_1.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_9.npz')['arr_0'] # self.vol_data = self.vol_data / self.vol_data.max() # self.vol_data[self.vol_data < .5] = 0 self.vol_data = bp.unpack_ndarray_file('/home/yuncong/CSHL_volumes/volume_MD589_annotation.bp') # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_annotationAllClasses.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_labelmap.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD594_predMap.npz')['arr_0'] # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_predMap.npz')['arr_0'] self.vol_data = self.vol_data[::2,::2,::2].astype(np.float)/9. # self.vol_data = np.flipud(np.rollaxis(self.vol_data, 1)) # self.sectionTo = 150 self.sectionTo = 50 colors = np.loadtxt('/home/yuncong/Brain/visualization/100colors.txt') # self.volume = scene.visuals.Volume(self.vol_data[:,0:self.sectionTo,:], parent=self.view.scene, cmap=get_colormap('coolwarm')) self.volume = scene.visuals.Volume(self.vol_data[:,0:self.sectionTo,:], parent=self.view.scene, method='mip', cmap=Colormap([(0,0,0),(0,1,0), (1,0,0), (0,1,0), (0,0,1), (1,1,0), (0,1,1), (1,1,0),(1,0.5,0),(0,0.5,0),(0,0,1)], interpolation='linear')) self.volume.transform = scene.STTransform(translate=(0,0,0)) CMAP = self.volume.cmap self.section2D = self.vol_data[:,self.sectionTo,:] self.plane = scene.visuals.Image(self.section2D, parent=self.view.scene, cmap=CMAP, relative_step_size=1.5) # self.plane.transform = scene.STTransform(translate=(0,self.sectionTo,0)) # self.plane.transform = scene.STTransform(translate=(0,0,0)) self.plane.transform = MatrixTransform() self.plane.transform.rotate(90, (1,0,0)) self.plane.transform.translate((0,self.sectionTo,0)) self.plane.attach(BlackToAlpha()) self.view.camera = scene.cameras.ArcballCamera(parent=self.view.scene)
def get_train_X_y(train_filepath_prefix, location, fold_id=None): df_train = pd.read_csv( '.'.join([train_filepath_prefix, location + ".tsv"]), **KWARGS_READ_CSV) if isinstance(fold_id, int): crossval_index_filename = '.'.join([ train_filepath_prefix, "index.crossval{f}".format(f=fold_id), location + ".blp" ]) removal_index = pd.DatetimeIndex( bp.unpack_ndarray_file(crossval_index_filename)) df_train.drop(removal_index, axis=0, inplace=True) df_train.dropna(axis=0, how="any", inplace=True) return df_train.values
def unpack_file(fn, encoding='utf8'): """ Unpack numpy array from filename Supports binary data with bloscpack and text data with msgpack+blosc >>> unpack_file('foo.blp') # doctest: +SKIP array([1, 2, 3]) See also: pack_file """ try: return bloscpack.unpack_ndarray_file(fn) except ValueError: with open(fn, 'rb') as f: return np.array(msgpack.unpackb(blosc.decompress(f.read()), encoding=encoding))
def unpack_file(fn, encoding='utf8'): """ Unpack numpy array from filename Supports binary data with bloscpack and text data with msgpack+blosc >>> unpack_file('foo.blp') # doctest: +SKIP array([1, 2, 3]) See also: pack_file """ try: return bloscpack.unpack_ndarray_file(fn) except ValueError: with open(fn, 'rb') as f: return np.array( msgpack.unpackb(blosc.decompress(f.read()), encoding=encoding))
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1:y[3], :]) label_block.append(incr_class_label[x[3] + 1:y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len( fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join( dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % ( i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append( incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[ 0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') print return self
def __call__(self): if path.exists(self.filename): return bloscpack.unpack_ndarray_file(self.filename) result = self.f() bloscpack.pack_ndarray_file(result, self.filename) return result
def wrapped_f(): if path.exists(self.filename): return bloscpack.unpack_ndarray_file(self.filename) result = f() bloscpack.pack_ndarray_file(result, self.filename) return result
features_rotated = np.reshape([ np.roll(features_tabular[i], -ai, axis=-1) for i, ai in enumerate(max_angle_indices) ], (fs.shape[0], dm.n_freq * dm.n_angle)) return features_rotated t = time.time() sys.stderr.write('load filtered values ...') features = [] for i in range(dm.n_kernel): sys.stderr.write('%d\n' % i) a = bp.unpack_ndarray_file(os.environ['GORDON_RESULT_DIR'] + '/feature_%03d.bp' % i).reshape((-1, )) if which_part == 0: features.append(a[:len(a) / 2]) else: features.append(a[len(a) / 2:]) features = np.asarray(features).T sys.stderr.write('done in %f seconds\n' % (time.time() - t)) t = time.time() sys.stderr.write('rotate features ...') items_per_job = 100
def read_blp(serialized_filepath): return bp.unpack_ndarray_file(serialized_filepath)
b_conf_matrix = True word_vector_size = 300 sentence_length = 43 # Load train data print "\nunpickling training data..." x_train, y_train = _load_train_data(data_ratio, 0.0) # pdb.set_trace() print "\nTraining matrix shape:" print x_train.shape print "\nTraining matrix size in bytes: " print x_train.nbytes # Load test data print "\nunpickling testing data..." x_test = bp.unpack_ndarray_file(data_path_test_x) y_test = bp.unpack_ndarray_file(data_path_test_y) N_test = x_test.shape[0] print N_test # data_info print "\nreading info..." N = x_train.shape[0] dim = x_train.shape[1] max_len = x_train.shape[2] # reshape data to match chainer format x_train = np.reshape(x_train, (x_train.shape[0], 1, word_vector_size, max_len)) # Hyper search params
import pylab as Plot #from OriginalTSNE import tsne from BHTSNEDropInReplacementTSNE import processResultFileName import bloscpack as bp import pandas as pd import seaborn as sns import glob if __name__ == "__main__": outputLabels = '/tmp/data/testpartition*.blp.labels' y = sorted(glob.glob(outputLabels)) print("Found labels "+str(y)) files = ['resultLayer7Perplexity20.000000.dat','resultLayer7Perplexity30.000000.dat','resultLayer7Perplexity5.000000.dat','resultLayer7Perplexity50.000000.dat'] for f in files: inputData = f labels = bp.unpack_ndarray_file(y[0]) for i in range(1,len(y)): tmpl = bp.unpack_ndarray_file(y[i]) labels = Math.concatenate((labels,tmpl)) Y = processResultFileName(inputData) if Y.shape[0] != labels.shape[0]: raise ValueError("X shape does not match label shape!!") print(Y.shape) print(labels.shape) df = pd.DataFrame(Y, columns=['x', 'y']) df['label']=labels class1 = df.query("label == 1.0") class2 = df.query("label == 0.0") sns.set(style="darkgrid") f, ax = plt.subplots(figsize=(8, 8)) ax.set_aspect("equal")
pred_file = sys.argv[1] if '.gz' in pred_file: pred_mat = pickle.load(gzip.open(pred_file, 'rb')) else: pred_mat = pickle.load(open(pred_file, 'rb')) l = sorted(glob.glob(sys.argv[2])) if len(l) == 0: log("ERROR in show_results. Test partitions is empty. Argument "+sys.argv[2]) subclassificationMapping = pd.read_csv("/ssd/subclassificationMapping",sep="=",names=["NUMBER","NAME"],index_col="NUMBER") print(subclassificationMapping.loc[121][0]) test_labels = bp.unpack_ndarray_file(l[0]+".labels") test_labels = test_labels.astype(numpy.int32) # Read the subclassifications assert l[0][-4:] == '.blp' , "Invalid extension "+l[0][-4:] fn = l[0][:-4]+".ignored.csv.gz" df = pd.read_csv(fn,sep=';',usecols=['SUBCLASSIFICATION'],dtype=numpy.int32) assert df.shape[0] == test_labels.shape[0],"Shapes not equal" # End read the subclassification for i in range (1,len(l)): stop_if_stop_is_requested() lab = bp.unpack_ndarray_file(l[i]+".labels") lab = lab.astype(numpy.int32) test_labels = numpy.concatenate((test_labels,lab))
def decompress(self): it = bp.unpack_ndarray_file(self.storage)
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1: y[3], :]) label_block.append(incr_class_label[x[3] + 1: y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len(fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append(incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return self
pickle.dump((l_overall_sentiment, d_source_sentiment), open('./stats_july.pkl', 'wb')) if __name__ == '__main__': # get the models ensemble_model_data = conv_net_model.get_ensemble() # load test data root_path = '/Users/miljan/PycharmProjects/entity-dependent-sentiment-mining/data/blp/word2vec/two_classes/' s_x_test = 'test_x.blp' s_y_test = 'test_y.blp' # Load test data print "\nunpickling testing data..." x_test = bp.unpack_ndarray_file(root_path + s_x_test) y_test = bp.unpack_ndarray_file(root_path + s_y_test) # reshape data to match chainer format x_test = np.reshape(x_test, (x_test.shape[0], 1, 300, 43)) print 'Predicting' predictions = [] for entry in x_test: predictions.append( conv_net_model.ensemble_predict(entry, ensemble_model_data)) counter = 0 for real, pred in zip(y_test, predictions): if real == pred:
tf_parameter_dict = load_alignment_parameters_v2( stack_f=atlas_name, stack_m=stack, warp_setting=24, vol_type_f='annotationAsScore', vol_type_m='annotationAsScore', downscale=32) cf = tf_parameter_dict['centroid_f'] cm = tf_parameter_dict['centroid_m'] of = tf_parameter_dict['crop_origin_f'] om = tf_parameter_dict['crop_origin_m'] params = tf_parameter_dict['params'] Rt = np.reshape(params, (3, 4)) R = Rt[:3, :3] t = Rt[:3, 3] moving_brain_markers_raw = bp.unpack_ndarray_file( get_stacy_markers_filepath(stack=stack, structure='All')) brain_markers_aligned2atlas = np.dot( R, (moving_brain_markers_raw - om - cm).T).T + t + of + cf all_markers[stack] = brain_markers_aligned2atlas #%% thickness = 5 cut_plane_normal = (0., 0., 1.) testplot = 0 markers_by_struct = pd.DataFrame() for OZ in range(-440, 440, 10): cut_plane_origin = (0., 0., OZ) ######
def read_numpy( path ): return bp.unpack_ndarray_file(path)
module = importlib.import_module(args.module) coder_class = getattr(module, args.model) # load data # --------- # Primary dataset if args.data is None: #load mnist as default train_data = mnist_data.read_data_sets('MNIST_data').train datas = [train_data.images] if hasattr(coder_class, 'prep_mnist'): datas = [coder_class.prep_mnist(datas[0])] else: datas = [bp.unpack_ndarray_file(fname) for fname in args.data] if data_fn: datas = data_fn(datas) train_mode = 'recode' # auxiliary optional datasets if args.labels: labels = bp.unpack_ndarray_file(args.labels) datas.append(labels) train_mode = 'label' if args.targets: targets = bp.unpack_ndarray_file(args.targets) if target_fn: targets = target_fn(targets) datas.append(targets) train_mode = 'target'