def rank_data_set(self, data_set, cv_generator): super().rank_data_set(data_set, cv_generator) bench_features_selection = [] _, labels = DataSets.load(data_set) cv = cv_generator(labels.shape[0]) for f in self.feature_selectors: bench_features_selection.append(f.weight_data_set(data_set, cv_generator)) bench_features_selection = np.array(bench_features_selection) data, labels = DataSets.load(data_set) cv_indices = PreComputedData.load_cv(data_set, cv) feature_selection = multiprocessing.Manager().dict() with multiprocessing.Pool(processes=self.max_parallelism) as pool: for i in range(bench_features_selection.shape[1]): pool.apply_async( self.run_and_set_in_results, kwds={ 'results': feature_selection, 'result_index': i, 'feature_selection': bench_features_selection[:, i], 'data': data[:, cv_indices[i][0]], 'labels': labels[cv_indices[i][0]] } ) pool.close() pool.join() return np.array([ranking for i, ranking in feature_selection.items()])
def main(argv=None): # data_set = PDataSet() # data_set.calc_etf('./data/out_dir/ag1606_20160104.csv') # train(data_set, data_set.batch_size(),data_set.train_step()) data_sets = DataSets() data_sets.gf_etf('./data/out_dir') train(data_sets)
def test_four(): model_file = './../../result/20171231/model/' pred = Prediciton() pred.init_model(55, model_file) data_sets = DataSets() data_sets.gf_etf('./../../data/out_dir/temp/') pred.prediction(data_sets) pred.save()
def run(self, data_sets): self.results = [] self.data_sets = [data_sets] if isinstance(data_sets, str) else data_sets bc_name = type(self.benchmark).__name__ for i, data_set in enumerate(self.data_sets): data, labels = DataSets.load(data_set) result = [] for feature_selector in self.feature_selectors: print("{}: {} [{}]".format(bc_name, data_set, feature_selector.__name__)) result.append( self.benchmark.run_raw_result( data, labels, feature_selector.rank_data_set(data_set, self.benchmark.cv))) result = np.array(result) self.results.append(result) print("\n{}".format(data_set.upper())) self._print_result(result) print("{} done".format(bc_name)) self.results = np.array(self.results) return self.results
def weight_data_set(self, data_set, cv_generator): super().weight_data_set(data_set, cv_generator) data, labels = DataSets.load(data_set) cv = cv_generator(labels.shape[0]) try: return PreComputedData.load(data_set, cv, "weight", self) except FileNotFoundError: print( "=> Generating feature {method}s of {data_set} ({cv}) with {feature_selector}" .format(method="weight", data_set=data_set, feature_selector=self.__name__, cv=type(cv).__name__)) try: cv_indices = PreComputedData.load_cv(data_set, cv) except FileNotFoundError: mkdir(PreComputedData.cv_dir(data_set, cv)) cv_indices = list(cv) np.save(PreComputedData.cv_file_name(data_set, cv), cv_indices) weights = self.generate(data, labels, cv_indices, "weight") self.__save(data_set, cv, "weight", weights) return weights
def run(self, data_sets): self.results = [] self.data_sets = [data_sets] if isinstance(data_sets, str) else data_sets bc_name = type(self.benchmark).__name__ for i, data_set in enumerate(self.data_sets): data, labels = DataSets.load(data_set) result = [] for feature_selector in self.feature_selectors: print("{}: {} [{}]".format( bc_name, data_set, feature_selector.__name__ )) result.append(self.benchmark.run_raw_result( data, labels, feature_selector.rank_data_set(data_set, self.benchmark.cv) )) result = np.array(result) self.results.append(result) print("\n{}".format(data_set.upper())) self._print_result(result) print("{} done".format(bc_name)) self.results = np.array(self.results) return self.results
def weight_data_set(self, data_set, cv_generator): super().weight_data_set(data_set, cv_generator) data, labels = DataSets.load(data_set) cv = cv_generator(labels.shape[0]) try: return PreComputedData.load(data_set, cv, "weight", self) except FileNotFoundError: print("=> Generating feature {method}s of {data_set} ({cv}) with {feature_selector}".format( method="weight", data_set=data_set, feature_selector=self.__name__, cv=type(cv).__name__ )) try: cv_indices = PreComputedData.load_cv(data_set, cv) except FileNotFoundError: mkdir(PreComputedData.cv_dir(data_set, cv)) cv_indices = list(cv) np.save(PreComputedData.cv_file_name(data_set, cv), cv_indices) weights = self.generate(data, labels, cv_indices, "weight") self.__save(data_set, cv, "weight", weights) return weights
def __init__(self, data_set_name, n_indices=None): super().__init__() feature_probe_labels = DataSets.load_features_labels(data_set_name) if feature_probe_labels is None: self.n_significant_features = None else: self.n_significant_features = np.sum([feature_probe_labels == 1]) self.n_indices = self.n_significant_features if n_indices is None else n_indices
def main(argv=None): # data_set = PDataSet() # data_set.calc_etf('./data/out_dir/ag1606_20160104.csv') # train(data_set, data_set.batch_size(),data_set.train_step()) np.set_printoptions(threshold=np.inf) data_sets = DataSets() data_sets.gf_etf('./../fc/data/temp_train/') # while data_sets.is_range(): # data_set = data_sets.train_batch() # print("filename %s" %(data_set.file_name())) # batch_index, train_x, train_y = data_set.train_batch() # print batch_index # print '<------------------->' # print train_x # print '<------------------->' # print train_y train(data_sets)
def process(full_data, permutation_index): original_set_size = full_data.get_original_data_set_size() TEST_SIZE = int(original_set_size * (conf.TEST_PERCENTAGE / 100.0)) train_perm, val_perm, test_perm = getPermutation( permutation_index, full_data.labels[:original_set_size], TEST_SIZE) test_data = copy.deepcopy(full_data).apply_permutation(test_perm) validation_data = copy.deepcopy(full_data).apply_permutation(val_perm) train_data = filterAndCreateTrainSet(validation_data.names, test_data.names, full_data) data_sets = DataSets(train_data, validation_data, test_data) return data_sets
def generate(self, data_set, cv): data, labels = DataSets.load(data_set) weights = PreComputedData.load(data_set, cv, "weight", self.feature_selector) ranks = PreComputedData.load(data_set, cv, "rank", self.feature_selector) stats, fig_hist_and_box, _, _ = AnalyseWeights.analyse_weights(weights.T) #fig_pca, fig_tsne = Analyse2D.analyse_2d(data, labels, ranks, self.features_to_filter) self.update_weights_plots(stats, fig_hist_and_box) #self.update_pca_plot(fig_pca) #self.update_tsne_plot(fig_tsne) plt.show() print(stats) if self.save_to_file: file_name = Analysis.file_name(data_set, cv, "weight", self.feature_selector) AnalyseFeatureSelection.create_directory(Analysis.dir_name(data_set, cv, "weight")) AnalyseFeatureSelection.save_weights_data(stats, fig_hist_and_box, file_name)
def get_stimulus(self, args, logger, filename): X = None if args.cli_mode > 0: if args.normalize > 0: X = self._X_valid if args.train_only else self._X_test elif os.path.exists(filename): logger.info("Reading stimulus from %s..." % filename) X = np.loadtxt(filename, dtype='float', delimiter=',', skiprows=1) # Python では転置行列を使用する X = np.array([X]) if X.ndim == 1 else X.T logger.info("Loaded %d items in stimulus file." % len(X)) # 入力刺激の選択 logger.debug("Input data shape:{}".format(X.shape)) X = eval("X[" + args.select_data + "]") XN, _ = DataSets( args.valid_file if args.train_only else args.test_file, logger).get() XN = eval("XN[" + args.select_data + "]") # ラベル情報を抽出 label_name = args.vlabel_name if args.train_only else args.ylabel_name ylabels = YLabels(label_name, logger) label_keys = ylabels.getKeys() label_vals = ylabels.getValues() L_enable_nos = label_keys[label_vals >= -1] # Xのデータ番号を元に、抽出位置を特定 X_inds = [XN_ in L_enable_nos for XN_ in XN] X = X[X_inds] logger.debug("Input data shape:{}".format(X.shape)) # 特徴量の選択 indices = args.select_feat if ',' in indices: indices = '(' + indices + ')' X = eval("X[:," + indices + "]") logger.debug("Input data shape:{}".format(X.shape)) return X
def update_average(self, args, logger): # train(Xfile),対応ラベル読込 N, X = DataSets(args.Xfile, logger).get() labels = YLabels(args.ylabel_name, logger) # 1以上のラベルのみアトラクタを構成する要素として扱う lset = {s if s >= 1 else None for s in set(labels.getValues())} lset.discard(None) ylbl_vals = sorted(lset) # ラベル種毎にXの平均を取得して配列化 avgs = [] for ylbl in ylbl_vals: ind_list = labels.getIndices(ylbl, N) avg = X[ind_list].mean(axis=0) avgs.append(avg) np_avg = np.array(avgs, dtype=float).T # 対応するylabelを1行目に追加して、平均値ファイルを上書き lbl_avg = np.vstack((ylbl_vals, np_avg)) np.savetxt(args.AVGfile, lbl_avg, delimiter=',')
def run(self, data_sets): self.results = np.zeros((len(data_sets) + 1, len(self.feature_selectors))) benchmark = FMeasureBenchmark( classifiers=self.classifiers, jaccard_percentage=self.jaccard_percentage, beta=self.beta, ) len_fs = len(self.feature_selectors) size = len(data_sets) * len_fs for i, data_set in enumerate(data_sets): data, labels = DataSets.load(data_set) for j, feature_selector in enumerate(self.feature_selectors): print("Progress: {:.2%}".format((i * len_fs + j)/size)) self.results[i, j] = benchmark.run( data, labels, robustness_features_selection=feature_selector.rank_data_set( data_set, benchmark.robustness_benchmark.cv, ), accuracy_features_selection=feature_selector.rank_data_set( data_set, benchmark.accuracy_benchmark.cv ) ) self.results[-1, :] = self.results[:-1].mean(axis=0) order = np.argsort(self.results[-1])[::-1] self.results = self.results[:, order] self.row_labels = data_sets + ["Mean"] self.col_labels = [] for i in order: self.col_labels.append(self.feature_selectors[i].__name__) return self.results
def run(self, data_sets): self.results = np.zeros( (len(data_sets) + 1, len(self.feature_selectors))) benchmark = FMeasureBenchmark( classifiers=self.classifiers, jaccard_percentage=self.jaccard_percentage, beta=self.beta, ) len_fs = len(self.feature_selectors) size = len(data_sets) * len_fs for i, data_set in enumerate(data_sets): data, labels = DataSets.load(data_set) for j, feature_selector in enumerate(self.feature_selectors): print("Progress: {:.2%}".format((i * len_fs + j) / size)) self.results[i, j] = benchmark.run( data, labels, robustness_features_selection=feature_selector. rank_data_set( data_set, benchmark.robustness_benchmark.cv, ), accuracy_features_selection=feature_selector.rank_data_set( data_set, benchmark.accuracy_benchmark.cv)) self.results[-1, :] = self.results[:-1].mean(axis=0) order = np.argsort(self.results[-1])[::-1] self.results = self.results[:, order] self.row_labels = data_sets + ["Mean"] self.col_labels = [] for i in order: self.col_labels.append(self.feature_selectors[i].__name__) return self.results
def generate(self, data_set, cv): data, labels = DataSets.load(data_set) weights = PreComputedData.load(data_set, cv, "weight", self.feature_selector) ranks = PreComputedData.load(data_set, cv, "rank", self.feature_selector) stats, fig_hist_and_box, _, _ = AnalyseWeights.analyse_weights( weights.T) #fig_pca, fig_tsne = Analyse2D.analyse_2d(data, labels, ranks, self.features_to_filter) self.update_weights_plots(stats, fig_hist_and_box) #self.update_pca_plot(fig_pca) #self.update_tsne_plot(fig_tsne) plt.show() print(stats) if self.save_to_file: file_name = Analysis.file_name(data_set, cv, "weight", self.feature_selector) AnalyseFeatureSelection.create_directory( Analysis.dir_name(data_set, cv, "weight")) AnalyseFeatureSelection.save_weights_data(stats, fig_hist_and_box, file_name)
def normalize_all(self, args, logger): """ 全観測刺激を取得し、正規化CSVを出力する """ if args.normalize <= 0: return # AVGデータの読込 AVG = np.loadtxt(args.AVGfile, dtype='float', delimiter=',', skiprows=0).T if len(AVG.shape) == 1: AVG = np.array([AVG]) AVG_i = AVG[:, 0].astype(int) AVG = AVG[:, 1:] logger.info("Loaded {} shape category patterns from {}.".format( AVG.shape, args.AVGfile)) # test, AVGの正規化 # normalize 0:正規化なし、1:正規化パラメタあれば参照、2:新規正規化 logger.info("Normalize mode %d" % args.normalize) normer = MinMaxNormalizer() # normalize >= 2または統計ファイルが無い場合は新たに正規化する. upd = not os.path.exists(args.min_max_all) or args.normalize > 1 files = {"min_max_all": args.min_max_all} if upd and not args.train_only: logger.warn( "no normalization parameter (Probably test without train)") if not upd: normer.load_stat(files) if args.train_only: # TRAIN正規化 N, X = DataSets(args.Xfile, logger).get() logger.info("Loaded {} items in stimulus from {}.".format( X.shape, args.Xfile)) X_norm = normer.normalize_feature_val(X, ylabel_name=None, update=upd, clip=True) # 統計ファイル出力 if upd: normer.save_stat(files) X_out = np.vstack((N.T, X_norm.T)) np.savetxt(self.get_norm_name(args.Xfile), X_out, delimiter=",") self._X = X_norm # Validation正規化 N_valid, X_valid = DataSets(args.valid_file, logger).get() X_valid_norm = normer.normalize_feature_val(X_valid, ylabel_name=None, update=False, clip=True) X_valid_out = np.vstack((N_valid.T, X_valid_norm.T)) np.savetxt(self.get_norm_name(args.valid_file), X_valid_out, delimiter=",") self._X_valid = X_valid_norm else: # TEST正規化 (正規化パラメタupdateなし) N_test, X_test = DataSets(args.test_file, logger).get() logger.info("Loaded {} items in stimulus from {}.".format( X_test.shape, args.test_file)) X_test_norm = normer.normalize_feature_val(X_test, ylabel_name=None, update=False, clip=True) X_test_out = np.vstack((N_test.T, X_test_norm.T)) np.savetxt(self.get_norm_name(args.test_file), X_test_out, delimiter=",") self._X_test = X_test_norm # AVGファイル処理 AVG_norm = normer.normalize_attractor(AVG, clip=True) AVG_out = np.vstack((AVG_i.T, AVG_norm.T)) np.savetxt(self.get_norm_name(args.AVGfile), AVG_out, delimiter=",") self._AVG = AVG_norm return
def save_results(self, args, logger, bam, Cfg, X, BAttM_res): # 状態を保存 saveConfig(Cfg, args.pkl_name) logger.info("Formatting results...") confBAttM = list() # 全カテゴリの確信度を格納する for i in np.arange(0, len(Cfg.state_list)): rows = list() tmp = Cfg.state_list[i] # estx/estP をもとにして確信度を計算する # (各step毎)に関数を適用して、結果をリストに格納する cnt = 0 for res in BAttM_res: conf_res = bam.confidence(tmp, res.estx, res.estP) # リストを一つのベクトルにまとめ、行列に連結する rows.append(conf_res) cnt += 1 confBAttM.append(rows) confBAttM = np.array(confBAttM).T zBAttM = list() pBAttM = list() pBAttM.append(np.arange(1, 1 + len(Cfg.state_list))) for i in np.arange(0, len(X) * args.repeat_cnt): # # 観測刺激の予測値 z の結果を行列に格納する if BAttM_res[i].estx is not None: zBAttM.append(BAttM_res[i].estx) # # 各列の合計値を求め、アトラクタ毎の共分散の和を計算する p = BAttM_res[i].estP if p is not None: tmp = BAttM_res[i].estP * BAttM_res[i].estP res = np.sum(tmp, axis=1) pBAttM.append(res) # 結果を出力 logger.info("Writing CSVs...") # 確信度: 各時刻、各カテゴリの確信度を格納した行列 # 値範囲は0-1 ではない cidx = np.array([np.arange(1, 1 + len(Cfg.state_list))], dtype=float) cdata = np.vstack((cidx, np.array(confBAttM))) np.savetxt(args.conf_file, cdata, delimiter=",") # z空間の位置 (観測刺激の予測値に等しい) zidx = np.array([np.arange(1, 1 + len(zBAttM))]) zdata = np.hstack((zidx.T, np.array(zBAttM))) np.savetxt(args.z_file, zdata.T, delimiter=",") # アトラクタ毎の共分散和 np.savetxt(args.p_file, np.array(pBAttM), delimiter=",") # 最大確信度のラベルIndex( 1 はじまり) ylabels_conf = confBAttM.argmax(axis=1) + 1 maxes = confBAttM.max(axis=1) ylabels_conf[maxes == 0.0] = -1 if args.repeat_cnt > 1: idcs = np.arange(args.repeat_cnt - 1, len(ylabels_conf), args.repeat_cnt) else: idcs = np.arange(0, len(ylabels_conf)) filename = args.valid_file if args.train_only else args.test_file N, _ = DataSets(filename, logger).get() feat_nos = eval('N[' + args.select_data + ']') # 特徴量のデータ番号とラベル定義を参照し # BAM推論されたデータ番号を推定してiBAttM.csvのヘッダに設定する。 label_name = args.vlabel_name if args.train_only else args.ylabel_name ylabels = YLabels(label_name, logger) label_keys = ylabels.getKeys() label_keys = eval("label_keys[" + args.select_data + "]") label_vals = ylabels.getValues() label_vals = eval("label_vals[" + args.select_data + "]") L_enable_nos = label_keys[label_vals >= -1] # Xのデータ番号を元に、抽出データを特定 X_inds = [no in L_enable_nos for no in feat_nos] feat_nos = feat_nos[X_inds] idxdata = np.vstack((feat_nos, ylabels_conf[idcs])) np.savetxt(args.idx_file, idxdata, delimiter=",", fmt="%.0f") # グラフ画像出力 self._cbam = np.array(confBAttM) self._zbam = np.array(zBAttM) logger.info("Finished.") pass