Beispiel #1
0
    def rank_data_set(self, data_set, cv_generator):
        super().rank_data_set(data_set, cv_generator)
        bench_features_selection = []

        _, labels = DataSets.load(data_set)
        cv = cv_generator(labels.shape[0])

        for f in self.feature_selectors:
            bench_features_selection.append(f.weight_data_set(data_set, cv_generator))
        bench_features_selection = np.array(bench_features_selection)

        data, labels = DataSets.load(data_set)
        cv_indices = PreComputedData.load_cv(data_set, cv)

        feature_selection = multiprocessing.Manager().dict()

        with multiprocessing.Pool(processes=self.max_parallelism) as pool:
            for i in range(bench_features_selection.shape[1]):
                pool.apply_async(
                    self.run_and_set_in_results,
                    kwds={
                        'results': feature_selection,
                        'result_index': i,
                        'feature_selection': bench_features_selection[:, i],
                        'data': data[:, cv_indices[i][0]],
                        'labels': labels[cv_indices[i][0]]
                    }
                )
            pool.close()
            pool.join()

        return np.array([ranking for i, ranking in feature_selection.items()])
Beispiel #2
0
def main(argv=None):
    # data_set = PDataSet()
    # data_set.calc_etf('./data/out_dir/ag1606_20160104.csv')
    # train(data_set, data_set.batch_size(),data_set.train_step())
    data_sets = DataSets()
    data_sets.gf_etf('./data/out_dir')
    train(data_sets)
Beispiel #3
0
def test_four():
    model_file = './../../result/20171231/model/'
    pred = Prediciton()
    pred.init_model(55, model_file)
    data_sets = DataSets()
    data_sets.gf_etf('./../../data/out_dir/temp/')
    pred.prediction(data_sets)
    pred.save()
Beispiel #4
0
    def run(self, data_sets):
        self.results = []
        self.data_sets = [data_sets] if isinstance(data_sets,
                                                   str) else data_sets
        bc_name = type(self.benchmark).__name__

        for i, data_set in enumerate(self.data_sets):
            data, labels = DataSets.load(data_set)
            result = []

            for feature_selector in self.feature_selectors:
                print("{}: {} [{}]".format(bc_name, data_set,
                                           feature_selector.__name__))

                result.append(
                    self.benchmark.run_raw_result(
                        data, labels,
                        feature_selector.rank_data_set(data_set,
                                                       self.benchmark.cv)))

            result = np.array(result)
            self.results.append(result)

            print("\n{}".format(data_set.upper()))
            self._print_result(result)

        print("{} done".format(bc_name))

        self.results = np.array(self.results)

        return self.results
    def weight_data_set(self, data_set, cv_generator):
        super().weight_data_set(data_set, cv_generator)

        data, labels = DataSets.load(data_set)
        cv = cv_generator(labels.shape[0])

        try:
            return PreComputedData.load(data_set, cv, "weight", self)
        except FileNotFoundError:

            print(
                "=> Generating feature {method}s of {data_set} ({cv}) with {feature_selector}"
                .format(method="weight",
                        data_set=data_set,
                        feature_selector=self.__name__,
                        cv=type(cv).__name__))

            try:
                cv_indices = PreComputedData.load_cv(data_set, cv)
            except FileNotFoundError:
                mkdir(PreComputedData.cv_dir(data_set, cv))

                cv_indices = list(cv)
                np.save(PreComputedData.cv_file_name(data_set, cv), cv_indices)

            weights = self.generate(data, labels, cv_indices, "weight")
            self.__save(data_set, cv, "weight", weights)

            return weights
    def run(self, data_sets):
        self.results = []
        self.data_sets = [data_sets] if isinstance(data_sets, str) else data_sets
        bc_name = type(self.benchmark).__name__

        for i, data_set in enumerate(self.data_sets):
            data, labels = DataSets.load(data_set)
            result = []

            for feature_selector in self.feature_selectors:
                print("{}: {} [{}]".format(
                    bc_name,
                    data_set,
                    feature_selector.__name__
                ))

                result.append(self.benchmark.run_raw_result(
                    data,
                    labels,
                    feature_selector.rank_data_set(data_set, self.benchmark.cv)
                ))

            result = np.array(result)
            self.results.append(result)

            print("\n{}".format(data_set.upper()))
            self._print_result(result)

        print("{} done".format(bc_name))

        self.results = np.array(self.results)

        return self.results
    def weight_data_set(self, data_set, cv_generator):
        super().weight_data_set(data_set, cv_generator)

        data, labels = DataSets.load(data_set)
        cv = cv_generator(labels.shape[0])

        try:
            return PreComputedData.load(data_set, cv, "weight", self)
        except FileNotFoundError:

            print("=> Generating feature {method}s of {data_set} ({cv}) with {feature_selector}".format(
                method="weight",
                data_set=data_set,
                feature_selector=self.__name__,
                cv=type(cv).__name__
            ))

            try:
                cv_indices = PreComputedData.load_cv(data_set, cv)
            except FileNotFoundError:
                mkdir(PreComputedData.cv_dir(data_set, cv))

                cv_indices = list(cv)
                np.save(PreComputedData.cv_file_name(data_set, cv), cv_indices)

            weights = self.generate(data, labels, cv_indices, "weight")
            self.__save(data_set, cv, "weight", weights)

            return weights
 def __init__(self, data_set_name, n_indices=None):
     super().__init__()
     feature_probe_labels = DataSets.load_features_labels(data_set_name)
     if feature_probe_labels is None:
         self.n_significant_features = None
     else:
         self.n_significant_features = np.sum([feature_probe_labels == 1])
     self.n_indices = self.n_significant_features if n_indices is None else n_indices
 def __init__(self, data_set_name, n_indices=None):
     super().__init__()
     feature_probe_labels = DataSets.load_features_labels(data_set_name)
     if feature_probe_labels is None:
         self.n_significant_features = None
     else:
         self.n_significant_features = np.sum([feature_probe_labels == 1])
     self.n_indices = self.n_significant_features if n_indices is None else n_indices
Beispiel #10
0
def main(argv=None):
    # data_set = PDataSet()
    # data_set.calc_etf('./data/out_dir/ag1606_20160104.csv')
    # train(data_set, data_set.batch_size(),data_set.train_step())
    np.set_printoptions(threshold=np.inf)
    data_sets = DataSets()
    data_sets.gf_etf('./../fc/data/temp_train/')

    # while data_sets.is_range():
    #    data_set = data_sets.train_batch()
    #    print("filename %s" %(data_set.file_name()))
    #    batch_index, train_x, train_y = data_set.train_batch()
    #    print batch_index
    #    print '<------------------->'
    #    print train_x
    #    print '<------------------->'
    #    print train_y
    train(data_sets)
Beispiel #11
0
def process(full_data, permutation_index):
    original_set_size = full_data.get_original_data_set_size()
    TEST_SIZE = int(original_set_size * (conf.TEST_PERCENTAGE / 100.0))

    train_perm, val_perm, test_perm = getPermutation(
        permutation_index, full_data.labels[:original_set_size], TEST_SIZE)

    test_data = copy.deepcopy(full_data).apply_permutation(test_perm)
    validation_data = copy.deepcopy(full_data).apply_permutation(val_perm)
    train_data = filterAndCreateTrainSet(validation_data.names,
                                         test_data.names, full_data)

    data_sets = DataSets(train_data, validation_data, test_data)

    return data_sets
    def generate(self, data_set, cv):
        data, labels = DataSets.load(data_set)
        weights = PreComputedData.load(data_set, cv, "weight", self.feature_selector)
        ranks = PreComputedData.load(data_set, cv, "rank", self.feature_selector)
        stats, fig_hist_and_box, _, _ = AnalyseWeights.analyse_weights(weights.T)
        #fig_pca, fig_tsne = Analyse2D.analyse_2d(data, labels, ranks, self.features_to_filter)

        self.update_weights_plots(stats, fig_hist_and_box)
        #self.update_pca_plot(fig_pca)
        #self.update_tsne_plot(fig_tsne)
        plt.show()
        print(stats)

        if self.save_to_file:
            file_name = Analysis.file_name(data_set, cv, "weight", self.feature_selector)
            AnalyseFeatureSelection.create_directory(Analysis.dir_name(data_set, cv, "weight"))
            AnalyseFeatureSelection.save_weights_data(stats, fig_hist_and_box, file_name)
Beispiel #13
0
    def get_stimulus(self, args, logger, filename):
        X = None
        if args.cli_mode > 0:
            if args.normalize > 0:
                X = self._X_valid if args.train_only else self._X_test
            elif os.path.exists(filename):
                logger.info("Reading stimulus from %s..." % filename)
                X = np.loadtxt(filename,
                               dtype='float',
                               delimiter=',',
                               skiprows=1)
                # Python では転置行列を使用する
                X = np.array([X]) if X.ndim == 1 else X.T
                logger.info("Loaded %d items in stimulus file." % len(X))

        # 入力刺激の選択
        logger.debug("Input data shape:{}".format(X.shape))
        X = eval("X[" + args.select_data + "]")

        XN, _ = DataSets(
            args.valid_file if args.train_only else args.test_file,
            logger).get()
        XN = eval("XN[" + args.select_data + "]")

        # ラベル情報を抽出
        label_name = args.vlabel_name if args.train_only else args.ylabel_name
        ylabels = YLabels(label_name, logger)
        label_keys = ylabels.getKeys()
        label_vals = ylabels.getValues()
        L_enable_nos = label_keys[label_vals >= -1]

        # Xのデータ番号を元に、抽出位置を特定
        X_inds = [XN_ in L_enable_nos for XN_ in XN]
        X = X[X_inds]

        logger.debug("Input data shape:{}".format(X.shape))

        # 特徴量の選択
        indices = args.select_feat
        if ',' in indices:
            indices = '(' + indices + ')'
        X = eval("X[:," + indices + "]")
        logger.debug("Input data shape:{}".format(X.shape))

        return X
Beispiel #14
0
    def update_average(self, args, logger):
        # train(Xfile),対応ラベル読込
        N, X = DataSets(args.Xfile, logger).get()
        labels = YLabels(args.ylabel_name, logger)
        # 1以上のラベルのみアトラクタを構成する要素として扱う
        lset = {s if s >= 1 else None for s in set(labels.getValues())}
        lset.discard(None)
        ylbl_vals = sorted(lset)

        # ラベル種毎にXの平均を取得して配列化
        avgs = []
        for ylbl in ylbl_vals:
            ind_list = labels.getIndices(ylbl, N)
            avg = X[ind_list].mean(axis=0)
            avgs.append(avg)
        np_avg = np.array(avgs, dtype=float).T

        # 対応するylabelを1行目に追加して、平均値ファイルを上書き
        lbl_avg = np.vstack((ylbl_vals, np_avg))
        np.savetxt(args.AVGfile, lbl_avg, delimiter=',')
    def run(self, data_sets):
        self.results = np.zeros((len(data_sets) + 1, len(self.feature_selectors)))

        benchmark = FMeasureBenchmark(
            classifiers=self.classifiers,
            jaccard_percentage=self.jaccard_percentage,
            beta=self.beta,
        )

        len_fs = len(self.feature_selectors)
        size = len(data_sets) * len_fs

        for i, data_set in enumerate(data_sets):
            data, labels = DataSets.load(data_set)

            for j, feature_selector in enumerate(self.feature_selectors):
                print("Progress: {:.2%}".format((i * len_fs + j)/size))
                self.results[i, j] = benchmark.run(
                    data,
                    labels,
                    robustness_features_selection=feature_selector.rank_data_set(
                        data_set,
                        benchmark.robustness_benchmark.cv,
                    ),
                    accuracy_features_selection=feature_selector.rank_data_set(
                        data_set,
                        benchmark.accuracy_benchmark.cv
                    )
                )

        self.results[-1, :] = self.results[:-1].mean(axis=0)

        order = np.argsort(self.results[-1])[::-1]
        self.results = self.results[:, order]

        self.row_labels = data_sets + ["Mean"]
        self.col_labels = []
        for i in order:
            self.col_labels.append(self.feature_selectors[i].__name__)

        return self.results
Beispiel #16
0
    def run(self, data_sets):
        self.results = np.zeros(
            (len(data_sets) + 1, len(self.feature_selectors)))

        benchmark = FMeasureBenchmark(
            classifiers=self.classifiers,
            jaccard_percentage=self.jaccard_percentage,
            beta=self.beta,
        )

        len_fs = len(self.feature_selectors)
        size = len(data_sets) * len_fs

        for i, data_set in enumerate(data_sets):
            data, labels = DataSets.load(data_set)

            for j, feature_selector in enumerate(self.feature_selectors):
                print("Progress: {:.2%}".format((i * len_fs + j) / size))
                self.results[i, j] = benchmark.run(
                    data,
                    labels,
                    robustness_features_selection=feature_selector.
                    rank_data_set(
                        data_set,
                        benchmark.robustness_benchmark.cv,
                    ),
                    accuracy_features_selection=feature_selector.rank_data_set(
                        data_set, benchmark.accuracy_benchmark.cv))

        self.results[-1, :] = self.results[:-1].mean(axis=0)

        order = np.argsort(self.results[-1])[::-1]
        self.results = self.results[:, order]

        self.row_labels = data_sets + ["Mean"]
        self.col_labels = []
        for i in order:
            self.col_labels.append(self.feature_selectors[i].__name__)

        return self.results
Beispiel #17
0
    def generate(self, data_set, cv):
        data, labels = DataSets.load(data_set)
        weights = PreComputedData.load(data_set, cv, "weight",
                                       self.feature_selector)
        ranks = PreComputedData.load(data_set, cv, "rank",
                                     self.feature_selector)
        stats, fig_hist_and_box, _, _ = AnalyseWeights.analyse_weights(
            weights.T)
        #fig_pca, fig_tsne = Analyse2D.analyse_2d(data, labels, ranks, self.features_to_filter)

        self.update_weights_plots(stats, fig_hist_and_box)
        #self.update_pca_plot(fig_pca)
        #self.update_tsne_plot(fig_tsne)
        plt.show()
        print(stats)

        if self.save_to_file:
            file_name = Analysis.file_name(data_set, cv, "weight",
                                           self.feature_selector)
            AnalyseFeatureSelection.create_directory(
                Analysis.dir_name(data_set, cv, "weight"))
            AnalyseFeatureSelection.save_weights_data(stats, fig_hist_and_box,
                                                      file_name)
Beispiel #18
0
    def normalize_all(self, args, logger):
        """ 全観測刺激を取得し、正規化CSVを出力する """
        if args.normalize <= 0:
            return

        # AVGデータの読込
        AVG = np.loadtxt(args.AVGfile,
                         dtype='float',
                         delimiter=',',
                         skiprows=0).T
        if len(AVG.shape) == 1:
            AVG = np.array([AVG])
        AVG_i = AVG[:, 0].astype(int)
        AVG = AVG[:, 1:]
        logger.info("Loaded {} shape category patterns from {}.".format(
            AVG.shape, args.AVGfile))
        # test, AVGの正規化

        # normalize 0:正規化なし、1:正規化パラメタあれば参照、2:新規正規化
        logger.info("Normalize mode %d" % args.normalize)

        normer = MinMaxNormalizer()
        # normalize >= 2または統計ファイルが無い場合は新たに正規化する.
        upd = not os.path.exists(args.min_max_all) or args.normalize > 1
        files = {"min_max_all": args.min_max_all}
        if upd and not args.train_only:
            logger.warn(
                "no normalization parameter (Probably test without train)")
        if not upd:
            normer.load_stat(files)

        if args.train_only:
            # TRAIN正規化
            N, X = DataSets(args.Xfile, logger).get()
            logger.info("Loaded {} items in stimulus from {}.".format(
                X.shape, args.Xfile))
            X_norm = normer.normalize_feature_val(X,
                                                  ylabel_name=None,
                                                  update=upd,
                                                  clip=True)
            # 統計ファイル出力
            if upd:
                normer.save_stat(files)
            X_out = np.vstack((N.T, X_norm.T))
            np.savetxt(self.get_norm_name(args.Xfile), X_out, delimiter=",")
            self._X = X_norm

            # Validation正規化
            N_valid, X_valid = DataSets(args.valid_file, logger).get()
            X_valid_norm = normer.normalize_feature_val(X_valid,
                                                        ylabel_name=None,
                                                        update=False,
                                                        clip=True)
            X_valid_out = np.vstack((N_valid.T, X_valid_norm.T))
            np.savetxt(self.get_norm_name(args.valid_file),
                       X_valid_out,
                       delimiter=",")
            self._X_valid = X_valid_norm
        else:
            # TEST正規化 (正規化パラメタupdateなし)
            N_test, X_test = DataSets(args.test_file, logger).get()
            logger.info("Loaded {} items in stimulus from {}.".format(
                X_test.shape, args.test_file))
            X_test_norm = normer.normalize_feature_val(X_test,
                                                       ylabel_name=None,
                                                       update=False,
                                                       clip=True)
            X_test_out = np.vstack((N_test.T, X_test_norm.T))
            np.savetxt(self.get_norm_name(args.test_file),
                       X_test_out,
                       delimiter=",")
            self._X_test = X_test_norm

        # AVGファイル処理
        AVG_norm = normer.normalize_attractor(AVG, clip=True)
        AVG_out = np.vstack((AVG_i.T, AVG_norm.T))
        np.savetxt(self.get_norm_name(args.AVGfile), AVG_out, delimiter=",")
        self._AVG = AVG_norm
        return
Beispiel #19
0
    def save_results(self, args, logger, bam, Cfg, X, BAttM_res):
        # 状態を保存
        saveConfig(Cfg, args.pkl_name)

        logger.info("Formatting results...")

        confBAttM = list()

        # 全カテゴリの確信度を格納する
        for i in np.arange(0, len(Cfg.state_list)):
            rows = list()
            tmp = Cfg.state_list[i]
            # estx/estP をもとにして確信度を計算する
            # (各step毎)に関数を適用して、結果をリストに格納する
            cnt = 0
            for res in BAttM_res:
                conf_res = bam.confidence(tmp, res.estx, res.estP)
                # リストを一つのベクトルにまとめ、行列に連結する
                rows.append(conf_res)
                cnt += 1
            confBAttM.append(rows)

        confBAttM = np.array(confBAttM).T

        zBAttM = list()

        pBAttM = list()
        pBAttM.append(np.arange(1, 1 + len(Cfg.state_list)))

        for i in np.arange(0, len(X) * args.repeat_cnt):
            #   # 観測刺激の予測値 z の結果を行列に格納する
            if BAttM_res[i].estx is not None:
                zBAttM.append(BAttM_res[i].estx)
            #   # 各列の合計値を求め、アトラクタ毎の共分散の和を計算する
            p = BAttM_res[i].estP
            if p is not None:
                tmp = BAttM_res[i].estP * BAttM_res[i].estP
                res = np.sum(tmp, axis=1)
                pBAttM.append(res)

        # 結果を出力
        logger.info("Writing CSVs...")
        # 確信度: 各時刻、各カテゴリの確信度を格納した行列
        # 値範囲は0-1 ではない
        cidx = np.array([np.arange(1, 1 + len(Cfg.state_list))], dtype=float)
        cdata = np.vstack((cidx, np.array(confBAttM)))
        np.savetxt(args.conf_file, cdata, delimiter=",")
        # z空間の位置 (観測刺激の予測値に等しい)
        zidx = np.array([np.arange(1, 1 + len(zBAttM))])
        zdata = np.hstack((zidx.T, np.array(zBAttM)))
        np.savetxt(args.z_file, zdata.T, delimiter=",")
        # アトラクタ毎の共分散和
        np.savetxt(args.p_file, np.array(pBAttM), delimiter=",")
        # 最大確信度のラベルIndex( 1 はじまり)
        ylabels_conf = confBAttM.argmax(axis=1) + 1
        maxes = confBAttM.max(axis=1)
        ylabels_conf[maxes == 0.0] = -1
        if args.repeat_cnt > 1:
            idcs = np.arange(args.repeat_cnt - 1, len(ylabels_conf),
                             args.repeat_cnt)
        else:
            idcs = np.arange(0, len(ylabels_conf))

        filename = args.valid_file if args.train_only else args.test_file
        N, _ = DataSets(filename, logger).get()
        feat_nos = eval('N[' + args.select_data + ']')

        # 特徴量のデータ番号とラベル定義を参照し
        # BAM推論されたデータ番号を推定してiBAttM.csvのヘッダに設定する。
        label_name = args.vlabel_name if args.train_only else args.ylabel_name
        ylabels = YLabels(label_name, logger)
        label_keys = ylabels.getKeys()
        label_keys = eval("label_keys[" + args.select_data + "]")
        label_vals = ylabels.getValues()
        label_vals = eval("label_vals[" + args.select_data + "]")
        L_enable_nos = label_keys[label_vals >= -1]

        # Xのデータ番号を元に、抽出データを特定
        X_inds = [no in L_enable_nos for no in feat_nos]
        feat_nos = feat_nos[X_inds]

        idxdata = np.vstack((feat_nos, ylabels_conf[idcs]))
        np.savetxt(args.idx_file, idxdata, delimiter=",", fmt="%.0f")

        # グラフ画像出力
        self._cbam = np.array(confBAttM)
        self._zbam = np.array(zBAttM)
        logger.info("Finished.")

        pass