コード例 #1
0
def task3(file_path):
    binning3 = preprocessing.KBinsDiscretizer(n_bins=3)
    binning6 = preprocessing.KBinsDiscretizer(n_bins=6)
    binning9 = preprocessing.KBinsDiscretizer(n_bins=9)

    data = pd.read_csv(
        file_path,
        usecols=(
            7,  # loan - classifier
            16,  # bank_arg1
            18,  # y - classifier
        ))
    labels = data.columns.values
    timer = TicToc()

    # Data clean-up
    data = data_transform(data)

    print_table(data.head())

    X_data = (data.drop("bank_arg1", 1))
    Y_data = (data["bank_arg1"])

    # Training/testing sets
    X_train = X_data[:-2500]
    X_test = X_data[-2500:]
    Y_train = Y_data[:-2500]
    Y_test = Y_data[-2500:]

    timer.tic()
    binning3.fit(X_data)
    prediction3 = binning3.transform(X_data)
    timer.toc()

    timer.tic()
    binning6.fit(X_data)
    prediction6 = binning6.transform(X_data)
    timer.toc()

    timer.tic()
    binning9.fit(X_data)
    prediction9 = binning9.transform(X_data)
    timer.toc()

    # TODO: Fix evaluation for matrix
    acc3 = evaluate_result(prediction3, Y_data.values)
    acc6 = evaluate_result(prediction6, Y_data.values)
    acc9 = evaluate_result(prediction9, Y_data.values)

    print(f"Binning with 3 units: {acc3}% matched in {timer.elapsed}s")
    print(f"Binning with 6 units: {acc6}% matched in {timer.elapsed}s")
    print(f"Binning with 9 units: {acc9}% matched in {timer.elapsed}s")
コード例 #2
0
    def preprocess(self, file_name):
        """
        Preprocesses the data for the Random Forest so that it is properly formatted for
        our training model.
        """

        names = [
            "age", "wrk_cls", "edu", "edu_num", "marital_sts", "occu_code",
            "relation", "race", "sex", "gain", "loss", "hours", "country",
            "income"
        ]

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(n_bins=8, encode="ordinal")
        data = est.fit_transform(data)
        return data
コード例 #3
0
    def kbinData(self):
        print("kbinData")
        training_data = []
        with open('training_data_set.txt', 'rb') as fp:
            training_set = pickle.load(fp)
        npa = np.empty([0, 70], dtype=int)
        ts_del_list = []
        for ts in training_set:
            try:
                npa = np.append(npa, np.array(training_set[ts]["d"]), axis=0)
            except:
                ts_del_list.append(ts)
                continue
        for tdl in ts_del_list:
            del training_set[tdl]
        est = []

        for n in npa.transpose():
            est.append(
                preprocessing.KBinsDiscretizer(n_bins=10,
                                               encode='ordinal').fit_transform(
                                                   n.reshape(-1, 1)))

        np_est = np.array(est).transpose()[0]
        self.kbinChart(np_est)
        i = 0
        for ts in training_set:
            training_set[ts]["kbinD"] = np_est[i]
            i += 1
        with open('training_data_set_kbin', 'wb') as fp:
            pickle.dump(training_set, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print("done")
コード例 #4
0
 def __init__(self, column, dataframe, settings):
     AEncoder.__init__(self, column, dataframe, settings)
     self.encoder = preprocessing.KBinsDiscretizer(
         n_bins=self.settings.get('n_bins', 5),
         encode=self.settings.get('encode', 'onehot'),
         strategy=self.settings.get('strategy', 'quantile'))
     self.pickle_process(dataframe)
コード例 #5
0
ファイル: discretize.py プロジェクト: alek9z/dataMole
 def execute(self, df: data.Frame) -> data.Frame:
     frame = copy.deepcopy(df)
     f = frame.getRawFrame()
     # Operation ignores nan values
     nanRows = f.iloc[:, list(self.__attributes.keys())].isnull()
     # For every column, transform every non-nan row
     columns = f.columns
     edges: Dict[int, List[float]] = dict()
     for col, k in self.__attributes.items():
         colName = columns[col]
         notNa = (~nanRows.loc[:, colName]).to_list()
         discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal',
                                            strategy=self.__strategy.value)
         # Discretize and convert to string (since categories are strings)
         result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str)
         name: str = colName
         if self.__attributeSuffix:
             # Make a new column with all nans
             name = colName + self.__attributeSuffix
             f.loc[:, name] = np.nan
         # Assign column
         f.loc[notNa, [name]] = result
         f.loc[:, name] = f[name].astype(
             pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True))
         edges[col] = discretizer.bin_edges_[0].tolist()
     # Log what has been done
     self.__logExecution(columns, edges)
     return data.Frame(f)
コード例 #6
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._training_inputs = None
     self._training_outputs = None
     self._origin_inputs = None  #for label encoder
     self._fitted = False
     self._cate_flag = None
     self._clf = Model('nb', bayesInf=1, PointInf=0)  #classifier
     self._LEoutput = preprocessing.LabelEncoder()  #label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  #imputer
     self._nbins = 10
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal',
         strategy='uniform')  #KbinsDiscretizer
     self._discTrainset = None
コード例 #7
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._problem_type = 'classification'
     self._training_inputs = None
     self._training_outputs = None
     self._fitted = False
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(
         missing_values=np.nan,
         strategy='mean')  #self.hyperparams['Imputer_Strategy']) # imputer
     self._nbins = self.hyperparams['nbins']
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy='uniform'
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
コード例 #8
0
    def __init__(self,
                 data,
                 k=4,
                 leaf_size=40,
                 lambdav=.5,
                 lap_vec=None,
                 bins=None):

        self.tree = KDTree(data, leaf_size=leaf_size)
        self.data = data

        if bins is not None:
            self.is_binned = True
            self.discretizer = preprocessing.KBinsDiscretizer(
                n_bins=bins, encode='ordinal').fit(data)
            self.data_binned = self.discretizer.inverse_transform(
                self.discretizer.transform(data))
            self.bin_tree = KDTree(self.data_binned, leaf_size=leaf_size)

        self.k = k
        self.lambdav = lambdav

        self.knn_dist_arr, self.knn_idx_arr = self.knn(data)
        self._pdist = self.pdist(data, self.knn_dist_arr, lap_vec)

        self._plof, self.nplof = self.plof(data, self.knn_idx_arr, self._pdist)
        self.loop_values = self.loop(self._plof)
コード例 #9
0
def get_outside_feature(data):
    est = preprocessing.KBinsDiscretizer(n_bins=5, encode='onehot-dense')
    new = est.fit_transform(sourse[['收率']])
    new = pd.DataFrame(data=new)
    new_feature = [i for i in new.columns]
    sourse[new_feature] = new

    data = data.copy()
    len_first = len(data.columns)
    for i in data.columns:
        order_mean = sourse.groupby(i)['收率'].mean()
        order_max = sourse.groupby(i)['收率'].max()
        order_min = sourse.groupby(i)['收率'].min()
        order_std = sourse.groupby(i)['收率'].std().fillna(0)
        order_sum = sourse.groupby(i)['收率'].sum()
        order_median = sourse.groupby(i)['收率'].median()

        # data['new_'+i+'_mean'] = data[i].map(order_mean)
        # data['new_'+i+'_max'] = data[i].map(order_max)
        # data['new_'+i+'_min'] = data[i].map(order_min)
        # data['new_'+i+'_std'] = data[i].map(order_std)
        # data['new_'+i+'_sum'] = data[i].map(order_sum)
        # data['new_'+i+'_median'] = data[i].map(order_median)

        # for j in new_feature:
        # 	order_label = sourse.groupby(i)[j].count()
        # 	data['new_'+i+'_'+str(j)] = data[i].map(order_label)
    data = data.iloc[:, len_first:].copy()

    return data
コード例 #10
0
    def preprocess(self, file_name):
        """
        Method to preprocess data to be in a format that is conducive
        to training.
        """
        names = ("age", "wrk_cls", "edu", "edu_num", "marital_sts",
                 "occu_code", "relation", "race", "sex", "gain", "loss",
                 "hours", "country", "income")

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(encode="ordinal")
        data = est.fit_transform(data)

        return data
コード例 #11
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     #parameters
     self._index = None
     self._fitted = False
     #hyperparameters
     self._nbins = self.hyperparams['nbins']
     self._strategy = self.hyperparams['strategy']
     self._method = self.hyperparams['method']
     self._thres_search_method = self.hyperparams['thres_search_method']
     self._threshold = self.hyperparams['threshold']
     self._bayes_factors = self.hyperparams['bayesfactors']
     self._problem_type = self.hyperparams['problem_type']
     #other parameters
     self._training_inputs = None
     self._training_outputs = None
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  # imputer
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy=self._strategy
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
コード例 #12
0
ファイル: centroid_oner.py プロジェクト: lrmendess/trab-ia
    def fit(self, x_train, y_train):
        self.class_  = unique_labels(y_train)
        self.x_train = x_train
        self.y_train = y_train

        self.candidates = list()
        self.predict_table = dict()
        self.predict_table_index = 0

        bins = [len(self.class_)] * len(self.x_train[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(self.x_train).transform(self.x_train)

        iterations = len(self.x_train[0])

        for i in range(iterations):
            cross = pd.crosstab(bins[:,i], self.y_train)
            rules = self.gen_table_rules(cross)
            self.candidates.append(rules)

        self.predict_table_index = self.best_predict_table_index(bins, self.y_train, self.candidates)
        self.predict_table = self.candidates[self.predict_table_index]

        selected_column = x_train[:,self.predict_table_index]
        collection_of_classes = dict()
        self.rule_set = dict()

        for k in self.class_:
            collection_of_classes[k] = list()

        for index, element in enumerate(selected_column):
            collection_of_classes[y_train[index]].append(element)

        for key, value in collection_of_classes.items():
            self.rule_set[key] = sum(value) / len(value)
コード例 #13
0
ファイル: oneR_centroid.py プロジェクト: lucastassis/ai
    def fit(self, x_train, y_train):
        self.class_ = unique_labels(y_train)
        opt_acc = 0

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_train[0])),
            encode='ordinal',
            strategy='uniform').fit(x_train).transform(x_train)

        # Fazendo as tabelas
        for i in range(0, len(x_train[0])):
            table = pd.crosstab(bins[:, i],
                                y_train)  # Cria as tabelas de contingencia
            rule = self.gen_rule(table)  # Cria a regra a partir da tabela

            if (self.acc_train(rule, bins[:, i], y_train)) >= opt_acc:
                self.index = i

        # Achando o centroide
        selected_column = x_train[:, self.index]
        class_lists = dict()
        self.rule_set = dict()

        for val in self.class_:
            class_lists[val] = []

        for i in range(0, len(selected_column)):
            class_lists[y_train[i]].append(selected_column[i])

        for key, value in class_lists.items():
            self.rule_set[key] = sum(value) / len(value)
コード例 #14
0
def q2():
    #Criação de variável que representa intevalos, semelhante ao pd.cut
    discretizer = preprocessing.KBinsDiscretizer(n_bins=10,
                                                 encode='ordinal',
                                                 strategy='quantile')
    discretizer.fit(countries[['Pop_density']])
    return int(sum(score_bins[:, 0] >= 9))
コード例 #15
0
ファイル: lab1.py プロジェクト: unfinity-core/ML-2020
def to_categorial(data_pd_series, n_bins=10):
    bins_transform = data_pd_series.to_numpy().reshape(-1, 1)
    descritizer = preprocessing.KBinsDiscretizer(
        n_bins=n_bins).fit(bins_transform)
    tmp1 = pd.DataFrame(
        descritizer.transform(bins_transform).todense()).stack()
    return pd.Series(pd.Categorical(
        tmp1[tmp1 != 0].index.get_level_values(1))), descritizer.bin_edges_
コード例 #16
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:  # inputs: m x n numpy array
        if self._fitted:
            # get discrete bins from training data
            discTrainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1))
            discTrainset.impute()
            discTrainset.discretize()
            discTrainset.remove()
            bins = discTrainset.NUM_STATES
            
            # convert categorical values to numerical values in testing data
            metadata = inputs.metadata
            [m, n] = inputs.shape
            X_test = np.zeros((m, n))
            for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)):
                if column_index is metadata_base.ALL_ELEMENTS:
                    continue
                column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
                semantic_types = column_metadata.get('semantic_types', [])
                if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                    LE = preprocessing.LabelEncoder()
                    LE = LE.fit(inputs.iloc[:, column_index])
                    X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index])
                elif 'http://schema.org/Text' in semantic_types:
                    pass
                else:
                    temp = list(inputs.iloc[:, column_index].values)
                    for i in np.arange(len(temp)):
                        if bool(temp[i]):
                            X_test[i, column_index] = float(temp[i])
                        else:
                            X_test[i, column_index] = 'nan'
            discTestset = RelationSet(X_test, [])
            discTestset.impute()
            X_test = discTestset.data
            index_list = np.setdiff1d(np.arange(discTrainset.num_features),np.array(discTrainset.removeIdx))
            X_test = X_test[:, index_list]
            est = preprocessing.KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy='uniform')
            est.fit(X_test)
            X_test = est.transform(X_test)
            t = time.time()
            output = self._clf.predict(X_test)
            self._time = time.time() - t + self._time
            if min(self._training_outputs) == 1:
                output = output + 1
            # label decode
            output = self._LEoutput.inverse_transform(output)
            
            # update metadata
            output = container.DataFrame(output, generate_metadata=False, source=self)
            output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True)
            
            for column_index, column_metadata in enumerate(self._target_columns_metadata):
                output.metadata = output.metadata.update_column(column_index, column_metadata, source=self)


            return CallResult(output)
        else:
            raise ValueError('Model should be fitted first.')
コード例 #17
0
def q2():
    est = preprocessing.KBinsDiscretizer(n_bins=10,
                                         encode='ordinal',
                                         strategy='quantile')
    pop_density_10bins = est.fit_transform(countries[['Pop_density']])
    plt.hist(pop_density_10bins)
    above_90 = np.sum(pop_density_10bins >= 9)
    # (pop_density_10bins > np.percentile(pop_density_10bins, 90)).sum()
    return above_90
コード例 #18
0
    def __init__(self, X_train, y_train):
        """
        Creates a new SalaryPredictor trained on the given features from the
        preprocessed census data to predicted salary labels. Performs and fits
        any preprocessing methods (e.g., imputing of missing features,
        discretization of continuous variables, etc.) on the inputs, and saves
        these as attributes to later transform test inputs.
        
        :param DataFrame X_train: Pandas DataFrame consisting of the
        sample rows of attributes pertaining to each individual
        :param DataFrame y_train: Pandas DataFrame consisting of the
        sample rows of labels pertaining to each person's salary
        """
        # [!] TODO
        ####
        #defined columns
        columns_to_encode=["work_class","education","marital","occupation_code","relationship","race","sex","country"]
        #columns_to_scale=["age","education_years","capital_gain","capital_loss","hours_per_week"]
        columns_to_scale=["age","education_years","hours_per_week"]
        columns_to_leave= ["capital_gain","capital_loss"]
        self.columns_with_missing=["work_class","occupation_code","country"]
        #get the categories for each column
        cat = []
        for col in X_train.columns:
            if pd.api.types.is_string_dtype(X_train[col]):
                cat.append(X_train[col].unique())

        #print(cat)
        self.imp = SimpleImputer(missing_values = "?", strategy="most_frequent")
        imp_columns = self.imp.fit_transform(X_train[columns_to_encode])
        
        self.ohe = preprocessing.OneHotEncoder(  handle_unknown = "ignore",categories = cat,sparse = False)
        self.le2 = preprocessing.LabelEncoder()
        self.Kbin = preprocessing.KBinsDiscretizer(n_bins=[2,2,2],encode="ordinal")
        self.scale = preprocessing.StandardScaler()
        #scaled_columns = self.scale.fit_transform(X_train[columns_to_scale])
        #ds_columns = self.Kbin.fit_transform(scaled_columns)
        scaled_columns = self.scale.fit_transform(X_train[columns_to_leave])
        ds_columns = self.Kbin.fit_transform(X_train[columns_to_scale])
        #print(X_train)
        encoded_columns = self.ohe.fit_transform(imp_columns)
        #print(X_train)
        #print(encoded_columns)
        #for col in encoded_columns:
            #print(col)
        #   if col == "work_class" or col == "occupation_code" or col == "country":
        #        encoded_columns[col].replace([0],-1)
        processed_data = np.concatenate([ds_columns,encoded_columns,scaled_columns],axis=1)
        #self.imp = SimpleImputer(missing_values = -1, strategy="most_frequent")
        #X_train = self.imp.fit_transform(processed_data)
        self.le = preprocessing.LabelEncoder()
        #y_process = self.le.fit_transform(y_train)
        #print(y_process)
        self.clf = LogisticRegression(max_iter=1000).fit(processed_data,y_train)
コード例 #19
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(
            n_bins=bins, encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:, self.predict_table_index]:
            predict.append(self.predict_table[element])

        return predict
コード例 #20
0
 def fit(self, data):
     # row = data[self.k].values
     # X = np.array([x for x in X]).reshape(-1, 1)
     # bins = np.repeat(n_bins, X.shape[1])  # e.g. [5,3] for 2 features
     # encode to integers
     # quantile: each bin contains approx. the same number of features
     strategy = 'uniform' if util.data.is_int(
         data[self.k]) else 'quantile'
     self.est = preprocessing.KBinsDiscretizer(
         n_bins=self.n_bins, encode='onehot', strategy=strategy)
     self.est.fit(data[self.k].values.reshape(-1, 1))
     self.n_bins = self.est.bin_edges_[0].size
コード例 #21
0
ファイル: NASDAQTransformer.py プロジェクト: Baes20/RASPUTIN
def test_Transformer(nasdaq):
    nasdaq = apply_robust(nasdaq)
    new_nasdaq = []
    for timeseries in nasdaq:
        if not (timeseries[-1] > 0.9999 or timeseries[-1] < 0.0001):
            new_nasdaq.append(timeseries)
    nasdaq = np.array(new_nasdaq)

    X, Y = nasdaq[:, :-1], nasdaq[:, -2:]

    discretizer = preprocessing.KBinsDiscretizer(n_bins=63, encode='ordinal')
    discretizer.fit(X)
    X = discretizer.transform(X)

    Y = np.diff(Y, axis=1)
    Y = np.sign(Y)
    Y = np.where(Y <= 0, Y * 0, Y)

    print(Y.shape)
    base_winrate = np.sum(Y) / len(Y)
    print(base_winrate)

    train_size = int(0.8 * len(X))
    trainX, trainY = X[:train_size], Y[:train_size]
    validX, validY = X[train_size:], Y[train_size:]
    print(trainX.shape, trainY.shape)

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    model = Transformer_Encoder(num_layers=6,
                                d_model=64,
                                num_heads=8,
                                dff=128,
                                input_vocab_size=64,
                                target_vocab_size=1,
                                rate=dropout_rate)
    checkpoint_path = "./SavedModel/Transformer/Transformer_FXTM.h5"
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                    save_weights_only=True,
                                                    save_best_only=True)
    model.compile(optimizer=optimizer, loss=loss_object, metrics=['accuracy'])
    model.build(input_shape=(None, 64))
    history = model.fit(trainX,
                        trainY,
                        validation_data=(validX, validY),
                        batch_size=256,
                        epochs=EPOCHS,
                        callbacks=[checkpoint])
コード例 #22
0
 def discretize(self):
     disc = preprocessing.KBinsDiscretizer(
         n_bins=
         10,  # Watch here: you might have to tweak this parameter a bit
         encode=
         "ordinal",  # I'm currently looking for a way to make this process automatic
         strategy="uniform"
     )  # If the data is standardized and rescaled though, 10 might be a good
     continous_df = self.df[self.continous_vars]  # first attempt
     not_continous_df = self.df.drop(columns=self.continous_vars)
     continous_df = pd.DataFrame(disc.fit_transform(continous_df),
                                 columns=continous_df.columns)
     self.df = continous_df.join(not_continous_df)
     return self
コード例 #23
0
 def discritizeData(self, features):
     """
     Discritization des données
     Args:
         features:           Array de données
     Return:
         xKBinsDiscretizer:  Array de données discritizé
     """
     est = preprocessing.KBinsDiscretizer(n_bins=3,
                                          encode='ordinal',
                                          strategy='uniform')
     est.fit(features)
     xKBinsDiscretizer = est.transform(features)
     return xKBinsDiscretizer
コード例 #24
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:,self.predict_table_index]:
            if element in self.predict_table:
                predict.append(self.predict_table[element])
            else:
                predict.append(choice(list(self.predict_table.items()))[1])

            self.predict_table = self.gen_table_rules(self.original_bin)

        return predict
コード例 #25
0
def discretiza_con_kmeans(dataframe, n):
    '''
    Recibe un dataframe y un numero de intervalos en el que queramos dividir
    dicho dataframe.
    
    Devuelve el dataframe dividido en n intervalos por la tecnica de kmeans.
    '''

    discretizado = preprocessing.KBinsDiscretizer(
        n_bins=n, encode='ordinal',
        strategy="uniform").fit_transform(dataframe)
    datas = pd.DataFrame(discretizado)
    datas.index = dataframe.index
    datas.columns = dataframe.columns
    return datas
コード例 #26
0
    def predict(self, x_test):
        predict = []

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_test[0])),
            encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        x_test = bins[:, self.index]

        for i in range(0, len(x_test)):
            predict.append(self.rule_set[x_test[i]])

        return predict
コード例 #27
0
 def kbins_discretizer(one_feature, n_bins=3, strategy='uniform'):
     '''
     strategy
         uniform
         All bins in each feature have identical widths.
         quantile
         All bins in each feature have the same number of points.
         kmeans
         Values in each bin have the same nearest center of a 1D k-means cluster.
         Method used to encode the transformed result.
     '''
     kbd = preprocessing.KBinsDiscretizer(n_bins=[n_bins], encode='ordinal', strategy=strategy)
     feature_transformed = kbd.fit_transform(one_feature).reshape(-1,)
     print(kbd.n_bins_)
     print(kbd.bin_edges_)
     return feature_transformed
コード例 #28
0
def k_bins(data, n_bins, encoder = "ordinal", strategy = "quantile"):
    """
    分箱离散化
    * encode:
        - "ordinal"
        - "onehot"
        - "onehot-dense"
    * strategy:
        - "uniform"
        - "quantile"
        - "kmeans"
    """
    est = preprocessing.KBinsDiscretizer(n_bins = n_bins, encoder = encoder, strategy = strategy)
    transformed_data = est.fit_transform(data)

    return transformed_data
コード例 #29
0
    def descretization(self):
        try:
            for i in range(len(self.x)):
                temp = self.dataset[self.x[i]].values.reshape(-1, 1)
                est  = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='uniform')
                est.fit(temp)
                x_scaled=est.transform(temp)
                print(x_scaled)
                self.dataset[self.x[i]] = x_scaled


            heads=self.dataset.head(10).to_numpy()
        
            for i in range(10):
                for j in range(len(self.dataset.columns)):
                    self.table.setItem(i,j, QTableWidgetItem(str(heads[i][j])))

        except Exception as e:
            print(repr(e))  
コード例 #30
0
 def fit(self, x_train, y_train):
     discretizer = preprocessing.KBinsDiscretizer(
         n_bins=2 * len(unique_labels(y_train)),
         encode='ordinal',
         strategy='kmeans')
     discretizer.fit(x_train)
     self.discretizer = discretizer
     df_data = (pd.DataFrame(data=discretizer.transform(x_train)))
     df_data['classe'] = y_train
     contingency_df = {}
     best_score = float('-inf')
     best_feature = None
     for col in list(df_data.iloc[:, :-1]):
         contingency_df[col] = pd.crosstab(df_data['classe'], df_data[col])
         score_feature = contingency_df[col].agg('max').sum()
         if (score_feature > best_score):
             best_feature = col
     self.best_feature = best_feature
     self.prob_table = contingency_df[col].apply(lambda x: x / sum(x)).T