def VarianceThreshold_selector(data, th):
    #Select Model
    selector = VarianceThreshold(
        th
    )  #Defaults to 0.0, e.g. only remove features with the same value in all samples
    #Fit the Model
    selector.fit(data)
    features = selector.get_support(
        indices=True
    )  #returns an array of integers corresponding to nonremoved features
    features = [column for column in data[features]
                ]  #Array of all nonremoved features' names
    #Format and Return
    selector = pd.DataFrame(selector.transform(data))
    selector.columns = features
    return selector
Esempio n. 2
0
        def VarianceThreshold_selector(data):

            #Select Model
            selector = VarianceThreshold(threshold=(.8 * (1 - .8)))

            #Fit the Model
            selector.fit(data)
            features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
            #print (features)
            Features = list(data)
            features = [Features[i] for i in features]
            #features = [column for column in data[features]] #Array of all nonremoved features' names
            #print (features)
            #Format and Return
            selector = pd.DataFrame(selector.transform(data))
            selector.columns = features
            return selector
Esempio n. 3
0
    def variance_threshold(self: pd.DataFrame, cp, fecha, threshold=0.0):
        """        
        VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance
        doesn’t meet some threshold. By default, it removes all zero-variance features, i.e.
        features that have the same value in all samples.
        As an example, suppose that we have a dataset with boolean features,
        and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples.
        """
        column_names = self.columns.values.tolist()
        key_variables = ['id_siniestro', 'id_poliza', 'cod_filiacion'] + cp + fecha
        removed_var = []
        for i in key_variables:
            try:
                column_names.remove(i)
                removed_var.append(i)
            except:
                pass

        append_names = []
        for i in column_names:
            self_i = self[[i]]
            self_i = self_i.apply(pd.to_numeric, errors='coerce')
            self_i = self_i.dropna(how='any', axis=0)
            selection = VarianceThreshold(threshold=threshold)
            try:
                selection.fit(self_i)
                features = selection.get_support(indices=True)
                features = self_i.columns[features]
                features = [column for column in self_i[features]]
                selection = pd.DataFrame(selection.transform(self_i), index=self_i.index)
                selection.columns = features
                append_names.append(selection.columns.values.tolist())
            except:
                pass

        append_names = [item for sublist in append_names for item in sublist]
        append_names = list(set(append_names))
        self = self[removed_var + append_names]
        return self