def fit(self, y: np.ndarray) -> "ClassifierLabelEncoder":
        """Fit the estimator to the target y.

        For all targets, this transforms classes into ordinal numbers.
        If the loss function is categorical_crossentropy, the target
        will be one-hot encoded.

        Parameters
        ----------
        y : np.ndarray
            The target data to be transformed.

        Returns
        -------
        ClassifierLabelEncoder
            A reference to the current instance of ClassifierLabelEncoder.
        """
        target_type = self._type_of_target(y)
        keras_dtype = np.dtype(tf.keras.backend.floatx())
        self._y_shape = y.shape
        encoders = {
            "binary":
            make_pipeline(
                TargetReshaper(),
                OrdinalEncoder(dtype=keras_dtype, categories=self.categories),
            ),
            "multiclass":
            make_pipeline(
                TargetReshaper(),
                OrdinalEncoder(dtype=keras_dtype, categories=self.categories),
            ),
            "multiclass-multioutput":
            FunctionTransformer(),
            "multilabel-indicator":
            FunctionTransformer(),
        }
        if is_categorical_crossentropy(self.loss):
            encoders["multiclass"] = make_pipeline(
                TargetReshaper(),
                OneHotEncoder(sparse=False,
                              dtype=keras_dtype,
                              categories=self.categories),
            )
        if target_type not in encoders:
            raise ValueError(
                f"Unknown label type: {target_type}."
                "\n\nTo implement support, subclass KerasClassifier and override"
                " ``target_encoder`` with a transformer that supports this"
                " label type."
                "\n\nFor information on sklearn target types, see:"
                " * https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html"
                " * https://scikit-learn.org/stable/modules/multiclass.html"
                "\n\nFor information on the SciKeras data transformation interface, see:"
                " * https://scikeras.readthedocs.io/en/latest/advanced.html#data-transformers"
            )
        self._final_encoder = encoders[target_type].fit(y)

        if (target_type == "multilabel-indicator" and y.min() == 0
                and (y.sum(axis=1) == 1).all()):
            target_type = "multiclass-onehot"

        self.n_outputs_ = 1
        self.n_outputs_expected_ = 1
        self._y_dtype = y.dtype
        self._target_type = target_type

        if target_type in ("binary", "multiclass"):
            self.classes_ = self._final_encoder[1].categories_[0]
            self.n_classes_ = self.classes_.size
        elif target_type in ("multiclass-onehot", "multilabel-indicator"):
            self.classes_ = np.arange(0, y.shape[1])
            self.n_classes_ = y.shape[1]
        elif target_type == "multiclass-multioutput":
            self.classes_ = None
            self.n_classes_ = None

        return self
Beispiel #2
0
 def __init__(self, cast_type=None):
     self.transformer_ = FunctionTransformer(
         feature_cast, kw_args={"cast_type": cast_type}, validate=False)
Beispiel #3
0
 def __init__(self):
     self.transformer_ = FunctionTransformer(to_dense, validate=False)
Beispiel #4
0
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:0.9182509505703423
exported_pipeline = make_pipeline(
    make_union(FastICA(tol=0.75), FunctionTransformer(copy)),
    XGBClassifier(learning_rate=0.01,
                  max_depth=4,
                  min_child_weight=7,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.6500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #5
0
def calc_returns(closes):
    log_prices = FunctionTransformer(func=np.log).fit_transform(closes)
    returns = pd.DataFrame(log_prices).diff()
    returns.columns = closes.columns
    returns = returns.drop(returns.index[0])
    return returns
Beispiel #6
0
def train():

    # 1. read jsons into a list
    data = load_data()

    # 2. clean data
    # can not do this inside the pipeline as doing so
    # might get rid of the input data during inference
    # if during the inference only part of data is received
    # this is to filter out some really bad responses, e.g.
    # 'instagram': {'num_users': 1, 'users': [{'username': '******'}]}
    for i in data:
        if i['instagram']['num_users'] > 0:
            if i['instagram']['users'][0].get('followers_count', -1) < 0:
                data.remove(i)

    # split the pipeline into two: preprocess and model
    def json_to_df(x):
        return pd.DataFrame(x)  # cant pickle lambdas

    def fill_na(x):
        return x.fillna(-1)

    preprocess_pipe = Pipeline([
        ('restructure_jsons', FunctionTransformer(restructure_data)),
        ('jsons_to_df', FunctionTransformer(json_to_df)),
        # not using imputer as it casts to array
        # fill with -1 to separate missing case from 0
        ('fill_na', FunctionTransformer(fill_na)),
    ])

    def cast_to_float32(x):
        return x.values.astype('float32')

    def vae_output_format(x):        return \
np.array(x)\
.reshape(x.shape[0], -1)\
.mean(axis=1)\
.__sub__(1)\
.clip(-1, 1)\
.round(2)

    # float64 is due to the fact that float32 is not json serialisable
    # https://github.com/tensorflow/tensorboard/issues/3057
    def final_format(x):
        return np.atleast_1d(x).astype('float64').round(2)

    model_pipe = Pipeline([
        ('ECDF', CustomECDF()),
        # ('prep_for_VAE', FunctionTransformer(cast_to_float32)),
        # ('VAE', VAE_numpy()),
        # ('vae_output_format', FunctionTransformer(vae_output_format))
        ('CustomScoreCombination', CustomScoreCombination()),
        ('final_format', FunctionTransformer(final_format))
    ])

    # output
    train_data = preprocess_pipe.transform(data)
    model_pipe.fit(train_data)

    # scores = model_pipe.transform(train_data)
    # print(pd.concat([train_data, scores], axis=1))
    # temp = pd.concat([train_data, pd.Series(scores.ravel())], axis=1)
    # temp.sort_values(0).round(2)

    # save
    # make sure to change the settings as follows
    # otherwise importing the saved files will be hard
    # https://github.com/uqfoundation/dill/issues/126
    dill.settings['recurse'] = True

    # to be able to save and load the models with no error
    # have to transform at least once before saving
    # otherwise it throws the following error:
    # _function_transformer.py - KeyError: '__builtins__'
    model_pipe.transform(train_data)

    print('Saving')
    preprocess_pipe_path = 'web_score/scorers/preprocess_pipe.pkl'
    model_pipe_path = 'web_score/scorers/model_pipe.pkl'
    with open(preprocess_pipe_path, 'wb') as i, open(model_pipe_path,
                                                     'wb') as j:
        dill.dump(preprocess_pipe, i)
        dill.dump(model_pipe, j)
Beispiel #7
0
def column_transformer(name):
    return FunctionTransformer(partial(pd.DataFrame.__getitem__, key=name),
                               validate=False)
Beispiel #8
0
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2,
                                                               seed=123)

# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS],
                                       validate=False)

# Complete the pipeline: pl
pl = Pipeline([
    ('union',
     FeatureUnion(
         transformer_list=[('numeric_features',
                            Pipeline([('selector',
                                       get_numeric_data), ('imputer',
                                                           Imputer())])),
                           ('text_features',
                            Pipeline([('selector', get_text_data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

# data directory
DATA_hmda_acs = 'data/hmda_acs_merged.csv'
DATA_zip_tract = 'data/zip_tract_122017.xlsx'
DATA_shp = 'data/2019/tl_2019_53_tract.shp'
DATA_zipcodes = 'data/zipcodes_king.csv'
MODEL_lr_nh = 'data/lr_model_nh.sav'
MODEL_lr_hf = 'data/lr_model_hf.sav'

# dictionary of models
model_dict = {'Original': MODEL_lr_nh, 'Without Population Bias': MODEL_lr_hf}

# transform the feature vectors
transformer = FunctionTransformer(np.log1p, validate=True)
scaler = MinMaxScaler(feature_range=(0.2, 0.8))

# set the title of web app
st.title('intelliRefinder')
st.markdown(
    '''Predict optimal locations for mortgage refinance business opportunities
       using machine learning algorithms on US OpenStreetMap (OSM) data.
    ''')

# load zip codes of king county WA
zipcodes = pd.read_csv(DATA_zipcodes)['zip']
#algorithms = ('Logistic Regression', 'Random Forest')
interventions = ('Original', 'Without Population Bias')

# seting up the sidebar and loading the data
Jc.mobileReady = Jc.mobileReady.astype(int)
Jc.personalized = Jc.personalized.astype(int)

#Preparing data for modelling
X = Jc.loc[:, [
    'hasCreative', 'mobileReady', 'percentOfList', 'personalized', 'trans'
]]

y = Jc.readRatePercent

#preparing data for pipeline
from sklearn.preprocessing import FunctionTransformer

# trans -text data
getTrans = FunctionTransformer(lambda x: x['trans'], validate=False)

# Numerics
getNums = FunctionTransformer(lambda x: x[
    ['hasCreative', 'mobileReady', 'percentOfList', 'personalized']],
                              validate=False)

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

union = FeatureUnion(
    transformer_list=[('numerics', Pipeline([('selector', getNums)])),
                      ('text',
                       Pipeline([('selector', getTrans),
                                 ('vectorizer',
Beispiel #11
0
 def __init__(self):
     # make a transformer which will load the time series and compute the
     # connectome matrix
     self.transformer_fmri = make_pipeline(
         FunctionTransformer(func=_load_fmri, validate=False),
         ConnectivityMeasure(kind='tangent', vectorize=False))
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.533205704365034
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(FunctionTransformer(copy), FunctionTransformer(copy))),
    SelectPercentile(score_func=f_regression, percentile=89),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=SGDRegressor(alpha=0.0,
                                             eta0=0.01,
                                             fit_intercept=True,
                                             l1_ratio=1.0,
                                             learning_rate="constant",
                                             loss="squared_loss",
                                             penalty="elasticnet",
                                             power_t=50.0)),
    StackingEstimator(estimator=RidgeCV()), MaxAbsScaler(), MaxAbsScaler(),
    LinearSVR(C=0.5,
              dual=True,
              epsilon=1.0,
def on_field(f: str, *vec) -> Pipeline:
    '''Quite a mistery here: 
    
    '''
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False),
                         *vec)
import numpy as np
np.warnings.filterwarnings('ignore')  #for supressing warnings from numpy
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer

data = np.loadtxt('winequality-red.csv', skiprows=1,
                  delimiter=';')  #reading data file into a np array

#saperating features from input data
x = np.concatenate((data[:, 0:10], data[:, 11].reshape(-1, 1)), axis=1)

#adding square root of each feature as a new feature as it is improving accuracy of model
funt = FunctionTransformer(np.sqrt)
x = np.concatenate((x, funt.fit_transform(x)), axis=1)

#separating result variable from input data
y = data[:, 10]

#splitting data into train and test samples and training linear regression model
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
linear_r = LinearRegression(normalize=True)
linear_r.fit(x_train, y_train)
cross_val_mean = cross_val_score(linear_r,
                                 x_train,
                                 y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

print('Predicted AC   vs   Actual AC')
for x, y in zip(linear_r.predict(x_test), y_test):
 def on_field(self, f: str, *vec):
     return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)
Reshape the model into a 3D array to fit the RNN Model.
"""


def shape_model_data(X, n_timesteps, n_features):
    return X.reshape((X.shape[0], n_timesteps, n_features))


# In[8]:
"""
Define preprocessing steps.
"""
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('reshape',
                      FunctionTransformer(shape_model_data,
                                          kw_args=dict(n_timesteps=Tx,
                                                       n_features=N_FEATURES)))
                     ])

# In[10]:
"""
Transform the feature data.
"""
model_X_train = pipeline.fit_transform(X_train)
model_X_test = pipeline.fit_transform(X_test)

# In[11]:

from keras.layers import Input, LSTM, BatchNormalization, Dense
from keras import Model
Beispiel #17
0
le.classes_
le.fit_transform(['b', 'b', 'a', 'c'])
le.inverse_transform([0, 0, 1, 2, 2])


from sklearn.preprocessing import Binarizer
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
binarizer = Binarizer()
binarizer.fit(X)
binarizer.transform(X)
binarizer = Binarizer(threshold=1.1)
binarizer.transform(X)

from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'foo':1, 'bar':2}, {'foo':3, 'baz':1}]
X = v.fit_transform(D)
X
v.feature_names_
v.inverse_transform(X)
v.transform({'foo':4, 'unseen_feature':3})

from sklearn.preprocessing import FunctionTransformer
def all_b(x):
    return(x[:, 1:])
x = np.arange(12).reshape(4,3)
func = FunctionTransformer(all_b)
func.fit_transform(x)
    # Besure that the horizontal grid coordinates of both the source and
    # grid cubes must have contiguous bounds.
    besure_cube_has_continuous_bounds(param_cube)
    besure_cube_has_continuous_bounds(target_cube)

    # Use the given scheme to regrid
    drv_cube = param_cube.regrid(target_cube, regrid_scheme)
    return drv_cube


topo_tgt = empty_3d_cube_tgt(
    surface_alt_tgt_data, 'surface_altitude', 'm')
topo_src = empty_3d_cube_src(
    surface_alt_src_data, 'surface_altitude', 'm')
lsm_tgt = empty_3d_cube_tgt(
    lsm_tgt_data, 'land_area_fraction', '1', )
lsm_src = empty_3d_cube_src(
    lsm_src_data, 'land_area_fraction', '1')
t_scn_src = empty_3d_cube_src(
    t_scn_src_data, 'air_temperature', 'K')
dpt_scn_src = empty_3d_cube_src(
    dpt_scn_src_data, 'dew_point_temperature', 'K')
sfc_prs_src = empty_3d_cube_src(
    sfc_prs_src_data, 'air_pressure_at_sea_level', 'Pa')
    
 X = t_scn.data
 transformer = FunctionTransformer(interpolate_by_scipy_linear)
 y = transformer.transform(X)
 
    print(dst_target.shape)
    dst_target = raw_target  # restore raw_target
    plot_iris_projection(x_index=0, y_index=1)

    dst_data = Imputer().fit_transform(
        vstack((array([nan, nan, nan, nan]), raw_data[:149])))
    ax = plt.subplot(2, 4, 2 + 4)
    ax.set_title('Imputer()')
    plot_iris_projection(x_index=0, y_index=2)

    dst_data = PolynomialFeatures().fit_transform(raw_data)
    ax = plt.subplot(2, 4, 3 + 4)
    ax.set_title('PolynomialFeatures()')
    plot_iris_projection(x_index=0, y_index=3)

    dst_data = FunctionTransformer(log1p).fit_transform(raw_data)
    ax = plt.subplot(2, 4, 4 + 4)
    ax.set_title('FunctionTransformer()')
    plot_iris_projection(x_index=1, y_index=2)
elif method_reg == "feature-select":
    dst_data = StandardScaler().fit_transform(raw_data)
    # variance selection method
    # parameter threshold is the threshold of variance
    dst_data = VarianceThreshold(threshold=3).fit_transform(raw_data)
    ax = plt.subplot(2, 4, 1 + 4)
    ax.set_title('VarianceThreshold()')
    plot_iris_projection(x_index=0, y_index=0)
    print(dst_data.shape)

    # Chi-square test
    dst_data = SelectKBest(chi2, k=2).fit_transform(raw_data, raw_target)
Beispiel #20
0
        # normalizing features after nmf
        truncated = b.get_nmf(tfidf, r_nmf) 
        truncated = preprocessing.scale(truncated, with_mean = False)
        km = a.k_means_cluster(truncated, k)
        if print_result:
            result = a.get_result(km, labels)
            a.print_result(result)
        colors = map(lambda(x): all_colors[x], km.labels_)
        first = pl.subplot(334)
        first.set_title('normalize festures using nmf')
        pl.scatter(truncated[:, 0:1], truncated[:, 1:2], c = colors)
    
        # using non-linear transformation
        non_linear = b.get_nmf(tfidf, r_nmf) 
        non_linear = FunctionTransformer(np.log1p).transform(non_linear)
        km = a.k_means_cluster(non_linear, k)
        if print_result:
            result = a.get_result(km, labels)
            a.print_result(result)
        colors = map(lambda(x): all_colors[x], km.labels_)
        first = pl.subplot(335)
        first.set_title('non-linear')
        pl.scatter(truncated[:, 0:1], truncated[:, 1:2], c = colors)

        # using normalize first and then non-linear
        truncated = b.get_nmf(tfidf, r_lsi) 
        truncated = preprocessing.scale(truncated, with_mean = False)
        b_3_first = FunctionTransformer(np.log1p).transform(truncated)
        km = a.k_means_cluster(b_3_first, k)
        if print_result:
Beispiel #21
0
def col2dict():
    return FunctionTransformer(
        lambda x: pd.DataFrame(x).to_dict(orient='records'), validate=False)
    return np.array([math.sqrt(len(t)) for t in x]).reshape(-1, 1)


#       *********Features Pipeline*******

pipeline = Pipeline([
    ('features_union',
     FeatureUnion([
         ('ngrams_feature',
          Pipeline([
              ('ngrams_vect', TfidfVectorizer(binary=False,
                                              ngram_range=(1, 2))),
          ])),
         ('length',
          Pipeline([
              ('count', FunctionTransformer(get_text_length, validate=False)),
          ]))
     ])),
    # ],
    #transformer_weights= {'words_feature': 1, 'ngrams_feature': 1,   }
    ('normalization', Normalizer(copy=False)),
    ('classifier', LinearSVC(penalty='l2'))
])

#       *********Applying preprocessing*******

reviews = compile(reviews)
#reviews = normalization(reviews)
#x_train,x_val,y_train,y_val = train_test_split(compile(reviews), target, train_size = 0.75, random_state = 42)
# x_train = get_stemmed_text(x_train,'Porter')
# x_val = get_stemmed_text(x_val,'Porter')
Beispiel #23
0
def calc_log_prices(closes):
    log_prices = FunctionTransformer(func=np.log).fit_transform(closes)
    log_df = pd.DataFrame(log_prices)
    log_df.index = closes.index
    log_df.columns = closes.columns
    return log_df
Beispiel #24
0
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    Nystroem(gamma=10.0, kernel="polynomial", n_components=10),
    make_union(
        VotingClassifier([("est",
                           KNeighborsClassifier(n_neighbors=4,
                                                weights="distance"))]),
        FunctionTransformer(lambda X: X)),
    make_union(
        VotingClassifier([("est",
                           ExtraTreesClassifier(criterion="entropy",
                                                max_features=1.0,
                                                n_estimators=500))]),
        FunctionTransformer(lambda X: X)),
    FeatureAgglomeration(affinity="precomputed", linkage="average"),
    GaussianNB())

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Beispiel #25
0
def test_function_transformer_frame():
    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(np.random.randn(100, 10))
    transformer = FunctionTransformer(validate=False)
    X_df_trans = transformer.fit_transform(X_df)
    assert hasattr(X_df_trans, 'loc')
Beispiel #26
0
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.8129780700079303
exported_pipeline = make_pipeline(
    make_union(StackingEstimator(estimator=GaussianNB()),
               FunctionTransformer(copy)),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.4,
                         min_samples_leaf=3,
                         min_samples_split=2,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #27
0
 def __init__(self, impute_val=None):
     self.transformer_ = FunctionTransformer(
         impute_null, kw_args={"impute_val": impute_val}, validate=False)
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.feature_extraction import DictVectorizer
from operator import itemgetter
import pandas as pd

class Vectorizer():
    def __init__(self):
        self.vectorizer = None

    def on_field(self, f: str, *vec):
        return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

    def to_records(self, df: pd.DataFrame):
        return df.to_dict(orient='records')
    
    def tfidf_vectorizer(self, title_feat=100000, description_feat=500000)
        self.vectorizer = make_union(
                                    self.on_field("title", Tfidf(max_features=title_feat, token_pattern="\w+")),
                                    self.on_field("description", Tfidf(max_features=description_feat, token_pattern="\w+", ngram_range=(1, 2))),
                                    self.on_field(['shipping', 'status'],
                                    FunctionTransformer(self.to_records, validate=False), DictVectorizer())
                                    )
        return self.vectorizer
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8228571428571427
exported_pipeline = make_pipeline(
    make_union(VarianceThreshold(threshold=0.4), FunctionTransformer(copy)),
    StandardScaler(), SelectPercentile(score_func=f_classif, percentile=70),
    LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=1e-05))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #30
0
BIAS_MAX_DF = 0.60  # Max occurance for words to be bias words

params = {
    'counts__binary': [True, False],
    'model__max_epochs': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100],
    'model__lr': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
    'model__batch_size': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100],
    'model__module__n_hidden': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100],
    'model__callbacks__lr_sched__patience': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

pipeline = Pipeline([
    ('counts', TfidfVectorizer(max_features=MAX_VOCAB, binary=False)),
    ('dense',
     FunctionTransformer(lambda x: x.toarray(),
                         validate=False,
                         accept_sparse=True)),
    ('model',
     WeightedNeuralNet(module=MLP,
                       device='cuda',
                       callbacks=[
                           ('epoch_score',
                            callbacks.EpochScoring(scoring='f1',
                                                   lower_is_better=False,
                                                   name='valid_f1')),
                           ('lr_sched',
                            callbacks.LRScheduler(policy='ReduceLROnPlateau',
                                                  monitor='valid_f1',
                                                  patience=3)),
                           ('early_stop',
                            callbacks.EarlyStopping(monitor='valid_f1',