Beispiel #1
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9269704433497538
exported_pipeline = make_pipeline(
    MaxAbsScaler(), LogisticRegression(C=0.01, dual=False, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #2
0
from sklearn.preprocessing import MaxAbsScaler

X = [[1., 10., 2.], [2., 0., 0.], [5., 1., -1.]]
transformer = MaxAbsScaler().fit(X)
transformer
transformer.transform(X)
Beispiel #3
0
    args = parser.parse_args()

    # Set random seed
    seed = args.seed
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():  # cuda device
        device = 'cuda'
        torch.cuda.set_device(args.gpu)
    else:
        device = 'cpu'
    batch_size = args.batch_size

    normalizer = MaxAbsScaler()

    adata = load_data(args.dataset,transpose=args.transpose)
    args.min_peaks = int(args.min_peaks) if args.min_peaks >= 1 else args.min_peaks
    total_cells = adata.shape[0]
    min_cells = int(args.low * total_cells)
    max_cells = int(args.high * total_cells)
    filter_features(adata, min_cells=min_cells)
    filter_features(adata, max_cells=max_cells)
    filter_cells(adata,min_peaks=args.min_peaks)

    dataset = SingleCellDataset(adata,transforms=[normalizer.fit_transform])
    trainloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    testloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)

    cell_num = dataset.shape[0]
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(MaxAbsScaler(),
                                  LogisticRegression(penalty="l2"))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Beispiel #5
0
df = pd.concat([Train_data, Test_data])
'''數據前處理'''
df = df.fillna(-1)
train_num = train_Y.shape[0]
train_x = df[:train_num].values

estimator = RandomForestRegressor()
estimator.fit(train_x, train_Y)
feats = pd.Series(data=estimator.feature_importances_, index=df.columns)
feats = feats.sort_values(ascending=False)
high_feature = list(feats[:37].index)

df = df[high_feature].values

MMEncoder = MaxAbsScaler()
df = MMEncoder.fit_transform(df)
test_X = df[train_num:]

with tf.Session() as sess:

    saver = tf.train.import_meta_graph(
        "/home/rex/桌面/T-Brain/DNN/2019-06-01/test.meta")

    saver.restore(sess, "/home/rex/桌面/T-Brain/DNN/2019-06-01/test")
    graph = tf.get_default_graph()
    predict = tf.get_collection('predict')[0]
    X = graph.get_operation_by_name("X").outputs[0]

    print("模型恢復成功!")
class ParallelCoordinates(DataVisualizer):
    """
    Parallel coordinates displays each feature as a vertical axis spaced
    evenly along the horizontal, and each instance as a line drawn between
    each individual axis. This allows you to detect braids of similar instances
    and separability that suggests a good classification problem.

    Parameters
    ----------
    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    features : list, default: None
        a list of feature names to use
        If a DataFrame is passed to fit and features is None, feature
        names are selected as the columns of the DataFrame.

    classes : list, default: None
        a list of class names for the legend
        The class labels for each class in y, ordered by sorted class index. These
        names act as a label encoder for the legend, identifying integer classes
        or renaming string labels. If omitted, the class labels will be taken from
        the unique values in y.

        Note that the length of this list must match the number of unique values in
        y, otherwise an exception is raised.

    normalize : string or None, default: None
        specifies which normalization method to use, if any
        Current supported options are 'minmax', 'maxabs', 'standard', 'l1',
        and 'l2'.

    sample : float or int, default: 1.0
        specifies how many examples to display from the data
        If int, specifies the maximum number of samples to display.
        If float, specifies a fraction between 0 and 1 to display.

    random_state : int, RandomState instance or None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random; only used if shuffle is True and sample < 1.0

    shuffle : boolean, default: True
        specifies whether sample is drawn randomly

    colors : list or tuple, default: None
        A single color to plot all instances as or a list of colors to color each
        instance according to its class. If not enough colors per class are
        specified then the colors are treated as a cycle.

    colormap : string or cmap, default: None
        The colormap used to create the individual colors. If classes are
        specified the colormap is used to evenly space colors across each class.

    alpha : float, default: None
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered lines more visible.
        If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise.

    fast : bool, default: False
        Fast mode improves the performance of the drawing time of parallel
        coordinates but produces an image that does not show the overlap of
        instances in the same class. Fast mode should be used when drawing all
        instances is too burdensome and sampling is not an option.

    vlines : boolean, default: True
        flag to determine vertical line display

    vlines_kwds : dict, default: None
        options to style or display the vertical lines, default: None

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    --------
    n_samples_ : int
        number of samples included in the visualization object

    features_ : ndarray, shape (n_features,)
        The names of the features discovered or used in the visualizer that
        can be used as an index to access or modify data in X. If a user passes
        feature names in, those features are used. Otherwise the columns of a
        DataFrame are used or just simply the indices of the data array.

    classes_ : ndarray, shape (n_classes,)
        The class labels that define the discrete values in the target. Only
        available if the target type is discrete. This is guaranteed to be
        strings even if the classes are a different type.

    Examples
    --------

    >>> visualizer = ParallelCoordinates()
    >>> visualizer.fit(X, y)
    >>> visualizer.transform(X)
    >>> visualizer.poof()

    Notes
    -----

    These parameters can be influenced later on in the visualization
    process, but can and should be set as early as possible.
    """

    NORMALIZERS = {
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "standard": StandardScaler(),
        "l1": Normalizer("l1"),
        "l2": Normalizer("l2"),
    }

    def __init__(
        self,
        ax=None,
        features=None,
        classes=None,
        normalize=None,
        sample=1.0,
        random_state=None,
        shuffle=False,
        colors=None,
        colormap=None,
        alpha=None,
        fast=False,
        vlines=True,
        vlines_kwds=None,
        **kwargs
    ):
        if "target_type" not in kwargs:
            kwargs["target_type"] = "discrete"
        super(ParallelCoordinates, self).__init__(
            ax=ax,
            features=features,
            classes=classes,
            colors=colors,
            colormap=colormap,
            **kwargs
        )

        # Validate 'normalize' argument
        if normalize in self.NORMALIZERS or normalize is None:
            self.normalize = normalize
        else:
            raise YellowbrickValueError(
                "'{}' is an unrecognized normalization method".format(normalize)
            )

        # Validate 'sample' argument
        if isinstance(sample, int):
            if sample < 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `int` must be greater than 1"
                )
        elif isinstance(sample, float):
            if sample <= 0 or sample > 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `float` must be between 0 and 1"
                )
        else:
            raise YellowbrickTypeError("`sample` parameter must be int or float")
        self.sample = sample

        # Set sample parameters
        if isinstance(shuffle, bool):
            self.shuffle = shuffle
        else:
            raise YellowbrickTypeError("`shuffle` parameter must be boolean")
        if self.shuffle:
            if (random_state is None) or isinstance(random_state, int):
                self._rng = RandomState(random_state)
            elif isinstance(random_state, RandomState):
                self._rng = random_state
            else:
                raise YellowbrickTypeError(
                    "`random_state` must be None, int, or np.random.RandomState"
                )
        else:
            self._rng = None

        # Visual and drawing parameters
        self.fast = fast
        self.alpha = alpha
        self.show_vlines = vlines
        self.vlines_kwds = vlines_kwds or {"linewidth": 1, "color": "black"}

        # Internal properties
        self._increments = None
        self._colors = None

    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # Determine the features, classes, and colors
        super(ParallelCoordinates, self).fit(X, y)

        # Convert from pandas data types
        if is_dataframe(X):
            X = X.values
        if is_series(y):
            y = y.values

        # Ticks for each feature specified
        self._increments = np.arange(len(self.features_))

        # Subsample instances
        X, y = self._subsample(X, y)

        # Normalize instances
        if self.normalize is not None:
            X = self.NORMALIZERS[self.normalize].fit_transform(X)

        self.draw(X, y, **kwargs)
        return self

    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the parallel
        coordinates canvas and draws each instance and vertical lines on it.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        """
        if self.fast:
            return self.draw_classes(X, y, **kwargs)
        return self.draw_instances(X, y, **kwargs)

    def draw_instances(self, X, y, **kwargs):
        """
        Draw the instances colored by the target y such that each line is a
        single instance. This is the "slow" mode of drawing, since each
        instance has to be drawn individually. However, in so doing, the
        density of instances in braids is more apparent since lines have an
        independent alpha that is compounded in the figure.

        This is the default method of drawing.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values

        Notes
        -----
        This method can be used to draw additional instances onto the parallel
        coordinates before the figure is finalized.
        """
        # Get alpha from param or default
        alpha = self.alpha or 0.25

        for idx in range(len(X)):
            Xi = X[idx]
            yi = y[idx]
            color = self.get_colors([yi])[0]

            self.ax.plot(self._increments, Xi, color=color, alpha=alpha, **kwargs)

        return self.ax

    def draw_classes(self, X, y, **kwargs):
        """
        Draw the instances colored by the target y such that each line is a
        single class. This is the "fast" mode of drawing, since the number of
        lines drawn equals the number of classes, rather than the number of
        instances. However, this drawing method sacrifices inter-class density
        of points using the alpha parameter.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values
        """
        # Get alpha from param or default
        alpha = self.alpha or 0.5

        # Prepare to flatten data within each class:
        #   introduce separation between individual data points using None in
        #   x-values and arbitrary value (one) in y-values
        X_separated = np.hstack([X, np.ones((X.shape[0], 1))])
        increments_separated = self._increments.tolist()
        increments_separated.append(None)

        # Get the classes that exist in the dataset, y
        y_values = np.unique(y)

        # Plot each class as a single line plot
        for yi in y_values:
            color = self.get_colors([yi])[0]

            X_in_class = X_separated[y == yi, :]
            increments_in_class = increments_separated * len(X_in_class)
            if len(X_in_class) > 0:
                self.ax.plot(
                    increments_in_class,
                    X_in_class.flatten(),
                    linewidth=1,
                    color=color,
                    alpha=alpha,
                    **kwargs
                )

        return self.ax

    def finalize(self, **kwargs):
        """
        Finalize executes any subclass-specific axes finalization steps.
        The user calls poof and poof calls finalize.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        """
        # Set the title
        self.set_title(
            "Parallel Coordinates for {} Features".format(len(self.features_))
        )

        # Add the vertical lines
        # TODO: Make an independent function for override!
        if self.show_vlines:
            for idx in self._increments:
                self.ax.axvline(idx, **self.vlines_kwds)

        # Set the limits
        self.ax.set_xticks(self._increments)
        self.ax.set_xticklabels(self.features_)
        self.ax.set_xlim(self._increments[0], self._increments[-1])

        # Add the legend sorting classes by name
        labels = sorted(list(self._colors.keys()))
        colors = [self._colors[lbl] for lbl in labels]
        manual_legend(self, labels, colors, loc="best", frameon=True)

        # Add the grid view
        self.ax.grid()

    def _subsample(self, X, y):

        # Choose a subset of samples
        if isinstance(self.sample, int):
            n_samples = min([self.sample, len(X)])
        elif isinstance(self.sample, float):
            n_samples = int(len(X) * self.sample)

        if (n_samples < len(X)) and self.shuffle:
            indices = self._rng.choice(len(X), n_samples, replace=False)
        else:
            indices = slice(n_samples)
        X = X[indices, :]
        y = y[indices]

        self.n_samples_ = n_samples
        return X, y
Beispiel #7
0
                                                    y,
                                                    test_size=0.3,
                                                    random_state=7)

for C in np.arange(0.05, 2, 0.05):
    for gamma in np.arange(0.001, 0.1, 0.001):

        svc = SVC(C=C, gamma=gamma)
        svc.fit(X_train, y_train)
        score = svc.score(X_test, y_test)
        if score > best_score:
            best_score = score
            print "C, gamma, score", C, gamma, score

#maxabs
norm = MaxAbsScaler()
norm.fit(X)
T = norm.transform(X)

X_train, X_test, y_train, y_test = train_test_split(T,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=7)

for C in np.arange(0.05, 2, 0.05):
    for gamma in np.arange(0.001, 0.1, 0.001):

        svc = SVC(C=C, gamma=gamma)
        svc.fit(X_train, y_train)
        score = svc.score(X_test, y_test)
        if score > best_score:
Beispiel #8
0
        #               'trailer',
        #               'truck_age_at_orig',
        #               'orig_amt_>150k'
    ],
    axis=1)

X = sm.add_constant(X)

#### Normalization ####

# Unit Norm
norm = Normalizer()
#X = pd.DataFrame(norm.fit_transform(X),columns=X.columns)

## Maximum absolute value
max_abs = MaxAbsScaler()
#X = pd.DataFrame(max_abs.fit_transform(X),columns=X.columns)
#
## MinMax Scaling
minmax = MinMaxScaler()
#X = pd.DataFrame(minmax.fit_transform(X),columns=X.columns)

Y = data['target']

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=1)

names = X_train.columns
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import BASE_COLORS
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler

data = pd.read_csv('iris.csv').values

X = data[:, :-1]
# X = data[:, 2].reshape(-1, 1)
y = data[:, -1]
X = MaxAbsScaler().fit(X).transform(X)
y = LabelEncoder().fit(y).transform(y)

colormap = np.array(list(BASE_COLORS.keys()))

plt.xlabel('x')
plt.ylabel('y')
for i in range(X.shape[1]):
    for j in range(i + 1, X.shape[1]):
        plt.scatter(X[:, i], X[:, j], c=colormap[y])
        plt.show()
Beispiel #10
0
def main():
    

    config_file=""   
    acceptable_parameters={"shuffle" : "bool" ,                           
                           "learning_rate" : "str" ,
                           "penalty" : "str" ,                             
                           "loss" : "str" ,                            
                           "epsilon" : "float" ,  
                           "alpha" : "float" ,                            
                           "power_t" : "float" ,                           
                           "n_iter" : "int" ,  
                           "eta0" : "float" ,     
                           "l1_ratio" : "float" ,                                       
                           "random_state" : "int" ,   
                           "verbose" : "int",                              
                           }

#usedense=true
#standardize=true
#use_log1p=true   



    
    
    arguments=sys.argv
    print ("arguments: ",arguments )
    if len(arguments)!=2:
        raise Exception(" was expecting only one argument pointing to the config file... process will terminate")

    else :
        config_file=arguments[1] 
        dense,standardize,use_log1p,task_type,model_file,data_file,prediction_file,column, model_parameters=read_file_end_return_parameters(config_file, acceptable_parameters)   
        #sanity checks
        if task_type not in ["train","predict"]:
            raise Exception("task needs to be either train or predict, here it was %s ... " % (task_type))   
        if model_file=="":
            raise Exception("model file cannot be empty")       
        if data_file=="":
            raise Exception("data file file cannot be empty")    
        if not os.path.isfile(data_file):
            raise Exception(" %s data file does not exist... " % (data_file))           
        if task_type=="predict" and  prediction_file=="":
            raise Exception("prediction file  cannot be empty when task=predict")  
        if len(model_parameters)==0 and task_type=="train":
            raise Exception("model parameters cannot be empty") 
        if column<1:
            raise Exception("columns cannot be less than 1...")   
        if  "validation_fraction" in model_parameters and model_parameters["validation_fraction"]>0.0:
                 model_parameters["early_stopping"]=True   
        if  "solver" in model_parameters and model_parameters["solver"]=="sgd":
                 model_parameters["nesterovs_momentum"]=True    
     
        ################### Model training ###############
        if  task_type =="train":
            
            st=StandardScaler() 
            ab=MaxAbsScaler()    
            
            
            
            X,y=get_data(data_file, column) #load data
            model=SGDRegressor(**model_parameters) # set model parameters
            if dense: #convert to dense - useful if the data does nto have high dimensionality .
            #Also sklearn models are not optimzied for sparse data in tree-cased algos
               X=X.toarray()
               if use_log1p :
                   X[X<0]=0
                   X=np.log1p(X)                  
               if standardize:
                   X=st.fit_transform(X)
                   
               model.fit(X,y) #fitting model
               joblib.dump((model,st) , model_file) 
                  
            else :
               if use_log1p:
                   X[X<0]=0
                   X=csr_matrix(X).log1p()               
               if standardize :
                   X=ab.fit_transform(X)  
                   
               model.fit(X,y) #fitting model
               joblib.dump((model,ab) , model_file)               
               

            if not os.path.isfile(model_file):
                raise Exception(" %s model file could not be exported - check permissions ... " % (model_file))             
                
            sys.exit(-1)# exit script
        ################### predicting ###############            
        else :
            if not os.path.isfile(model_file):
                raise Exception(" %s model file could not be imported " % (model_file))              
            X,y=get_data(data_file, column) #load data
            model,scaler=joblib.load(model_file)
            if dense: #convert to dense - useful if the data does nto have high dimensionality .
            #Also sklearn models are not optimzied for sparse data in tree-cased algos
               X=X.toarray()
               if use_log1p :
                   X[X<0]=0
                   X=np.log1p(X)                          
            
            else :
               if use_log1p:
                   X[X<0]=0
                   X=csr_matrix(X).log1p()
                   
            if standardize:
                 X=scaler.transform(X)


            preds=model.predict(X)
            np.savetxt(prediction_file, preds, delimiter=",", fmt='%.9f')
            if not os.path.isfile(prediction_file):
                raise Exception(" %s prediction file could not be exported - check permissions ... " % (prediction_file))             
            sys.exit(-1)# exit script        
Beispiel #11
0
    data = data.drop(data[data['hminus_TRACK_Type'] == 5].index)
    variables = [
        "V0_ENDVERTEX_Z", "hplus_P", "hplus_PT", "hminus_PT", "hminus_PZ",
        "Angle", "nLongTracks", "V0_ENDVERTEX_CHI2", "hplus_TRACK_GhostProb",
        "hminus_TRACK_CHI2NDOF"
    ]

#"V0_ENDVERTEX_Y""nTracks","V0_ORIVX_Y","V0_ORIVX_CHI2"
#  down==1 variables: "V0_ENDVERTEX_CHI2","V0_ENDVERTEX_Z","V0_ENDVERTEX_Y","hplus_P","hplus_PY","hminus_P","Angle","nTracks","V0_ORIVX_X","V0_ORIVX_Z","V0_ORIVX_CHI2","hplus_IP_OWNPV"
#Declaring real masses from pdg
# data= data.drop(["Track_type"],axis=1)
pionm = 139.57061
protonm = 938.272081
lambdam = 1115.683

cs = MaxAbsScaler()

# norm = data.max() - data.min()
# data = data - [data.min()[0],data.min()[1],data.min()[2],data.min()[3],data.min()[4],data.min()[5],0]
# data = data.div([norm[0],norm[1],norm[2],norm[3],norm[4],norm[5],1])

data = data.dropna()
data = data.drop(data[data.Resolution.abs() > 10].index)
datamlp = data[variables]
labelmlp = data["Resolution"].abs()

Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(data[variables],
                                                  data["Resolution"].abs(),
                                                  test_size=0.3)
datamlp = cs.fit_transform(datamlp)
Xtrain = cs.transform(Xtrain)
Beispiel #12
0
data_clients.CODE_GENDER = labelencoder.fit_transform(data_clients.CODE_GENDER)
data_clients.FLAG_OWN_CAR = labelencoder.fit_transform(
    data_clients.FLAG_OWN_CAR)
data_clients.FLAG_OWN_REALTY = labelencoder.fit_transform(
    data_clients.FLAG_OWN_REALTY)
data_clients.NAME_TYPE_SUITE = labelencoder.fit_transform(
    data_clients.NAME_TYPE_SUITE)
data_clients.NAME_INCOME_TYPE = labelencoder.fit_transform(
    data_clients.NAME_INCOME_TYPE)
data_clients.NAME_EDUCATION_TYPE = labelencoder.fit_transform(
    data_clients.NAME_EDUCATION_TYPE)
data_clients.NAME_FAMILY_STATUS = labelencoder.fit_transform(
    data_clients.NAME_FAMILY_STATUS)
data_clients.NAME_HOUSING_TYPE = labelencoder.fit_transform(
    data_clients.NAME_HOUSING_TYPE)
data_clients.OCCUPATION_TYPE = labelencoder.fit_transform(
    data_clients.OCCUPATION_TYPE)
data_clients.WEEKDAY_APPR_PROCESS_START = labelencoder.fit_transform(
    data_clients.WEEKDAY_APPR_PROCESS_START)
data_clients.ORGANIZATION_TYPE = labelencoder.fit_transform(
    data_clients.ORGANIZATION_TYPE)

scaler_maxabs = MaxAbsScaler()
data = scaler_maxabs.fit_transform(data_clients)
X_train = PCA(n_components=40).fit_transform(data)

resultat = model.predict(X_train)

dataset['TARGET_p'] = resultat.tolist()

dataset.to_csv('predict.csv')
import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler, MinMaxScaler

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv("PATH/TO/DATA/FILE",
                          delimiter="COLUMN_SEPARATOR",
                          dtype=np.float64)
features = np.delete(
    tpot_data.view(np.float64).reshape(tpot_data.size, -1),
    tpot_data.dtype.names.index("class"),
    axis=1,
)
(
    training_features,
    testing_features,
    training_classes,
    testing_classes,
) = train_test_split(features, tpot_data["class"], random_state=42)

exported_pipeline = make_pipeline(
    MaxAbsScaler(), MinMaxScaler(),
    LogisticRegression(C=49.0, dual=True, penalty="l2"))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Beispiel #14
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]),
        FunctionTransformer(lambda X: X)), MaxAbsScaler(),
    RandomizedPCA(iterated_power=10),
    LinearSVC(C=0.79, dual=False, penalty="l1"))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Beispiel #15
0
def normaliza(dados):
    p = MaxAbsScaler()
    p.fit(dados)
    return p.transform(dados)
from sklearn.preprocessing import RobustScaler

from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose

iris = load_iris()


def _get_valid_samples_by_column(X, col):
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive, omit_kwargs",
    [(MaxAbsScaler(), maxabs_scale, True, False, []),
     (MinMaxScaler(), minmax_scale, False, False, ['clip']),
     (StandardScaler(), scale, False, False, []),
     (StandardScaler(with_mean=False), scale, True, False, []),
     (PowerTransformer('yeo-johnson'), power_transform, False, False, []),
     (PowerTransformer('box-cox'), power_transform, False, True, []),
     (QuantileTransformer(n_quantiles=10), quantile_transform, True, False,
      []), (RobustScaler(), robust_scale, False, False, []),
     (RobustScaler(with_centering=False), robust_scale, True, False, [])])
def test_missing_value_handling(est, func, support_sparse, strictly_positive,
                                omit_kwargs):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
Beispiel #17
0
def randomForest( dataFrame, targetColumn, featureNames ):
    dataFrame = dataFrame[featureNames]

    FEATURE_NAMES = list(dataFrame.columns);
    FEATURE_NAMES.remove(targetColumn)
    COLUMNS = list(dataFrame.columns);
    LABEL = targetColumn;

    Y_dataFrame = dataFrame[[targetColumn]];
    Y_values = Y_dataFrame.values;
    X_dataFrame = dataFrame.drop(targetColumn, axis=1);
    X_values = X_dataFrame.values;
    Y_values = Y_values

    print(X_dataFrame.describe())


    FEATURE_DEFAULTS = ((X_dataFrame.max() + X_dataFrame.min()) * 0.5).to_dict()

    # preprocessorY = MinMaxScaler()
    # preprocessorY = StandardScaler()
    preprocessorY = MaxAbsScaler()
    preprocessorY.fit(Y_values)
    preprocessorX = MinMaxScaler()
    # preprocessorX = StandardScaler()
    preprocessorX.fit(X_values)

    Y_values = preprocessorY.transform(Y_values)
    X_values = preprocessorX.transform(X_values)
    X_numpyTrainVal, X_numpyTest, Y_numpyTrainVal, Y_numpyTest = train_test_split(X_values, Y_values, test_size=0.1)
    model = RandomForestRegressor(n_estimators=100, n_jobs = -1)
    model.fit(X_numpyTrainVal, Y_numpyTrainVal)  # обучение
    Y_numpyPredict = model.predict(X_numpyTest)  # предсказание

    X_numpyTotal = X_values
    Y_numpyTotal = Y_values
    eps = 0.001
    Y_relErr = np.abs(Y_numpyPredict - Y_numpyTest.flatten()) / (Y_numpyTest + eps)
    for threshold in [0.025, 0.05, 0.10, 0.15]:
        bad_s = np.sum((Y_relErr > threshold))
        good_s = np.sum((Y_relErr <= threshold))
        total_s = Y_relErr.size
        print("threshold = {:5}, good = {:10}, bad = {:10}, err = {:4}".format(threshold, good_s/total_s, bad_s/total_s,
                                                                               bad_s / (good_s + bad_s)))

    Y_numpyPredict = preprocessorY.inverse_transform(Y_numpyPredict.reshape(-1, 1))
    Y_numpyTest = preprocessorY.inverse_transform(Y_numpyTest.reshape(-1, 1))
    modelPacket = dict()
    modelPacket['model'] = model
    modelPacket['preprocessorX'] = preprocessorX
    modelPacket['preprocessorY'] = preprocessorY

    modelPacket['feature_names'] = FEATURE_NAMES
    modelPacket['feature_defaults'] = FEATURE_DEFAULTS
    threshold = 10
    print()
    Y_relativeError = np.abs(Y_numpyPredict - Y_numpyTest) * 100 / Y_numpyTest

    allValues = Y_numpyTest
    mask = Y_relativeError > threshold
    badValues = Y_numpyTest[mask]
    mask = Y_relativeError <= threshold
    goodValues = Y_numpyTest[mask]

    bins = range(1, 20)
    bins = [i * 0.5e6 for i in bins]

    figure, axes = plt.subplots(3, 1)
    axes[1].axis('tight')
    axes[1].axis('off')

    resultValues = axes[0].hist([allValues, goodValues, badValues], bins=bins, histtype='bar',
                                color=['green', 'yellow', 'red'])
    allValues = resultValues[0][0];
    goodValues = resultValues[0][1];
    badValues = resultValues[0][2];

    accuracy = goodValues * 100 / (allValues + 0.01)
    col_label = ['{:5d}'.format(int((bins[i + 0] + bins[i + 1]) / 2)) for i in range(len(bins) - 1)]
    cell_text = [['{:2.1f}'.format(acc_) for acc_ in accuracy], ]

    table_ = axes[1].table(cellText=cell_text, colLabels=col_label, loc='center')
    table_.auto_set_font_size(False)
    table_.set_fontsize(8)

    Y_numpyTest_max = np.max(Y_numpyTest)
    Y_numpyTest_min = np.min(Y_numpyTest)

    # axes[2].set_position([Y_numpyTotal_min-Y_numpyTotal_width*0.1,Y_numpyTotal_min-Y_numpyTotal_width*0.1,Y_numpyTotal_width*0.2,Y_numpyTotal_width*0.2])
    axes[2].plot(Y_numpyTest, Y_numpyTest, c='blue')
    axes[2].plot(Y_numpyTest, Y_numpyTest * (1.0 + 0.1), c='red')
    axes[2].plot(Y_numpyTest, Y_numpyTest * (1.0 - 0.1), c='red')
    axes[2].scatter(Y_numpyPredict, Y_numpyTest)
    plt.show()

    # figure, axes =plt.subplots(3,1)
    # clust_data = np.random.random((10,3))
    # collabel=("col 1", "col 2", "col 3")
    # axs[0].axis('tight')
    # axs[0].axis('off')
    # the_table = axs[0].table(cellText=clust_data,colLabels=collabel,loc='center')

    # axs[1].plot(clust_data[:,0],clust_data[:,1])
    # plt.show()

    return modelPacket, (Y_numpyPredict, Y_numpyTotal)
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9604589764296974
exported_pipeline = make_pipeline(
    RobustScaler(), MaxAbsScaler(),
    GaussianProcessRegressor(kernel=Matern(length_scale=4.0, nu=2.5),
                             n_restarts_optimizer=185,
                             normalize_y=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #19
0
def train_ann(X_array,
              Y_array,
              scaling_input=None,
              scaling_output=None,
              reduce_pca=False,
              outfile=None,
              regressor_opts={'activation': 'logistic'}):

    print('Fitting Artificial Neural Network')

    Y_array = np.asarray(Y_array)
    X_array = np.asarray(X_array)

    pca = None
    input_scaler = None
    output_scaler = None

    # Normalize the input
    if scaling_input == 'minmax':
        input_scaler = MinMaxScaler()
        X_array = input_scaler.fit_transform(X_array)
        if outfile:
            fid = open(outfile + '_scaler_input', 'wb')
            pickle.dump(input_scaler, fid, -1)
            fid.close()
    elif scaling_input == 'maxabs':
        input_scaler = MaxAbsScaler()
        X_array = input_scaler.fit_transform(X_array)
        if outfile:
            fid = open(outfile + '_scaler_input', 'wb')
            pickle.dump(input_scaler, fid, -1)
            fid.close()
    elif scaling_input == 'normalize':
        input_scaler = StandardScaler()
        X_array = input_scaler.fit_transform(X_array)
        if outfile:
            fid = open(outfile + '_scaler_input', 'wb')
            pickle.dump(input_scaler, fid, -1)
            fid.close()

    if reduce_pca:
        #Reduce input variables using Principal Component Analisys
        pca = PCA(n_components=10)
        X_array = pca.fit_transform(X_array)
        if outfile:
            fid = open(outfile + '_PCA', 'wb')
            pickle.dump(pca, fid, -1)
            fid.close()

    # Normalize the output
    if scaling_output == 'minmax':
        output_scaler = MinMaxScaler()
        Y_array = output_scaler.fit_transform(Y_array)
        if outfile:
            fid = open(outfile + '_scaler_output', 'wb')
            pickle.dump(output_scaler, fid, -1)
            fid.close()
    elif scaling_output == 'maxabs':
        output_scaler = MaxAbsScaler()
        Y_array = output_scaler.fit_transform(Y_array)
        if outfile:
            fid = open(outfile + '_scaler_output', 'wb')
            pickle.dump(output_scaler, fid, -1)
            fid.close()
    elif scaling_output == 'normalize':
        output_scaler = StandardScaler()
        Y_array = output_scaler.fit_transform(Y_array)
        if outfile:
            fid = open(outfile + '_scaler_output', 'wb')
            pickle.dump(output_scaler, fid, -1)
            fid.close()

    # Get the number of bands to set the ANN structure
    ann = ann_sklearn.MLPRegressor(**regressor_opts)

    ANN = ann.fit(X_array, Y_array)
    if outfile:
        fid = open(outfile, 'wb')
        pickle.dump(ANN, fid, -1)
        fid.close()

    return ANN, input_scaler, output_scaler, pca
Beispiel #20
0
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, PolynomialFeatures
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.6334721147216316
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    SelectPercentile(score_func=f_regression, percentile=89),
    VarianceThreshold(threshold=0.1), MaxAbsScaler(),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    LinearSVR(C=1.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #21
0
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
Beispiel #22
0
                                          verbosity=0)), MinMaxScaler(),
 MinMaxScaler(),
 StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                          eta0=0.01,
                                          fit_intercept=False,
                                          l1_ratio=0.0,
                                          learning_rate="constant",
                                          loss="huber",
                                          penalty="elasticnet",
                                          power_t=0.0)),
 StackingEstimator(estimator=LinearSVR(C=25.0,
                                       dual=True,
                                       epsilon=0.01,
                                       loss="epsilon_insensitive",
                                       tol=0.0001)),
 FeatureAgglomeration(affinity="l2", linkage="average"), MaxAbsScaler(),
 SelectPercentile(score_func=f_regression, percentile=6),
 StackingEstimator(
     estimator=GradientBoostingRegressor(alpha=0.9,
                                         learning_rate=0.1,
                                         loss="huber",
                                         max_depth=3,
                                         max_features=0.1,
                                         min_samples_leaf=13,
                                         min_samples_split=11,
                                         n_estimators=10,
                                         subsample=0.7000000000000001)),
 StackingEstimator(estimator=LinearSVR(C=20.0,
                                       dual=True,
                                       epsilon=1.0,
                                       loss="squared_epsilon_insensitive",
# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern= TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
Beispiel #24
0
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = knn.predict(X_test)
prediction_end = time.perf_counter()
acc_knn = (preds == y_test).sum().astype(float) / len(preds) * 100
knn_train_time = training_end - training_start
knn_prediction_time = prediction_end - prediction_start
print("K Nearest Neighbors Classifier's prediction accuracy is: %3.2f" %
      (acc_knn))
print("Time consumed for training: %4.3f seconds" % (knn_train_time))
print("Time consumed for prediction: %6.5f seconds" % (knn_prediction_time))

###Naive Bayes

from sklearn.preprocessing import MaxAbsScaler
scaler_gnb = MaxAbsScaler()
sdss = scaler_gnb.fit_transform(sdss_df_fe.drop('Category_list', axis=1))
X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split(
    sdss, sdss_df_fe['Category_list'], test_size=0.33)

gnb = GaussianNB()
training_start = time.perf_counter()
gnb.fit(X_train_gnb, y_train_gnb)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = gnb.predict(X_test_gnb)
prediction_end = time.perf_counter()
acc_gnb = (preds == y_test_gnb).sum().astype(float) / len(preds) * 100
gnb_train_time = training_end - training_start
gnb_prediction_time = prediction_end - prediction_start
print("Gaussian Naive Bayes Classifier's prediction accuracy is: %3.2f" %
Beispiel #25
0
def maxabsscaler(data):
    data = MaxAbsScaler().fit_transform(data)
    return data
Beispiel #26
0
}

# Take only 2 features to make visualization easier
# Feature MedInc has a long tail distribution.
# Feature AveOccup has a few but very large outliers.
features = ['MedInc', 'AveOccup']
features_idx = [feature_names.index(feature) for feature in features]
X = X_full[:, features_idx]
distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling',
        StandardScaler().fit_transform(X)),
    ('Data after min-max scaling',
        MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling',
        MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Yeo-Johnson)',
     PowerTransformer(method='yeo-johnson').fit_transform(X)),
    ('Data after power transformation (Box-Cox)',
     PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
        QuantileTransformer(output_distribution='uniform')
        .fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
        QuantileTransformer(output_distribution='normal')
        .fit_transform(X)),
    ('Data after sample-wise L2 normalizing',
        Normalizer().fit_transform(X)),
]
Beispiel #27
0
@author: damian.campo
"""

from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import MaxAbsScaler  # See Normalizer and StandardScaler

#   Load dataset
iris = datasets.load_iris()
samples = iris.data
#print(samples)

#   Preprocess the samples so that all features are between 0 and 1
scaler = MaxAbsScaler()
scaler.fit(samples)
samples = scaler.transform(samples)

model = KMeans(n_clusters=3)
model.fit(samples)
labels = model.predict(samples)

target = iris.target
plt.figure()
#   for plotting purposes only features 2 and 3 are taking into consideration
plt.scatter(samples[:, 2], samples[:, 3], c=target)
plt.title('Ground truth clusters  (Based on 2 features)')

centroids = model.cluster_centers_
centroids_x = centroids[:, 2]
Beispiel #28
0
                                              on='userId',
                                              how='left')
    columns = test_data.columns.tolist()
    df = test_data.merge(user_feature, on='userId', how='left')
    df1 = df.merge(offline_user_mer_feature,
                   on=['userId', 'merchantId'],
                   how='left')
    df2 = df1.merge(offline_mer_feature, on='merchantId', how='left')
    df2.fillna(np.nan, inplace=True)
    df2.drop(columns, axis=1, inplace=True)
    df2.to_csv('resource/train_features.csv', index=False)

    # train_features = pd.read_csv(train_feature_path).astype(float)
    columns = df2.columns.tolist()
    for col in columns:
        if col is 'userAverageDistance_y' or col is 'userAverageDistance_x':
            df2[col] = df2[col].fillna(-1)
        else:
            df2[col] = df2[col].fillna(0)

    max_abs_scaler = MaxAbsScaler()
    x_test_maxabs = max_abs_scaler.fit_transform(df2)
    clf = joblib.load('model/xgb/xgb_model.pkl')
    test_matrix = xgboost.DMatrix(df2.values, feature_names=df2.columns)

    y_pre = clf.predict(test_matrix)
    y_pre_df = pd.DataFrame(pd.Series(y_pre), columns=['Probability'])
    submit_df = test_data[['userId', 'Coupon_id',
                           'Date_received']].join(y_pre_df)
    submit_df.rename(columns={'userId': 'User_id'})
    submit_df.to_csv('submit/submit.csv', index=False)
Beispiel #29
0
from sklearn.preprocessing import RobustScaler

from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose

iris = load_iris()


def _get_valid_samples_by_column(X, col):
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive",
    [(MaxAbsScaler(), maxabs_scale, True, False),
     (MinMaxScaler(), minmax_scale, False, False),
     (StandardScaler(), scale, False, False),
     (StandardScaler(with_mean=False), scale, True, False),
     (PowerTransformer('yeo-johnson'), power_transform, False, False),
     (PowerTransformer('box-cox'), power_transform, False, True),
     (QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
     (RobustScaler(), robust_scale, False, False),
     (RobustScaler(with_centering=False), robust_scale, True, False)])
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:-148.31589276097782
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(MaxAbsScaler(), RobustScaler(), ZeroCount(),
                      SelectFwe(score_func=f_regression, alpha=0.038)),
        FunctionTransformer(copy)),
    XGBRegressor(learning_rate=0.1,
                 max_depth=9,
                 min_child_weight=15,
                 n_estimators=100,
                 nthread=1,
                 subsample=1.0))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)