Beispiel #1
0
class Transformer(object):
    """
    The purpose of this class is to take a dataframe and transform it into
    a numpy array compatible format.
    """

    def __init__(self, config):
        self.__config = config
        self.__mapper = None
        self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer())

    def prepare(self, dataframe):
        """
        Takes the already cleaned dataframe, splits it into train and test
        and returns the train and test as numpy arrays.
        If the problem is supervised, the target column will be that last one
        of the returned arrays.
        """
        mapping = DataFrameMapCreator().get_mapping_from_config(self.__config)
        self.__mapper = DataFrameMapper(mapping)
        train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage"))
        return self.__get_correct_return_parameters(train, test)

    def __get_correct_return_parameters(self, train, test):
        model = self.__config.get_data_model()

        train_transformed = self.__mapper.fit_transform(train)
        test_transformed = self.__mapper.transform(test)

        if model.has_target():
            return self.__add_target_data(train_transformed, train), \
                   self.__add_target_data(test_transformed, test)
        else:
            return train_transformed, test_transformed

    def __add_target_data(self, transformed_data, original_data):
        """
        Picks up the target data from the original_data and appends it as a
        column to the transformed_data.
        Both arguments are expected to be np.array's.
        """
        model = self.__config.get_data_model()
        target_feature = model.find_target_feature()
        name = target_feature.get_name()

        if target_feature.is_categorical():
            target_row = original_data[name]
            target = self.__label_encoder_adapter.transform(target_row)
        else:
            target = original_data[name].values.astype(type_name_to_data_type("float"))

        target = target[..., None]

        return np.hstack((transformed_data, target))

    def apply(self, dataframe):
        return self.__mapper.transform(dataframe)
Beispiel #2
0
   def transform_features(self):
       totransform = []
       for index, item in enumerate(self.feat_head):
           field = item[0]
           func_name = item[1]
           transform = item[2]
           is_enable = item[3]

           if is_enable:
               if not field in self.stumble_data.get_features():
                   print 'field not in feature..generating:' +  field
                   func_name(field)
               totransform.append((field, transform))

       if len(totransform):
           mapper = DataFrameMapper(totransform)
           mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
           #
           X_transformed_train = mapper.transform(
               self.stumble_data.all_pd[:self.stumble_data.len_train])
           X_transformed_test = mapper.transform(
               self.stumble_data.all_pd[self.stumble_data.len_train:])

           for index, item in enumerate(self.feat_head):
               field = item[0]
               is_enable = item[3]
               if is_enable and field in self.stumble_data.get_features():
                   del self.stumble_data.all_pd[field]

           import pdb
           pdb.set_trace()

           from scipy.sparse import hstack

           X_train = X_transformed_train
           X_test = X_transformed_test
           y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label']
#            print 'Dumping train in SVMLight.'
           dump_svmlight_file(X_train, y_train, output_train_libsvm_file )

#            print 'Dumping test in SVMLight.'
#            dump_svmlight_file(X_test, pred, output_test_libsvm_file )

       else:
           X_train = X_train.as_matrix()
           X_test = X_test.as_matrix()


       return X_train, y_train, X_test
Beispiel #3
0
def prepare_data(df_train, df_test,name):
    """ Define the input and output sets formated to use for neural network model 
    # Arguments
        df_train: training set with all input variables, survival time and censoring status
        df_test: test set with all input variables, survival time and censoring status
        name: name of the model (CoxCC, CoxTime or DeepHit)
    # Returns
        x_train: input variables for the training set
        y_train: output variables for the training set
        x_test: input variables for the test set
        duration_test: survival time for the test set
        event_test: censoring indicator for the test set
        labtrans: output variables transformed for specific models (DeepHit ad CoxTime)
    """
    col_list = list(df_train.columns)
    cols_standardize = [e for e in col_list if e not in ['yy', 'status']]
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    x_mapper = DataFrameMapper(standardize)
    x_train = x_mapper.fit_transform(df_train).astype('float32')
    x_test = x_mapper.transform(df_test).astype('float32')
    get_target = lambda df: (df['yy'].values, df['status'].values)
    
    if name=="DeepHit" :
        num_durations = 10
        labtrans = DeepHitSingle.label_transform(num_durations)
        y_train = labtrans.fit_transform(*get_target(df_train))
    elif name=="CoxTime":
        labtrans = CoxTime.label_transform()
        y_train = labtrans.fit_transform(*get_target(df_train))
    else :
        labtrans = ""
        y_train = get_target(df_train)
    duration_test, event_test = get_target(df_test)
    
    return x_train, y_train, x_test, duration_test, event_test,labtrans
Beispiel #4
0
def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
Beispiel #5
0
def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
def train_fn(args):
    print("loading data")
    train_df = pd.read_csv(args.train_data + "/train.csv", engine='python')
    test_df = pd.read_csv(args.test_data + "/test.csv", engine='python')

    TARGET = 'SeriousDlqin2yrs'
    X_train = train_df.drop(TARGET, axis=1)
    y_train = train_df[TARGET]
    X_test = test_df.drop(TARGET, axis=1)
    y_test = test_df[TARGET]

    print("Imputing missing values")
    transformer = DataFrameMapper([
        (['MonthlyIncome'], DFImputer(strategy="constant", fill_value=-1)),
        (['age'], DFImputer(strategy="median")),
        (['NumberOfDependents'], DFImputer(strategy="median")),
        (['DebtRatio'], DFImputer(strategy="median")),
        (['RevolvingUtilizationOfUnsecuredLines'
          ], DFImputer(strategy="median")),
        (['NumberRealEstateLoansOrLines'], DFImputer(strategy="median")),
        (['NumberOfOpenCreditLinesAndLoans'], DFImputer(strategy="median")),
        (['NumberOfTime30-59DaysPastDueNotWorse'
          ], DFImputer(strategy="median")),
        (['NumberOfTime60-89DaysPastDueNotWorse'
          ], DFImputer(strategy="median")),
        (['NumberOfTimes90DaysLate'], DFImputer(strategy="median")),
    ],
                                  input_df=True,
                                  df_out=True)
    transformer.fit(X_train)
    X_train = transformer.transform(X_train)
    X_test = transformer.transform(X_test)

    print("Building model...")
    model = RandomForestClassifier(n_estimators=50,
                                   max_depth=6,
                                   max_leaf_nodes=30)
    model.fit(X_train, y_train)
    explainer = shap.TreeExplainer(model)

    print("Saving artifacts...")
    model_dir = Path(args.model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)

    joblib.dump(transformer, open(str(model_dir / "transformer.joblib"), "wb"))
    joblib.dump(model, open(str(model_dir / "model.joblib"), "wb"))
    joblib.dump(explainer, open(str(model_dir / "explainer.joblib"), "wb"))
Beispiel #7
0
def scale_vars(df, mapper):
    # warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    # on soustrait aux données leur moyenne empirique # 𝜇 on les divisent par leur écart-type 𝛿 
    if mapper is None:
        map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
Beispiel #8
0
    def execute(self,
                data,
                transforms=None,
                target=None,
                mapper=None,
                drop=None,
                default=None,
                **kwargs):
        try:

            if not data.empty:

                features, labels = data, None

                if mapper is None:

                    mapping = []
                    if transforms:
                        mapping = [(feature, transform_method(**kwargs))
                                   for feature, transform_method in transforms]

                    mapper = DataFrameMapper(mapping,
                                             df_out=True,
                                             default=default)

                    if target is not None:
                        features, labels = data.drop(target,
                                                     axis=1), data[target]

                    if labels is None:
                        features = mapper.fit_transform(features.copy())
                    else:
                        features = mapper.fit_transform(
                            features.copy(), labels.copy())
                else:
                    features = mapper.transform(features.copy())

                if drop:

                    features.drop(
                        [col for col in drop if col in features.columns],
                        axis=1,
                        inplace=True)

                data = features

                if labels is not None:
                    data[target] = labels

            else:
                raise AttributeError("No data provided")

        except Exception:
            print(traceback.format_exc())
            logging.error(traceback.format_exc())

        return data, mapper
Beispiel #9
0
def test_exception_column_context_transform(simple_dataframe):
    """
    If an exception is raised when transforming a column,
    the exception includes the name of the column being transformed
    """
    class FailingTransformer(object):
        def fit(self, X):
            pass

        def transform(self, X):
            raise Exception('Some exception')

    df = simple_dataframe
    mapper = DataFrameMapper([('a', FailingTransformer())])
    mapper.fit(df)

    with pytest.raises(Exception, match='a: Some exception'):
        mapper.transform(df)
def test_exception_column_context_transform(simple_dataframe):
    """
    If an exception is raised when transforming a column,
    the exception includes the name of the column being transformed
    """
    class FailingTransformer(object):
        def fit(self, X):
            pass

        def transform(self, X):
            raise Exception('Some exception')

    df = simple_dataframe
    mapper = DataFrameMapper([('a', FailingTransformer())])
    mapper.fit(df)

    with pytest.raises(Exception, match='a: Some exception'):
        mapper.transform(df)
Beispiel #11
0
def transform_cat_to_cont(df, cat_features, cont_features):
    feature_defs = []
    for col_name in cat_features:
        feature_defs.append((col_name, MyLabelBinarizer()))

    for col_name in cont_features:
        feature_defs.append((col_name, None))

    mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True)
    mapper.fit(df)
    return mapper.transform(df)
Beispiel #12
0
def scale_vars(df, mapper):
    """ Standardize numerical features by removing the mean and scaling to unit variance.
    """
    warnings.filterwarnings('ignore',
                            category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n], StandardScaler()) for n in df.columns
                 if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
Beispiel #13
0
def test_numerical_transformer_serialization(simple_dataset):
    """
    Test if you can serialize transformer
    """
    transfomer = DataFrameMapper([('feat1', NumericalTransformer('log'))])

    df = simple_dataset
    transfomer.fit(df)
    f = tempfile.NamedTemporaryFile(delete=True)
    joblib.dump(transfomer, f.name)
    transfomer2 = joblib.load(f.name)
    np.array_equal(transfomer.transform(df), transfomer2.transform(df))
    f.close()
    def pre_processing(self):
        self.__numeric_header = [i for i in self.__train_feature.columns if i not in self.__categorical_header]
        self.__train_categorical = self.__train_feature[self.__categorical_header]
        self.__train_numeric = self.__train_feature[self.__numeric_header]
        self.__test_categorical = self.__test_feature[self.__categorical_header]
        self.__test_numeric = self.__test_feature[self.__numeric_header]

        self.__train_categorical = self.__train_categorical.astype(str)
        self.__test_categorical = self.__test_categorical.astype(str)
        self.__train_categorical = self.__train_categorical.fillna("missing")
        self.__test_categorical = self.__test_categorical.fillna("missing")
        mapper = DataFrameMapper([(i, LabelEncoder()) for i in self.__train_categorical.columns])
        mapper.fit(self.__train_categorical)
        self.__train_categorical = pd.DataFrame(mapper.transform(self.__train_categorical), columns=self.__train_categorical.columns)
        self.__test_categorical = pd.DataFrame(mapper.transform(self.__test_categorical), columns=self.__test_categorical.columns)

        self.__train_numeric = self.__train_numeric.fillna(-999)
        self.__test_numeric = self.__test_numeric.fillna(-999)

        self.__train_feature = pd.concat([self.__train_numeric, self.__train_categorical], axis=1)
        self.__test_feature = pd.concat([self.__test_numeric, self.__test_categorical], axis=1)
        self.__train_feature = self.__train_feature.values
        self.__test_feature = self.__test_feature.values
Beispiel #15
0
    def test_mapper(self):
        data = lib.load_titanic()

        transformation_list = [(['name'],
                                [EmbeddingVectorizer(max_sequence_length=12)])]

        mapper = DataFrameMapper(transformation_list, df_out=True)

        mapper.fit(data)

        data_transformed = mapper.transform(data)

        assert_array_equal([2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1],
                           data_transformed.values[0, :])
class Preprocessor:
    mapper: DataFrameMapper

    def __init__(self):
        self.mapper = DataFrameMapper([(encoding_fields, [
            SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder()
        ]), (scale_fields, preprocessing.StandardScaler())])

    def train(self, x: pd.DataFrame):
        self.mapper.fit(x)

    def transform(self, x: pd.DataFrame):
        return self.mapper.transform(x)
class imbalanceOversampleProcess:
    '''
    numericFeature: 列表,数值型特征列表
    OversampleParamDict:字典{key,value},key限定为['RandomSample','Smote','ADASYN',\
                      'SMOTEENN','SMOTETomek']这几种,value为参数dict
    estimator:需训练的模型,参数要进行初始化
    '''
    def __init__(self, numericFeature, OversampleParamDict, estimator):
        self.numericFeature = numericFeature
        self.OversampleParamDict = OversampleParamDict
        self.estimator = estimator
        self.dataTranformer = DataFrameMapper([(self.numericFeature,\
                        [ContinuousDomain(),SimpleImputer(strategy='mean'), StandardScaler()])])

    def _generateModel(self, key, paramDict):
        if key == 'RandomSample':
            self.model = RandomOverSampler(**paramDict)
        elif key == 'Smote':
            self.model = SMOTE(**paramDict)
        elif key == 'ADASYN':
            self.model = ADASYN(**paramDict)
        elif key == 'SMOTEENN':
            self.model = SMOTEENN(**paramDict)
        elif key == 'SMOTETomek':
            self.model = SMOTETomek(**paramDict)
        else:
            assert key not in ['RandomSample','Smote','ADASYN',\
                      'SMOTEENN','SMOTETomek'],'请输入RandomSample,Smote,\
                               ADASYN,SMOTEENN,SMOTETomek中任意一种!'

    def _fitSample(self, X, y):
        XTransform = self.dataTranformer.fit_transform(X)
        assert len(self.OversampleParamDict) == 1, '只支持单模型输出,字典只能放一组模型参数!'
        for key, value in self.OversampleParamDict.items():
            self._generateModel(key, value)
        X_train, y_train = self.model.fit_sample(XTransform, y)
        self.X_train_sample = pd.DataFrame(data=X_train,
                                           columns=self.numericFeature)
        self.y_train_sample = y_train

    def fit(self, X, y):
        self._fitSample(X, y)
        self.estimator.fit(self.X_train_sample, self.y_train_sample)

    def predict_proba(self, X):
        XTransformTest = self.dataTranformer.transform(X)
        X_test = pd.DataFrame(data=XTransformTest, columns=self.numericFeature)
        self.predictResult = self.estimator.predict_proba(X_test)
        return self.predictResult
Beispiel #18
0
def run_pipeline(
    data, onehot_cols, ordinal_cols, batch_size, validate=True,
):
    X = data.drop(columns=["fraction_recovered"])
    y = data["fraction_recovered"]
    X_train, X_valid, y_train, y_valid = (
        train_test_split(X, y, test_size=0.2, random_state=0)
        if validate
        else (X, None, y, None)
    )

    transformer = DataFrameMapper(
        [
            (onehot_cols, OneHotEncoder(drop="if_binary")),
            (
                list(ordinal_cols.keys()),
                OrdinalEncoder(categories=list(ordinal_cols.values())),
            ),
        ],
        default=StandardScaler(),
    )

    X_train = transformer.fit_transform(X_train)
    X_valid = transformer.transform(X_valid) if validate else None

    input_nodes = X_train.shape[1]
    output_nodes = 1

    model = Sequential()
    model.add(Input((input_nodes,)))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3, seed=0))
    model.add(Dense(32, activation="relu"))
    model.add(Dropout(0.3, seed=1))
    model.add(Dense(16, activation="relu"))
    model.add(Dropout(0.3, seed=2))
    model.add(Dense(output_nodes))
    model.compile(optimizer="adam", loss="mean_squared_error")

    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=100,
        validation_data=(X_valid, y_valid) if validate else None,
        verbose=1,
    )

    return history.history, model, transformer
Beispiel #19
0
def scale_vars(df: pd.DataFrame,
               mapper: DataFrameMapper = None) -> skp.DataFrameMapper:
    """
    Returns a mapper to scale variables.
    """
    warnings.filterwarnings('ignore',
                            category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        # apply standardscaler to columns
        map_f = [(
            [column], StandardScaler()
            ) for column in df.columns if is_numeric_dtype(df[column])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
Beispiel #20
0
def make_predictions(PATH_TO_DATA, *arg, **kwargs):

    try:
        df = pd.read_csv(PATH_TO_DATA, usecols=FEATURES)
    except InputError:
        print('Not valid data for predictions')

    df = feature_engineering(df)

    for n_model in MODEL_NAMES:
        mapper = DataFrameMapper(map_features(df.columns), df_out=True)
        preds = MODELS[n_model].predict(mapper.transform(df))
        df['Preds ' + n_model] = np.exp(preds)

    return df
Beispiel #21
0
def normed_data(df_train, df_test):
    """ Define the structure of the neural network for a Cox-MLP (CC), CoxTime and  DeepHit
    # Arguments
        df_train: Training set of simulated data with 20 entry variables, survival status and survival time. 
        df_test: Test set of simulated data with 20 entry variables, survival status and survival time. 
    # Returns
        x_train: dataframe with the normalized explanatory variables.
        x_test: dataframe with the normalized explanatory variables.
    """
    col_list = list(df_train.columns)
    cols_standardize = [e for e in col_list if e not in ['yy', 'status']]
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    x_mapper = DataFrameMapper(standardize)
    x_train = x_mapper.fit_transform(df_train).astype('float32')
    x_test = x_mapper.transform(df_test).astype('float32')
    return x_train, x_test
Beispiel #22
0
def _scale(df, mapper=None):
    '''
    ===============     ====================================================================
    **Argument**        **Description**
    ---------------     --------------------------------------------------------------------
    df                  DataFrame to be scaled.
    mapper              Parameters used for scaling.
    ===============     ====================================================================

    :return: mapper if passed as None
    '''
    if mapper is None:
        map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
Beispiel #23
0
def transform(observations):
    """

     - Convert Sex to boolean male indicator
     - Create train / test split
     - Create SKLearn-Pandas mapper
     - Train SKLearn
     - Transform train and test data

    :param observations:
    :type observations: pandas.DataFrame
    :return:
    """
    logging.info('Begin transform')

    # Convert Sex field into boolean male indicator
    observations['male'] = observations['Sex'] == 'male'
    logging.info('Converted Sex to binary class. Value counts: {}'.format(
        observations['male'].value_counts()))

    # Split into train / test split
    mask = numpy.random.rand(len(observations)) < 0.8
    observations_train = observations[mask]
    observations_test = observations[~mask]

    logging.info('Creating dataframe mapper')
    mapper = DataFrameMapper([(['Age'], [Imputer(),
                                         StandardScaler()]),
                              (['SibSp'], [Imputer(),
                                           StandardScaler()]),
                              (['Parch'], [Imputer(),
                                           StandardScaler()]),
                              (['male'], [Imputer(strategy='most_frequent')])])

    logging.info('Fitting and transforming training data set')
    x_train = mapper.fit_transform(observations_train)
    y_train = observations_train['Survived'].values

    logging.info('Transforming response data set')
    x_test = mapper.transform(observations_test)
    y_test = observations_test['Survived'].values

    # Archive & return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return x_train, x_test, y_train, y_test, mapper
Beispiel #24
0
def test_fit_with_required_y_arg(complex_dataframe):
    """
    Transformers with a required y argument in the fit method
    are handled and perform correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

    # fit, doesn't fail
    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

    # fit_transform
    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
    assert_array_equal(ft_arr, df[['feat1']].values)

    # transform
    t_arr = mapper.transform(df[['feat1', 'feat2']])
    assert_array_equal(t_arr, df[['feat1']].values)
def test_fit_with_required_y_arg(complex_dataframe):
    """
    Transformers with a required y argument in the fit method
    are handled and perform correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

    # fit, doesn't fail
    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

    # fit_transform
    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
    assert_array_equal(ft_arr, df[['feat1']].values)

    # transform
    t_arr = mapper.transform(df[['feat1', 'feat2']])
    assert_array_equal(t_arr, df[['feat1']].values)
Beispiel #26
0
def prepare_pseudobs_simu(df_train, y_train, df_test,name):
    """ Prepare the data for training
    The input data is formated so that one line corresponds to one subject at a particular time point.
    # Arguments
        df_train: the entire dataset (input + survival times + event status)
        y_train: the pseudo-values computed according to the method chosen. 
        df_test: the entire dataset (input + survival times + event status)
    # Returns
        x_train_all: input data with all input variables + time variable and one line represents one subject at one time point.
        y_train_all: pseudo-values computed according to the method chosen. 
        x_test_all: input data with all input variables + time variable and one line represents one subject at one time point.
        y_test_all: survival time and event status.
        n_picktime: the number of time point at which the pseudo-observations are computed.
    """
    y_test_all = df_test[['yy','status']]
    n_picktime = int(y_train[['s']].apply(pd.Series.nunique))
    x_test = df_test.drop(['yy','status'], axis = 1)
    x_test_all = pd.concat([x_test]*n_picktime)
    time_test = pd.DataFrame(np.repeat(np.unique(y_train[['s']]),len(x_test)))
    x_test_all.reset_index(inplace=True, drop=True)
    x_test_all = pd.concat([x_test_all, time_test], axis = 1)

    if name!= "pseudo_discrete":
        x_train = df_train.drop(['yy','status'], axis = 1)
        x_train_all = pd.concat([x_train]*n_picktime)
        x_train_all.reset_index(inplace=True, drop=True)
        x_train_all = pd.concat([x_train_all, y_train[['s']]], axis = 1)
        y_train_all = y_train[['pseudost']]
    else:
        x_train = df_train.drop(['yy','status'], axis = 1)
        x_train['id'] = np.arange(len(x_train)) + 1
        x_train = x_train.merge(y_train, left_on='id', right_on='id')
        x_train_all = x_train.drop(['id','pseudost'], axis = 1)
        y_train_all = x_train['pseudost']
    # Data normalization
    col_list = list(x_train_all.columns)
    x_test_all.columns = col_list
    cols_standardize = [e for e in col_list]
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    x_mapper = DataFrameMapper(standardize, df_out=True)
    x_train_all = x_mapper.fit_transform(x_train_all).astype('float32')
    x_test_all = x_mapper.transform(x_test_all).astype('float32')
    
    return(x_train_all, y_train_all, x_test_all, y_test_all, n_picktime)
Beispiel #27
0
def data_simple_imputer(data_train,
                        numeric_feature,
                        category_feature,
                        numeric_strategy='mean',
                        category_strategy='most_frequent',
                        data_test=None):
    '''
    使用DataFrameMapper进行简单的缺失值填补 指定类别型变量和连续型变量 并指定各自的填充策略
    data_train: 需要进行转换的训练集
    numeric_feature: 需要处理的数值型变量
    category_feature: 需要处理的类别型变量
    numeric_strategy: 连续型变量的填补策略 默认是均值
    category_strategy: 类别型变量的填补策略 默认是众数
    data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换
    
    return:
    X_train_imputed 添补完成的训练数据
    miss_transfer 训练好的DataFrameMapper类
    X_test_imputed 添补完成的测试数据 只有在给定测试数据的时候才会使用
    '''
    print('开始缺失值填充'.center(50, '='))
    ##从dict里面把特征list拿出来
    print('类别特征数', len(category_feature))
    print('数值特征数', len(numeric_feature))
    ##数值列和类别列用指定的方法填充
    miss_transfer = DataFrameMapper([
        (numeric_feature, [SimpleImputer(strategy=numeric_strategy)]),
        (category_feature, [SimpleImputer(strategy=category_strategy)])
    ])
    ##进行fit和transform
    X_train_imputed = miss_transfer.fit_transform(data_train[numeric_feature +
                                                             category_feature])
    X_train_imputed = pd.DataFrame(X_train_imputed,
                                   columns=numeric_feature + category_feature)
    print('train_mapper完成:', X_train_imputed.shape)
    ##如果测试数据不为空 那么对测试数据进行transform 并返回
    if data_test is not None:
        X_test_imputed = miss_transfer.transform(data_test[numeric_feature +
                                                           category_feature])
        X_test_imputed = pd.DataFrame(X_test_imputed,
                                      columns=numeric_feature +
                                      category_feature)
        return X_train_imputed, miss_transfer, X_test_imputed
    return X_train_imputed, miss_transfer
    def readDataset(self):

        train_df = pd.read_csv(self.trainFile)
        test_df = pd.read_csv(self.testFile)

        #print(train_df.columns)
        #print(train_df.head())
        #print(test_df.columns)
        self.test_index = test_df.Id
        train_df = train_df.astype(float)
        test_df = test_df.astype(float)
        #print(train_df.iloc[0].values)
        mapper = DataFrameMapper([
            ([
                'Elevation', 'Aspect', 'Slope',
                'Horizontal_Distance_To_Hydrology',
                'Vertical_Distance_To_Hydrology',
                'Horizontal_Distance_To_Roadways', 'Hillshade_9am',
                'Hillshade_Noon', 'Hillshade_3pm',
                'Horizontal_Distance_To_Fire_Points'
            ], MinMaxScaler()),
            ([
                'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
                'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
                'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7',
                'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
                'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',
                'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
                'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
                'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
                'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
                'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
                'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
                'Soil_Type40'
            ], None)
        ])

        self.X_train = mapper.fit_transform(train_df)
        # print(X_train[0:2,:])

        self.y_train = train_df.Cover_Type.values
        # print(y_train[0:10])

        self.X_test = mapper.transform(test_df)
Beispiel #29
0
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
    """
    If input_df is True, the subsequent transformers get passed pandas
    objects instead of numpy arrays (given the previous transformers
    output pandas objects as well)
    """
    df = simple_dataframe
    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
    monkeypatch.setattr(MockTClassifier, 'transform',
                        Mock(return_value=pd.Series([1, 2, 3])))
    mapper = DataFrameMapper(
        [('a', [MockXTransformer(), MockTClassifier()])], input_df=True)
    mapper.fit(df)
    out = mapper.transform(df)

    args, _ = MockTClassifier().fit.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
    """
    If input_df is True, the subsequent transformers get passed pandas
    objects instead of numpy arrays (given the previous transformers
    output pandas objects as well)
    """
    df = simple_dataframe
    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
    monkeypatch.setattr(MockTClassifier, 'transform',
                        Mock(return_value=pd.Series([1, 2, 3])))
    mapper = DataFrameMapper([
        ('a', [MockXTransformer(), MockTClassifier()])
    ], input_df=True)
    mapper.fit(df)
    out = mapper.transform(df)

    args, _ = MockTClassifier().fit.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
Beispiel #31
0
class scale_vars(TBStep):
    def __init__(self, features=None):
        warnings.filterwarnings(
            'ignore', category=sklearn.exceptions.DataConversionWarning)
        self.features = features

    def __repr__(self):
        return 'scale features'

    def fit(self, df):
        if self.features is None: self.features = df.columns
        self.features = [i for i in self.features if is_numeric_dtype(df[i])]
        map_f = [([n], StandardScaler()) for n in df[self.features].columns]
        self.mapper = DataFrameMapper(map_f).fit(
            df[self.features].dropna(axis=0))

    def transform(self, df):
        df = df.copy()
        df[self.mapper.transformed_names_] = self.mapper.transform(
            df[self.features])
        return df
def test_integration(none_value):

    df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'],
                       'num': [1, 2, 3, 4, 5]})

    mapper = DataFrameMapper([
        ('cat', CategoricalImputer()),
        ('num', None)
    ], df_out=True).fit(df)

    df_t = mapper.transform(df)

    assert pd.notnull(df_t).all().all()

    val_idx = pd.notnull(df['cat'])
    nan_idx = ~val_idx

    assert (df['num'] == df_t['num']).all()

    assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
    assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()
class MyMapper():
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.ncols = []
        self.scols = []
        #         print("mapping features")
        for col in X:
            if X[col].dtype == float:
                # print("numerical col: %s" % col)
                self.ncols.append([col])
            else:
                # print("categorical col: %s" % col)
                self.scols.append([col])
        nfeats = gen_features(columns=self.ncols,
                              classes=[{
                                  'class':
                                  sklearn.preprocessing.MinMaxScaler,
                              }])
        sfeats = gen_features(columns=self.scols,
                              classes=[{
                                  'class': LabelBinarizer2
                              }])
        self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True)
        self.mapper.fit(X)
        #         print("features mapped")
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = self.mapper.transform(X)
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
Beispiel #34
0
        df_train = df_train.drop(df_test.index)
        df_val = df_train.sample(frac=0.2)
        df_train = df_train.drop(df_val.index)

    standardize = [([col], StandardScaler()) for col in cols_standardize]
    leave = [(col, None) for col in cols_leave]
    categorical = [(col, OrderedCategoricalLong()) for col in cols_categorical]

    x_mapper_float = DataFrameMapper(standardize + leave)
    x_mapper_long = DataFrameMapper(categorical)

    x_fit_transform = lambda df: tt.tuplefy(
        x_mapper_float.fit_transform(df).astype(np.float32),
        x_mapper_long.fit_transform(df))
    x_transform = lambda df: tt.tuplefy(
        x_mapper_float.transform(df).astype(np.float32),
        x_mapper_long.transform(df))

    x_train = x_fit_transform(df_train)
    x_val = x_transform(df_val)
    x_test = x_transform(df_test)
    num_embeddings = x_train[1].max(0) + 1
    embedding_dims = num_embeddings // 2

    get_target = lambda df: (df['duration'].values, df['event'].values)
    y_train = get_target(df_train)
    y_val = get_target(df_val)
    durations_test, events_test = get_target(df_test)
    val = x_val, y_val

    # Model preparation =============================================================
class Dataprocess(object):
    datafile = "data.csv"
    def __init__(self, datadir="/Users/shintaro/work/kaggle-kobe/data/"):
        self.datadir = datadir

    def read(self):
        self.df_orig = pd.read_csv(self.datadir + self.datafile)
        self.df = self.df_orig.copy()

    def process(self):
        self.read()
        self.preproc()
        self.set_mapper()
        self.split_df()
        train_X = self.vec_X(self.train_df)
        train_y = self.vec_y(self.train_df)
        test_X = self.mapper_X.transform(self.test_df)
        return train_X, train_y, test_X


    def preproc(self):
        self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"]
        self.df['last_5_sec'] = self.df['time_remaining'] < 5
        self.df['latter_half'] = self.df['time_remaining'] < 360
        self.df['first_period'] = self.df['period'] == 1
        self.df['latter_period'] = self.df['period'] > 2
        self.df['last_period'] = self.df['period'] == 4
        self.df['last_quarter'] = self.df['time_remaining'] < 180

        threshold = 3
        anomaly = 14
        self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1)
        self.df['away'] = self.df.matchup.str.contains('@')
        self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining'])
        self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart']
        numGaussians = 13
        gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full', 
                                           params='wmc', init_params='wmc',
                                           random_state=1, n_init=3,  verbose=0)
        gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']])
        self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']])
        self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0)

        self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))])
        self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))])
        self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))])

        action_type_list = list(set(self.df["action_type"].tolist()))
        self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))])

        combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist()))
        self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))])

        opponent_list = list(set(self.df["opponent"].tolist()))
        self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))])

        game_id_list = list(set(self.df["game_id"].tolist()))
        self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))])

        season_list = list(set(self.df["season"].tolist()))
        season_list.sort()
        self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))])

        self.df["shot_distance"][self.df["shot_distance"] > 45] = 45

        # del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"]
        # return self.df


    def set_mapper(self):
        self.mapper_X = DataFrameMapper([
            (u'action_type', LabelBinarizer()),
            (u'combined_shot_type', LabelBinarizer()),
            (u'loc_x', None),
            (u'loc_y', None),
            (u'minutes_remaining', None),
            (u'period', LabelBinarizer()),

            (u'playoffs', LabelBinarizer()),
            (u'season', LabelBinarizer()),
            (u'seconds_remaining', None),
            (u'shot_distance', None),
            (u'shot_type', LabelBinarizer()),
            (u'shot_zone_area', LabelBinarizer()),
            (u'shot_zone_basic', LabelBinarizer()),
            (u'shot_zone_range', LabelBinarizer()),
            (u'matchup', LabelBinarizer()),
            (u'shot_id', None),

            (u'season_num', None),
            (u'game_year', None),
            (u'game_month', None),
            (u'game_day', None),

            (u'first_period', LabelBinarizer()),
            (u'latter_period', LabelBinarizer()),
            (u'last_period', LabelBinarizer()),
            (u'last_quarter', LabelBinarizer()),
            (u'time_remaining', None),
            (u'latter_half', LabelBinarizer()),
            (u'last_5_sec', LabelBinarizer()),
            (u'opponent_num', LabelBinarizer()),
            (u'game_id_num', LabelBinarizer()),

            (u'last_moment', LabelBinarizer()),
            (u'away', LabelBinarizer()),
            (u'secondsFromStart', None),
            (u'secondsFromGameStart', None),
            (u'shotLocationCluster', LabelBinarizer()),
            (u'homeGame', LabelBinarizer()),
            ])
        self.mapper_y = DataFrameMapper([(u'shot_made_flag', None),])
        self.mapper_X.fit(self.df)
        self.mapper_y.fit(self.df)


    def split_df(self):
        self.train_df = self.df[~np.isnan(self.df["shot_made_flag"])]
        self.test_df = self.df[np.isnan(self.df["shot_made_flag"])]


    def vec_X(self, df):
        return self.mapper_X.transform(df.copy())


    def vec_y(self, df):
        return self.mapper_y.transform(df.copy())
def create_LabelBinarized_files() :
    """ Apply LabelBinarizing to the data to create:
         A file with test coupon information (and LabelBinarizing of categorical variables)
        A file which aggregates coupon_detail and user information (and LabelBinarizing of categorical variables)
        A file which aggregates coupon_visit and user information (and LabelBinarizing of categorical variables)
        These files will be used in the similarity_distance.py script
    """

    print "Create Label Binarized files"

    def get_unix_time(row):
        """Convert to unix time. Neglect time of the day
        """
        row = row.split(" ")
        row = row[0].split("-")
        y,m,d = int(row[0]), int(row[1]), int(row[2])
        return calendar.timegm(date(y,m,d).timetuple())

    #read in all the input data
    cpdtr = pd.read_csv("../Data/Data_translated/coupon_detail_train_translated.csv")
    cpltr = pd.read_csv("../Data/Data_translated/coupon_list_train_translated.csv")
    cplte = pd.read_csv("../Data/Data_translated/coupon_list_test_translated.csv")
    ulist = pd.read_csv("../Data/Data_translated/user_list_translated.csv")
    ulist["REG_DATE_UNIX"] = ulist["REG_DATE"].apply(get_unix_time)

    # List of unbinarized features
    list_col_unbin = ["COUPON_ID_hash","USER_ID_hash", "GENRE_NAME", "large_area_name", "small_area_name", "PRICE_RATE", "CATALOG_PRICE", "DISCOUNT_PRICE",
       "DISPFROM", "DISPEND", "DISPPERIOD", "VALIDFROM", "VALIDEND",
       "VALIDPERIOD", "USABLE_DATE_MON", "USABLE_DATE_TUE",
       "USABLE_DATE_WED", "USABLE_DATE_THU", "USABLE_DATE_FRI",
       "USABLE_DATE_SAT", "USABLE_DATE_SUN", "USABLE_DATE_HOLIDAY",
       "USABLE_DATE_BEFORE_HOLIDAY", "ITEM_COUNT", "AGE", "SEX_ID", "REG_DATE_UNIX"]

    #making of the train set
    train = pd.merge(cpdtr, cpltr)
    train = pd.merge(train, ulist, left_on = "USER_ID_hash", right_on = "USER_ID_hash")
    train = train[list_col_unbin]

    # Format the test set as the train set
    cplte["USER_ID_hash"] = np.array(["dummyuser"]*len(cplte))
    for col in ["ITEM_COUNT", "AGE", "SEX_ID", "REG_DATE_UNIX"] :
        cplte[col] = 0
    #Then combine test and train
    cpchar = cplte[list_col_unbin]
    train = pd.concat([train, cpchar])

    # Binarize features now
    list_to_binarize = ["GENRE_NAME", "large_area_name", "small_area_name"]
    # After binarisation, we obtain more features. We store the name of those features in d_bin 
    d_bin = {}
    for feat in list_to_binarize:
        if feat == "GENRE_NAME" :
            cardinal = sorted(set(train[feat].values))
            d_bin["GENRE_NAME"] = [feat + "_" + str(i) for i in cardinal]
        if feat == "large_area_name" :
            cardinal = sorted(set(train[feat].values))
            d_bin["large_area_name"] = [feat + "_" + str(i) for i in cardinal]
        if feat == "small_area_name" :
            cardinal = sorted(set(train[feat].values))
            d_bin["small_area_name"] = [feat + "_" + str(i) for i in cardinal]

    # Use a sklearn_pandas mapper for binarization
    list_mapper      = []
    # Store binaried col names in new list
    list_col_bin = []
    for feat in list_col_unbin :
        if feat in list_to_binarize :
            list_col_bin += d_bin[feat]
            list_mapper.append((feat, preprocessing.LabelBinarizer()))
        else :
            list_col_bin.append(feat)
            list_mapper.append((feat, None))
    mapper = DataFrameMapper(list_mapper)

    # Fit binarizer of full matrix and save
    train = mapper.fit_transform(train) 
    # Incorporate binarized feature in train
    train = pd.DataFrame(train, index = None, columns = list_col_bin )

    #separate the test from train
    test = train[train["USER_ID_hash"]=="dummyuser"]
    train = train[train["USER_ID_hash"] !="dummyuser"]

    #Save the test data
    test.to_csv("../Data/Data_translated/coupon_list_test_LB_translated.csv", index = False)
    #Free memory
    del test

    #Save the train data
    train.to_csv("../Data/Data_translated/coupon_train_aggregated_LB_translated.csv", index = False)
    #Free memory
    del train

    #Load visit data frame in chunks because it is too large 
    for index, cpvtr in enumerate(pd.read_csv("../Data/Data_translated/coupon_visit_train_translated.csv", chunksize=100000)) :
        sys.stdout.write("\rProcessing row " + str(index*100000)+" to row "+str((index+1)*100000))
        sys.stdout.flush()
        cpvtr = cpvtr[cpvtr["PURCHASE_FLG"]!=1][["VIEW_COUPON_ID_hash","USER_ID_hash"]]
        trainv = pd.merge(cpvtr, cpltr, left_on = "VIEW_COUPON_ID_hash", right_on = "COUPON_ID_hash")
        trainv = pd.merge(trainv, ulist, left_on = "USER_ID_hash", right_on = "USER_ID_hash")
        trainv["ITEM_COUNT"] = 0
        trainv = trainv[list_col_unbin]

        #Binarize
        trainv = mapper.transform(trainv) 
        trainv = pd.DataFrame(trainv, index = None, columns = list_col_bin )

        # Add trainv to trainvisit
        if index == 0:
            with open("../Data/Data_translated/coupon_trainv_aggregated_LB_translated.csv", "w") as f :
                trainv.to_csv(f, index = False) 
        else : 
            with open("../Data/Data_translated/coupon_trainv_aggregated_LB_translated.csv", "a") as f :
                trainv.to_csv(f, index = False, header=False) 

    print
Test = Test.groupby(['borough', 'month'])['complaints'].mean().reset_index()

X_train = Train[['borough', 'month']]
y_train = Train['complaints']

X_test = Test[['borough', 'month']]
y_test = Test['complaints']

# Features
# convert the categorical varibale to binary variables
mapper = DataFrameMapper([('month', None), ('borough', LabelBinarizer())],
                         df_out=True)

# preprocessing features data sets
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

model = LinearRegression(normalize=True)
model.fit(Z_train, y_train)
model.score(Z_train, y_train)

y_pred = model.predict(Z_test)

RSS = ((y_test - y_pred)**2).sum()
TSS = ((y_train.mean() - y_test)**2).sum()

R2 = 1.0 - RSS / TSS

print("Model performance R^2 = {}".format(R2))

print("Baseline model prediction {}".format(y_train.mean()))
class trainingData(object):
    def __init__(self, data):
        self.__df = data
        self.__col_list = self.__df.columns.values.tolist()
        self.__del_col_list=[]
        self.__cat_col_list=[]
        self.__final_col_list = []


    def plot_corr(self):
        corMat = DataFrame(self.__df.corr())
        plot.pcolor(corMat)
        plot.show()

    def getColNmaes(self):
        return self.__col_list

    def preprocess(self,predictors, target, unique = 3):
        assert str(target)
        """
        :param target: variable to be predicted
        :param unique: tolerance for number of unique values in categorial columns. \
        If unique count in categorial columns is greater than this count those particular columns are dropped
        :return: none
        """
        for name in self.__col_list:
            if name == target:
                self.__temp = name

            if self.__df[name].dtype == 'O':
                if len(self.__df[name].unique()) > unique:
                    self.__del_col_list.append(self.__col_list.index(name))
                else:
                    self.__cat_col_list.append(name)

            if self.__df[name].dtype == 'int64':
                self.__df[name] = self.__df[name].astype(float)

        #templist = []
        for value in self.__col_list:
            if value not in predictors and self.__col_list.index(value) not in self.__del_col_list and value != target:
                self.__del_col_list.append(self.__col_list.index(value))


        #drop unwqanted columns
        self.__df.drop(self.__df.columns[self.__del_col_list],axis=1,inplace=True)


        #drop null values
        self.__df.dropna(axis=1,how='any',inplace = True)

        #prepare target df
        self.__target_df= self.__df[self.__temp]
        self.__df.drop(self.__temp,axis = 1, inplace = True)



        #train test split
        self.trainX ,self.testX, self.trainY, self.testY = sklearn.cross_validation.train_test_split(self.__df,self.__target_df,test_size=0.30)

        #get final column list for mappers
        self.__final_col_list = self.__df.columns.values.tolist()
        self.__num_col_list = [item for item in self.__final_col_list if item not in self.__cat_col_list]
        #print self.num_col_list

        self.mapfunc = []
        for name in self.__final_col_list:
            if self.__df[name].dtype == "O":
                self.mapfunc.append(([name],sklearn.preprocessing.LabelBinarizer()))
            else:
                self.mapfunc.append(([name], sklearn.preprocessing.StandardScaler(copy=False)))

        #io mappers
        self.in_mapper = DataFrameMapper(self.mapfunc)
        self.out_mapper = sklearn.preprocessing.RobustScaler(with_centering=False,copy=False)


        self.trainX = np.array(self.in_mapper.fit_transform(self.trainX),np.float32)
        self.trainY = np.array(self.out_mapper.fit_transform(self.trainY.reshape(-1,1)),np.float32)

        self.testX = np.array(self.in_mapper.transform(self.testX),np.float32)
        self.testY = np.array(self.out_mapper.transform(self.testY.reshape(-1,1)),np.float32)

        self.tindex = self.trainX.shape[0]


    def expt(self,name):
        """Export train or test Files...for debugging purposes """
        if name == "trainX":
            __df = pd.DataFrame(self.trainX)
            __df.to_csv("trainX.csv")
        elif name == "trainY":
            __df = pd.DataFrame(self.trainY)
            __df.to_csv("trainY.csv")
        elif name == "testX":
            __df = pd.DataFrame(self.testX)
            __df.to_csv("testX.csv")
        elif name == "testY":
            __df = pd.DataFrame(self.testX)
            __df.to_csv("testX.csv")
        else:
            raise ValueError
Beispiel #39
0
x=train[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_test=test[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_result=predict[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_result=x_result.loc[[i in [46,58,70,82,94,106,118,130,142] for i in x_result.time_id],:]
x_final=x_result.reset_index(drop=True)

a=x.columns
mapper=[]
for j in a:
	if j in ['district_id', 'Day', 'Weekday', 'Workday', 'Yesterday_Workday','Twoday_ago_Workday', 'time_id']:
		mapper.append((j,None))
	else:
		mapper.append((j,StandardScaler()))
b=DataFrameMapper(mapper)
b.fit(pd.concat([x, x_test, x_result]))
x=b.transform(x)
x_test=b.transform(x_test)
x_result_before = x_result
x_result=b.transform(x_result)

#Random Forest
clf = ensemble.RandomForestClassifier(n_estimators=20,max_features=min(len(feasible_columns) - len(columns_to_drop), 25))
clf.fit(x,y)
clf_predict=clf.predict(x_test)
clf_score=clf.score(x_test, y_test)


clf_predict.fill(1)

diff=clf_predict-y_test
MAPE=sum(abs(diff[y_test!=0]/y_test[y_test!=0])/len(y_test))
    def __load_coupons(self, validation_timedelta):
        train_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_train.csv"),
                                           parse_dates=["DISPFROM","DISPEND"])
        test_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_test.csv"))

        train_coupon_df["DISPFROM"].fillna(pd.Timestamp("19000101"), inplace=True)
        train_coupon_df = train_coupon_df.sort(columns=["DISPFROM"]).reset_index(drop=True)

        if validation_timedelta:
            max_date = train_coupon_df["DISPFROM"].max()
            valid_start = max_date - validation_timedelta
            valid_coupon_df = train_coupon_df[(train_coupon_df["DISPFROM"] > valid_start)]
            train_coupon_df = train_coupon_df[~ (train_coupon_df["DISPFROM"] > valid_start)]
        else:
            valid_coupon_df = train_coupon_df[np.zeros(len(train_coupon_df), dtype=np.bool)].copy()

        # remove outlier data from the validation-set
        if len(valid_coupon_df) > 0:
            very_low_price = valid_coupon_df[valid_coupon_df.DISCOUNT_PRICE <= 100].COUPON_ID_hash
            very_long_time_display = valid_coupon_df[valid_coupon_df.DISPPERIOD > 20].COUPON_ID_hash
            valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_long_time_display)]
            valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_low_price)].reset_index(drop=True)

        # remove outlier data from the training-set
        very_long_time_display = train_coupon_df[train_coupon_df.DISPPERIOD > 20].COUPON_ID_hash
        train_coupon_df = train_coupon_df[~train_coupon_df.COUPON_ID_hash.isin(very_long_time_display)].reset_index(drop=True)

        # coupon features
        coupon_mapper = DataFrameMapper([
                ('CATEGORY_NAME', LabelBinarizer()),
                ('PRICE_RATE', None),
                ('CATALOG_PRICE_LOG', None),
                ('DISCOUNT_PRICE_LOG', None),
                ('REDUCE_PRICE_LOG', None),
                ('DISPPERIOD_C', LabelBinarizer()),
                ('VALIDPERIOD_NA', LabelBinarizer()),
                ('USABLE_DATE_SUM', None),
                ('LARGE_AREA_NAME', LabelBinarizer()),
                ('PREF_NAME', LabelBinarizer()),
                ('SMALL_AREA_NAME', LabelBinarizer()),
                ])
        config = {}
        self.__coupon_preproc(train_coupon_df)
        self.__coupon_preproc(valid_coupon_df)
        self.__coupon_preproc(test_coupon_df)
        
        coupon_mapper.fit(pd.concat([train_coupon_df, valid_coupon_df, test_coupon_df]))
        
        train_coupon_vec = coupon_mapper.transform(train_coupon_df.copy())
        if len(valid_coupon_df) > 0:
            valid_coupon_vec = coupon_mapper.transform(valid_coupon_df.copy())
        else:
            valid_coupon_vec = np.array([])
        test_coupon_vec = coupon_mapper.transform(test_coupon_df.copy())

        self.train_coupon_vec = train_coupon_vec
        self.valid_coupon_vec = valid_coupon_vec
        self.test_coupon_vec = test_coupon_vec
        self.train_coupon_df = train_coupon_df
        self.valid_coupon_df = valid_coupon_df
        self.test_coupon_df = test_coupon_df
Beispiel #41
0
#Types of features
binary_features = ['mouseovers', 'viewable']
cat_features = ['placement_id','browser_id','os_id','region','country','campaign','creative_asset_id']
numeric_features = ['hour', 'max_duration', 'video_length']

#Preprocess accordingly
mapper = DataFrameMapper([(binary_features, None),
	(cat_features, OneHotEncoder(handle_unknown='ignore')),
	(numeric_features, MaxAbsScaler())], sparse=True, )

#Fit to training data only
X_train = np.round(mapper.fit_transform(df_train.copy()), 2)

#Use same mapper to transform test data
X_test = np.round(mapper.transform(df_test.copy()), 2)

#Begin cross validation
cv = cross_validation.StratifiedKFold(y_train, 10)
parameters = {
    'alpha': [1e-6, 1e-5, 1e-4, 1e-3]
}
grid_search = GridSearchCV(SGDClassifier(loss='log', penalty='l1', n_iter=10, shuffle=True), parameters, cv=cv, verbose=True, scoring='f1')
grid_search.fit(X_train, y_train)

clf = grid_search.best_estimator_
print "Best parameters set:"
best_parameters = clf.get_params()
for param_name in sorted(parameters.keys()):
    print "\t%s: %r" % (param_name, best_parameters[param_name])
Beispiel #42
0
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

#Preprocess X_train
feature_def = gen_features(
     columns=[[c] for c in X_train.columns[:7]],
     classes=[MinMaxScaler]
 )

feature_def += ((pos_col, [LabelBinarizer()]),)

svc_preprocessor = DataFrameMapper(feature_def)
X_train = svc_preprocessor.fit_transform(X_train)
svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' % (nrows,))
joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb'))
X_test = svc_preprocessor.transform(X_test)
#####

#Didn't help!!
#X_train, y_train = downsample_negatives(X_train, y_train)

for cv in [1,10,20]:
    print "Training, sample_count: %s\tcv:%s" % (nrows, cv)
    clf = svm.SVC(kernel='linear', class_weight={1:cv})
    clf.fit(X_train, y_train)  
    y_pred = clf.predict(X_test)
    measures = metrics.precision_recall_fscore_support(y_test, y_pred, average='binary')
    model_file_name = os.path.join('../model/tmp/svc_mentions_unbalanced.%s.%s.pkl' % (nrows,cv))
    joblib.dump(clf, open(model_file_name, 'wb'))
    print "measures: ", measures
    sys.stdout.flush()
Beispiel #43
0
def create_mapper(df, cat_vars=list(), cont_vars=list(), date_vars=list(), no_transform_vars=list(), response_vars=list()):
    logging.info('Creating mapper')

    # TODO Add support for datetime variables

    # Reference variables
    transformation_list = list()

    # Copy df, to avoid 'inplace' transformation
    df = df.copy(deep=True)

    # TODO Check if any df variables are not listed in cat_vars or cont_vars. If so, remove them.

    # Check if any df variables are listed in cat_vars and cont_vars. If so, raise an error.
    intersection = filter(lambda x: x in cat_vars, cont_vars)
    if len(intersection) > 0:
        raise AssertionError('Columns appear in both cat_vars and cont_vars: {}'.format(intersection))

    # Convert continuous variables to float32
    for cont_var in cont_vars + response_vars:
        logging.debug('Converting cont_var data type: {}'.format(cont_var))
        df[cont_var] = df[cont_var].astype(numpy.float32)

    for date_var in date_vars:
        logging.info('Enriching for datetime var: {}'.format(date_var))
        df, date_cat_vars, date_cont_vars = add_datetime_vars(df, date_var)
        cat_vars.extend(date_cat_vars)
        cont_vars.extend(date_cont_vars)

    # Add continuous variable transformations for cont_vars
    for cont_var in cont_vars + response_vars:
        logging.debug('Creating transformation list for cont_var: {}'.format(cont_var))
        transformations = [Imputer(strategy='mean'), StandardScaler()]
        var_tuple = ([cont_var], transformations)
        transformation_list.append(var_tuple)

    # Add categorical variable transformations for cat_vars
    for cat_var in cat_vars:
        logging.debug('Creating transformation list for cat_var: {}'.format(cat_var))
        # TODO Replace LabelEncoder with CategoricalEncoder, to better handle unseen cases
        transformations = [LabelEncoder()]
        var_tuple = (cat_var, transformations)
        transformation_list.append(var_tuple)

    for no_transform_var in no_transform_vars:
        logging.debug('Creating transformation list for cont_var: {}'.format(no_transform_var))
        transformations = [Imputer(strategy='most_frequent')]
        var_tuple = ([no_transform_var], transformations)
        transformation_list.append(var_tuple)

    # Create mapper
    logging.info('Creating mapper')
    mapper = DataFrameMapper(features=transformation_list, df_out=True)

    # Train mapper
    logging.info('Training newly created mapper')
    mapper.fit(df)

    # Throw away transformation, to set up mapper
    logging.info('Transforming data set with newly created mapper, to initialize mapper internals')
    mapper.transform(df.sample(1000))

    return mapper
Beispiel #44
0
import pandas as pd
import numpy as np
import seaborn as sn
import sklearn
from sklearn_pandas import DataFrameMapper, cross_val_score
import re

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
feats = [key for key in df.keys() if re.match('.*feat.*',key)]
mapper = DataFrameMapper([(feats,sklearn.preprocessing.StandardScaler())])
data_train_scaled = mapper.fit_transform(df_train)
data_test_scaled = mapper.transform(df_test)
data_test = df_test[feats]
data_train =df_train[feats]