def preprocessing(data,target):
    global statistics_row
    print('Start preprocessing')
    data = data.drop('imports', axis='columns')
    target = target.drop('imports', axis='columns')
    labels = data['BUGFIX_count']>0
    test_labels = target['BUGFIX_count']>0
    test_loc = target['SM_file_lloc']

    print('Start Random Forest')
    predictions = random_forest(data.iloc[:, :data.columns.get_loc('BUGFIX_count')], labels,
                                target.iloc[:, :target.columns.get_loc('BUGFIX_count')])
    lb_infinite = False
    try:
        #if no defects predicted exception through division by zero
        lb = lower_bound(predictions, test_loc, test_labels)
    except Exception as e:
        print(e)
        #lower bound infinite
        statistics_row.append(-1)
        lb_infinite = True
        print('RF-b:', 'lb infinite')

    if not lb_infinite:
        try:
            ub = upper_bound(predictions, test_loc, test_labels)
            #scaling difference depending on positivity or negativity
            if(ub-lb)<0:
                statistics_row.append(-(abs(ub - lb) / (abs(ub - lb) + 1000)))
            else:
                statistics_row.append((ub - lb) / ((ub - lb) + 1000))
            print(lb, '< C <', ub)

        except Exception as e:
            print(e)
            #upper bound infinite
            statistics_row.append(1)
            print('RF-b:','ub infinite')

    #selecting features with random forest
    features = select_features_rf(data.iloc[:, :data.columns.get_loc('BUGFIX_count')], labels)
    #applying features to data
    test_data = target[features]
    data = data[features]
    columns = data.columns
    try:
        #defining function (sigmoid) for scaling data
        scaler = FunctionTransformer(func=lambda x: (np.e ** x) /((np.e ** x) + 1), validate=True)
    except RuntimeWarning as e:
        print(e)
    #applying sigmoid function on data
    data = pd.DataFrame(scaler.fit_transform(data), columns=columns).fillna(0)
    test_data = pd.DataFrame(scaler.fit_transform(test_data),columns=columns).fillna(0)

    labels = pd.DataFrame(labels).reset_index(drop=True)

    data = pd.concat([data, labels], axis=1)
    data, labels = knn_data_selection(data, test_data)

    return data, labels, test_data, test_labels, test_loc
def test_function_transformer_feature_names_out_is_None():
    transformer = FunctionTransformer()
    X = np.random.rand(100, 2)
    transformer.fit_transform(X)

    msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
    with pytest.raises(AttributeError, match=msg):
        transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_string(feature_names_out):
    transformer = FunctionTransformer(feature_names_out=feature_names_out)
    X = np.random.rand(100, 2)
    transformer.fit_transform(X)

    msg = """must either be "one-to-one" or a callable"""
    with pytest.raises(ValueError, match=msg):
        transformer.get_feature_names_out()
def test_function_transformer_future_warning(validate, expected_warning):
    # FIXME: to be removed in 0.22
    X = np.random.randn(100, 10)
    transformer = FunctionTransformer(validate=validate)
    with pytest.warns(expected_warning) as results:
        transformer.fit_transform(X)
    if expected_warning is None:
        assert len(results) == 0
def test_function_transformer_future_warning(validate, expected_warning):
    # FIXME: to be removed in 0.22
    X = np.random.randn(100, 10)
    transformer = FunctionTransformer(validate=validate)
    with pytest.warns(expected_warning) as results:
        transformer.fit_transform(X)
    if expected_warning is None:
        assert len(results) == 0
def test_function_transformer_get_feature_names_out_without_validation():
    transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
    X = np.random.rand(100, 2)
    transformer.fit_transform(X)

    msg = "When 'feature_names_out' is 'one-to-one', either"
    with pytest.raises(ValueError, match=msg):
        transformer.get_feature_names_out()

    names = transformer.get_feature_names_out(("a", "b"))
    assert isinstance(names, np.ndarray)
    assert names.dtype == object
    assert_array_equal(names, ("a", "b"))
def test_function_transformer_get_feature_names_out(
    X, feature_names_out, input_features, expected
):
    if isinstance(X, dict):
        pd = pytest.importorskip("pandas")
        X = pd.DataFrame(X)

    transformer = FunctionTransformer(
        feature_names_out=feature_names_out, validate=True
    )
    transformer.fit_transform(X)
    names = transformer.get_feature_names_out(input_features)
    assert isinstance(names, np.ndarray)
    assert names.dtype == object
    assert_array_equal(names, expected)
def logarithmic_regression(input_data, cement, water, coarse_aggr, fine_aggr,
                           days):

    variables = input_data.iloc[:, :-1]
    results = input_data.iloc[:, -1]

    n = results.shape[0]
    results = results.values.reshape(
        n, 1
    )  #reshaping the values so that variables and results have the same shape

    #transforming x data to logarithmic fucntion
    log_regression = FunctionTransformer(np.log, validate=True)
    log_variables = log_regression.fit_transform(variables)

    #making linear model and fitting the logarithmic data into linear model
    regression = linear_model.LinearRegression()
    model = regression.fit(log_variables, results)

    input_values = [cement, water, coarse_aggr, fine_aggr, days]

    #transforming input data for prediction in logarithmic function
    input_values = log_regression.transform([input_values])

    #predicting the outcome based on the input_values
    predicted_strength = regression.predict(
        input_values)  #adding values for prediction
    predicted_strength = round(predicted_strength[0, 0], 2)

    return "Logarithmic prediction: " + str(predicted_strength)
    def transform(self, X):
        X_transformed = self.kbd_fitted.transform(X).astype(int)

        pre = FunctionTransformer(_unpack_bits,
                                  validate=False,
                                  kw_args={'nbits': self.max_bits_per_feature})
        return pre.fit_transform(X_transformed)
def gen_statistics():

    # Read weather data (used to find list of airports)
    weather_df = pd.read_csv('../data/airport_per_date/weather.csv')
    weather_df['DateOfDeparture'] = pd.to_datetime(weather_df['Date'])
    weather_df.drop(
        ['Date', 'Events', 'Max Gust SpeedKm/h', 'Precipitationmm'],
        axis=1,
        inplace=True)

    # Initialise list of airports and date range for period of interest and encode dates for easier merge
    airport_list = weather_df['AirPort'].unique()
    airport = pd.DataFrame(airport_list, columns=['AirPort'])
    date_list = pd.date_range(start='01/01/2011', end='05/03/2013')
    date_airports = pd.DataFrame(list(product(date_list, airport_list)),
                                 columns=['DateOfDeparture', 'AirPort'])

    # Merge weather information
    merge_transform = MergeTransformer(X_ext=weather_df,
                                       how='left',
                                       on=['DateOfDeparture', 'AirPort'])
    date_airports = merge_transform.fit_transform(date_airports)

    date_encoder = FunctionTransformer(_encode_dates)
    date_airports = date_encoder.fit_transform(date_airports)

    # Merge LoadFactor and Passenger statistics per airport and date
    airport_statistics = pd.read_csv(
        '../data/airport_per_date/airports_statistics.csv', sep=',')
    merge_transform = MergeTransformer(X_ext=airport_statistics,
                                       how='left',
                                       on=['year', 'month', 'AirPort'])
    date_airports = merge_transform.fit_transform(date_airports)

    websearches = pd.read_csv('../data/airport_per_date/websearches.csv',
                              sep=';',
                              index_col='DateOfDeparture')
    websearches = websearches.stack()
    websearches = pd.DataFrame(websearches).reset_index()
    websearches.rename({
        'level_1': 'AirPort',
        0: 'search_intensity'
    },
                       axis=1,
                       inplace=True)
    websearches['DateOfDeparture'] = pd.to_datetime(
        websearches['DateOfDeparture'], format='%d/%m/%Y')

    merge_transform = MergeTransformer(X_ext=websearches,
                                       how='left',
                                       on=['DateOfDeparture', 'AirPort'])
    date_airports = merge_transform.fit_transform(date_airports)

    date_airports.drop(
        ['year', 'month', 'day', 'weekday', 'week', 'n_days', 'day_nb'],
        axis=1,
        inplace=True)

    return date_airports
Beispiel #11
0
def labelize(column, t):

    if t == "string":
        raise ValueError("String valued labels are not supported")
    elif t == "categorical":

        label_map = {}
        for i, k in enumerate(set(column)):
            label_map[k] = i

        vectorizer = FunctionTransformer(lambda x: label_map[x[0, 0]])

        return vectorizer.fit_transform(column), vectorizer

    else:

        vectorizer = FunctionTransformer(tryParse)

        return vectorizer.fit_transform(column), vectorizer
Beispiel #12
0
def rescale_cont_vars(df, log_transform=True, dep_var=None):
    cont_vars, _ = cont_cat_split(df, dep_var=dep_var)
    if log_transform:
        log_transformer = FunctionTransformer(func=np.log1p,
                                              inverse_func=np.expm1,
                                              validate=False)
        df[cont_vars] = log_transformer.fit_transform(df[cont_vars])
    scaler = MinMaxScaler()
    df[cont_vars] = scaler.fit_transform(df[cont_vars])
    return df
def test_wrapper_func_transformer(test_func):
    """Testing if WrapperFunctionTransformer still has functionality of an underlying FunctionTransformer."""
    test_arr = np.array([1, 1, 1, 2, 3, 4, 5]).reshape(-1, 1)

    tr = FunctionTransformer(func=test_func)
    wrap_tr = WrapperFunctionTransformer("test", clone(tr))

    expected_arr = tr.fit_transform(test_arr)
    actual_arr = wrap_tr.fit_transform(test_arr)

    assert np.array_equal(actual_arr, expected_arr)
    assert str(wrap_tr) != str(tr)
Beispiel #14
0
def test_check_inverse():
    X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))

    X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]

    for X in X_list:
        if sparse.issparse(X):
            accept_sparse = True
        else:
            accept_sparse = False
        trans = FunctionTransformer(
            func=np.sqrt,
            inverse_func=np.around,
            accept_sparse=accept_sparse,
            check_inverse=True,
            validate=True,
        )
        warning_message = (
            "The provided functions are not strictly"
            " inverse of each other. If you are sure you"
            " want to proceed regardless, set"
            " 'check_inverse=False'."
        )
        with pytest.warns(UserWarning, match=warning_message):
            trans.fit(X)

        trans = FunctionTransformer(
            func=np.expm1,
            inverse_func=np.log1p,
            accept_sparse=accept_sparse,
            check_inverse=True,
            validate=True,
        )
        with warnings.catch_warnings():
            warnings.simplefilter("error", UserWarning)
            Xt = trans.fit_transform(X)

        assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))

    # check that we don't check inverse when one of the func or inverse is not
    # provided.
    trans = FunctionTransformer(
        func=np.expm1, inverse_func=None, check_inverse=True, validate=True
    )
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        trans.fit(X_dense)
    trans = FunctionTransformer(
        func=None, inverse_func=np.expm1, check_inverse=True, validate=True
    )
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        trans.fit(X_dense)
def function_transformer_example(sample_df):
    # Obtain the text data: get_text_data
    get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

    # Obtain the numeric data: get_numeric_data
    get_numeric_data = FunctionTransformer(
        lambda x: x[['numeric', 'with_missing']], validate=False)

    # Fit and transform the text data: just_text_data
    just_text_data = get_text_data.fit_transform(sample_df)

    # Fit and transform the numeric data: just_numeric_data
    just_numeric_data = get_numeric_data.fit_transform(sample_df)

    # Print head to check results
    print('Text Data')
    print(just_text_data.head())
    print('\nNumeric Data')
    print(just_numeric_data.head())

    X_train, X_test, y_train, y_test = train_test_split(
        sample_df[['numeric', 'with_missing', 'text']],
        pd.get_dummies(sample_df['label']),
        random_state=22)

    # Create a FeatureUnion with nested pipeline: process_and_join_features
    process_and_join_features = FeatureUnion(
        transformer_list=[('numeric_features',
                           Pipeline([('selector',
                                      get_numeric_data), ('imputer',
                                                          Imputer())])),
                          ('text_features',
                           Pipeline([(
                               'selector',
                               get_text_data), ('vectorizer',
                                                CountVectorizer())]))])

    # Instantiate nested pipeline: pl
    pl = Pipeline([('union', process_and_join_features),
                   ('clf', OneVsRestClassifier(LogisticRegression()))])
def test_function_transformer_feature_names_out_uses_estimator():
    def add_n_random_features(X, n):
        return np.concatenate([X, np.random.rand(len(X), n)], axis=1)

    def feature_names_out(transformer, input_features):
        n = transformer.kw_args["n"]
        return list(input_features) + [f"rnd{i}" for i in range(n)]

    transformer = FunctionTransformer(
        func=add_n_random_features,
        feature_names_out=feature_names_out,
        kw_args=dict(n=3),
        validate=True,
    )
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
    transformer.fit_transform(df)
    names = transformer.get_feature_names_out()

    assert isinstance(names, np.ndarray)
    assert names.dtype == object
    assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
Beispiel #17
0
def log_scaler(x, convert_input_data=True, return_log_scaler=False):
    """
    :param x: unscaled data (numpy array)
    :param convert_input_data: if True convert input data to a 2-dimensional NumPy array or sparse matrix
    :param return_std_scaler: boolean value which enable returning (or not) FunctionTransformer instance
    
    :return: scaled data (numpy array), FunctionTransformer instance (optional)
    """
    log_transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=convert_input_data)
    x_scaled = log_transformer.fit_transform(x)
    if return_log_scaler:
        return x_scaled, log_transformer
    return x_scaled
Beispiel #18
0
def plot_published_games_over_years(df, lb, ub, exponential_regression=True):
    """
    Plot the overall #published games over years from lb to ub

    df: dataframe 
    lb: yearpublished lower bound
    ub: yearpublished upper bound
    exponential_regression: a flag whether to plot an exponential regression line
    """

    assert isinstance(df, pd.DataFrame)
    assert isinstance(lb, int) and ub > 0
    assert isinstance(ub, int) and lb > 0
    assert isinstance(exponential_regression, bool)

    # Filter the dataframe on yearpublished lower bound and upper bound
    filtered_df = df.loc[(df["yearpublished"] >= lb)
                         & (df["yearpublished"] <= ub)]

    # Configure the pyplot setting
    fig = plt.figure(figsize=(15, 10))
    sns.set(style="ticks")

    # Draw a exponential regression line
    if exponential_regression:
        transformer = FunctionTransformer(np.log, validate=True)
        counts = filtered_df.groupby("yearpublished").count()["id"]
        x = np.arange(len(counts))[:, None]
        y = counts[:, None]
        # Fit exponential model
        y_trans = transformer.fit_transform(y)
        regressor = LinearRegression()
        results = regressor.fit(x, y_trans)
        model = results.predict
        y_fit = model(x)
        plt.plot(x + lb, np.exp(y_fit), "k--", color="brown", linewidth=2)

    # Plot the histogram of published games
    p = sns.histplot(filtered_df["yearpublished"],
                     discrete=True,
                     stat="count",
                     color="orange",
                     edgecolor="white")
    p.set_xlabel("Year", fontsize=25, weight="bold")
    p.set_ylabel("Number of games", fontsize=25, weight="bold")
    p.tick_params(labelsize=20)
    p.set_xticks(p.get_xticks()[1:-2])
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)

    plt.show()
Beispiel #19
0
def scale_data(df, p, train=True, save=True):
    if p.log_scale:
        df.loc[df["last_pend_time"] == 0, "last_pend_time"] = 1
        if train:
            log_scaler = FunctionTransformer(np.log2)
            df.loc[:, ["last_pend_time"]] = log_scaler.fit_transform(
                df[["last_pend_time"]])
            if save:
                joblib.dump(log_scaler, "log_scaler.save")
        else:
            log_scaler = joblib.load("log_scaler.save")
            df.loc[:, ["last_pend_time"]] = log_scaler.transform(
                df[["last_pend_time"]])

    scale_cols = ["last_pend_time"]
    if p.use_using_cores:
        scale_cols.append("using_cores")
    if p.use_spending_run_time:
        scale_cols.append("spending_run_time")
    if p.use_pending_jobs:
        scale_cols.append("pending_jobs")
    if p.use_last_pend_time_submit:
        scale_cols.append("last_pend_time_submit")
    if p.use_submit_time:
        scale_cols.append("sin_submit_time")
        scale_cols.append("cos_submit_time")
    if p.use_day_of_week:
        scale_cols.append("sin_day_of_week")
        scale_cols.append("cos_day_of_week")

    if train:
        min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        df.loc[:, scale_cols] = min_max_scaler.fit_transform(df[scale_cols])
        if save:
            joblib.dump(min_max_scaler, "min_max_scaler.save")
    else:
        min_max_scaler = joblib.load("min_max_scaler.save")
        df.loc[:, scale_cols] = min_max_scaler.transform(df[scale_cols])

    if p.standard_scale:
        if train:
            standard_scaler = StandardScaler()
            df.loc[:,
                   scale_cols] = standard_scaler.fit_transform(df[scale_cols])
            if save:
                joblib.dump(standard_scaler, "standard_scaler.save")
        else:
            standard_scaler = joblib.load("standard_scaler.save")
            df.loc[:, scale_cols] = standard_scaler.transform(df[scale_cols])

    return df
    def log1p(self, **params) -> pd.DataFrame:
        """
        log1p 变换
        :param params:
        :return:
        """
        print('进行对数变换的特征列:')
        print(self.columns)

        transformer = FunctionTransformer(np.log1p)
        self.df[self.columns] = transformer.fit_transform(
            self.df[self.columns].values)

        return self.df
Beispiel #21
0
def main(params, inputs, outputs):

    ### 读入数据 ###
    x = pd.read_pickle(inputs.x)

    ### 定义函数:去除第一列数据 ###
    def all_but_first_column(X):
        return X[:, 1:]

    ### 使用FunctionTransformer训练并转化 ###
    ft = FunctionTransformer(all_but_first_column)
    x_new = ft.fit_transform(x)

    ### 结果输出 ###
    x_new.to_pickle(outputs.x_new)
Beispiel #22
0
def impute():
    csv_data = '''A,B,C,D\n1.0,2.0,3.0,4.0\n5.0,6.0,,8.0\n10.0,11.0,12.0,'''
    df = pd.read_csv(StringIO(csv_data))
    print(df)

    # calculate column mean
    imr = SimpleImputer(missing_values=np.nan, strategy='mean')
    imr = imr.fit(df.values)
    imputed_data = imr.transform(df.values)
    print(imputed_data)

    # calculate row mean
    ftr_imr = FunctionTransformer(lambda X: imr.fit_transform(X.T).T, validate=False)
    imputed_data = ftr_imr.fit_transform(df.values)
    print(imputed_data)
Beispiel #23
0
def df_to_exponential_fit(df, colX, colY, wgt=None):

    X = df[colX].values.reshape(-1, 1)  # values converts it into a numpy array
    Y = df[colY].values.reshape(
        -1,
        1)  # -1 means that calculate the dimension of rows, but have 1 column

    # Y = np.log(df[colY].values.reshape(-1, 1)) # -1 means that calculate the dimension of rows, but have 1 column
    transformer = FunctionTransformer(np.log, validate=True)
    y_trans = transformer.fit_transform(Y)

    linear_regressor = LinearRegression()  # create object for the class
    results = linear_regressor.fit(X, y_trans, sample_weight=wgt)

    linear_regressor.fit(X, y_trans,
                         sample_weight=wgt)  # perform linear regression
    Y_pred = linear_regressor.predict(X)  # make predictions
    coef = float(linear_regressor.coef_)

    return Y_pred, coef
def test_function_transformer_validate_inverse():
    """Test that function transformer does not reset estimator in
    `inverse_transform`."""
    def add_constant_feature(X):
        X_one = np.ones((X.shape[0], 1))
        return np.concatenate((X, X_one), axis=1)

    def inverse_add_constant(X):
        return X[:, :-1]

    X = np.array([[1, 2], [3, 4], [3, 4]])
    trans = FunctionTransformer(
        func=add_constant_feature,
        inverse_func=inverse_add_constant,
        validate=True,
    )
    X_trans = trans.fit_transform(X)
    assert trans.n_features_in_ == X.shape[1]

    trans.inverse_transform(X_trans)
    assert trans.n_features_in_ == X.shape[1]
Beispiel #25
0
def clean_df(X):

    date_encoder = FunctionTransformer(_encode_dates)
    X = date_encoder.fit_transform(X)

    X.rename(
        {
            'year': 'year_departure',
            'day': 'day_departure',
            'n_days': 'n_days_departure'
        },
        axis=1,
        inplace=True)

    columns = [
        'DateOfDeparture', 'DateBooked', 'state_dep', 'state_arr', 'week',
        'month', 'weekday', 'holidays_dep', 'holidays_arr', 'Departure',
        'Arrival'
    ]
    X.drop(columns, axis=1, inplace=True)

    return X
Beispiel #26
0
def test_function_transformer_frame():
    pd = pytest.importorskip("pandas")
    X_df = pd.DataFrame(np.random.randn(100, 10))
    transformer = FunctionTransformer()
    X_df_trans = transformer.fit_transform(X_df)
    assert hasattr(X_df_trans, "loc")
Beispiel #27
0
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

func1 = FunctionTransformer(lambda x: x[x.columns[0]], validate=False)
func2 = FunctionTransformer(lambda x: x[x.columns[2]], validate=False)

union = FeatureUnion([('func1', make_pipeline(func1,Imputer())),
                      ('func2', make_pipeline(func2,Imputer()))])
func1.fit_transform(df_text)
func2.fit_transform(df_text)
pd.DataFrame(union.fit_transform(df_text))

iris = load_iris()
text = 'hi, my name is vishesh'
df_text = pd.DataFrame(iris.data, columns=iris.feature_names)
df_text['label'] = iris.target
label_names = ['hello', 'man', 'namely']
df_text['text'] = df_text['label'].apply(lambda x: 'hi my name is {}'.format(label_names[x]))
trainX, testX, trainY, testY = train_test_split(df_text, pd.get_dummies(df_text.label), test_size=.3)

TOKEN = '\\S+(?=\\s+)'
countvec = CountVectorizer(ngram_range = (1,3))
countvec_HASH = HashingVectorizer(token_pattern=TOKEN, ngram_range = (1,3), norm=None, non_negative=True)
# Hash function takes a token as input and outputs a hash value; we can limit the number of these values. Thus, each
# hash value may have multiple tokens assigned to it. Interestingly, this has little effect on model accuracy.
# Some problems are memory-bound and not easily parallelizable, and hashing enforces a fixed length computation instead
# of using a mutable datatype (like a dictionary).
text_data = combine_text_columns(X_train)
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
hashing_vec = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
hashed_text = hashing_vec.fit_transform(text_data)


# Pipeline for numeric and categorical variables -----------------------------------------------------------------------
# Any step in the pipeline must be an object that implements the fit and transform methods. The FunctionTransformer
# creates an object with these methods out of any Python function that you pass to it.
get_text_data = FunctionTransformer(lambda x: x['text_col'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['numeric_col_1', 'numeric_col_2']], validate=False)
just_text_data = get_text_data.fit_transform(df)
just_numeric_data = get_numeric_data.fit_transform(df)

# FeatureUnion joins results of multiple pipelines together
process_and_join_features = FeatureUnion(
            transformer_list=[
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )
Beispiel #29
0
def predict_missing_price(preprocessed_data, one_hot=False):

    test_index = preprocessed_data.price != preprocessed_data.price

    feature_columns = [
        i for i in preprocessed_data.columns if i not in ['class_id', 'price']
    ]
    y_column = ['price']
    testX = preprocessed_data.loc[test_index, feature_columns].values

    trainX = preprocessed_data.loc[(1 - test_index).astype(bool),
                                   feature_columns].values
    trainY = preprocessed_data.loc[(1 - test_index).astype(bool),
                                   y_column].values

    # plt.hist(trainY)
    # plt.show()

    # 销量数据使用log1p处理后更接近正态分布,比sqrt处理要好
    # trs = FunctionTransformer(func=np.sqrt, inverse_func=np.square)
    trs = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)
    scaler = MinMaxScaler()
    trainX = scaler.fit_transform(trainX)
    trainY = trs.fit_transform(np.reshape(trainY, (-1, 1)))

    # plt.hist(trainY)
    # plt.show()
    print(trainX.shape, trainY.shape)
    clf = xgb.XGBRegressor(seed=12)

    if one_hot:
        # ONE HOT with norm PARAMS sqare
        grid = [
            {
                'booster': ['gbtree'],
                'learning_rate': [0.1],
                # 'min_child_weight':[],
                'max_depth': [2],
                'gamma': [1],
                'subsample': [0.3],
                'colsample_bytree': [0.3],
                'reg_alpha': [1.0],
                'reg_lambda': [0.85],
                'scale_pos_weight': [1]
            },
        ]
    else:
        # no one hot PARAMS sqrt

        # grid = [{
        #     'booster': ['gbtree'],
        #     'learning_rate': [0.1],
        #     # 'min_child_weight':[],
        #     'max_depth': [2],
        #     'gamma': [0.7],
        #     'subsample': [0.1],
        #     'colsample_bytree': [0.3],
        #     'reg_alpha': [0.5],
        #     'reg_lambda': [0.3],
        #     'scale_pos_weight': [1]
        # },
        # ]

        # no one hot PARAMS log1p
        grid = [
            {
                'booster': ['gbtree'],
                'learning_rate': [0.25],
                # 'min_child_weight':[],
                'max_depth': [2],
                'gamma': [0.09],
                'subsample': [0.1],
                'colsample_bytree': [0.95],
                'reg_alpha': [0.5],
                'reg_lambda': [0.25],
                'scale_pos_weight': [1]
            },
        ]

    gridCV = GridSearchCV(estimator=clf,
                          param_grid=grid,
                          scoring=make_scorer(_scorer,
                                              greater_is_better=False),
                          iid=False,
                          n_jobs=-1,
                          cv=6,
                          verbose=1)

    gridCV.fit(trainX, trainY)

    print("best params:", gridCV.best_params_)
    print('best score:', gridCV.best_score_)
    testX = scaler.transform(testX)
    predY = np.reshape(gridCV.predict(testX), (-1, 1))
    preprocessed_data.loc[test_index, y_column] = trs.inverse_transform(predY)

    return preprocessed_data
You are working with numeric data that needs imputation, and text data that needs to be converted into a bag-of-words. You'll create functions that separate the text from the numeric variables and see how the .fit() and .transform() methods work.

INSTRUCTIONS
100XP
Compute the selector get_text_data by using a lambda function and FunctionTransformer() to obtain all 'text' columns.
Compute the selector get_numeric_data by using a lambda function and FunctionTransformer() to obtain all the numeric columns (including missing data). These are 'numeric' and 'with_missing'.
Fit and transform get_text_data using the .fit_transform() method with sample_df as the argument.
Fit and transform get_numeric_data using the same approach as above.
'''
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

# Print head to check results
print('Text Data')
print(just_text_data.head())
print('\nNumeric Data')
print(just_numeric_data.head())
Beispiel #31
0
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

# FeatureUnion is useful to combine pipelines! 
###################################################
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

# # Print head to check results
# print('Text Data')
# print(just_text_data.head())
# print('\nNumeric Data')
# print(just_numeric_data.head())
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']), 
Beispiel #32
0
class MSD(IndexableDataset):
    """Assuming input datastream is a example of
    user-item interaction triplet. In this class
    mel-spectrogram and tag vector (BOW) is fetched
    based on the triplet's item index
    """
    provides_sources = ('raw')

    def __init__(self, target, which_set, config, *args, **kwargs):
        """
        """
        self.source = 'raw'
        self.axis_labels = None

        self.sr = config.hyper_parameters.sample_rate
        self.length = config.hyper_parameters.patch_length
        self.slice_dur = int(self.length * self.sr)
        self.sub_batch_sz = config.hyper_parameters.sub_batch_size

        self.n_fft = config.hyper_parameters.n_fft
        self.hop_length = config.hyper_parameters.hop_size
        self.output_norm = config.data_server.output_norm

        self.target = target
        self.which_set = which_set

        self.n_jobs = config.data_server.n_jobs

        self.config = config

        # load dataset into instance
        self._load()

    def _load(self):

        if hasattr(self.config.paths.meta_data.splits, self.target):
            split_fn = eval('self.config.paths.meta_data.splits.{}'.format(
                self.target))
            split_fn = os.path.join(self.config.paths.meta_data.root, split_fn)

            self.internal_idx = joblib.load(split_fn)[self.which_set]
        else:
            raise IOError('[ERROR] cannot load split file!')

        if hasattr(self.config.paths.meta_data.targets, self.target):
            target_fn = eval('self.config.paths.meta_data.targets.{}'.format(
                self.target))
            target_fn = os.path.join(self.config.paths.meta_data.root,
                                     target_fn)

            target = joblib.load(target_fn)

            target_ref = {v: k for k, v in enumerate(target['tids'])}
            self.Y = target['item_factors']

            # output standardization
            if self.output_norm:
                self.out_sclr = StandardScaler()
            else:
                self.out_sclr = FunctionTransformer(func=lambda x: x)
            self.Y = self.out_sclr.fit_transform(self.Y)

        else:
            self.Y = None

        path_to_pathmap = self.config.paths.path_map
        if (path_to_pathmap is not None) and os.path.exists(path_to_pathmap):
            self._path_map = pkl.load(open(path_to_pathmap))

            # filter out error entries (no data)
            incl = filter(lambda t: t in self._path_map, self.internal_idx)

            self.Y = self.Y[map(lambda x: target_ref[x], incl)]
            self.internal_idx = map(lambda x: x, incl)

            if self.Y.shape[0] != len(self.internal_idx):
                raise ValueError('length bet. index and targets are not\
                                 consistant!')

    @property
    def num_examples(self):
        """
        """
        return len(self.internal_idx)

    def _multi_load(self, fns):
        """"""
        return pmap(partial(load_audio, sr=self.sr), fns, n_jobs=self.n_jobs)

    def _convert_index(self, request):
        """"""
        return map(
            lambda x: os.path.join(self.config.paths.audio.root, self.
                                   _path_map[self.internal_idx[x]]), request)

    def get_data(self, state=None, request=None):
        if state is not None:
            raise ValueError

        # (batch,2,sr*length)
        try:
            batch_sz = len(request)

            if self.target != 'self':
                # convert index
                request_fn = self._convert_index(request)

                # list of (2, 128, len)
                signal = self._multi_load(request_fn)
                signal, mask = zero_pad_signals(signal)

                # fetch target
                target = map(lambda ix: self.Y[ix], request)
                data = filter(lambda y: y[1].sum() > self.slice_dur,
                              zip(signal, mask, target))
                X = map(lambda x: x[0], data)
                M = map(lambda x: x[1], data)
                Y = map(lambda x: x[2], data)

                # prepare sub batch
                X, Y = prepare_sub_batches(self.sub_batch_sz, self.slice_dur,
                                           X, M, Y)

            else:
                # get index list
                triplet = sample_matcher_idx(request, self.internal_idx)

                # make hash for batch elements
                uniq_idx = list(
                    set(
                        list(
                            chain.from_iterable(
                                map(lambda x: (x[0], x[1]), triplet)))))
                uniq_hash = {v: k for k, v in enumerate(uniq_idx)}

                # convert index into path
                uniq_paths = self._convert_index(uniq_idx)

                # list of (2, 128, len)
                signal = self._multi_load(uniq_paths)
                signal, mask = zero_pad_signals(signal)

                # list of (128,n_frames)
                data = filter(lambda x: x[1].sum() > self.slice_dur,
                              zip(signal, mask, uniq_idx))
                survivors = set(map(lambda x: x[2], data))
                data = {d[2]: (d[0], d[1]) for d in data}

                # assign databatch into original order
                Xl, Xr, Ml, Mr, Y = [], [], [], [], []
                for d in triplet:
                    if (d[0] not in survivors) or (d[1] not in survivors):
                        continue
                    else:
                        Xl.append(data[d[0]][0])
                        Xr.append(data[d[1]][0])
                        Ml.append(data[d[0]][1])
                        Mr.append(data[d[1]][1])
                        Y.append(d[2])

                # prepare sub batch
                Xl, Y = prepare_sub_batches(self.sub_batch_sz, self.slice_dur,
                                            Xl, Ml, Y)
                Xr, _ = prepare_sub_batches(self.sub_batch_sz, self.slice_dur,
                                            Xr, Mr)

                X = np.swapaxes(np.array([Xl, Xr]), 0, 1)
                y = np.eye(2)
                Y = y[Y.ravel().astype(int).tolist()]

            print(X.shape, Y.shape)

        except Exception as e:
            traceback.print_exc()
            # raise Exception
            return -1, -1, request
        else:
            return X, Y, request
def test_function_transformer_frame():
    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(np.random.randn(100, 10))
    transformer = FunctionTransformer(validate=False)
    X_df_trans = transformer.fit_transform(X_df)
    assert hasattr(X_df_trans, 'loc')