コード例 #1
0
def test_nonexistent_columns_explicit_fail(simple_dataframe):
    """
    If a nonexistent column is selected, KeyError is raised.
    """
    mapper = DataFrameMapper(None)
    with pytest.raises(KeyError):
        mapper._get_col_subset(simple_dataframe, ["nonexistent_feature"])
コード例 #2
0
ファイル: structured.py プロジェクト: gil2abir/fastai
def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
コード例 #3
0
ファイル: predict.py プロジェクト: adcaes/kaggelCompetitions
def preprocess_train(train):
    train_y = train['count']
    train_y1 = train['casual']
    train_y2 = train['registered']

    preprocess_data(train)

    mapper = DataFrameMapper([
        ('hour', None),
        ('season', preprocessing.LabelBinarizer()),
        ('holiday', None),
        ('workingday', None),
        ('weather', preprocessing.LabelBinarizer()),
        ('temp', None),
        ('atemp', None),
        ('humidity', None),
        ('windspeed', None),
        ('weekday', None),
        ('is_sunday', None),
        ('bad_weather', None),
        ('year', None),
    ])

    train_X = mapper.fit_transform(train)
    return train_X, train_y, train_y1, train_y2, mapper
コード例 #4
0
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10):
    """
    return an iterator with cross validation data
    :param df:
    :param clfs:
    :param preprocess_scaling:
    :param nFold:
    :return:
    """

    to_sklearn_features = DataFrameMapper([('features', sklearn.feature_extraction.DictVectorizer())])

    data_X = to_sklearn_features.fit_transform(df)
    data_Y = df.expected_class

    skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold)
    classification_results = []
    scores = []
    for num, (train_index, test_index) in enumerate(skf):
        X_train, X_test = data_X[train_index], data_X[test_index]
        Y_train, Y_test = data_Y[train_index], data_Y[test_index]
        print("Len train{}, Len test{}".format(Y_train.size, Y_test.size))
        cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test)
        cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False)

        for clf in clfs:
            score, classification = generate_score(clf, cross_valid_data, fold=num)
            scores.append(score)
            classification_results.append(classification)
    return scores, classification_results
コード例 #5
0
def test_list_transformers_single_arg(simple_dataframe):
    """
    Multiple transformers can be specified in a list even if some of them
    only accept one X argument instead of two (X, y).
    """
    mapper = DataFrameMapper([("a", [MockXTransformer()])])
    # doesn't fail
    mapper.fit_transform(simple_dataframe)
コード例 #6
0
def test_simple_df(simple_dataframe):
    """
    Get a dataframe from a simple mapped dataframe
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)], df_out=True)
    transformed = mapper.fit_transform(df)
    assert type(transformed) == pd.DataFrame
    assert len(transformed["a"]) == len(simple_dataframe["a"])
コード例 #7
0
def test_transformed_names_complex_alias(complex_dataframe):
    """
    If we specify an alias for a multiple output column, it is used for the
    output
    """
    df = complex_dataframe
    mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c']
コード例 #8
0
def test_get_col_subset_single_column_array(simple_dataframe):
    """
    Selecting a single column should return a 1-dimensional numpy array.
    """
    mapper = DataFrameMapper(None)
    array = mapper._get_col_subset(simple_dataframe, "a")

    assert type(array) == np.ndarray
    assert array.shape == (len(simple_dataframe["a"]),)
コード例 #9
0
def test_transformed_names_binarizer(complex_dataframe):
    """
    Get transformed names of features in `transformed_names` attribute
    for a transformation that multiplies the number of columns
    """
    df = complex_dataframe
    mapper = DataFrameMapper([('target', LabelBinarizer())])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
コード例 #10
0
def test_transformed_names_simple(simple_dataframe):
    """
    Get transformed names of features in `transformed_names` attribute
    for simple transformation
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['a']
コード例 #11
0
ファイル: __init__.py プロジェクト: jpmml/sklearn2pmml
	def test_mapper(self):
		domain = CategoricalDomain()
		df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}])
		mapper = DataFrameMapper([
			("X", [domain, LabelBinarizer()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
コード例 #12
0
def test_transformed_names_simple_alias(simple_dataframe):
    """
    If we specify an alias for a single output column, it is used for the
    output
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['new_name']
コード例 #13
0
def test_default_none_names():
    """
    If default=None, column names are returned unmodified.
    """
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
    mapper = DataFrameMapper([], default=None)

    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['a', 'b']
コード例 #14
0
ファイル: transformer.py プロジェクト: GMadorell/abris
class Transformer(object):
    """
    The purpose of this class is to take a dataframe and transform it into
    a numpy array compatible format.
    """

    def __init__(self, config):
        self.__config = config
        self.__mapper = None
        self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer())

    def prepare(self, dataframe):
        """
        Takes the already cleaned dataframe, splits it into train and test
        and returns the train and test as numpy arrays.
        If the problem is supervised, the target column will be that last one
        of the returned arrays.
        """
        mapping = DataFrameMapCreator().get_mapping_from_config(self.__config)
        self.__mapper = DataFrameMapper(mapping)
        train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage"))
        return self.__get_correct_return_parameters(train, test)

    def __get_correct_return_parameters(self, train, test):
        model = self.__config.get_data_model()

        train_transformed = self.__mapper.fit_transform(train)
        test_transformed = self.__mapper.transform(test)

        if model.has_target():
            return self.__add_target_data(train_transformed, train), \
                   self.__add_target_data(test_transformed, test)
        else:
            return train_transformed, test_transformed

    def __add_target_data(self, transformed_data, original_data):
        """
        Picks up the target data from the original_data and appends it as a
        column to the transformed_data.
        Both arguments are expected to be np.array's.
        """
        model = self.__config.get_data_model()
        target_feature = model.find_target_feature()
        name = target_feature.get_name()

        if target_feature.is_categorical():
            target_row = original_data[name]
            target = self.__label_encoder_adapter.transform(target_row)
        else:
            target = original_data[name].values.astype(type_name_to_data_type("float"))

        target = target[..., None]

        return np.hstack((transformed_data, target))

    def apply(self, dataframe):
        return self.__mapper.transform(dataframe)
コード例 #15
0
def test_fit_with_optional_y_arg(complex_dataframe):
    """
    Transformers with an optional y argument in the fit method
    are handled correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
    # doesn't fail
    mapper.fit(df[['feat1', 'feat2']], df['target'])
コード例 #16
0
ファイル: __init__.py プロジェクト: jpmml/sklearn2pmml
	def test_mapper(self):
		domain = ContinuousDomain()
		df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 3.0, "X2" : 3.5}])
		mapper = DataFrameMapper([
			(["X1", "X2"], [domain, StandardScaler()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual(numpy.array([1.0, 0.5]).tolist(), domain.data_min_.tolist())
		self.assertEqual(numpy.array([3.0, 3.5]).tolist(), domain.data_max_.tolist())
コード例 #17
0
def test_binarizer2_df():
    """
    Check level names from LabelBinarizer with just one output column
    """
    df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
    mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 1
    assert cols[0] == 'target'
コード例 #18
0
def test_get_col_subset_single_column_list(simple_dataframe):
    """
    Selecting a list of columns (even if the list contains a single element)
    should return a 2-dimensional numpy array.
    """
    mapper = DataFrameMapper(None)
    array = mapper._get_col_subset(simple_dataframe, ["a"])

    assert type(array) == np.ndarray
    assert array.shape == (len(simple_dataframe["a"]), 1)
コード例 #19
0
def test_default_transformer():
    """
    If default=Transformer, non explicitly selected columns are applied this
    transformer.
    """
    df = pd.DataFrame({'a': [1, np.nan, 3], })
    mapper = DataFrameMapper([], default=Imputer())

    transformed = mapper.fit_transform(df)
    assert (transformed[: 0] == np.array([1., 2., 3.])).all()
コード例 #20
0
def test_sparse_off(simple_dataframe):
    """
    If the resulting features are sparse but the "sparse" argument
    of the mapper is False, return a non-sparse matrix.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=False)

    dmatrix = mapper.fit_transform(df)
    assert type(dmatrix) != sparse.csr.csr_matrix
コード例 #21
0
def test_list_transformers_old_unpickle(simple_dataframe):
    mapper = DataFrameMapper(None)
    # simulate the mapper was created with < 1.0.0 code
    mapper.features = [("a", [MockXTransformer()])]
    mapper_pickled = pickle.dumps(mapper)

    loaded_mapper = pickle.loads(mapper_pickled)
    transformer = loaded_mapper.features[0][1]
    assert isinstance(transformer, TransformerPipeline)
    assert isinstance(transformer.steps[0][1], MockXTransformer)
コード例 #22
0
def test_sparse_features(simple_dataframe):
    """
    If any of the extracted features is sparse and "sparse" argument
    is true, the hstacked result is also sparse.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=True)
    dmatrix = mapper.fit_transform(df)

    assert type(dmatrix) == sparse.csr.csr_matrix
コード例 #23
0
def test_multiindex_df(multiindex_dataframe_incomplete):
    """
    Get a dataframe from a multiindex dataframe with missing data
    """
    df = multiindex_dataframe_incomplete
    mapper = DataFrameMapper([([c], Imputer()) for c in df.columns],
                             df_out=True)
    transformed = mapper.fit_transform(df)
    assert len(transformed) == len(multiindex_dataframe_incomplete)
    for c in df.columns:
        assert len(transformed[str(c)]) == len(df[c])
コード例 #24
0
def test_transformed_names_transformers_list(complex_dataframe):
    """
    When using a list of transformers, use them in inverse order to get the
    transformed names
    """
    df = complex_dataframe
    mapper = DataFrameMapper([
        ('target', [LabelBinarizer(), MockXTransformer()])
    ])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
コード例 #25
0
def test_onehot_df():
    """
    Check level ids from one-hot
    """
    df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
    mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 4
    assert cols[0] == 'target_0'
    assert cols[3] == 'target_3'
コード例 #26
0
def test_fit_transform(simple_dataframe):
    """
    Check that custom fit_transform methods of the transformers are invoked.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    # return something of measurable length but does nothing
    mock_transformer.fit_transform.return_value = np.array([1, 2, 3])
    mapper = DataFrameMapper([("a", mock_transformer)])
    mapper.fit_transform(df)
    assert mock_transformer.fit_transform.called
コード例 #27
0
def test_default_false():
    """
    If default=False, non explicitly selected columns are discarded.
    """
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
    mapper = DataFrameMapper([
        ('b', None)
    ], default=False)

    transformed = mapper.fit_transform(df)
    assert transformed.shape == (3, 1)
コード例 #28
0
def test_unselected_columns():
    """
    selected_columns returns a list of the columns not appearing in the
    features of the mapper but present in the given dataframe.
    """
    df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
    mapper = DataFrameMapper([
        ('a', None),
        (['a', 'b'], None)
    ])
    assert 'c' in mapper._unselected_columns(df)
def scale_X(X, dataset):
    if dataset == 'noYelp':
        X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    else:
        #use sklearn pandas data mapper to scale only non binary columns
        mapper = DataFrameMapper([(['yelp_rating'], StandardScaler()), (['yelp_reviews'], StandardScaler()), (['risk'], StandardScaler()), (['insp_badge'], StandardScaler()), (['crime_count'], StandardScaler()), (['311_count'], StandardScaler()), (['construction_count'], StandardScaler()), (['avg_high_temp'], StandardScaler()), (['time_diff'], StandardScaler()), (['prev_crit_viol'], StandardScaler()), ('Burgers', None), ('Convenience Stores', None), ('Sandwiches', None), ('Wine & Spirits', None), ('adultentertainment', None), ('afghani', None), ('african', None), ('apartments', None), ('asianfusion', None), ('bagels', None), ('bakeries', None), ('bangladeshi', None), ('bars', None), ('bbq', None), ('beerbar', None), ('beergardens', None), ('belgian', None), ('brasseries', None), ('breakfast_brunch', None), ('breweries', None), ('british', None), ('buffets', None), ('burgers', None), ('burmese', None), ('cafes', None), ('cafeteria', None), ('cajun', None), ('catering', None), ('cheesesteaks', None), ('chicken_wings', None), ('chinese', None), ('chocolate', None), ('churches', None),('cocktailbars', None), ('coffee', None), ('coffeeroasteries', None), ('comfortfood', None), ('cookingschools', None), ('creperies', None), ('cuban', None), ('cupcakes', None), ('danceclubs', None), ('delis', None), ('desserts', None), ('diners', None), ('discountstore', None), ('divebars', None), ('donuts', None), ('drugstores', None), ('ethiopian', None), ('ethnicmarkets', None), ('falafel', None), ('foodtrucks', None), ('french', None), ('gastropubs', None), ('gelato', None), ('german', None), ('gluten_free', None), ('golf', None), ('gourmet', None), ('greek', None), ('grocery', None), ('gyms', None), ('halal', None), ('healthtrainers', None), ('hookah_bars', None),  ('hotdog', None), ('hotdogs', None), ('hotels', None), ('icecream', None), ('indpak', None), ('irish', None), ('irish_pubs', None), ('italian', None), ('japanese', None),  ('jazzandblues', None), ('juicebars', None), ('korean', None), ('landmarks', None),  ('latin', None), ('lawyers', None), ('lebanese', None), ('libraries', None), ('lounges', None), ('mediterranean', None), ('mexican', None), ('mideastern', None), ('mini_golf', None), ('modern_european', None), ('musicvenues', None), ('newamerican', None), ('nonprofit', None), ('pakistani', None), ('peruvian', None), ('pianobars', None), ('pizza', None),  ('publicservicesgovt', None), ('pubs', None), ('puertorican', None), ('restaurants', None),  ('salad', None), ('salvadoran', None), ('sandwiches', None), ('seafood', None),  ('social_clubs', None), ('soulfood', None), ('soup', None), ('southern', None),  ('spanish', None), ('sports_clubs', None), ('sportsbars', None), ('steak', None), ('sushi', None), ('tapas', None), ('tapasmallplates', None), ('tea', None),  ('tex-mex', None), ('thai', None), ('tobaccoshops', None), ('tradamerican', None), ('turkish', None), ('vegetarian', None), ('venues', None), ('vietnamese', None), ('wholesale_stores', None), ('wine_bars', None)])

        X_scaled = pd.DataFrame(mapper.fit_transform(X.copy()), columns=X.columns)

    print "\n data scaled\n"
    return X_scaled
コード例 #30
0
def test_fit_transform_equiv_mock(simple_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper using the mock
    transformer which does not implement a custom fit_transform.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', MockXTransformer())])
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.all(transformed_combined == transformed_separate)
コード例 #31
0
def main():
    """ Run script"""
    options = getArgumentParser().parse_args()

    ### Make output dir
    dir_path = os.getcwd()
    out_dir = options.outdir
    path = os.path.join(dir_path, out_dir)
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)
    os.chdir(path)

    all_data = pd.read_csv(options.infile)
    all_data = all_data[all_data['w'] > 0]
    #all_data['w'] = all_data['w'].abs()
    # Variables of interest
    var = [
        'met_tight_tst_et', 'met_tight_tst_phi', 'mT', 'ph_pt', 'dphi_mety_ll',
        'AbsPt', 'Ptll', 'mllg', 'lep1pt', 'lep2pt', 'mll', 'metsig_tst',
        'Ptllg', 'dphi_met_ph'
    ]
    varw = [
        'met_tight_tst_et', 'met_tight_tst_phi', 'mT', 'ph_pt', 'dphi_mety_ll',
        'AbsPt', 'Ptll', 'mllg', 'lep1pt', 'lep2pt', 'mll', 'metsig_tst',
        'Ptllg', 'dphi_met_ph', 'w'
    ]
    units = [
        'GeV', 'Radians', 'GeV', 'GeV', 'Radians', '', 'GeV', 'GeV', 'GeV',
        'GeV', 'GeV', r'$\sqrt{GeV}$', 'GeV', 'Radians'
    ]
    df_bkg = all_data[all_data['event'] == 0][var]
    df_sig = all_data[all_data['event'] == 1][var]
    #for i in range(0,len(var)):
    #makePlots(df_bkg,df_sig,var[i],units[i],cuts=[])

    # Split into training and testing set and multiply by weights
    X = all_data
    y = all_data['event']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    X_train = X_train[varw]
    X_test = X_test[varw]
    wtrain = X_train['w']
    wtest = X_test['w']
    cols = X_train.columns
    itrain = X_train.index
    itest = X_test.index
    mapper = DataFrameMapper([(cols, StandardScaler())])
    scaled_train = mapper.fit_transform(X_train.copy(), len(cols))
    scaled_test = mapper.fit_transform(X_test.copy(), len(cols))
    X_train = pd.DataFrame(scaled_train, index=itrain, columns=cols)
    X_test = pd.DataFrame(scaled_test, index=itest, columns=cols)
    X_train = X_train.drop(['w'], axis=1)
    X_test = X_test.drop(['w'], axis=1)
    model = MLPClassifier(max_iter=2000,
                          activation='relu',
                          alpha=0.06,
                          hidden_layer_sizes=(120, 75),
                          learning_rate='adaptive',
                          momentum=0.9,
                          solver='sgd',
                          batch_size=50,
                          learning_rate_init=0.05)
    '''
    mlp = MLPClassifier(max_iter=2000,batch_size=50,momentum=0.9)
    param_grid = {
    'hidden_layer_sizes': [(sp_randint.rvs(100,600,1),sp_randint.rvs(100,600,1),), 
                                          (sp_randint.rvs(100,600,1),)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': uniform(0.0001, 0.9),
    'learning_rate': ['constant','adaptive']}
    #{'alpha': 0.06640793542453478, 'batch_size': 50, 'hidden_layer_sizes': (117, 74), 'learning_rate': 'adaptive', 'learning_rate_init': 0.05421689357774788, 'momentum': 0.9, 'solver': 'sgd'}
    #{'alpha': 0.015786202068122347, 'hidden_layer_sizes': (197,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.010660992530318792, 'solver': 'sgd'}
    parameter_space = {
    'hidden_layer_sizes': [(sp_randint.rvs(100,600,1),sp_randint.rvs(100,600,1),),(sp_randint.rvs(100,600,1),)],
    #'momentum': [0.9,0.95,0.99],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': uniform(0.0001,0.1),
    'learning_rate_init': uniform(0.0001,0.1),
    'learning_rate': ['constant','adaptive','invscaling']
    #'batch_size': [10,50,200]
    }
    scores = ['roc_auc']
    sys.stdout = open('model_cv.txt','wt')
    for score in scores:
        clf = RandomizedSearchCV(mlp,
                               parameter_space,
                               cv=3,
                               scoring=score,
                               n_jobs=-1,n_iter=25)
        clf.fit(X_train, y_train)
        print(score)
        print()
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
    '''
    model = model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    probs = model.predict_proba(X_test)
    metrics.plot_roc_curve(model, X_test, y_test, sample_weight=wtest)
    plt.savefig("roc.pdf")
    metrics.plot_precision_recall_curve(model,
                                        X_test,
                                        y_test,
                                        sample_weight=wtest)
    plt.savefig("prec_recall.pdf")
    #compare_train_test(model,X_train, y_train, X_test, y_test)
    plot_probs(y_test, probs, "nn")
    # Show output BDT score plot
    '''
    fig,ax = plt.subplots(1,1)
    twoclass_output = model.decision_function(X_test)
    train_output = model.decision_function(X_train)
    class_names = ["Signal", "Background"]
    plot_colors = ['red', 'blue']
    for i, n, c in zip(range(2), class_names, plot_colors):
        ax.hist(twoclass_output[i],
             bins=50,
             range=[-5,5],
             facecolor=c,
             label='Test %s' %n,
             alpha=.5,
             edgecolor=c)
        ax.hist(train_output[i],
            bins=50,
            range=[-5,5],
            label='Train %s' %n,
            fill=False,
            linestyle='--',
            edgecolor=c)
        ax.legend(loc='upper right')
        ax.set_ylabel('Samples')
        ax.set_xlabel('Score')
        ax.set_title('Decision Scores')
        plt.savefig('bdt_train_test_output_scores.pdf') 
    '''

    sys.stdout = open('model_out.txt', 'wt')
    print('Accuracy:')
    print(metrics.accuracy_score(y_test, predictions, sample_weight=wtest))
    print("ROC:")
    print(metrics.roc_auc_score(y_test, probs[:, 1], sample_weight=wtest))
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test, predictions, sample_weight=wtest))
    print(
        metrics.classification_report(y_test, predictions,
                                      sample_weight=wtest))
コード例 #32
0
parameters = {
    'eta': 0.3,
    'silent': True,  # option for logging
    'objective': 'multi:softprob',  # error evaluation for multiclass tasks
    'num_class': 3,  # number of classes to predic
    'max_depth': 3  # depth of the trees in the boosting process
}
num_round = 20  # the number of training iterations

model = xgb.XGBClassifier(**parameters)

# model.fit(X_train, y_train)
# preds = model.predict(X_test)

default_mapper = DataFrameMapper([(i, None) for i in feat_names])

pipeline = PMMLPipeline([('mapper', default_mapper), ("classifier", model)])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
y_test_trans = np.array([_[0] for _ in y_test.values])

print(precision_score(y_test, preds, average='macro'))  # 各类别分别计算,然后平均

print(precision_score(y_test, preds, average='micro'))  # 全局,不区分类别

sklearn2pmml(pipeline, "iris_v2.pmml", with_repr=True)
# sklearn2pmml(estimator=model, mapper=default_mapper, pmml='iris_v2.xml')
コード例 #33
0
ファイル: model.py プロジェクト: EvanZ/kaggle
# In[10]:

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np

xtrain0 = train[[
    'season', 'hour', 'holiday', 'workingday', 'humidity', 'windspeed',
    'weather', 'temp'
]]
ytrain = train['count']
mapper = DataFrameMapper([('season', LabelBinarizer()),
                          ('hour', LabelBinarizer()),
                          ('holiday', LabelBinarizer()),
                          ('workingday', LabelBinarizer()),
                          ('humidity', StandardScaler()),
                          ('windspeed', StandardScaler()),
                          ('weather', LabelBinarizer()),
                          ('temp', StandardScaler())])
xtrain1 = mapper.fit_transform(xtrain0)
model = LinearRegression()
model.fit(xtrain1, ytrain)
print(model)
print(model.coef_)

# In[11]:

model.score(xtrain1, ytrain)

# In[12]:
コード例 #34
0
from sklearn import preprocessing

df = pd.read_csv('data/bio_stats.csv')

df['college'].value_counts()

target = 'college'
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

mapper = DataFrameMapper([
    ('player_height', LabelBinarizer()),
    ('player_weight', LabelBinarizer()),
    ('country', LabelBinarizer()),
    ('draft_year', LabelBinarizer()),
    ('draft_round', LabelBinarizer()),
    ('draft_number', LabelBinarizer())],df_out=True)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

model = LogisticRegression(max_iter=500).fit(Z_train, y_train)
model.score(Z_train, y_train)
model.score(Z_test, y_test)

model = RandomForestClassifier().fit(Z_train, y_train)
model.score(Z_train, y_train)
model.score(Z_test, y_test)
コード例 #35
0
    token_2 = []
    token_3 = []
    token_4 = messages['length']
    token_5 = []
    token_6 = []

    count_vect = CountVectorizer()
    x_counts = count_vect.fit(messages_data)
    x_int = count_vect.transform(messages_data)
    x_int = list(x_int)

    data = preprocessing_text()
    labels = ['message', 'f1', 'f2', 'f3', 'f4', 'f5']
    df = pd.DataFrame.from_records(data, columns=labels)
    mapper = DataFrameMapper([(['f1', 'f2', 'f3', 'f4', 'f5'], None),
                              ('message',
                               CountVectorizer(binary=True,
                                               ngram_range=(1, 2)))])
    X = mapper.fit_transform(df)
    print("X " + str(X))
    print("X " + str(X.shape))
    trainset, testset, trainlabel, testlabel = train_test_split(
        X, messages_labels, test_size=0.33, random_state=42)

    SVM = svm.SVC()
    SVM.fit(trainset, trainlabel)
    predicted_values_svm = SVM.predict(testset)
    print(predicted_values_svm)
    acurracy_SVM = accuracy_score(testlabel, predicted_values_svm)
    print("acurracy_SVM " + str(acurracy_SVM))
    confusion_matrix_SVM = confusion_matrix(testlabel,
                                            predicted_values_svm,
コード例 #36
0
ファイル: main.py プロジェクト: nixonjin/BigDataTraining
                ]
            )
            features_def = features_def + categorical_feature_def

    if numerical_features and len(numerical_features) > 0:
        for feature in numerical_features:
            numerical_feature_def = gen_features(
                columns=[[feature]],
                classes=[
                    {'class': SimpleImputer, 'strategy': 'mean'},
                    {'class': StandardScaler},
                ]
            )
            features_def = features_def + numerical_feature_def

    preprocess = ('Preprocess', DataFrameMapper(features_def, df_out=True))
    estimator = ('Estimator', RandomForestClassifier())

    steps = [preprocess, estimator]

    pipeline = Pipeline(steps=steps)

    model = pipeline.fit(X, y)

    y_pred = pipeline.predict(test_df)
    test_df['Survived'] = y_test
    test_df['prediction'] = y_pred


    # 查看预测结果
    print(f"预测结果:{test_df[['PassengerId', 'Survived', 'prediction']]}")
コード例 #37
0
ファイル: model.py プロジェクト: Jordan-Milne/car-pricer
y = df[target]
X = df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# DataFrame Mapper
mapper = DataFrameMapper(
    [
        #     ('region', LabelBinarizer()),
        (['year'], StandardScaler()),
        # ('manufacturer',[CategoricalImputer(), LabelBinarizer()]),
        ('model', [CategoricalImputer()]),
        ('cylinders', [CategoricalImputer(),
                       LabelBinarizer()]),
        ('fuel', [CategoricalImputer(), LabelBinarizer()]),
        (['odometer'], [SimpleImputer(), StandardScaler()]),
        # ('title_status', [CategoricalImputer(), LabelBinarizer()]),
        ('transmission', [CategoricalImputer(),
                          LabelBinarizer()]),
        # (['vin'], StandardScaler()),
        # ('type', [CategoricalImputer(), LabelBinarizer()]),
        ('paint_color', [CategoricalImputer(),
                         LabelBinarizer()]),
        ('condition', [CategoricalImputer(),
                       LabelBinarizer()]),
    ],
    df_out=True)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

# # GridSearchCV to find best params for the pipe
コード例 #38
0
     Pipeline(steps=[('imputer', SimpleImputer(
         strategy='median')), ('scaler', StandardScaler())]))
    for f in numerical
]

categorical_transformations = [([f],
                                OneHotEncoder(handle_unknown='ignore',
                                              sparse=False))
                               for f in categorical]

transformations_pipeline = numeric_transformations + categorical_transformations

# Append classifier algorithm to preprocessing pipeline.
# Now we have a full prediction pipeline.
model_pipeline = Pipeline(steps=[('preprocessor',
                                  DataFrameMapper(transformations_pipeline)),
                                 ('classifier',
                                  LogisticRegression(C=args.C,
                                                     solver=args.solver,
                                                     penalty=args.penalty,
                                                     l1_ratio=args.l1_ratio))])

# Check Scikit-Learn docs to see the hyper-parameters available for the LogisticRegression:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# +

# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(attritionXData,
                                                    target,
                                                    test_size=0.2,
コード例 #39
0
    ([f],
     Pipeline(steps=[('imputer', SimpleImputer(
         strategy='median')), ('scaler', StandardScaler())]))
    for f in numerical
]

categorical_transformations = [([f],
                                OneHotEncoder(handle_unknown='ignore',
                                              sparse=False))
                               for f in categorical]

transformations = numeric_transformations + categorical_transformations

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)
                       ), ('classifier', LogisticRegression(solver='lbfgs'))])

# Split data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(attritionXData,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

# write x_text out as a pickle file for later visualization
x_test_pkl = 'x_test.pkl'
with open(x_test_pkl, 'wb') as file:
    joblib.dump(value=x_test, filename=os.path.join('./outputs/', x_test_pkl))
コード例 #40
0
def get_jewellery_data():
    data = pd.read_csv(r"../../../sample/jewellery_sample.csv")
    return data


def replace_foreign_characters(s):
    return re.sub(r'[^\x00-\x7f]', r'', s)


if __name__ == '__main__':
    samples = get_jewellery_data()
    X = samples.drop(['id'], axis=1)
    X['name'] = X['name'].apply(lambda x: replace_foreign_characters(x))
    X['description'] = X['description'].apply(lambda x: replace_foreign_characters(x))
    Y = samples["id"]
    print("data done!")

    pipeline = Pipeline([
        ('mapper', DataFrameMapper([
            ('name', TfidfVectorizer(norm=None, analyzer="word", max_features=200, stop_words="english")),
            ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=600, stop_words="english"))
        ])),
        ('model', SVC(max_iter=10000)),  # train on TF-IDF vectors w/ Linear SVM classifier
    ])
    print("model set done!")

    pipeline.fit(X, Y)
    print("model fit done!")

    joblib.dump(pipeline, "../../../model/model_for_jewellery_second.joblib")
    print("model to JobLib done!")
コード例 #41
0
prod_ratings.product_id = prod_ratings.product_id.apply(
    lambda x: prodid2idx[x])
prod_ratings.user_id = prod_ratings.user_id.apply(lambda x: userid2idx[x])

n_users = prod_ratings.user_id.nunique()
n_prods = prod_ratings.product_id.nunique()
# print(n_users, n_prods)


def round_rating(number):
    """Round a number to the closest half integer"""
    return np.round(number * 2) / 2


mapper = DataFrameMapper([(['product_count'], MinMaxScaler())], df_out=True)

#apply the mapper to each user and concatenate results
dfs = [
    np.round(
        mapper.fit_transform(prod_ratings[prod_ratings.user_id == u].copy()),
        1) for u in range(n_users)
]

prod_ratings['product_score'] = pd.concat(dfs).reset_index(drop=True) * 4 + 1
prod_ratings['product_score'] = round_rating(
    prod_ratings['product_score'])  #.astype(int)
#print(prod_ratings.shape)
# print(prod_ratings.head(20))

g = prod_ratings.groupby('user_id')['product_score'].count()
コード例 #42
0
        cols_standardize = ["x0", "x3", "x4", "x6"]
        cols_leave = ["x1", "x7"]
        cols_categorical = ["x2", "x5"]

    if len(cols_categorical) > 0:
        num_embeddings = [
            len(df_train[cat].unique()) + 1 for cat in cols_categorical
        ]
        embedding_dims = [math.ceil(n_emb / 2) for n_emb in num_embeddings]

        standardize = [([col], StandardScaler()) for col in cols_standardize]
        leave = [(col, None) for col in cols_leave]
        categorical = [(col, OrderedCategoricalLong())
                       for col in cols_categorical]

        x_mapper_float = DataFrameMapper(standardize + leave)
        x_mapper_long = DataFrameMapper(categorical)
        x_fit_transform = lambda df: tt.tuplefy(
            x_mapper_float.fit_transform(df).astype(np.float32),
            x_mapper_long.fit_transform(df))
        x_transform = lambda df: tt.tuplefy(
            x_mapper_float.transform(df).astype(np.float32),
            x_mapper_long.transform(df))
    else:
        standardize = [([col], StandardScaler()) for col in cols_standardize]
        leave = [(col, None) for col in cols_leave]
        x_mapper = DataFrameMapper(standardize + leave)

    data_file_name = os.path.join('./data/', args.dataset + '.pickle')
    if os.path.exists(data_file_name):
        with open(data_file_name, 'rb') as f:
コード例 #43
0
                    'Sensor2_entropy_250', 'Sensor3_entropy_250', 'Sensor4_entropy_250', 'Sensor5_entropy_250',
                    'Sensor6_entropy_250', 'Sensor7_entropy_250', 'Sensor8_entropy_250', 'Sensor9_entropy_250',
                    'Sensor10_entropy_250', 'Sensor11_entropy_250', 'Sensor12_entropy_250', 'Sensor13_entropy_250',
                    'Sensor14_entropy_250', 'Sensor15_entropy_250', 'Sensor16_entropy_250', 'Sensor17_entropy_250',
                    'Sensor18_entropy_250', 'Sensor19_entropy_250', 'Sensor20_entropy_250', 'Sensor21_entropy_250']

response_column = 'RUL'

# Parsing data
training_data = training_frame[training_columns]
target_data = training_frame[response_column]
testing_data = testing_frame[training_columns]
ground_truth_data = testing_frame[response_column]

# Setting up mapper
df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])

# Train data - pandas to sklearn
data = df_mapper.fit_transform(training_frame)
# train
x = data[:, 0:108]
# response
y = data[:, 108]

# Test data - pandas to sklearn
test = df_mapper.fit_transform(testing_frame)
# test
tX = test[:, 0:108]
# ground truth
tY = test[:, 108]
コード例 #44
0
    'texture_std_dev', 'perimeter_std_dev', 'area_std_dev',
    'smoothness_std_dev', 'compactness_std_dev', 'concavity_std_dev',
    'concave_points_std_dev', 'symmetry_std_dev', 'Worst_texture',
    'Worst_perimeter', 'Worst_area', 'Worst_smoothness', 'Worst_compactness',
    'Worst_concavity', 'Worst_concave_points', 'Worst_symmetry', 'Tumor_Size',
    'Lymph_Node_Status'
]
'''
These Dropped parameters are highly correlated variables because
it could introduce a problem of multicollinearity which further has a negative impact on the accuracy of the model.
'''

featureEngineered_dataset = dataset.drop(dropped_params, axis=1)
featureEngineered_dataset.head()

mapper = DataFrameMapper([(featureEngineered_dataset.columns, StandardScaler())
                          ])
scaled_features = mapper.fit_transform(featureEngineered_dataset.copy(), 4)
scaled_features_df = pd.DataFrame(scaled_features,
                                  index=featureEngineered_dataset.index,
                                  columns=featureEngineered_dataset.columns)
'''
scaled_features_df is the dataset on which feaured engineering 
has been performed
'''

scaled_features_df.describe()

i = 4


def running_and_evaluating_model(x, y):
コード例 #45
0
# Input dec csv file from the current folder
data = pd.read_csv('final_deceleration_mavg-co.csv', index_col=0)

# Removing the parameters which are not used for clustering
traindf = data.drop([
    'LA array', 'FileName', 'V2', 'T1', 'T2', 'D2-D1', 'Avg LA', 'yaw array',
    'mavg_jerk'
],
                    axis=1)

# Conversion formula is a*x+b where x is the parameter after scaling
a = traindf.max(axis=0) - traindf.min(axis=0)
b = traindf.min(axis=0)

# Scaling data using MInMaxScalar formula = X - Min / (Max-Min)
mapper1 = DataFrameMapper([(traindf.columns, MinMaxScaler())])
scaled_features = mapper1.fit_transform(traindf.copy(), 4)
scnum_train = pd.DataFrame(scaled_features,
                           index=traindf.index,
                           columns=traindf.columns)
#scnum_train.describe()

# Elbow Method for finding optimal number of clusters.
# taking average of WCSS for 15 random seed values for every cluster

wcss_avg = []

for i in range(1, 15):
    wcss_k = []
    for j in range(1, 11):
        km = KMeans(n_clusters=i,
コード例 #46
0
mapper = DataFrameMapper(
    [
        ('SOILCLASS', None),  # Soil classification
        ('LANDCOV', None),  # Land coverage class from GlobCover
        ('ELEVATION', None),  # Terrain elevation from a DEM
        ('SLOPE_PERCENTAGE', None),  # Terrain slope from a DEM
        ('ASPECT', None),  # Terrain slope from a DEM
        ('PROFILE_CURVATURE', None),  # Terrain slope from a DEM
        ('CLEAN_ID', None),  # Unique identifier for each measurement point
        ('TIMESTRR', None),  # Date for the measurement
        ('LONWGS84_x', None),  # Longitute coodinates for measurement points
        ('LATWGS84_x', None),  # Latitude coordinates for measurement points
        ('DEPTH', None),  # Depth of the measurement
        ('UHDICM.f', None),  # Upper horizon depth
        ('LHDICM.f', None),  #  Lower horizon depth
        ('DEPTH.f', None),  # Depth of the measurement
        ('UHDICM', None),  # Upper horizon depth
        ('LHDICM', None),  # Lower horizon depth
        ('CRFVOL', None),  # Coarse fragments volumetric in %
        ('SNDPPT',
         None),  # Sand content (50-2000 micro meter) mass fraction in %
        ('SLTPPT', None),  # Silt content (2-50 micro meter) mass fraction in %
        ('CLYPPT', None),  # Clay content (0-2 micro meter) mass fraction in %
        ('BLD', None),  # Bulk density (fine earth) in kg / cubic-meter
        ('PHIHOX', None),  # Soil pH x 10 in H2O
        ('PHIKCL', None),  # pH medido numa solução de Potássio-Cloro (KCl)
        ('ORCDRC', None
         ),  # Soil organic carbon content (fine earth fraction) in permilles
        ('CECSUM', None)
    ],
    df_out=True)  #
コード例 #47
0
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object


# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()


# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature], SimpleImputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )



# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
コード例 #48
0
# The Pipeline constructor takes a list of name/estimator pairs defining a sequence
# of steps. All but the last estimator must be transformers (they must have a fit_transform()
# method.)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn_pandas import DataFrameMapper
num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())])

housing_num_tr = num_pipeline.fit_transform(housing_num)

mapper1 = DataFrameMapper([
    ("ocean_proximity", [LabelBinarizer()])], sparse=False)

# cat_pipeline = Pipeline([
#     ('labeler', StringIndexer()),
#     ('encoder', OneHotEncoder(handle_unknown='ignore'))])
#
# housing_cat_tr = cat_pipeline.fit_transform(housing)
# print(housing_cat_tr)
#housing_cat = housing["ocean_proximity"]
# housing_cat_encoded, housing_categories = housing_cat.factorize()
#
# encoder = OneHotEncoder()
# # transform factorized categorical data from housing set  and reshape it since
# # fit.transform expects a 2D array.
# housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
# print(housing_cat_1hot)
                   'description_length','num_of_features',
                   'day_created', 'manager_quality', 'building_quality',
                   'hour_created', 'day_of_week_created']


# convert target label into numerical (ordinal)
target_conversion = {'low':0,'medium':1,'high':2}
y_train = X_train.interest_level.map(target_conversion).values
y_test = X_test.interest_level.map(target_conversion).values

X_train_cut = X_train[features_to_use]
X_test_cut = X_test[features_to_use]


# mapping scaler to keep dataset in a dataframe (cannot do inverse using this function)
scaler = DataFrameMapper([(X_train_cut.columns, StandardScaler())])
#scaler = StandardScaler()

# learn scale parameters from final training set and apply to training, val, and test sets
X_train_scaled = scaler.fit_transform(X_train_cut)
X_test_scaled = scaler.transform(X_test_cut)

# turn numpy arrays back to pandas dataframes (retaining column names)
X_train_df = pd.DataFrame(X_train_scaled, index=X_train_cut.index, columns=X_train_cut.columns)
X_test_df = pd.DataFrame(X_test_scaled, index=X_test_cut.index, columns=X_test_cut.columns)


# In[19]:
#==============================================================================
# Modeling and evaluation
#==============================================================================
コード例 #50
0
#impute missing values for continuous features
imputable_cont_features = ['Age','Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(titanic_train[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])

encodable_columns=['Sex', 'Embarked', 'Pclass']
feature_defs = [(col_name, preprocessing.LabelEncoder()) for col_name in encodable_columns]
mapper = DataFrameMapper(feature_defs)
mapper.fit(titanic_train)
titanic_train[encodable_columns] = mapper.transform(titanic_train)

titanic_train1 = titanic_train.drop(['PassengerId', 'Name', 'Cabin','Ticket','Survived'], axis=1)

one_hot_encoder = preprocessing.OneHotEncoder(categorical_features = np.array([0,1,6]))
one_hot_encoder.fit(titanic_train1)
print(one_hot_encoder.n_values_)
titanic_train2 = one_hot_encoder.transform(titanic_train1).toarray()

scaler = preprocessing.StandardScaler()
scaler.fit(titanic_train2)
X_train = scaler.transform(titanic_train2)
y_train = titanic_train[['Survived']]
コード例 #51
0
def impute_categorical_features(df, features):
    feature_defs = []
    for col_name in features:
        feature_defs.append((col_name, CategoricalImputer()))
    mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True)
    df[features] = mapper.fit_transform(df[features])
コード例 #52
0
ファイル: DataFrameParser.py プロジェクト: simonway/PythonML
def pandasToSkLearn(panda_frame, training_features, response_feature):
    df_mapper = DataFrameMapper([(training_features, None),
                                 (response_feature, None)])
    parsed_frame = df_mapper.fit_transform(panda_frame)
    return parsed_frame
コード例 #53
0
def build_dataset(dataframe, num_features, scaler, include_id=False):
    x = dataframe.iloc[:, :-1]
    y = dataframe[['redshift']]

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=42)

    if include_id:
        x_train = x_train.iloc[:, :-1]
        x_val = x_val.iloc[:, :-1]
        x_test_ids = x_test.iloc[:, -1]

    chunks = 2
    if num_features > 10:
        chunks = num_features / 5

    if 5 < num_features < 16:
        x_train_ugriz, x_train_errs, x_train_experrs, _ = ugriz_errs_split(
            x_train, chunks)
        x_val_ugriz, x_val_errs, x_val_experrs, _ = ugriz_errs_split(
            x_val, chunks)
        x_test_ugriz, x_test_errs, x_test_experrs, _ = ugriz_errs_split(
            x_test, chunks)

        if scaler != None:
            mapper = DataFrameMapper([(x_train_ugriz.columns, scaler)])
            x_train_ugriz_s = mapper.fit_transform(x_train_ugriz)
            x_train_ugriz = pd.DataFrame(x_train_ugriz_s,
                                         index=x_train_ugriz.index,
                                         columns=x_train_ugriz.columns)

            x_val_ugriz_s = mapper.transform(x_val_ugriz)
            x_val_ugriz = pd.DataFrame(x_val_ugriz_s,
                                       index=x_val_ugriz.index,
                                       columns=x_val_ugriz.columns)

            x_test_ugriz_s = mapper.transform(x_test_ugriz)
            x_test_ugriz = pd.DataFrame(x_test_ugriz_s,
                                        index=x_test_ugriz.index,
                                        columns=x_test_ugriz.columns)

            if chunks == 2:
                x_train = pd.concat([x_train_ugriz, x_train_errs], axis=1)
                x_val = pd.concat([x_val_ugriz, x_val_errs], axis=1)
                x_test = pd.concat([x_test_ugriz, x_test_errs], axis=1)
            else:
                x_train = pd.concat(
                    [x_train_ugriz, x_train_errs, x_train_experrs], axis=1)
                x_val = pd.concat([x_val_ugriz, x_val_errs, x_val_experrs],
                                  axis=1)
                x_test = pd.concat([x_test_ugriz, x_test_errs, x_test_experrs],
                                   axis=1)

    elif num_features > 15:
        x_train_ugriz, x_train_errs, x_train_experrs, x_train_expmags = ugriz_errs_split(
            x_train, chunks)
        x_val_ugriz, x_val_errs, x_val_experrs, x_val_expmags = ugriz_errs_split(
            x_val, chunks)
        x_test_ugriz, x_test_errs, x_test_experrs, x_test_expmags = ugriz_errs_split(
            x_test, chunks)

        if scaler != None:
            mapper = DataFrameMapper([(x_train_ugriz.columns, scaler)])
            x_train_ugriz_s = mapper.fit_transform(x_train_ugriz)
            x_train_ugriz = pd.DataFrame(x_train_ugriz_s,
                                         index=x_train_ugriz.index,
                                         columns=x_train_ugriz.columns)

            x_val_ugriz_s = mapper.transform(x_val_ugriz)
            x_val_ugriz = pd.DataFrame(x_val_ugriz_s,
                                       index=x_val_ugriz.index,
                                       columns=x_val_ugriz.columns)

            x_test_ugriz_s = mapper.transform(x_test_ugriz)
            x_test_ugriz = pd.DataFrame(x_test_ugriz_s,
                                        index=x_test_ugriz.index,
                                        columns=x_test_ugriz.columns)

            x_train = pd.concat([
                x_train_ugriz, x_train_errs, x_train_experrs, x_train_expmags
            ],
                                axis=1)
            x_val = pd.concat(
                [x_val_ugriz, x_val_errs, x_val_experrs, x_val_expmags],
                axis=1)
            x_test = pd.concat(
                [x_test_ugriz, x_test_errs, x_test_experrs, x_test_expmags],
                axis=1)

    else:
        if scaler != None:
            mapper = DataFrameMapper([(x_train.columns, scaler)])
            x_train_s = mapper.fit_transform(x_train)
            x_train = pd.DataFrame(x_train_s,
                                   index=x_train.index,
                                   columns=x_train.columns)

            x_val_s = mapper.transform(x_val)
            x_val = pd.DataFrame(x_val_s,
                                 index=x_val.index,
                                 columns=x_val.columns)

            x_test_s = mapper.transform(x_test)
            x_test = pd.DataFrame(x_test_s,
                                  index=x_test.index,
                                  columns=x_test.columns)

    if include_id:
        x_test = pd.concat([x_test, x_test_ids], axis=1)

    return x_train, y_train, x_test, y_test, x_val, y_val, scaler
コード例 #54
0
features = DataFrameMapper([
    (['VocabularyRichness','Egotest','WordPerLine','WordLenght'], None),
    ('CleanLyrics',CountVectorizer(analyzer = "word",   \
                             ngram_range=(1, 1),    \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 250)),
    ('PosTag',CountVectorizer(analyzer = "word",   \
                             ngram_range=(2, 2),    \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 250)),
    ('PosWord',CountVectorizer(analyzer = "word",   \
                             ngram_range=(1, 1),    \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 250)),
    ('RidTag',CountVectorizer(analyzer = "word",   \
                             ngram_range=(2, 2),    \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 250)),
    ('RidTagOnly',CountVectorizer(analyzer = "word",   \
                             ngram_range=(4, 4),    \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 25))])
コード例 #55
0
titanic_train.Cabin = titanic_train.Cabin.map(lambda x: x[0])

# size of families (including the passenger)
titanic_train['FamilySize'] = titanic_train.Parch + titanic_train.SibSp + 1

cat_features = ['Sex', 'Embarked', 'Pclass', 'Cabin', 'Title']
cont_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']

feature_defs = []
for col_name in cat_features:
    feature_defs.append((col_name, MyLabelBinarizer()))

for col_name in cont_features:
    feature_defs.append((col_name, None))

mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True)
mapper.fit(titanic_train)
X_train = mapper.transform(titanic_train)
y_train = titanic_train['Survived']

kfold = model_selection.StratifiedKFold(n_splits=10)
random_state = 100

rf_classifier = ensemble.RandomForestClassifier(random_state=random_state)
rf_grid = {
    'max_depth': list(range(7, 14)),
    'n_estimators': list(range(10, 100, 10)),
    'min_samples_split': list(range(4, 11)),
    'min_samples_leaf': list(range(2, 5))
}
grid_rf_classifier = model_selection.GridSearchCV(rf_classifier,
コード例 #56
0
#print(audit_df.head(5))

audit_X = audit_df[audit_df.columns.difference(["Adjusted"])]
audit_y = audit_df["Adjusted"]

scalar_mapper = DataFrameMapper([
    ("Education",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Employment",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Occupation",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Age", [
        ContinuousDomain(),
        CutTransformer(bins=[17, 28, 37, 47, 83],
                       labels=["q1", "q2", "q3", "q4"]),
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()
コード例 #57
0
major = pd.read_csv(r"C:\Users\钟顺民\Desktop\4.csv", sep=",", encoding='ISO-8859-1') \
    .dropna().groupby('id', as_index=False, group_keys=False) \
    .apply(typicalsamling, typicalNDict_Major)

# 分配数据
X = major.drop(['id'], axis=1)
Y = major["id"]

pipeline = PMMLPipeline([
    ('mapper',
     DataFrameMapper([('name',
                       TfidfVectorizer(norm=None,
                                       analyzer="word",
                                       max_features=200,
                                       tokenizer=Splitter())),
                      ('description',
                       TfidfVectorizer(norm=None,
                                       analyzer="word",
                                       max_features=600,
                                       tokenizer=Splitter()))])),
    ('model',
     SVC(max_iter=10000)),  # train on TF-IDF vectors w/ Linear SVM classifier
])

pipeline.fit(X, Y)

c = pd.read_csv(r"C:\Users\钟顺民\Desktop\4.csv", sep=',',
                encoding='ISO-8859-1').dropna().sample(n=200)

prediction = pipeline.predict(c.drop(['id'], axis=1))
t = c['id']
コード例 #58
0
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

train_set = pd.read_csv("train.csv", index_col="PassengerId")
test_set = pd.read_csv("test.csv", index_col="PassengerId")

mapper = DataFrameMapper(
    [
        ("Age", None),
        ("Fare", None),
        #("Embarked", [CategoricalImputer(), LabelBinarizer()]), # at first it didn't well
        ("Sex", LabelEncoder()),
        ("Pclass", None),
        ("SibSp", None),
    ],
    df_out=False)

pipeline = Pipeline([
    ("mapper", mapper),
    ("imputer", Imputer()),
    ("scaler", StandardScaler()),
    #("classifier", SGDClassifier(random_state = 42, n_jobs = 4)), # Stochastic Gradient Descent
    ("classifier", RandomForestClassifier(random_state=42, n_jobs=4))
])

train_set_labels = train_set["Survived"]
pipeline.fit(train_set, train_set_labels)
コード例 #59
0
from sklearn2pmml.pipeline import PMMLPipeline

import pandas

df = pandas.read_csv("audit.csv")

cat_columns = ["Education", "Employment", "Marital", "Occupation"]
cont_columns = ["Age", "Hours", "Income"]

df_X = df[cat_columns + cont_columns]
df_y = df["Adjusted"]

mapper = DataFrameMapper(
    [(cat_column,
      [CategoricalDomain(invalid_value_treatment="as_is"),
       LabelBinarizer()]) for cat_column in cat_columns] +
    [([cont_column],
      [ContinuousDomain(invalid_value_treatment="as_is"),
       StandardScaler()]) for cont_column in cont_columns])

selector = SelectKBest()

classifier = LogisticRegression(multi_class="ovr",
                                penalty="elasticnet",
                                solver="saga",
                                max_iter=1000)

pipeline = PMMLPipeline([("mapper", mapper), ("selector", selector),
                         ("classifier", classifier)])

param_grid = {
コード例 #60
0
# get features and thermal sensation
y = data_new['sensation']
x = data_new[['temperature', 'humidity', 'skin', 'clothing']]

# In[6]:

import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import datasets, svm
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn_pandas import DataFrameMapper

# In[7]:

mapper = DataFrameMapper([(['temperature'], None), (['humidity'], None),
                          (['skin'], None), (['clothing'], None)])
mapper.fit_transform(x.copy())
# count the number of thermal sensation
bool = (y == 3)
len(y[bool])

# In[21]:

#clf = svm.SVC(kernel='linear')
C = 1
#clf = svm.SVC(kernel='poly',degree=3,C=C)
clf = svm.SVC(kernel='rbf', gamma=0.7, C=C)
pipe = sklearn.pipeline.Pipeline([('featurize', mapper), ('svc', clf)])
#np.round(cross_val_score(pipe, X=data_new.copy(), y=data_new.comfort, scoring='r2'), 2)
cross_val_score(pipe, X=x.copy(), y=y, scoring='r2', cv=5)