class ROCKET(): def __init__(self, num_kernels=100): self.num_kernels = num_kernels def train(self, X_train, Y_train): _ = generate_kernels(100, 10) apply_kernels(np.zeros_like(X_train)[:, 1:], _) input_length = X_train.shape[1] self.kernels = generate_kernels(input_length, self.num_kernels) X_transform = apply_kernels(X_train, self.kernels) self.classifier = RidgeClassifierCV(alphas=10**np.linspace(-3, 3, 10), normalize=True) return self.classifier.fit(X_transform, Y_train) def test(self, X_test, Y_test): X_transform = apply_kernels(X_test, self.kernels) results = self.classifier.score(X_transform, Y_test) return results
def classify(train, test): X, y = train.iloc[:, 0:-1], train.iloc[:, -1] clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True).fit(X, y) preds = clf.predict(test.iloc[:, 0:-1]) return accuracy_score(preds, test.iloc[:, -1])
def linear_models(x_train, y_train): from sklearn.linear_model import LogisticRegression classifier1 = LogisticRegression(C=1.2, random_state=0, max_iter=1500) classifier1.fit(x_train, y_train) from sklearn.linear_model import PassiveAggressiveClassifier classifier2 = PassiveAggressiveClassifier() classifier2.fit(x_train, y_train) from sklearn.linear_model import RidgeClassifierCV classifier3 = RidgeClassifierCV() classifier3.fit(x_train, y_train) from sklearn.linear_model import SGDClassifier classifier4 = SGDClassifier() classifier4.fit(x_train, y_train) from sklearn.linear_model import Perceptron classifier5 = Perceptron() classifier5.fit(x_train, y_train) print('LogisticRegression training accuracy: ', classifier1.score(x_train, y_train)) print('PassiveAggressiveClassifier training accuracy: ', classifier2.score(x_train, y_train)) print('RidgeClassifierCV training accuracy: ', classifier3.score(x_train, y_train)) print('SGDClassifier training accuracy: ', classifier4.score(x_train, y_train)) print('Perceptron training accuracy: ', classifier5.score(x_train, y_train)) return classifier1, classifier2, classifier3, classifier4, classifier5
def run(train_file, test_file, num_seq, n_jobs): print("Load train data") y, s = load_data(train_file) print("Generate random features") ss = generate_features(s, num_seq) print("Generate automaton") A = ahocorasick.Automaton() for idx, f in enumerate(ss): A.add_word(f, (idx, f)) A.make_automaton() print("Extract Feautre Vectors of train data") fvec = create_fvec(s, A, n_jobs) print("Learn classifier") cls = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True) cls.fit(fvec, y) print("Load test data") y_test, s_test = load_data(test_file) print("Extract Feature Vector of test data") fvec_test = create_fvec(s_test, A, n_jobs) print("Predict") print(cls.score(fvec_test, y_test))
def main(): train_input = pd.read_csv('../input/train.csv') test_input = pd.read_csv('../input/test.csv') data = pd.concat([train_input, test_input]) # We don't have data on whether person is delinquint featurizer = CreditScoreFeaturizer() # Create our own features print "Transforming dataset into features..." ##Create matrix of features from raw dataset X = featurizer.fit_transform(data) X_train = X[:len(train_input)] X_test = X[len(train_input):] ## Use any model that we might find appropriate model = RidgeClassifierCV(alphas=[ 0.1, 1., 10. ]) ##Create the object and set relevant parameters #model = LogisticRegression(C=10) # Can also switch different models (e.g. Ridge) ##Set target variable y y = train_input.SeriousDlqin2yrs print "Cross validating..." print np.mean(cross_val_score(model, X_train, y, scoring='roc_auc')) # Scoring metric is now AUC print "Training final model..." model = model.fit(X_train, y)
class TestServinator(unittest.TestCase): def setUp(self): data = load_iris() x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=67, stratify=data.target) self.model = RidgeClassifierCV(normalize=True, scoring='logloss', cv=3, class_weight='balanced') self.model.fit(x_train, y_train) self.test_data = x_test self.test_target = y_test model_backend = ModelBackend(self.model) # self.app = servinator('test', model_backend).test_client() def test_json_to_model_input(self): raw_json = '''[{"end_date": "2014-10-08T14:52:44-04:00", "location": "Louisville, KY", "pledged": "70.0", "goal": "10000.0", "category": "Food", "author": "Joe Banet", "backers": "4", "blurb": "Krazy Joe's soon to be famous kimchi and bourbon barrels which have long ago been enjoyed come together at last. Bourbon aged kimchi!", "title": "Krazy Joe's Bourbon Barrel Kimchi", "full_text": "I like kimchi. I like to make kimchi. I think I'm pretty good at it. My goal is to create a Bourbon barrel aged kimchi and share it with the world. This is just a start to company that could greatly expand and diversify into many products that all have one common denominator...kimchi. Thank you for your interest and support! ; "}]''' expected_data = { 'author': {0: 'joe banet'}, 'backers': {0: '4'}, 'blurb': {0: 'krazy joes soon to be famous kimchi and bourbon barrels which have long ago been enjoyed come together at last bourbon aged kimchi'}, 'day': {0: 8}, 'dayofweek': {0: 2}, 'dayofyear': {0: 281}, 'full_text': {0: 'i like kimchi i like to make kimchi i think im pretty good at it my goal is to create a bourbon barrel aged kimchi and share it with the world this is just a start to company that could greatly expand and diversify into many products that all have one common denominator kimchi thank you for your interest and support'}, 'goal': {0: '10000.0'}, 'hour': {0: 18}, 'loc1': {0: ''}, 'loc2': {0: 'louisville'}, 'loc3': {0: 'ky'}, 'minute': {0: 52}, 'month': {0: 10}, 'pledged': {0: '70.0'}, 'title': {0: 'krazy joes bourbon barrel kimchi'}, 'weekday': {0: 2}, 'weekofyear': {0: 41}, 'year': {0: 2014}} df_json = _json_to_model_input(raw_json) df_expected = pd.DataFrame(expected_data) # Sort them for comparison df_json = df_json.loc[:, sorted(df_json.columns.values)] df_expected = df_expected.loc[:, sorted(df_expected.columns.values)] self.assertTrue(df_expected.equals(df_json)) def test_e2e(self): '''test full load and predict of training data and assert it matches offline result''' pass
def _fit_estimator(self, rocket, X, y): transformed_x = rocket.fit_transform(X) ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True) ridge.fit(transformed_x, y) return [ make_pipeline(rocket, ridge), transformed_x if self.save_transformed_data else None, ]
def test_ridge_classifier_with_scoring(filter_, scoring, cv): # non-regression test for #14672 # check that RidgeClassifierCV works with all sort of scoring and # cross-validation scoring_ = make_scorer(scoring) if callable(scoring) else scoring clf = RidgeClassifierCV(scoring=scoring_, cv=cv) # Smoke test to check that fit/predict does not raise error clf.fit(filter_(X_iris), y_iris).predict(filter_(X_iris))
def do_rcv(X_test, X_train, Y_train): # creating a classifier of loss function "hinge" and penalty function "l2" clf = RidgeClassifierCV() print "starts fitting" print clf.fit(X_train, Y_train) print "finished fitting, starts predictions" Y_pred = clf.predict(X_test) print "finished predictions" return Y_pred
def ridge_classification(X_train, X_test, y_train, y_test): X_train, X_test = preprocess(X_train, X_test) from sklearn.linear_model import RidgeClassifierCV classifier = RidgeClassifierCV() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) y_pred = np.round(y_pred).flatten() plot_model(classifier, X_train, y_train, y_test, y_pred, "RidgeClassifierCV")
def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=None, class_weight=None): _RidgeClassifierCV.__init__(self, alphas, fit_intercept, normalize, scoring, cv, class_weight) BaseWrapperClf.__init__(self)
def _fit_estimator(self, rocket, X, y): transformed_x = rocket.fit_transform(X) scaler = StandardScaler(with_mean=False) scaler.fit(transformed_x, y) ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) ridge.fit(scaler.transform(transformed_x), y) return [ make_pipeline(rocket, scaler, ridge), transformed_x if self.save_transformed_data else None, ]
def ridge_classfier_cv_selected_feature(): raw_frame=thal_data() x=raw_frame.drop(['sugar','age','cardiographic','angina','slope','thal','log_cholestoral'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(x_train, y_train) global train_score train_score.append(clf.score(x_train,y_train)) global test_score test_score.append(clf.score(x_test,y_test))
def ridge_classfier_cv_withlog(): raw_frame=thal_data() x=raw_frame.drop(['thal','pressure','cholestoral','age','heart_rate'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(x_train, y_train) global train_score train_score.append(clf.score(x_train,y_train)) global test_score test_score.append(clf.score(x_test,y_test))
def ridge(X, X_train, X_val, y_train, y_val, X_test, y_test): model = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]) model.fit(X_train, y_train) sh = X.shape save_model(model, sh) tr_f1, val_f1, test_f1, tr_acc, val_acc, test_acc = model_performance( model, X_train, y_train, X_val, y_val, X_test, y_test) return tr_f1, val_f1, test_f1
def train(self, X_train, Y_train): _ = generate_kernels(100, 10) apply_kernels(np.zeros_like(X_train)[:, 1:], _) input_length = X_train.shape[1] self.kernels = generate_kernels(input_length, self.num_kernels) X_transform = apply_kernels(X_train, self.kernels) self.classifier = RidgeClassifierCV(alphas=10**np.linspace(-3, 3, 10), normalize=True) return self.classifier.fit(X_transform, Y_train)
def agent(path="./", dataset="", ratio=False, seg=0.75, folder="temp"): current_process().name = dataset start1 = time.time() train_x, train_y = load_from_tsfile_to_dataframe( f"{path}/{dataset}/{dataset}_TRAIN.ts") test_x, test_y = load_from_tsfile_to_dataframe( f"{path}/{dataset}/{dataset}_TEST.ts") print(f"{dataset}: Train Shape {train_x.shape}") print(f"{dataset}: Test Shape {test_x.shape}") scaler = StandardScaler() transform_time1 = time.time() mod_train = PAAStat(paa_=ratio, seg_=seg).transform(train_x.values) mod_train = scaler.fit(mod_train).transform(mod_train) mod_test = PAAStat(paa_=ratio, seg_=seg).transform(test_x.values) mod_test = scaler.transform(mod_test) transform_time2 = time.time() model = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True) train_time1 = time.time() model.fit(mod_train, train_y) preds = model.predict(mod_test) train_time2 = time.time() acc1 = accuracy_score(preds, test_y) * 100 end1 = time.time() print( f"Dataset: {dataset}, AccuracyRidge: {acc1}, Time taken: {(end1 - start1)/60}, " f"Transfrom_time: {(transform_time2-transform_time1)/60}, train_time: {(train_time2-train_time1)/60}" ) results = pd.DataFrame({ 'Dataset': dataset, 'AccuracyRidge': [acc1], 'Time (min)': [(end1 - start1) / 60], 'Transfrom_time (min)': [(transform_time2 - transform_time1) / 60], 'train_time (min)': [(train_time2 - train_time1) / 60] }) temp_path = './' + folder if not os.path.exists(temp_path): os.mkdir(temp_path) results.to_csv(os.path.join(temp_path + f'/{dataset}.csv'), index=False)
def ridge(y, pred, i, wt=1): b = y[:, i] b.ravel() c = pred[:,i] > 0.4 c.ravel() print('score before (class {})'.format(i), f1_score(b, c,)) clf = RidgeClassifierCV(alphas=[ 0.001, 0.01, 0.1, 1],class_weight={0:1, 1:wt}, normalize=True).fit(pred,b ) # #clf = RidgeClassifierCV(alphas=[0.001, 0.01, 0.1, 1], class_weight='balanced', cv=10).fit(pred, b) # ri_pred = clf.predict(pred) print('score after (class {})'.format(i), f1_score(b, ri_pred)) #print('score after (class {})'.format(i),clf.score(pred, b)) f1_before.append(f1_score(b, c,)) f1_after.append(f1_score(b, ri_pred)) return clf
def train_model(X, Y): print "Training LR..." modelLR = LogisticRegression(penalty='l1', C=100, tol=1e-10) modelLR.fit(X.toarray(), Y) print "Training RC..." modelRC = RidgeClassifierCV(alphas=[ 0.1, 1., 10. ]) modelRC.fit(X.toarray(), Y) print "Training GBC..." modelGBC = GradientBoostingClassifier(subsample=0.5, max_depth=6, n_estimators=50) modelGBC.fit(X.toarray(), Y) return modelGBC, modelRC, modelLR
def test_ridge_regression_custom_scoring(filter_, cv): # check that custom scoring is working as expected # check the tie breaking strategy (keep the first alpha tried) def _dummy_score(y_test, y_pred): return 0.42 alphas = np.logspace(-2, 2, num=5) clf = RidgeClassifierCV(alphas=alphas, scoring=make_scorer(_dummy_score), cv=cv) clf.fit(filter_(X_iris), y_iris) assert clf.best_score_ == pytest.approx(0.42) # In case of tie score, the first alphas will be kept assert clf.alpha_ == pytest.approx(alphas[0])
def __init__(self, num_features=10000, max_dilations_per_kernel=32, random_state=None, alphas=np.logspace(-3, 3, 13), normalize=True, memory=None, verbose=False, scoring=None, class_weight=None, **kwargs): """ MiniRocketClassifier is recommended for up to 10k time series. For a larger dataset, you can use MINIROCKET (in Pytorch). scoring = None --> defaults to accuracy. """ self.steps = [('minirocketmultivariate', MiniRocketMultivariate( num_features=num_features, max_dilations_per_kernel=max_dilations_per_kernel, random_state=random_state)), ('ridgeclassifiercv', RidgeClassifierCV(alphas=alphas, normalize=normalize, scoring=scoring, class_weight=class_weight, **kwargs))] self.num_features, self.max_dilations_per_kernel, self.random_state = num_features, max_dilations_per_kernel, random_state self.alphas, self.normalize, self.scoring, self.class_weight, self.kwargs = alphas, normalize, scoring, class_weight, kwargs self.memory = memory self.verbose = verbose self._validate_steps()
def agent(path, dataset, seg, folder, paa=True): start = time.time() train_x, train_y = load_from_tsfile_to_dataframe( f"{path}/{dataset}/{dataset}_TRAIN.ts") test_x, test_y = load_from_tsfile_to_dataframe( f"{path}/{dataset}/{dataset}_TEST.ts") print(f"{dataset}: Train Shape {train_x.shape}") print(f"{dataset}: Test Shape {test_x.shape}") model = Pipeline([('data_transform', PAAStat(paa_=paa, seg_=seg)), ('model', RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True, class_weight='balanced'))]) model.fit(train_x.values, train_y) preds = model.predict(test_x.values) acc1 = accuracy_score(preds, test_y) * 100 end = time.time() results = pd.DataFrame({ 'Dataset': dataset, 'AccuracyRidge': [acc1], 'Time': [end - start] }) print(results) temp_path = './' + folder if not os.path.exists(temp_path): os.mkdir(temp_path) results.to_csv(os.path.join(temp_path + f'/{dataset}.csv'), index=False)
def fit(self, X, y): """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self.classifier = rocket_pipeline = make_pipeline( Rocket( num_kernels=self.num_kernels, random_state=self.random_state, n_jobs=self.n_jobs, ), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) self._is_fitted = True return self
def _fit(self, X, y): """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self._pipeline = rocket_pipeline = make_pipeline( Rocket( num_kernels=self.num_kernels, random_state=self.random_state, n_jobs=self._threads_to_use, ), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) return self
class RocketClassifier(sklearn.pipeline.Pipeline): def __init__(self, num_kernels=10_000, normalize_input=True, random_state=None, alphas=np.logspace(-3, 3, 7), normalize_features=True, memory=None, verbose=False, scoring=None, class_weight=None, **kwargs): """ RocketClassifier is recommended for up to 10k time series. For a larger dataset, you can use ROCKET (in Pytorch). scoring = None --> defaults to accuracy. Rocket args: num_kernels : int, number of random convolutional kernels (default 10,000) normalize_input : boolean, whether or not to normalise the input time series per instance (default True) random_state : int (ignored unless int due to compatability with Numba), random seed (optional, default None) """ self.steps = [('rocket', Rocket(num_kernels=num_kernels, normalise=normalize_input, random_state=random_state)), ('ridgeclassifiercv', RidgeClassifierCV(alphas=alphas, normalize=normalize_features, scoring=scoring, class_weight=class_weight, **kwargs))] store_attr() self._validate_steps()
def init_classifiers(seed): return { 'AdaBoostClassifier': AdaBoostClassifier(random_state=seed), 'BaggingClassifier': BaggingClassifier(random_state=seed), 'ExtraTreesClassifier': ExtraTreesClassifier(random_state=seed), 'GradientBoostingClassifier': GradientBoostingClassifier(random_state=seed), 'RandomForestClassifier': RandomForestClassifier(random_state=seed), 'XGBClassifier': xgb.XGBClassifier(), 'LogisticRegression': LogisticRegression(random_state=seed), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(random_state=seed), 'RidgeClassifier': RidgeClassifier(random_state=seed), 'RidgeClassifierCV': RidgeClassifierCV(), 'SGDClassifier': SGDClassifier(random_state=seed), #'KNeighborsClassifier': KNeighborsClassifier(), #'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'MLPClassifier': MLPClassifier(random_state=seed), 'DecisionTreeClassifier': DecisionTreeClassifier(random_state=seed), 'ExtraTreeClassifier': ExtraTreeClassifier(random_state=seed) }
def get_logistic_regression_coefs_l2(self, category, clf=RidgeClassifierCV()): ''' Computes l2-penalized logistic regression score. Parameters ---------- category : str category name to score category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' try: from sklearn.cross_validation import cross_val_predict except: from sklearn.model_selection import cross_val_predict y = self._get_mask_from_category(category) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y) y_hat = cross_val_predict(clf, X, y) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) return clf.coef_[0], acc, baseline
def classify_connectivity(X, y, sss, classifier_name, n_jobs=-1, subjects=None): """ Returns 100 shuffle split scores """ if classifier_name == 'logreg_l1': classifier = LogisticRegression(penalty='l1', dual=False, random_state=42) elif classifier_name == 'logreg_l2': classifier = LogisticRegression(penalty='l2', random_state=42) elif classifier_name == 'ridge': classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7)) elif classifier_name == 'svc_l2': classifier = LinearSVC(penalty='l2', random_state=42) elif classifier_name == 'svc_l1': classifier = LinearSVC(penalty='l1', dual=False, random_state=42) p = Parallel(n_jobs=n_jobs, verbose=5)( delayed(train_and_test)(classifier, X, y, train, test, subjects) for train, test in sss) return np.asarray(p)
class RocketClassifier(sklearn.pipeline.Pipeline): """Time series classification using ROCKET features and a linear classifier""" def __init__(self, num_kernels=10_000, normalize_input=True, random_state=None, alphas=np.logspace(-3, 3, 7), normalize_features=True, memory=None, verbose=False, scoring=None, class_weight=None, **kwargs): """ RocketClassifier is recommended for up to 10k time series. For a larger dataset, you can use ROCKET (in Pytorch). scoring = None --> defaults to accuracy. Rocket args: num_kernels : int, number of random convolutional kernels (default 10,000) normalize_input : boolean, whether or not to normalise the input time series per instance (default True) random_state : Optional random seed (default None) """ try: import sktime from sktime.transformations.panel.rocket import Rocket except ImportError: print("You need to install sktime to be able to use RocketClassifier") self.steps = [('rocket', Rocket(num_kernels=num_kernels, normalise=normalize_input, random_state=random_state)), ('ridgeclassifiercv', RidgeClassifierCV(alphas=alphas, normalize=normalize_features, scoring=scoring, class_weight=class_weight, **kwargs))] store_attr() self._validate_steps()
def _train_probas_for_estimator(self, y, idx): rs = 255 if self.random_state == 0 else self.random_state rs = (None if self.random_state is None else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max) rng = check_random_state(rs) indices = range(self.n_instances_) subsample = rng.choice(self.n_instances_, size=self.n_instances_) oob = [n for n in indices if n not in subsample] results = np.zeros((self.n_instances_, self.n_classes_)) if len(oob) == 0: return results, 1, oob clf = make_pipeline( StandardScaler(with_mean=False), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)), ) clf.fit(self.transformed_data_[idx].iloc[subsample], y[subsample]) preds = clf.predict(self.transformed_data_[idx].iloc[oob]) weight = clf.steps[1][1].best_score_ for n, pred in enumerate(preds): results[oob[n]][self._class_dictionary[pred]] += weight return results, weight, oob
def generate_submission(models): X = pd.concat([ pd.read_csv(inp)[full_labels] for inp in [ "../models/{}/train_meta_probs.csv".format(model) for model in models ] ], axis=1) X_test = pd.concat([ pd.read_csv(inp)[full_labels] for inp in ["../models/{}/test_meta_probs.csv".format(model) for model in models] ], axis=1) col_names = [ "{}_{}".format(i, j) for i in ["model_{}".format(k) for k in range(len(models))] for j in full_labels ] X.columns, X_test.columns = col_names, col_names folds = get_folds() print("===Ridge===") ridge_cv = RidgeClassifierCV(alphas=[ 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 100 ], cv=folds).fit(X, y) print("best alpha value is: {}".format(ridge_cv.alpha_)) ridge_model = RidgeClassifier(alpha=ridge_cv.alpha_).fit(X, y) print(accuracy_score(y, ridge_model.predict(X))) test_df['label'] = pd.Series( ridge_model.predict(X_test)).map(full_num_label_mapping) test_df['label'] = test_df['label'].map(lambda x: "unknown" if x not in labels else x) test_df.to_csv("ridge_on_{}_models.csv".format(len(models)), index=False)
def get_classifier(clf_name): svmClf = SVC(gamma='auto') advancedSvmClf = SVC(gamma='auto', kernel='rbf', probability=True, class_weight='balanced', C=120000000) ridgeClf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True) randomForestClf = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=0) gradientBoostClf = GradientBoostingClassifier(n_estimators=200, max_depth=7, random_state=0) xgbClf = XGBClassifier(n_estimators=500, max_depth=5) adaBoostClf = AdaBoostClassifier() lgbmClf = LGBMClassifier(n_estimators=1000) if clf_name == 'svm': return svmClf elif clf_name == 'advanced_svm': return advancedSvmClf elif clf_name == 'ridge': return ridgeClf elif clf_name == 'random_forest': return randomForestClf elif clf_name == 'gradient_boosting': return gradientBoostClf elif clf_name == 'xgb': return xgbClf elif clf_name == 'ada_boost': return adaBoostClf elif clf_name == 'lgbm': return lgbmClf
def fit(self, X, y): trainx1, trainx2 = zip(*X) self.count_vect = CountVectorizer(analyzer='word', ngram_range=(1, 2)) self.count_vect.fit(list(trainx1)+list(trainx2)) X_train_counts1 = self.count_vect.transform(trainx1) X_train_counts2 = self.count_vect.transform(trainx2) X_train_counts = np.concatenate((X_train_counts1.toarray(),X_train_counts2.toarray()),axis=1) self.clf = RidgeClassifierCV().fit(X_train_counts, y) return self
def main(): start_time = time.time() train_feats, train_labels, test_feats = get_data(TRAIN_FILE, TEST_FILE) # try RidgeClassifier with manual cross validation (k-fold) lr = RidgeClassifier().fit(train_feats, train_labels) cv_preds = cross_validation.cross_val_predict(lr, train_feats, train_labels, cv=10) print("cross validation accuracy:", metrics.accuracy_score(train_labels, cv_preds)) # try automatic RidgeClassifierCV (k-fold) lrcv = RidgeClassifierCV(cv=10).fit(train_feats, train_labels) print("built in ridge cv accuracy:", lrcv.score(train_feats, train_labels)) # use cross validated model to predict test labels preds = lrcv.predict(test_feats).astype(str) for i in range(preds.shape[0]): if preds[i] == '1': preds[i] = 'TRUE' else: preds[i] = 'FALSE' np.savetxt("attractiveness_predictions.csv", preds, fmt="%s", newline="\n") print("time taken:", time.time()-start_time, "seconds")
def main(): train_input = pd.read_csv('train.csv') test_input = pd.read_csv('test.csv') data = pd.concat([train_input, test_input]) featurizer = CreditScoreFeaturizer() print "Transforming dataset into features..." ##Create matrix of features from raw dataset X = featurizer.fit_transform(data) X_train = X[:len(train_input)] X_test = X[len(train_input):] ## Use any model that we might find appropriate model = RidgeClassifierCV(alphas=[ 0.1, 1., 10. ]) ##Create the object and set relevant parameters #model = LogisticRegression(C=10) ##Set target variable y y = train_input.SeriousDlqin2yrs print "Cross validating..." print np.mean(cross_val_score(model, X_train, y, scoring='roc_auc', cv=10)) print "Training final model..." model = model.fit(X_train, y) n_models=5 bag_size=0.70 models = [LogisticRegression(C=10) for _ in xrange(n_models)] model = Bagging(models, bag_size) #Fit Final Model model.fit(X_train, y) print "Create predictions on submission set..." create_submission(model, X_test, test_input)
def setUp(self): data = load_iris() x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=67, stratify=data.target) self.model = RidgeClassifierCV(normalize=True, scoring='logloss', cv=3, class_weight='balanced') self.model.fit(x_train, y_train) self.test_data = x_test self.test_target = y_test model_backend = ModelBackend(self.model)
class GClassifier(BaseEstimator, ClassifierMixin): def __init__(self): pass def fit(self, X, y): trainx1, trainx2 = zip(*X) self.count_vect = CountVectorizer(analyzer='word', ngram_range=(1, 2)) self.count_vect.fit(list(trainx1)+list(trainx2)) X_train_counts1 = self.count_vect.transform(trainx1) X_train_counts2 = self.count_vect.transform(trainx2) X_train_counts = np.concatenate((X_train_counts1.toarray(),X_train_counts2.toarray()),axis=1) self.clf = RidgeClassifierCV().fit(X_train_counts, y) return self def predict(self, X): testx1, testx2 = zip(*X) X_test_counts1 = self.count_vect.transform(testx1) X_test_counts2 = self.count_vect.transform(testx2) X_test_counts = np.concatenate((X_test_counts1.toarray(),X_test_counts2.toarray()),axis=1) return self.clf.predict(X_test_counts)
def sklearn_ridge_cv(Xtrain,Ytrain,Xtest,Ytest,*args,**kwargs): clf = RidgeClassifierCV(fit_intercept=True) clf.fit(Xtrain,Ytrain) return clf.score(Xtest,Ytest)
# Nicely print the confusion matrix print " " * 4, for label in labels: print " %s" % label, print for i, label1 in enumerate(labels): print label1, for j, label2 in enumerate(labels): print "%4d" % cm[i, j], print from sklearn.linear_model import RidgeClassifierCV clf = RidgeClassifierCV().fit(X_train, y_train) print("Accuracy = ", clf.score(X_test, y_test)), print_cm(confusion_matrix(y_test, clf.predict(X_test), labels=populations), populations) # Plot coefficients coef = np.mean(np.abs(clf.coef_), axis=0) f, ax = plt.subplots(figsize=(10, 4)) plt.bar(left=range(coef.size), height=coef) # ppl.bar(ax, left=range(coef.size), height=coef, xticklabels=None, # annotate=False) plt.savefig("ridge.png")
plt.tight_layout() plt.savefig(fname, format="png", bbox_extra_artists=(lgd,), bbox_inches="tight") plt.close() if __name__ == "__main__": # generate some fake data, split, and scale X, y = make_classification(n_samples=1000, n_informative=5, n_redundant=6, random_state=4) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) scaler = StandardScaler().fit(X_train) X_train_standard = scaler.transform(X_train) X_test_standard = scaler.transform(X_test) # specify classifiers ridge = RidgeClassifierCV(alphas=np.logspace(-3, 1, 20)) lasso = LogisticRegressionCV(Cs=np.logspace(-3, 1, num=20)) forest = RandomForestClassifier(n_estimators=100, n_jobs=-1) # train the classifiers ridge.fit(X_train_standard, y_train) lasso.fit(X_train_standard, y_train) forest.fit(X_train, y_train) # predicted values ridge_preds = ridge.predict(X_test_standard) lasso_preds = lasso.predict(X_test_standard) forest_preds = forest.predict(X_test) # confusion matrices c1 = confusion_matrix(y_test, ridge_preds)
from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import RidgeClassifierCV from sklearn.ensemble import GradientBoostingClassifier gdc = GradientBoostingClassifier() lr = LogisticRegression() clf = svm.SVR() et = ExtraTreesClassifier() rgr = RadiusNeighborsRegressor() forest = RandomForestRegressor(n_estimators = 100, n_jobs = 2, oob_score=True) adaboost = AdaBoostRegressor() nb = GaussianNB() rd = RidgeClassifierCV() kf = KFold(report.shape[0], n_folds = 5) for train_index, test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = variables.ix[list(train_index),], variables.ix[list(test_index),] y_train = report['survey_participant'].ix[list(train_index),] y_test = report['survey_participant'].ix[list(test_index),] forest.fit(X_train,y_train) adaboost.fit(X_train,y_train) gdc.fit(X_train, y_train) rd.fit(X_train, y_train) rgr.fit(X_train, y_train) nb.fit(X_train, y_train) lr.fit(X_train, y_train) et.fit(X_train, y_train)