Ejemplo n.º 1
0
                        help='L1 or L2 penalization.')
    parser.add_argument('C',
                        help='Regularization parameter.')
    parser.add_argument('fit_intercept',
                        help='Whether to include a constant bias term in the loss function.')
    parser.add_argument('n_components_pca',
                        help='n_components for PCA.')
    parser.add_argument('model_fname',
                        help='Absolute path to pickle the fitted CvModel.')
    args = parser.parse_args()

    n_components_pca = None if args.n_components_pca == 'None' else int(args.n_components_pca)

    Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv')

    # trans/clf specs
    n_folds = 5
    scaler = StandardScaler()
    pca = PCA(n_components=n_components_pca, whiten=True)
    trans = Pipeline([('scale_center', scaler),('pca', pca)]).fit(X)
    clf = LogisticRegression(
        penalty = args.penalty,
        C = float(args.C),
        fit_intercept = bool(args.fit_intercept),
        class_weight = 'auto'
    )

    cv_clf = CvModel(n_folds, trans, clf)
    cv_clf.fit(X, y)
    joblib.dump(cv_clf, args.model_fname)
Ejemplo n.º 2
0
from sklearn.externals import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

if __name__ == '__main__':

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('n_estimators',
                        help='n_estimators of RandomForestClassifier.')
    parser.add_argument(
        'max_features',
        help='Number of features to split for RandomForestClassifier.')
    parser.add_argument('model_fname',
                        help='Absolute path to pickle the fitted CvModel.')
    args = parser.parse_args()

    Id, X, y = extract_training_data(
        '/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv')

    # trans/clf specs
    n_folds = 5
    scaler = StandardScaler().fit(X)
    clf = RandomForestClassifier(n_estimators=int(args.n_estimators),
                                 max_features=int(args.max_features))

    cv_clf = CvModel(n_folds, scaler, clf)
    cv_clf.fit(X, y, [1. if yi == 0 else 3. for yi in y])
    joblib.dump(cv_clf, args.model_fname)
Ejemplo n.º 3
0
if __name__ == '__main__':
    
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('n_estimators',
                        help='n_estimators of AdaBoostClassifier.')
    parser.add_argument('max_depth',
                        help='max_depth of base estimator DecisionTreeClassifier.')
    parser.add_argument('learning_rate',
                        help='learning_rate of each tree.')
    parser.add_argument('model_fname',
                        help='Absolute path to pickle the fitted CvModel.')
    args = parser.parse_args()

    Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv')

    # trans/clf specs
    n_folds = 5
    scaler = StandardScaler().fit(X)
    dtclf = DecisionTreeClassifier(max_depth=int(args.max_depth))
    clf = AdaBoostClassifier(
        base_estimator=dtclf,
        n_estimators=int(args.n_estimators),
        learning_rate=float(args.learning_rate)
    )

    cv_clf = CvModel(n_folds, scaler, clf)
    cv_clf.fit(X, y)
    joblib.dump(cv_clf, args.model_fname)
Ejemplo n.º 4
0
if __name__ == '__main__':

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('n_estimators',
                        help='n_estimators of RandomForestClassifier.')
    parser.add_argument(
        'max_features',
        help='Number of features to split for RandomForestClassifier.')
    parser.add_argument(
        'C', help='Regularization parameter for L1 feature selection.')
    parser.add_argument('model_fname',
                        help='Absolute path to pickle the fitted CvModel.')
    args = parser.parse_args()

    Id, X, y = extract_training_data(
        '/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv')

    # trans/clf specs
    n_folds = 5
    scaler = StandardScaler()
    lasso = LinearSVC(C=float(args.C), penalty='l1', dual=False)
    trans = Pipeline([('center_scale', scaler),
                      ('feature_selection', lasso)]).fit_transform(X, y)
    clf = RandomForestClassifier(n_estimators=int(args.n_estimators),
                                 max_features=int(args.max_features))

    cv_clf = CvModel(n_folds, trans, clf)
    cv_clf.fit(X, y, input_trans=False)
    joblib.dump(cv_clf, args.model_fname)