コード例 #1
0
from models import Repo
import utils


def get_classifier(X, y):
    return RandomForestClassifier(
        n_estimators=100, max_depth=None, min_samples_split=1,
        random_state=0,  # random seed is static for comparison
        compute_importances=True,
    )


if __name__ == '__main__':
    repos = Repo.load_sample()

    class_to_id, id_to_class = utils.create_bimap(classes.classes)

    dict_repos = []
    for r in repos:
        d = {mod: False for mod in utils.stdlib_module_names()}

        for mod in r.imported_stdlib_modules:
            d[mod] = True
        dict_repos.append(d)

    vectorizer = DictVectorizer(sparse=False)

    y = np.array([class_to_id[classes.classify(r)] for r in repos])
    X = vectorizer.fit_transform(dict_repos)

    clf = get_classifier(X, y)
コード例 #2
0
def _run(repos, features):
    """Train and run a classifier using features from these repos.
    Current classes are used.

    :param repos: a list of Repos
    :param features: a list of strings of feature names
    """
    class_to_id, id_to_class = utils.create_bimap(classes.classes)
    y = np.array([class_to_id[classes.classify(r)] for r in repos])

    # all features except imports are numerical;
    # imports become one-hot boolean ngrams
    use_imports = False
    if 'imported_stdlib_modules' in features:
        use_imports = True
        # mod_feature_dict = {_mod_feature_name(mods): False
        #                     for mods in ngrams(sorted_stdlib_names)}
        features = [f for f in features if f != 'imported_stdlib_modules']

    dict_repos = []
    for r in repos:
        d = {}

        if use_imports:
            # d = mod_feature_dict.copy()

            mods = [m for m in r.imported_stdlib_modules if m in
                    set(['hashlib', '__future__', 'functools', 'threading', 'warnings', 'base64',
                         'traceback', 'socket', 'urlparse', 'subprocess', 'tempfile', 'json',
                         'unittest', 'errno', 'StringIO', 're', 'glob', 'signal', 'inspect',
                         'operator'])]

            for mods in ngrams(mods):
                d[_mod_feature_name(mods)] = True

        for fname in features:
            d[fname] = getattr(r, fname)

        dict_repos.append(d)

    vec = DictVectorizer()
    X = vec.fit_transform(dict_repos)
    #X = X.todense()

    feature_names = vec.get_feature_names()

    dense_X = X.toarray()

    # model search
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
        dense_X, y, test_size=0.3
    )

    # rfc_grid = [
    #     {'max_features': [None, 'sqrt', 'log2'],
    #      'criterion': ['entropy', 'gini'],
    #      'n_estimators': [200, 500, 750],
    #      'max_depth': [None],
    #      'min_samples_split': [1, 2, 3, 5],
    #      },
    # ]

    # cv_rfc = GridSearchCV(RandomForestClassifier(),
    #                       rfc_grid, cv=3, verbose=1, n_jobs=-1).fit(X_train, y_train)

    # ada_grid = [
    #     {
    #         'n_estimators': [200, 500, 750, 1000],
    #         'algorithm': ['SAMME', 'SAMME.R']
    #     },
    # ]

    # cv_ada = GridSearchCV(AdaBoostClassifier(
    #     base_estimator=cv_rfc.best_estimator_.estimators_[0]),
    #     ada_grid, cv=3, verbose=1, n_jobs=-1).fit(X_train, y_train)


    # print 'RFC 5-fold stratified'
    # rfc = RandomForest()
    # rfc.fit(X_train, y_train)
    # pred = rfc.predict(X_test)
    # print metrics.precision_recall_fscore_support(y_test, pred)

    # #benchmark(RandomForest(), dense_X, y, feature_names)

    # print 'RFC found by:'
    # print cv_rfc.best_estimator_
    # rfc = cv_rfc.best_estimator_
    # rfc.fit(X_train, y_train)
    # pred = rfc.predict(X_test)
    # print metrics.precision_recall_fscore_support(y_test, pred)

    # print 'ABC found by:'
    # print cv_ada.best_estimator_
    # rfc = cv_ada.best_estimator_
    # rfc.fit(X_train, y_train)
    # pred = rfc.predict(X_test)
    # print metrics.precision_recall_fscore_support(y_test, pred)

    #print 'Gradient boost'
    #benchmark(GradientBoostingClassifier(n_estimators=300,
    #                                     max_depth=5,
    #                                     min_samples_split=1,
    #                                     max_features=None,
    #                                     ),
    #          dense_X, y, feature_names)


    #size = .3
    #print '5-fold strat %s' % size
    #cv = sklearn.cross_validation.StratifiedShuffleSplit(
    #    y,
    #    n_iter=5,
    #    test_size=size
    #)

    #size = .5
    #print '5-fold strat %s' % size
    #benchmark(RandomForest(), dense_X, y, feature_names,
    #          cv=sklearn.cross_validation.StratifiedShuffleSplit(
    #              y,
    #              n_iter=5,
    #              test_size=size
    #          ))

    def _attempt(clf, X_train, y_train, X_test, y_test, weighted=True):
        weights = None
        if weighted:
            weights = balance_weights(y_train)

        clf.fit(X_train, y_train, sample_weight=weights)

        pred = clf.predict(X_test)
        print metrics.classification_report(y_test, pred, target_names=['high', 'low'])

    def attempt(clf, X_train, y_train, X_test, y_test):
        print clf
        print 'weighted:'
        _attempt(clf, X_train, y_train, X_test, y_test)

        print
        print 'weighted with undersampled test set:'
        X_u_small, X_u_large, y_u = get_asym_task(X_test, y_test)
        X_u_large = np.array(random.sample(X_u_large, len(X_u_small)))
        X_u = np.vstack((X_u_small, X_u_large))
        _attempt(clf, X_train, y_train, X_u, y_u, False)
        print
        print

    rfc = RandomForest()
    attempt(rfc, X_train, y_train, X_test, y_test)

    ada = AdaBoostClassifier(n_estimators=300)
    attempt(ada, X_train, y_train, X_test, y_test)

    #benchmark(RandomForest(), X_new, y_new, feature_names)

    asym = AsymBaggingRFCs(13,
                           n_estimators=200,
                           max_depth=None,
                           min_samples_split=1,
                           max_features=None,
                           #random_state=0,  # random seed is static for comparison
                           compute_importances=True,
                           n_jobs=-1,  # run on all cores
                           )
    attempt(asym, X_train, y_train, X_test, y_test)

    print
    print '============'
    print 'with undersampled training data:'
    rfc_under = RandomForest()
    X_utr_small, X_utr_large, y_utr = get_asym_task(X_train, y_train)

    X_utr_large = np.array(random.sample(X_utr_large, len(X_utr_small)))
    X_utr = np.vstack((X_utr_small, X_utr_large))
    attempt(rfc_under, X_utr, y_utr, X_test, y_test)

    ada_under = AdaBoostClassifier(n_estimators=300)
    attempt(ada_under, X_utr, y_utr, X_test, y_test)