from models import Repo import utils def get_classifier(X, y): return RandomForestClassifier( n_estimators=100, max_depth=None, min_samples_split=1, random_state=0, # random seed is static for comparison compute_importances=True, ) if __name__ == '__main__': repos = Repo.load_sample() class_to_id, id_to_class = utils.create_bimap(classes.classes) dict_repos = [] for r in repos: d = {mod: False for mod in utils.stdlib_module_names()} for mod in r.imported_stdlib_modules: d[mod] = True dict_repos.append(d) vectorizer = DictVectorizer(sparse=False) y = np.array([class_to_id[classes.classify(r)] for r in repos]) X = vectorizer.fit_transform(dict_repos) clf = get_classifier(X, y)
def _run(repos, features): """Train and run a classifier using features from these repos. Current classes are used. :param repos: a list of Repos :param features: a list of strings of feature names """ class_to_id, id_to_class = utils.create_bimap(classes.classes) y = np.array([class_to_id[classes.classify(r)] for r in repos]) # all features except imports are numerical; # imports become one-hot boolean ngrams use_imports = False if 'imported_stdlib_modules' in features: use_imports = True # mod_feature_dict = {_mod_feature_name(mods): False # for mods in ngrams(sorted_stdlib_names)} features = [f for f in features if f != 'imported_stdlib_modules'] dict_repos = [] for r in repos: d = {} if use_imports: # d = mod_feature_dict.copy() mods = [m for m in r.imported_stdlib_modules if m in set(['hashlib', '__future__', 'functools', 'threading', 'warnings', 'base64', 'traceback', 'socket', 'urlparse', 'subprocess', 'tempfile', 'json', 'unittest', 'errno', 'StringIO', 're', 'glob', 'signal', 'inspect', 'operator'])] for mods in ngrams(mods): d[_mod_feature_name(mods)] = True for fname in features: d[fname] = getattr(r, fname) dict_repos.append(d) vec = DictVectorizer() X = vec.fit_transform(dict_repos) #X = X.todense() feature_names = vec.get_feature_names() dense_X = X.toarray() # model search X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split( dense_X, y, test_size=0.3 ) # rfc_grid = [ # {'max_features': [None, 'sqrt', 'log2'], # 'criterion': ['entropy', 'gini'], # 'n_estimators': [200, 500, 750], # 'max_depth': [None], # 'min_samples_split': [1, 2, 3, 5], # }, # ] # cv_rfc = GridSearchCV(RandomForestClassifier(), # rfc_grid, cv=3, verbose=1, n_jobs=-1).fit(X_train, y_train) # ada_grid = [ # { # 'n_estimators': [200, 500, 750, 1000], # 'algorithm': ['SAMME', 'SAMME.R'] # }, # ] # cv_ada = GridSearchCV(AdaBoostClassifier( # base_estimator=cv_rfc.best_estimator_.estimators_[0]), # ada_grid, cv=3, verbose=1, n_jobs=-1).fit(X_train, y_train) # print 'RFC 5-fold stratified' # rfc = RandomForest() # rfc.fit(X_train, y_train) # pred = rfc.predict(X_test) # print metrics.precision_recall_fscore_support(y_test, pred) # #benchmark(RandomForest(), dense_X, y, feature_names) # print 'RFC found by:' # print cv_rfc.best_estimator_ # rfc = cv_rfc.best_estimator_ # rfc.fit(X_train, y_train) # pred = rfc.predict(X_test) # print metrics.precision_recall_fscore_support(y_test, pred) # print 'ABC found by:' # print cv_ada.best_estimator_ # rfc = cv_ada.best_estimator_ # rfc.fit(X_train, y_train) # pred = rfc.predict(X_test) # print metrics.precision_recall_fscore_support(y_test, pred) #print 'Gradient boost' #benchmark(GradientBoostingClassifier(n_estimators=300, # max_depth=5, # min_samples_split=1, # max_features=None, # ), # dense_X, y, feature_names) #size = .3 #print '5-fold strat %s' % size #cv = sklearn.cross_validation.StratifiedShuffleSplit( # y, # n_iter=5, # test_size=size #) #size = .5 #print '5-fold strat %s' % size #benchmark(RandomForest(), dense_X, y, feature_names, # cv=sklearn.cross_validation.StratifiedShuffleSplit( # y, # n_iter=5, # test_size=size # )) def _attempt(clf, X_train, y_train, X_test, y_test, weighted=True): weights = None if weighted: weights = balance_weights(y_train) clf.fit(X_train, y_train, sample_weight=weights) pred = clf.predict(X_test) print metrics.classification_report(y_test, pred, target_names=['high', 'low']) def attempt(clf, X_train, y_train, X_test, y_test): print clf print 'weighted:' _attempt(clf, X_train, y_train, X_test, y_test) print print 'weighted with undersampled test set:' X_u_small, X_u_large, y_u = get_asym_task(X_test, y_test) X_u_large = np.array(random.sample(X_u_large, len(X_u_small))) X_u = np.vstack((X_u_small, X_u_large)) _attempt(clf, X_train, y_train, X_u, y_u, False) print print rfc = RandomForest() attempt(rfc, X_train, y_train, X_test, y_test) ada = AdaBoostClassifier(n_estimators=300) attempt(ada, X_train, y_train, X_test, y_test) #benchmark(RandomForest(), X_new, y_new, feature_names) asym = AsymBaggingRFCs(13, n_estimators=200, max_depth=None, min_samples_split=1, max_features=None, #random_state=0, # random seed is static for comparison compute_importances=True, n_jobs=-1, # run on all cores ) attempt(asym, X_train, y_train, X_test, y_test) print print '============' print 'with undersampled training data:' rfc_under = RandomForest() X_utr_small, X_utr_large, y_utr = get_asym_task(X_train, y_train) X_utr_large = np.array(random.sample(X_utr_large, len(X_utr_small))) X_utr = np.vstack((X_utr_small, X_utr_large)) attempt(rfc_under, X_utr, y_utr, X_test, y_test) ada_under = AdaBoostClassifier(n_estimators=300) attempt(ada_under, X_utr, y_utr, X_test, y_test)