def predict(name, sp, arrsource, arrfeats, nsp, clf_scaler_feats=None, clf_factory=None, clffact_feats=None, clf_type='svm', norm=True, nfeats=100, balance_train=False): """ - arrfeats: labeled training examples array, from ppi.feature_array.arrfeats, also stored in res.cvtest result as result.exs.arrfeats. - arrsource: array of data to classify, matching the training array """ if clf_scaler_feats: clf, scaler, feats = clf_scaler_feats else: if balance_train: arrfeats = fe.balance_train(arrfeats) if clf_type in clf_factories and clf_factory is None: clf_factory, clffact_feats = clf_factories[clf_type] feats = feature_selection(arrfeats, nfeats, clffact_feats() if clffact_feats else None) arrfeats = fe.keep_cols(arrfeats, feats) clf = clf_factory() scaler = ml.fit_clf(arrfeats, clf, norm=norm) print "Classifier:", clf arrsource = fe.keep_cols(arrsource, feats) ppis = ml.classify(clf, arrsource, scaler=scaler) pres = Struct(ppis=ppis,name=name, species=sp, ppi_params=str(clf), feats=feats, nsp=nsp, arrfeats=arrfeats, balance_train=balance_train) return pres
def cvtest(name, base_sp, nsp, fs, base_featstruct, kfold=2, clf_type='svm', nfeats=100, norm=True, ppi_output=None, train_limit=None, save_data=True, balance_train=False, keep_cols=None, clf_factory=None, clffact_feats=None, **kwargs): """ """ assert kfold>1, "CV K-fold 1 not possible" exs = ppi.feature_array(base_sp, fs, base_featstruct, nsp, **kwargs) if ppi_output is None else ppi_output arrfeats, ntest_pos = fe.arr_copy(exs.arrfeats), exs.ntest_pos assert len(arrfeats)>0, '0 examples not supported' if train_limit: print 'Sampling %s train/cv examples' % train_limit train_limit = train_limit or len(arrfeats) arrfeats = arrfeats if keep_cols is None else fe.keep_cols(arrfeats, keep_cols) arrfeats = fe.keep_rows(arrfeats, random.sample(range(len(arrfeats)), int(train_limit))) # shuffle even if not sampling. don't random.shuffle ntest_pos = int(ntest_pos * train_limit / len(arrfeats)) if clf_type in clf_factories and clf_factory is None: clf_factory, clffact_feats = clf_factories[clf_type] ppis = [] for k in range(kfold): print 'Fold %s:' % k ppis_fold,clf,scaler,feats = fold_test(arrfeats, kfold, k, clf_factory, clffact_feats, nfeats, norm, balance_train) ppis += ppis_fold random.shuffle(ppis) ppis.sort(key=lambda x: x[2], reverse=True) result = Struct(traincv=arrfeats[['id1','id2','hit']], clf=clf, scaler=scaler, ppis=ppis, ntest_pos=ntest_pos, name=name, species=base_sp, ppi_params=str(clf), feats=feats, source_feats=exs.arrfeats.dtype.names, balance_train=balance_train) if save_data: result.exs = exs return result
def fold_test(arrfeats, kfold, k, clf_factory, clffact_feats, nfeats, norm, balance_train): arrtrain, arrtest = fe.arr_kfold(arrfeats, kfold, k) if balance_train: arrtrain = fe.balance_train(arrtrain) if nfeats: clf_feats = clffact_feats() feats = feature_selection(arrtrain, nfeats, clf_feats) arrtrain,arrtest = [fe.keep_cols(a,feats) for a in arrtrain,arrtest] else: feats = None clf = clf_factory() if k==0: print "Classifier:", clf scaler = ml.fit_clf(arrtrain, clf, norm=norm) if ml.exist_pos_neg(arrtrain): ppis = ml.classify(clf, arrtest, scaler=scaler, do_sort=False) else: ppis = [] return ppis,clf,scaler,feats
# Plot the feature importances of the trees and of the forest if do_plot: import pylab as pl pl.figure() pl.title("Feature importances") for tree in forest.estimators_: pl.plot(indnums, tree.feature_importances_[indices], "r") pl.plot(indnums, importances[indices], "b") pl.show() feats, weights = zip(*ranked) return list(feats), list(weights) if __name__ == '__main__': if len(sys.argv) < 4: sys.exit("usage: python ml.py train_test feats_f clf_type \ donorm kwarg1_val1-kwarg2-val2") ttf = sys.argv[1] tt = np.load(ttf) feats = ut.loadpy(sys.argv[2]) k = sys.argv[3] do_norm = sys.argv[4] kvs = sys.argv[5] kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \ if kvs else {} clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs) ts = [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs), fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt], clf, norm=do_norm)) for n in 20,30,40,50] ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))