Beispiel #1
0
def predict(name, sp, arrsource, arrfeats, nsp, clf_scaler_feats=None,
        clf_factory=None, clffact_feats=None, clf_type='svm', norm=True, nfeats=100, balance_train=False):
    """
    - arrfeats: labeled training examples array, from
      ppi.feature_array.arrfeats, also stored in res.cvtest result as
      result.exs.arrfeats.
    - arrsource: array of data to classify, matching the training array
    """
    if clf_scaler_feats:
        clf, scaler, feats = clf_scaler_feats
    else:
        if balance_train:
            arrfeats = fe.balance_train(arrfeats)
        if clf_type in clf_factories and clf_factory is None:
            clf_factory, clffact_feats = clf_factories[clf_type]
        feats = feature_selection(arrfeats, nfeats, clffact_feats() if clffact_feats
                else None)
        arrfeats = fe.keep_cols(arrfeats, feats)
        clf = clf_factory()
        scaler = ml.fit_clf(arrfeats, clf, norm=norm)
    print "Classifier:", clf
    arrsource = fe.keep_cols(arrsource, feats)
    ppis = ml.classify(clf, arrsource, scaler=scaler)
    pres = Struct(ppis=ppis,name=name, species=sp, ppi_params=str(clf),
            feats=feats, nsp=nsp, arrfeats=arrfeats,
            balance_train=balance_train)
    return pres
Beispiel #2
0
def cvtest(name, base_sp, nsp, fs, base_featstruct, kfold=2, clf_type='svm',
        nfeats=100, norm=True, ppi_output=None, train_limit=None,
        save_data=True, balance_train=False, keep_cols=None, clf_factory=None,
        clffact_feats=None, **kwargs):
    """
    """
    assert kfold>1, "CV K-fold 1 not possible"
    exs = ppi.feature_array(base_sp, fs, base_featstruct,
            nsp, **kwargs) if ppi_output is None else ppi_output
    arrfeats, ntest_pos = fe.arr_copy(exs.arrfeats), exs.ntest_pos
    assert len(arrfeats)>0, '0 examples not supported'
    if train_limit: print 'Sampling %s train/cv examples' % train_limit
    train_limit = train_limit or len(arrfeats)
    arrfeats = arrfeats if keep_cols is None else fe.keep_cols(arrfeats, keep_cols)
    arrfeats = fe.keep_rows(arrfeats, random.sample(range(len(arrfeats)),
        int(train_limit))) # shuffle even if not sampling. don't random.shuffle
    ntest_pos = int(ntest_pos * train_limit / len(arrfeats))
    if clf_type in clf_factories and clf_factory is None:
        clf_factory, clffact_feats = clf_factories[clf_type]
    ppis = []
    for k in range(kfold):
        print 'Fold %s:' % k
        ppis_fold,clf,scaler,feats = fold_test(arrfeats, kfold, k, clf_factory,
                clffact_feats, nfeats, norm, balance_train)
        ppis += ppis_fold
    random.shuffle(ppis)
    ppis.sort(key=lambda x: x[2], reverse=True)
    result = Struct(traincv=arrfeats[['id1','id2','hit']], clf=clf,
            scaler=scaler, ppis=ppis, ntest_pos=ntest_pos, name=name,
            species=base_sp, ppi_params=str(clf), feats=feats,
            source_feats=exs.arrfeats.dtype.names, balance_train=balance_train)
    if save_data:
        result.exs = exs
    return result
Beispiel #3
0
def fold_test(arrfeats, kfold, k, clf_factory, clffact_feats, nfeats, norm,
        balance_train):
    arrtrain, arrtest = fe.arr_kfold(arrfeats, kfold, k)
    if balance_train:
        arrtrain = fe.balance_train(arrtrain)
    if nfeats:
        clf_feats = clffact_feats()
        feats = feature_selection(arrtrain, nfeats, clf_feats)
        arrtrain,arrtest = [fe.keep_cols(a,feats) for a in arrtrain,arrtest]
    else:
        feats = None
    clf = clf_factory()
    if k==0: print "Classifier:", clf
    scaler = ml.fit_clf(arrtrain, clf, norm=norm)
    if ml.exist_pos_neg(arrtrain):
        ppis = ml.classify(clf, arrtest, scaler=scaler, do_sort=False)
    else:
        ppis = []
    return ppis,clf,scaler,feats
Beispiel #4
0
    # Plot the feature importances of the trees and of the forest
    if do_plot:
        import pylab as pl
        pl.figure()
        pl.title("Feature importances")
        for tree in forest.estimators_:
            pl.plot(indnums, tree.feature_importances_[indices], "r")
        pl.plot(indnums, importances[indices], "b")
        pl.show()
    feats, weights = zip(*ranked)
    return list(feats), list(weights)

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.exit("usage: python ml.py train_test feats_f clf_type \
               donorm kwarg1_val1-kwarg2-val2")
    ttf = sys.argv[1]
    tt = np.load(ttf)
    feats = ut.loadpy(sys.argv[2])
    k = sys.argv[3]
    do_norm = sys.argv[4]
    kvs = sys.argv[5]
    kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \
        if kvs else {}
    clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs)
    ts =  [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs),
        fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt], 
                        clf, norm=do_norm)) 
        for n in 20,30,40,50]
    ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))