Beispiel #1
0
def cvtest(name, base_sp, nsp, fs, base_featstruct, kfold=2, clf_type='svm',
        nfeats=100, norm=True, ppi_output=None, train_limit=None,
        save_data=True, balance_train=False, keep_cols=None, clf_factory=None,
        clffact_feats=None, **kwargs):
    """
    """
    assert kfold>1, "CV K-fold 1 not possible"
    exs = ppi.feature_array(base_sp, fs, base_featstruct,
            nsp, **kwargs) if ppi_output is None else ppi_output
    arrfeats, ntest_pos = fe.arr_copy(exs.arrfeats), exs.ntest_pos
    assert len(arrfeats)>0, '0 examples not supported'
    if train_limit: print 'Sampling %s train/cv examples' % train_limit
    train_limit = train_limit or len(arrfeats)
    arrfeats = arrfeats if keep_cols is None else fe.keep_cols(arrfeats, keep_cols)
    arrfeats = fe.keep_rows(arrfeats, random.sample(range(len(arrfeats)),
        int(train_limit))) # shuffle even if not sampling. don't random.shuffle
    ntest_pos = int(ntest_pos * train_limit / len(arrfeats))
    if clf_type in clf_factories and clf_factory is None:
        clf_factory, clffact_feats = clf_factories[clf_type]
    ppis = []
    for k in range(kfold):
        print 'Fold %s:' % k
        ppis_fold,clf,scaler,feats = fold_test(arrfeats, kfold, k, clf_factory,
                clffact_feats, nfeats, norm, balance_train)
        ppis += ppis_fold
    random.shuffle(ppis)
    ppis.sort(key=lambda x: x[2], reverse=True)
    result = Struct(traincv=arrfeats[['id1','id2','hit']], clf=clf,
            scaler=scaler, ppis=ppis, ntest_pos=ntest_pos, name=name,
            species=base_sp, ppi_params=str(clf), feats=feats,
            source_feats=exs.arrfeats.dtype.names, balance_train=balance_train)
    if save_data:
        result.exs = exs
    return result
Beispiel #2
0
def enrichment_array_combined(sp_base, sp_dict_elutfs, cxs, func=np.average, nsp=1, scores=["poisson"], exs=None):
    """
    sp_dict_elutfs: {'Ce': [Ce_elution_1, Ce_elution_2, ...] , ...}
    """
    exs = exs or correlation_enrichment([(i, set(c)) for i, c in enumerate(cxs)])
    elutfs = ut.flatten([elutfs for sp, elutfs in sp_dict_elutfs.items()])
    ppio = ppi.feature_array(sp_base, elutfs, exs, nsp, scores=scores, extdata=[], do_filter=False)
    newarr = ppio.arrfeats
    for sp in sp_dict_elutfs.keys():
        newarr = fe.merge_features(newarr, "%s.*" % sp, func, False)
    return newarr