def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True):
    from unbalanced_dataset import SMOTE
    from sklearn.metrics import confusion_matrix as confmat
    from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
    columns = []
    
    if type(data) is not np.ndarray:
        data = data.as_matrix()
        
    if type(labels) is not np.ndarray:
        labels = labels.as_matrix().ravel()
    
    skf = StratifiedKFold(labels,n_folds=folds, shuffle=False)
    sets = [{'train':train, 'test':test} for train, test in skf]
    acc = []
    fmeasure = []
    recall = []
    precision = []
    cm = np.array([0, 0, 0, 0]).reshape(2,2)
    
    for fold in sets:
        data_train = data[fold['train']]
        labels_train = labels[fold['train']]
        
    
        bugs = sum(labels_train)
        ratio = float(len(labels_train)-bugs)/bugs
        
        data_test = data[fold['test']]
        labels_test = labels[fold['test']]
        if runSMOTE:
            smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
            data_train, labels_train = smote.fit_transform(data_train,labels_train)
        
        clf.fit(data_train, labels_train)
        hypot = clf.predict(data_test)
        
        acc.append(accuracy_score(hypot, labels_test))
        fmeasure.append(f1_score(hypot, labels_test))
        recall.append(recall_score(hypot, labels_test))
        precision.append(precision_score(hypot, labels_test))
        
        cm += confmat(labels_test, hypot)
        
    return acc, fmeasure, recall, precision, cm
def doNetRun(DoPretrain, actType, numHidden, numNodes, dropOut, NumReps,
             AdjustWforDropout, L1, L2,
             LearningRate, Momentum, Algorithm, maxUpdate, BatchSize, Patience,
             MinImprovement, ValidateEveryN):
    '''Performs *numRep* neural network runs, saving the best run so far
    (global var)
    :params: NN hyper-parameters.
    :returns: run parameters + performance measures (DataFrame).'''

    layerDef = dict(size=numNodes, activation=actType)

    netDef = []
    netDef.append(numFeatures)
    for _ in range(numHidden):
        netDef.append(layerDef)
    netDef.append(numCats)

    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    saveCols = []
    for arg in args:
        print arg, ': ', values[arg], ',',
        saveCols.append(arg)
    print

    saveCols = saveCols + \
        ['TrLoss', 'VldLoss', 'TstLoss', 'TestAcc', 'Time', 'BestEpoch']

    hist = pd.DataFrame(columns=saveCols)

    for _ in range(NumReps):
        global kountRuns
        kountRuns = kountRuns + 1

        # use new seed for each run
        ii32 = np.iinfo(np.int32)
        seed = random.randint(0, ii32.max)
        print 'seed: ', seed
        # although numpy.random.seed should be uint32, Theanets only checks for
        # int
        # and fails if given a uint32 stored as a long in Python

        net = thts.Classifier(layers=netDef, rng=seed)

        t0 = time.clock()
        Epoch = 0
        if DoPretrain:
            print('Train phase I:')
            net.train(train, valid,
                      patience=Patience,
                      learning_rate=LearningRate,
                      momentum=Momentum,
                      min_improvement=MinImprovement,
                      validate_every=ValidateEveryN,
                      max_updates=maxUpdate,
                      input_dropout=dropOut,
                      hidden_dropout=dropOut,
                      algo='layerwise',
                      weight_l1=L1,  # L1 norm sparsity
                      weight_l2=L2,  # L2 norm weight decay
                      batch_size=BatchSize)

        print('Train phase II:')
        Epoch = 0
        lastLoss = np.Inf
        lastEpoch = 0
        for tr, vl in net.itertrain(train, valid,
                                    patience=Patience,
                                    learning_rate=LearningRate,
                                    momentum=Momentum,
                                    min_improvement=MinImprovement,
                                    validate_every=ValidateEveryN,
                                    max_updates=maxUpdate,
                                    input_dropout=dropOut,
                                    hidden_dropout=dropOut,
                                    algo=Algorithm,
                                    weight_l1=L1,  # L1 norm sparsity
                                    weight_l2=L2,  # L2 norm weight decay
                                    batch_size=BatchSize):
            Epoch = Epoch + 1
            vloss = vl['loss']

            if (lastLoss - vloss) >= MinImprovement:
                lastLoss = vloss
                lastEpoch = Epoch
                flg = ' *' + str(lastEpoch)
            else:
                flg = ''
            print Epoch, 'trLoss: %.4f' % tr['loss'], ' vlLoss: %.4f' % vloss, \
                ' vlacc: %.4f' % vl['acc'], flg

        t1 = time.clock() - t0
        print 'Time: ', t1, ' Epochs:', Epoch

        if AdjustWforDropout:
            fact = 1.0 - dropOut
            for ll in net.layers:
                if (ll.name != 'in'):
                    w = net.find(ll.name, 'w')
                    w.set_value(w.get_value() * fact)

        X, y = train
        trnLoss = log_loss(y, net.predict_proba(X))

        X, y = valid
        vldLoss = log_loss(y, net.predict_proba(X))

        X, y = test
        ypp = net.predict_proba(X)
        yp = net.predict(X)
        acc = net.score(X, y)
        tstLoss = log_loss(y, ypp)

        print Epoch, 'trLoss: %.4f' % trnLoss, ' vlLoss: %.4f' % vldLoss, \
            ' vlacc: %.4f' % acc
        print 'Best Epoch: ', lastEpoch

        cf = confmat(y, yp)
        print 'Test-set confusion matrix:'
        print cf

        global bestLoss
        global bestParams

        dta = dict()

        for arg in args:
            dta[arg] = values[arg]

        dta['TrLoss'] = trnLoss
        dta['VldLoss'] = vldLoss
        dta['TstLoss'] = tstLoss
        dta['TestAcc'] = acc
        dta['Time'] = t1
        dta['BestEpoch'] = lastEpoch

        nr = pd.DataFrame([dta])

        if (tstLoss <= bestLoss):
            bestLoss = tstLoss
            net.save('bestModel')
            bestParams = nr

        hist = hist.append(nr, ignore_index=True)

        # re-order columns...
        hist = hist[saveCols]

    return hist