def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True): from unbalanced_dataset import SMOTE from sklearn.metrics import confusion_matrix as confmat from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score columns = [] if type(data) is not np.ndarray: data = data.as_matrix() if type(labels) is not np.ndarray: labels = labels.as_matrix().ravel() skf = StratifiedKFold(labels,n_folds=folds, shuffle=False) sets = [{'train':train, 'test':test} for train, test in skf] acc = [] fmeasure = [] recall = [] precision = [] cm = np.array([0, 0, 0, 0]).reshape(2,2) for fold in sets: data_train = data[fold['train']] labels_train = labels[fold['train']] bugs = sum(labels_train) ratio = float(len(labels_train)-bugs)/bugs data_test = data[fold['test']] labels_test = labels[fold['test']] if runSMOTE: smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') data_train, labels_train = smote.fit_transform(data_train,labels_train) clf.fit(data_train, labels_train) hypot = clf.predict(data_test) acc.append(accuracy_score(hypot, labels_test)) fmeasure.append(f1_score(hypot, labels_test)) recall.append(recall_score(hypot, labels_test)) precision.append(precision_score(hypot, labels_test)) cm += confmat(labels_test, hypot) return acc, fmeasure, recall, precision, cm
def doNetRun(DoPretrain, actType, numHidden, numNodes, dropOut, NumReps, AdjustWforDropout, L1, L2, LearningRate, Momentum, Algorithm, maxUpdate, BatchSize, Patience, MinImprovement, ValidateEveryN): '''Performs *numRep* neural network runs, saving the best run so far (global var) :params: NN hyper-parameters. :returns: run parameters + performance measures (DataFrame).''' layerDef = dict(size=numNodes, activation=actType) netDef = [] netDef.append(numFeatures) for _ in range(numHidden): netDef.append(layerDef) netDef.append(numCats) frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) saveCols = [] for arg in args: print arg, ': ', values[arg], ',', saveCols.append(arg) print saveCols = saveCols + \ ['TrLoss', 'VldLoss', 'TstLoss', 'TestAcc', 'Time', 'BestEpoch'] hist = pd.DataFrame(columns=saveCols) for _ in range(NumReps): global kountRuns kountRuns = kountRuns + 1 # use new seed for each run ii32 = np.iinfo(np.int32) seed = random.randint(0, ii32.max) print 'seed: ', seed # although numpy.random.seed should be uint32, Theanets only checks for # int # and fails if given a uint32 stored as a long in Python net = thts.Classifier(layers=netDef, rng=seed) t0 = time.clock() Epoch = 0 if DoPretrain: print('Train phase I:') net.train(train, valid, patience=Patience, learning_rate=LearningRate, momentum=Momentum, min_improvement=MinImprovement, validate_every=ValidateEveryN, max_updates=maxUpdate, input_dropout=dropOut, hidden_dropout=dropOut, algo='layerwise', weight_l1=L1, # L1 norm sparsity weight_l2=L2, # L2 norm weight decay batch_size=BatchSize) print('Train phase II:') Epoch = 0 lastLoss = np.Inf lastEpoch = 0 for tr, vl in net.itertrain(train, valid, patience=Patience, learning_rate=LearningRate, momentum=Momentum, min_improvement=MinImprovement, validate_every=ValidateEveryN, max_updates=maxUpdate, input_dropout=dropOut, hidden_dropout=dropOut, algo=Algorithm, weight_l1=L1, # L1 norm sparsity weight_l2=L2, # L2 norm weight decay batch_size=BatchSize): Epoch = Epoch + 1 vloss = vl['loss'] if (lastLoss - vloss) >= MinImprovement: lastLoss = vloss lastEpoch = Epoch flg = ' *' + str(lastEpoch) else: flg = '' print Epoch, 'trLoss: %.4f' % tr['loss'], ' vlLoss: %.4f' % vloss, \ ' vlacc: %.4f' % vl['acc'], flg t1 = time.clock() - t0 print 'Time: ', t1, ' Epochs:', Epoch if AdjustWforDropout: fact = 1.0 - dropOut for ll in net.layers: if (ll.name != 'in'): w = net.find(ll.name, 'w') w.set_value(w.get_value() * fact) X, y = train trnLoss = log_loss(y, net.predict_proba(X)) X, y = valid vldLoss = log_loss(y, net.predict_proba(X)) X, y = test ypp = net.predict_proba(X) yp = net.predict(X) acc = net.score(X, y) tstLoss = log_loss(y, ypp) print Epoch, 'trLoss: %.4f' % trnLoss, ' vlLoss: %.4f' % vldLoss, \ ' vlacc: %.4f' % acc print 'Best Epoch: ', lastEpoch cf = confmat(y, yp) print 'Test-set confusion matrix:' print cf global bestLoss global bestParams dta = dict() for arg in args: dta[arg] = values[arg] dta['TrLoss'] = trnLoss dta['VldLoss'] = vldLoss dta['TstLoss'] = tstLoss dta['TestAcc'] = acc dta['Time'] = t1 dta['BestEpoch'] = lastEpoch nr = pd.DataFrame([dta]) if (tstLoss <= bestLoss): bestLoss = tstLoss net.save('bestModel') bestParams = nr hist = hist.append(nr, ignore_index=True) # re-order columns... hist = hist[saveCols] return hist