def catClassify(botData, kernelType, nTopic):
    X = botData[:, :nTopic]
    y = botData[:, nTopic]

    # Run classifier
    #    classifier = svm.SVC(kernel='linear', probability=True)

    classifier = svm.NuSVC(probability=True)

    #cross-validation
    cv = StratifiedKFold(y, k=nFold)
    #select classifier
    #classifier = svm.SVC(kernel=kernelType, probability=True)

    metricstemp = np.zeros((nFold, nMetrics), np.float)
    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        #fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable
        #roc_auc = auc(fpr, tpr)
        precision, recall, thresholds = precision_recall_curve(
            y[test], probas_[:, 1])  #@UnusedVariable
        pr_auc = auc(recall, precision)
        metricstemp[i] = [pr_auc]

    return [np.mean(metricstemp), np.std(metricstemp)]
def svmClassify(data,probability=True,kernelType='rbf',nFold=10,beta=1,nMetrics=3):
    X = data[:,:-1]
    y = data[:,-1]

    clfParamList = {'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree':4, 'probability':probability,'shrinking':True,'cache_size':10000}
    classifier = SVC(**clfParamList)
    
    #cross-validation
    cv = StratifiedKFold(y, k=nFold)
        
    metricstemp = np.zeros((nFold,nMetrics),np.float)
    for i, (train, test) in enumerate(cv):
        clf = classifier.fit(X[train], y[train])
        ypred = clf.predict(X[test])
        if(probability==True):
            probas_ = clf.predict_proba(X[test])
            precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable
            fpr,tpr,thresholds = roc_curve(y[test],probas_[:,1]) #@UnusedVariable
            roc_auc = auc(fpr,tpr)
            pr_auc = auc(recall,precision)
        else:
            f1score = f1_score(y[test],ypred)
            fpr,tpr,thresholds = roc_curve(y[test],ypred) #@UnusedVariable
            roc_auc = auc(fpr,tpr)
            
        metricstemp[i] = [f1score,roc_auc,pr_auc]
    
    return metricstemp
def catClassify(dataPath, catname, kernelType, dataext, catmap, nTopic):
    #read the categoy data which will positive
    fname = dataPath + catname + dataext
    catpos = np.genfromtxt(fname, dtype=np.int)  # catpos
    catpos = catpos[:, :nTopic + 1]
    catpos[:, nTopic] = 1
    #read the category data of remaining classes
    for cats in catmap.keys():
        if (cats != catname):
            firstvisit = True
            if (firstvisit):
                catneg = np.genfromtxt(fname, dtype=np.int)
                firstvisit = False
            else:
                catneg = np.concatenate(
                    (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0)
    #sample the negative data to have equal size as the positive
    nPos = catpos.shape[0]
    nNeg = catneg.shape[0]
    catneg = catneg[np.random.randint(0, nNeg, nPos), :]  #catneg
    catneg = catneg[:, :nTopic + 1]
    catneg[:, nTopic] = 0
    #combine positive and negative data
    data = np.concatenate((catpos, catneg), axis=0)
    #shuffle the rows to aid in random selection of train and test
    np.random.shuffle(data)

    X = data[:, :nTopic]
    y = data[:, nTopic]

    #cross-validation
    cv = StratifiedKFold(y, k=nFold)
    #select classifier
    classifier = svm.SVC(kernel=kernelType, probability=True)
    metricstemp = np.zeros((nFold, nMetrics), np.float)
    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:,
                                                          1])  #@UnusedVariable
        roc_auc = auc(fpr, tpr)
        precision, recall, thresholds = precision_recall_curve(
            y[test], probas_[:, 1])  #@UnusedVariable
        pr_auc = auc(recall, precision)
        metricstemp[i] = [roc_auc, pr_auc]

    return [np.mean(metricstemp, axis=0), np.std(metricstemp, axis=0)]
    def ParameterGridSearch(self, callback=None, nValidation=5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        #
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
        #from multiprocessing import cpu_count
        #n_workers = cpu_count()
        #except:
        #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {
            'C': 2**np.arange(-5, 11, 2, dtype=float),
            'gamma': 2**np.arange(3, -11, -2, dtype=float)
        }
        clf = GridSearchCV(SVC(kernel='rbf'),
                           parameters,
                           n_jobs=n_workers,
                           score_func=precision_score)
        clf.fit(self.svm_train_values,
                self.svm_train_labels,
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s' %
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma
def knnClassify(data,nFold=10,nNeighbor=10,nMetrics=2):
    X = data[:,:-1]
    y = data[:,-1]
    
    clfParamList = {'n_neighbors':nNeighbor,'algorithm':'auto'}
    classifier = NeighborsClassifier(**clfParamList)
    
    #cross-validation
    cv = StratifiedKFold(y, k=nFold)
        
    metricstemp = np.zeros((nFold,nMetrics),np.float)
    for i, (train, test) in enumerate(cv):
        clf = classifier.fit(X[train], y[train])
        ypred = clf.predict(X[test])
        f1score = f1_score(y[test],ypred)
        fpr,tpr,thresholds = roc_curve(y[test],ypred) #@UnusedVariable
        roc_auc = auc(fpr,tpr)
        metricstemp[i] = [f1score,roc_auc]
    
    return metricstemp
Example #6
0
        v, w = np.linalg.eigh(gmm.covars[n][:2, :2])
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan(u[1]/u[0])
        angle = 180 * angle / np.pi # convert to degrees
        v *= 9
        ell = mpl.patches.Ellipse(gmm.means[n, :2], v[0], v[1], 180 + angle,
                                  color=color)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)

iris = datasets.load_iris()

# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(iris.target, k=4)
# Only take the first fold.
train_index, test_index = skf.__iter__().next()


X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
classifiers = dict((x, GMM(n_states=n_classes, cvtype=x))
                    for x in ['spherical', 'diag', 'tied', 'full'])
def main():
    try:
        kernelType = sys.argv[1]
    except(IndexError):
        kernelType='linear'
    
    #catmap = getCatMap(dataset)
    #initialise output matrices
    rocauc = np.zeros((nDim,nCategory),dtype=np.float32)
    mapauc = np.zeros((nDim,nCategory),dtype=np.float32)
    
    nSamplesPerCat = int(np.round(nClusterSamples/nCategory))
    for iLDim,ldim in enumerate(ldims):
        #write the lower dimensional projections for each category
        for iCategory,catname in enumerate(catList):
            dataOuttemp = dimred(iCategory,catname,ldim)
            dataOut = np.array(np.round(dataOuttemp).astype(np.int16),dtype=np.int16)
            outFilename = tempPath+catname+'.'+dataExt
            np.savetxt(outFilename, dataOut, delimiter=' ', fmt='%d')
            if(dataOut.shape[0] <= nSamplesPerCat):
                catSample = dataOut
            else:
                rndsample = np.random.randint(0,dataOut.shape[0],nSamplesPerCat)
                catSample = dataOut[rndsample,:]
            if(iCategory==0):
                dataLower = catSample
            else:
                dataLower = np.concatenate((dataLower,catSample),axis=0)
        #cluster random sampled lower dimensional data
        # compute the code-book for the data-set
        [CodeBook,label] = kmeans2(dataLower,nCodewords,iter=nIterKmeans,minit='points',missing='warn') #@UnusedVariable
        # write code-book to file
        cbfilepath = tempPath+dataset+codebookext
        cbfile = open(cbfilepath,'w')
        np.savetxt(cbfile,CodeBook,fmt='%f', delimiter=' ',)
        cbfile.close()
        
        for iCategory,catname in enumerate(catList):
            tempFilename = tempPath+catname+'.'+dataExt
            catData = np.loadtxt(tempFilename, dtype=np.int16, delimiter=' ')
            [catLabel,catDist] = vq(catData,CodeBook) #@UnusedVariable
            catfilePath = dataPath+catname+'.'+dataExt
            catImgId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-2])
            catId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-1])[0]
            ImgId = np.unique(catImgId)
            catboffilepath = tempPath+catname+bofext
            imgcount=0
            for imgid in ImgId:
                imgLabel = catLabel[catImgId==imgid]
                [hist,edges] = np.histogram(imgLabel,nCodewords) #@UnusedVariable
                if imgcount==0:
                    dataout = np.hstack((hist.T,imgid,catId))
                else:
                    dataout = np.vstack((dataout,np.hstack((hist.T,imgid,catId))))
                imgcount+=1
            np.savetxt(catboffilepath, dataout, fmt='%d', delimiter=' ', )
        
        select = np.concatenate((np.arange(nCodewords),[nCodewords+1]),axis=1)
        for iCategory,catname in enumerate(catList):
            #posLabel = catmap.get(catname)
            #negLabel = 0
            #read the category data which will positive
            catboffilepath = tempPath+catname+bofext
            catpos = np.genfromtxt(catboffilepath,dtype=np.int)   
            catpos = catpos.take(select,axis=1)
            catpos[:,-1] = 1
            #posLabel = catpos[0][-1]
            catset = set(catList)
            catset.remove(catname)
            firstvisit = True
            for cat in catset: #@UnusedVariable
                catboffilepath = tempPath+catname+bofext
                if(firstvisit):
                    catneg = np.genfromtxt(catboffilepath,dtype=np.int)
                    firstvisit = False
                else : 
                    catneg = np.concatenate((catneg,np.genfromtxt(catboffilepath,dtype=np.int)),axis=0)
                
            #sample the negative data to have equal size as the positive
            nPos = catpos.shape[0]
            nNeg = catneg.shape[0]
            catneg = catneg[np.random.randint(0,nNeg,nPos),:]
            catneg = catneg.take(select,axis=1)
            catneg[:,-1] = -1
            #combine positive and negative data
            data = np.concatenate((catpos,catneg),axis=0)
            
            #shuffle the rows to aid in random selection of train and test
            #np.random.shuffle(data)
            
            X = data[:,:nCodewords]
            y = data[:,nCodewords]
            #labels for cross validation
            
            #y2 = np.where(y!=posLabel,0,y)
            #y2 = np.where(y2==posLabel,1,y2)
            
            #cross-validation
            cv = StratifiedKFold(y, k=nFold)
            #select classifier
            classifier = svm.SVC(kernel=kernelType, probability=True)
            metricstemp = np.zeros((nFold,nMetrics),np.float)
            
            for i, (train, test) in enumerate(cv):
                probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
                print y[test]
                print probas_[:,1]
                try:
                    fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable
                    roc_auc = auc(fpr, tpr)
                except:
                    roc_auc = 0.
                try:
                    precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable
                    pr_auc = auc(recall,precision)
                except:
                    pr_auc = 0.
                metricstemp[i] = [roc_auc,pr_auc]
                
            rocauc[iLDim,iCategory] = np.mean(metricstemp[0],axis=0)
            mapauc[iLDim,iCategory] = np.mean(metricstemp[1],axis=0)
            print '%s classified...' % (catname)
     
    outPath = rootDir + dataset + outDir + '%s%s%s%s'%('dimensionality',dataset,kernelType,'.svg')
    outPath1 = rootDir + dataset + outDir + '%s%s%s%s' % ('dimensionality',dataset,kernelType,'.npz') 
    plt.figure(0)
    #ax = plt.subplot(111)
    plt.errorbar(np.arange(1,nDim+1), np.mean(rocauc,axis=1), np.std(rocauc,axis=1), fmt = '-', elinewidth=1, marker = 'x', label = 'AUC-ROC')
    plt.errorbar(np.arange(1,nDim+1), np.mean(mapauc,axis=1), np.std(mapauc,axis=1), fmt = '--', elinewidth=1, marker = 'o', label = 'MAP')
    plt.xlabel('Visual Categories')
    plt.ylabel('Performance Metric')
    plt.title('BOF Performance: %s : %s' % (dataset,kernelType))
    plt.legend(loc="lower right")
    #ax.set_xticks()
    #ax.set_xticklabels(ldim,size='small',ha='center')
    plt.savefig(outPath,format='svg')
    try:
        np.savez(outPath1,rocauc,mapauc)
    except:
        print 'unable to write file %s' % (outPath1)
    

    plt.show()
    plt.close()
##############################################################################
# Loading a dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
n_classes = np.unique(y).size

# Some noisy data not correlated
random = np.random.RandomState(seed=0)
E = random.normal(size=(len(X), 2200))

# Add noisy data to the informative features for make the task harder
X = np.c_[X, E]

svm = SVC(kernel='linear')
cv = StratifiedKFold(y, 2)

score, permutation_scores, pvalue = permutation_test_score(svm,
                                                           X,
                                                           y,
                                                           zero_one_score,
                                                           cv=cv,
                                                           n_permutations=100,
                                                           n_jobs=1)

print "Classification score %s (pvalue : %s)" % (score, pvalue)

###############################################################################
# View histogram of permutation scores
pl.hist(permutation_scores, label='Permutation scores')
ylim = pl.ylim()
Example #9
0
    algorithm.fit(train_data, train_target)
    y_pred = algorithm.predict(test_data)
    results.append(precision(y_pred))

def precision(y_pred):
    prec = sum(y_pred == test_target)
    return float(prec) / len(test_target)

svmclf = svm.SVC()
logisticclf = LogisticRegression()
nnclf= neighbors.Neighbors()
svmli = []
logli = []
nnli = []

cv = StratifiedKFold(iris.target, 20)
for train_index, test_index in cv:
    train_data = iris.data[train_index]
    train_target = iris.target[train_index]
    test_data = iris.data[test_index]
    test_target = iris.target[test_index]

    #svm
    test_algorithm(svmclf, svmli, train_data, train_target, test_data)

    #logistic regression
    test_algorithm(logisticclf, logli, train_data, train_target, test_data)
    
    #NN
    test_algorithm(nnclf, nnli, train_data, train_target, test_data)
from scikits.learn.cross_val import StratifiedKFold
from scikits.learn.feature_selection import RFECV
from scikits.learn.datasets import samples_generator
from scikits.learn.metrics import zero_one

################################################################################
# Loading a dataset

X, y = samples_generator.test_dataset_classif(n_features=500, k=5, seed=0)

################################################################################
# Create the RFE object and compute a cross-validated score

svc = SVC(kernel='linear')
rfecv = RFECV(estimator=svc, n_features=2, percentage=0.1, loss_func=zero_one)
rfecv.fit(X, y, cv=StratifiedKFold(y, 2))

print 'Optimal number of features : %d' % rfecv.support_.sum()

import pylab as pl
pl.figure()
pl.semilogx(rfecv.n_features_, rfecv.cv_scores_)
pl.xlabel('Number of features selected')
pl.ylabel('Cross validation score (nb of misclassifications)')
# 15 ticks regularly-space in log
x_ticks = np.unique(
    np.logspace(
        np.log10(2),
        np.log10(rfecv.n_features_.max()),
        15,
    ).astype(np.int))
def process_speaker(db):
    TEST_FOLD = 2
    #db = connect_to_database()
    #db.add_son_manipulator(TransformToBinary())
    #db = NeoEngine('/data/neo4j')

    #Impostor ratio is ratio of impostor records in training
    #and testing population.  For ratio=N, subject is 1/(N+1),
    #impostor N/(N+1) of population
    IMPOSTOR_RATIO = 3
    LIMIT = 30

    #Set up queue
    ctx = zmq.Context()
    q = ctx.socket(zmq.PULL)
    q.connect("tcp://127.0.0.1:5555")
    outQ = ctx.socket(zmq.PUSH)
    outQ.connect("tcp://127.0.0.1:5556")

    while True:
        speaker_name = q.recv_json()
        print 'Received %s' % speaker_name

        #time.sleep(random.randint(1,10))
        #Find all SVs for current subject
        #print 'Speaker Name: %s' % speaker.name
        #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count()
        #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name))
        sv_subject = stack_SVs(db.get('sv', {'speaker_name': speaker_name}),
                               limit=LIMIT)
        num_subject = np.size(sv_subject, 0)
        print num_subject
        if num_subject < 20:
            continue

        #Get random SVs from rest of database for test population
        #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}})
        sv_impostor = stack_SVs_random(db, speaker_name,
                                       num_subject * IMPOSTOR_RATIO)
        num_impostor = np.size(sv_impostor, 0)
        print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor)

        #generate total dataset of observations X with class labels y
        X = np.vstack((sv_subject, sv_impostor))
        y = np.array([1] * num_subject + [0] * num_impostor)

        #Pick random assortment from each set to form training observations
        #Switch ensures that smaller number always used for training
        if TEST_FOLD < 3:
            train, test = iter(StratifiedKFold(y, TEST_FOLD)).next()
        else:
            test, train = iter(StratifiedKFold(y, TEST_FOLD)).next()

        #Perform crossvalidated SVM training
        #print type(X), type(y)
        #print np.shape(X[train]), np.shape(y[train])

        clf = train_svm_crossvalidated(X[train], y[train])
        #print type(clf)

        #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name}
        #db.svm.insert(clf_rec, safe=True)

        #Collect classification statistics
        accuracy = test_svm_accuracy(X[test], y[test], clf)
        num_subject_test = np.sum(y[test])
        num_impostor_test = len(y[test]) - num_subject_test
        print 'Accuracy: %f' % (float(accuracy['correct_subject']) /
                                float(num_subject_test))
        #print 'Sub: %i/%i  Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test)
        #print 'False Neg: %i  False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos'])
        msg = {
            'speaker_name': speaker_name,
            'accuracy': accuracy,
            'num_subject': num_subject,
            'num_subject_test': num_subject_test,
            'num_impostor_test': num_impostor_test
        }
        outQ.send_pyobj(msg)
Example #12
0
def blWord():
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset

    kernelType = options.kernelType
    nFold = options.nFold
    nCodeword = options.nCodeword

    dataPath = rootDir + dataset + bofDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword) + bofext
    nCategory = len(catList)

    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)

    for iCat, catname in enumerate(catList):
        print catname
        #read the category data which will positive
        fname = dataPath + catname + dataext
        catpos = np.genfromtxt(fname, dtype=np.int)  # catpos
        catpos = catpos[:, :nCodeword + 1]
        catpos[:, nCodeword] = 1
        #read the category data of remaining classes
        for cats in catList:
            if (cats != catname):
                firstvisit = True
                if (firstvisit):
                    catneg = np.genfromtxt(fname, dtype=np.int)
                    firstvisit = False
                else:
                    catneg = np.concatenate(
                        (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0)
        #sample the negative data to have equal size as the positive
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0, nNeg, nPos), :]  #catneg
        catneg = catneg[:, :nCodeword + 1]
        catneg[:, nCodeword] = 0
        #combine positive and negative data
        data = np.concatenate((catpos, catneg), axis=0)
        #shuffle the rows to aid in random selection of train and test
        np.random.shuffle(data)

        X = data[:, :nCodeword]
        y = data[:, nCodeword]

        clfParamList = {
            'kernel': kernelType,
            'gamma': 1e-3,
            'C': 1,
            'degree': 4,
            'probability': True,
            'shrinking': True,
            'cache_size': 1000
        }
        classifier = SVC(**clfParamList)
        cv = StratifiedKFold(y, k=nFold)
        avgprec = np.zeros(nFold)
        for icv, (train, test) in enumerate(cv):
            clf = classifier.fit(X[train], y[train])
            probas_ = clf.predict_proba(X[test])
            precision, recall, thresholds = precision_recall_curve(
                y[test], probas_[:, 1])  #@UnusedVariable
            avgprec[icv] = auc(recall, precision)
        perfMean[iCat] = np.mean(avgprec)
        perfStd[iCat] = np.std(avgprec)
    return [perfMean, perfStd]
def test_accuracy_concurrent(worker, concurrency):
    TEST_FOLD = 2
    RUN_MAX = False
    iter_num = 0
    db = connect_to_database()
    db.add_son_manipulator(TransformToBinary())
    

    #take this out later when testing complete
    fid = open('/home/ubuntu/project/backend-search/src/spkrec/utils/hist'+str(worker)+'.csv', 'wb')
    csv_writer = csv.writer(fid)
    if worker == 0:
        csv_writer.writerow(['name', 'num speaker SVs', 'test subjects', 'test impostors', 'correct subject', 'correct impostor', 'false neg', 'false pos'])

    cursor = Concurrent_cursor(Speaker.objects())
    cursor.set_concurrency(concurrency)
    cursor.set_worker(worker)


    for speaker in Speaker.objects():
        #Impostor ratio is ratio of impostor records in training
        #and testing population.  For ratio=N, subject is 1/(N+1),
        #impostor N/(N+1) of population
        IMPOSTOR_RATIO = 3

        #Find all SVs for current subject
        print 'Speaker Name: %s' % speaker.name
        #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count()
        #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name))
        sv_subject = stack_SVs(db.sv.find({'speaker_name': speaker.name}))
        num_subject = np.size(sv_subject,0)
        
        #csv_writer.writerow([speaker.name, num_subject])
        #print num_subject
        if num_subject < 20:
            continue
        
        #Get random SVs from rest of database for test population
        #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}})
        sv_impostor = stack_SVs_random(db, speaker.name, num_subject*IMPOSTOR_RATIO)
        num_impostor = np.size(sv_impostor,0)
        print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor)

        #generate total dataset of observations X with class labels y
        X = np.vstack((sv_subject, sv_impostor))
        y = np.array([1] * num_subject + [0] * num_impostor)

        #Pick random assortment from each set to form training observations
        #Switch ensures that smaller number always used for training
        if TEST_FOLD < 3:
            train, test = iter(StratifiedKFold(y, TEST_FOLD)).next()
        else:
            test, train = iter(StratifiedKFold(y, TEST_FOLD)).next()
        #print train

        #Perform crossvalidated SVM training
        #print type(X), type(y)
        #print np.shape(X[train]), np.shape(y[train])

        clf = train_svm_crossvalidated(X[train], y[train])
        #print type(clf)

        #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name}
        #db.svm.insert(clf_rec, safe=True)

        #Collect classification statistics
        accuracy = test_svm_accuracy(X[test], y[test], clf)
        num_subject_test = np.sum(y[test])
        num_impostor_test = len(y[test]) - num_subject_test
        print 'Accuracy: %f' % (float(accuracy['correct_subject'])/float(num_subject_test))
        print 'Sub: %i/%i  Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test)
        print 'False Neg: %i  False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos'])

        csv_writer.writerow([speaker.name, num_subject, num_subject_test, num_impostor_test, accuracy['correct_subject'], accuracy['correct_impostor'], accuracy['false_neg'], accuracy['false_pos']])
        iter_num = iter_num + 1

        #if RUN_MAX and iter_num >= RUN_MAX:
        #    print "I'm breaking"
        #    break
        #print num_subject, num_impostor
    fid.close()
    print "Complete"
y = digits.target

################################################################################
# Create the RFE object and compute a cross-validated score, compared to an
# unvariate feature selection

<<<<<<< HEAD
rfe = RFE(estimator = SVC(kernel="linear",C=1), n_features = 10, percentage =
0.1)
anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif)
clf = SVC(kernel="linear",C=1)

y_pred_rfe = []
y_pred_univ = []
y_true = []
for train, test in StratifiedKFold(y, 2):
    Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]

    ### Fit and predict rfe
    support = rfe.fit(X[train], y[train]).support_
    y_pred_rfe.append(clf.fit(X[train,support],y[train]).predict(
          X[test,support]))

    ### Fit and predict univariate feature selection
    xr = anova_filter.fit(Xtrain, ytrain).transform(Xtrain)
    y_pred_univ.append(clf.fit(Xtrain[:,anova_filter.support_],ytrain).predict(
          Xtest[:,anova_filter.support_]))
    y_true.append(ytest)

y_pred_univ = np.concatenate(y_pred_univ)
y_true = np.concatenate(y_true)
Example #15
0
from scikits.learn.metrics import precision_score
from scikits.learn.metrics import recall_score
from scikits.learn.svm import SVC

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# split the dataset in two equal part respecting label proportions
train, test = iter(StratifiedKFold(y, 2)).next()

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
        v *= 9
        ell = mpl.patches.Ellipse(gmm.means[n, :2],
                                  v[0],
                                  v[1],
                                  180 + angle,
                                  color=color)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)


iris = datasets.load_iris()

# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(iris.target, k=4)
# Only take the first fold.
train_index, test_index = skf.__iter__().next()

X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
classifiers = dict((x, GMM(n_states=n_classes, cvtype=x))
                   for x in ['spherical', 'diag', 'tied', 'full'])

n_classifiers = len(classifiers)
Example #17
0
    def draw_roc(self, fields_used=None, label_field='', pca=False):
        # Classification and ROC analysis

        # Run classifier with crossvalidation and plot ROC curves
        cv = StratifiedKFold(self.labels, k=self.folds)
        self.classifier = self.get_ml(probability=True)

        mean_tpr = 0.0
        mean_fpr = numpy.linspace(0, 1, 100)

        for i, (train, test) in enumerate(cv):
            train_data = self.data[train]
            test_data = self.data[test]
            if pca:
                reducer = GlassPCA()
                reducer.get_pca(self.data[train])
                train_data, test_data = reducer.project_data(
                    self.data[train], self.data[test])
            fitted = self.classifier.fit(train_data,
                                         self.labels[train],
                                         class_weight='auto')

            decisions = fitted.predict(test_data)
            print classification_report(self.labels[test], decisions)
            print confusion_matrix(self.labels[test], decisions)
            print mean_square_error(self.labels[test], decisions)

            if getattr(self.classifier, 'classes',
                       None) is None or self.classifier.classes.shape[0] == 2:
                probas_ = fitted.predict_proba(test_data)
                # Compute ROC curve and area the curve
                try:
                    fpr, tpr, thresholds = roc_curve(self.labels[test],
                                                     probas_[:, 1])
                except IndexError:
                    fpr, tpr, thresholds = roc_curve(self.labels[test],
                                                     probas_)
                mean_tpr += scipy.interp(mean_fpr, fpr, tpr)
                mean_tpr[0] = 0.0
                roc_auc = auc(fpr, tpr)
                pl.plot(fpr,
                        tpr,
                        lw=1,
                        label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

        mean_tpr /= len(cv)
        mean_tpr[-1] = 1.0
        mean_tpr = numpy.ma.fix_invalid(mean_tpr, fill_value=0)
        mean_auc = auc(mean_fpr, mean_tpr)
        pl.plot(mean_fpr,
                mean_tpr,
                'k--',
                label='Mean ROC (area = %0.2f)' % mean_auc,
                lw=2)

        pl.xlim([-0.05, 1.05])
        pl.ylim([-0.05, 1.05])
        pl.xlabel('False Positive Rate')
        pl.ylabel('True Positive Rate')
        title = '%s ROC curve: %s' % (self.class_name, label_field)
        if fields_used:
            annot_fields = [
                '%s (%.2f)' % (fields_used[i], self.classifier.coef_[0:, i])
                for i in xrange(0, len(fields_used))
            ]

            title += '\nUsing fields: ' + '\n'.join([
                ', '.join(annot_fields[x:x + 6])
                for x in xrange(0, len(annot_fields), 6)
            ])
        pl.title(title, fontsize='small')
        pl.legend(loc="lower right")
        #pl.savefig('/Users/karmel/Desktop/pic_%d.png' % random.randint(0,99999))
        pl.show()
Example #18
0
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print "Total dataset size:"
print "n_samples: %d" % n_samples
print "n_features: %d" % n_features
print "n_classes: %d" % n_classes

################################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
train, test = iter(StratifiedKFold(y, k=4)).next()
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]

################################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print "Extracting the top %d eigenfaces from %d faces" % (n_components,
                                                          X_train.shape[0])
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

eigenfaces = pca.components_.T.reshape((n_components, h, w))
# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape

# Add noisy features
X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]

################################################################################
# Classification and ROC analysis

# Run classifier with crossvalidation and plot ROC curves
cv = StratifiedKFold(y, k=6)
classifier = svm.SVC(kernel='linear', probability=True)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))