def svmCV(k, filetype, groups, cp, ke):

    predictAcc = np.zeros(k)

    for i in range(k):

        print 'k:', k, 'run:', i
        result = dv.createDataVoca(k, i, filetype)
        data = result[0]
        voca = result[1]
        label = result[2]

        subsetSize = len(label) / k
        testLabel = label[i * subsetSize:(i + 1) * subsetSize]
        trainLabel = np.append(label[:i * subsetSize],
                               label[(i + 1) * subsetSize:],
                               axis=0)

        lenTestLab = len(testLabel)
        lenTrainLab = len(trainLabel)

        print 'len(testLabel):', lenTestLab
        print 'len(trainLabel):', lenTrainLab

        mask = np.logical_and((data[:, 0] > i * subsetSize),
                              (data[:, 0] <= (i + 1) * subsetSize))

        tem = np.array([i * subsetSize, 0, 0])
        testData = data[mask] - tem

        tem1 = np.array([subsetSize, 0, 0])
        maskhi = data[:, 0] > (i + 1) * subsetSize
        trainDatahi = data[maskhi] - tem1

        masklo = data[:, 0] <= i * subsetSize
        trainDatalo = data[masklo]

        trainData = np.append(trainDatahi, trainDatalo, axis=0)

        # generate features for all the reviews
        trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab)
        testFeat = fvec.mulFeatGen(testData, voca, lenTestLab)

        clf = svm.SVC(C=cp, kernel=ke, cache_size=1000)
        print "we are training our data:"
        clf.fit(trainFeat, trainLabel)
        print "we are testing our data:"
        predictMul = clf.predict(testFeat)
        print "we are calculating accuracy:"
        predictAcc[i] = fvec.accPredict(testLabel, predictMul)

        print "accuracy: " + str(predictAcc[i])
        joblib.dump(
            clf, '../modelSave/svmModelAll_K' + str(k) + '_Run' + str(i) +
            '_' + filetype + '.pkl')
    # averageAcc = np.mean(predictAcc)

    return predictAcc
def svmCV(k, filetype, groups, cp, ke):

	predictAcc = np.zeros(k)

	for i in range(k):

		print 'k:',k,'run:',i
		result = dv.createDataVoca(k,i,filetype)
		data = result[0]
		voca = result[1]
		label = result[2]

		subsetSize = len(label)/k
		testLabel = label[i*subsetSize:(i+1)*subsetSize]
		trainLabel = np.append(label[:i*subsetSize],label[(i+1)*subsetSize:], axis=0)

		lenTestLab = len(testLabel)
		lenTrainLab = len(trainLabel)

		print 'len(testLabel):',lenTestLab
		print 'len(trainLabel):',lenTrainLab

		mask = np.logical_and((data[:,0]>i*subsetSize), (data[:,0]<=(i+1)*subsetSize))
		
		tem = np.array([i*subsetSize,0,0])
		testData = data[mask] - tem

		tem1 = np.array([subsetSize,0,0])
		maskhi = data[:,0]>(i+1)*subsetSize
		trainDatahi = data[maskhi] - tem1

		masklo = data[:,0]<=i*subsetSize
		trainDatalo = data[masklo]

		trainData = np.append(trainDatahi, trainDatalo, axis=0)
		
		# generate features for all the reviews
		trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab)
		testFeat = fvec.mulFeatGen(testData, voca, lenTestLab)

		clf = svm.SVC(C=cp, kernel=ke, cache_size=1000)
		print "we are training our data:"
		clf.fit(trainFeat, trainLabel)
		print "we are testing our data:"
		predictMul = clf.predict(testFeat)		
		print "we are calculating accuracy:"
		predictAcc[i] = fvec.accPredict(testLabel,predictMul)
		
		print "accuracy: "+str(predictAcc[i])
		joblib.dump(clf, '../modelSave/svmModelAll_K'+str(k)+'_Run'+str(i)+'_'+filetype+'.pkl') 
	# averageAcc = np.mean(predictAcc)

	return predictAcc
def navieBayesMulCV(k, filetype, groups):

    predictAcc = np.zeros(k)

    for i in range(k):

        print 'k:', k, 'run:', i
        result = dv.createDataVoca(k, i, filetype)
        data = result[0]
        voca = result[1]
        label = result[2]

        subsetSize = len(label) / k
        testLabel = label[i * subsetSize:(i + 1) * subsetSize]
        trainLabel = np.append(label[:i * subsetSize],
                               label[(i + 1) * subsetSize:],
                               axis=0)

        print 'len(trainLabel):', len(trainLabel)
        print 'len(testLabel):', len(testLabel)

        mask = np.logical_and((data[:, 0] > i * subsetSize),
                              (data[:, 0] <= (i + 1) * subsetSize))

        tem = np.array([i * subsetSize, 0, 0])
        testData = data[mask] - tem

        tem1 = np.array([subsetSize, 0, 0])
        maskhi = data[:, 0] > (i + 1) * subsetSize
        trainDatahi = data[maskhi] - tem1

        masklo = data[:, 0] <= i * subsetSize
        trainDatalo = data[masklo]

        trainData = np.append(trainDatahi, trainDatalo, axis=0)

        print "we are training our data:"
        result = navieBayesMulTrain(groups, voca, trainData, trainLabel)
        print "we are testing our data:"
        predictMul = navieBayesMulTest(groups, voca, testData, testLabel,
                                       result[0], result[1])
        print "we are calculating accuracy:"
        predictAcc[i] = fvec.accPredict(testLabel, predictMul)

        # if i==0:
        # 	print "we are calculating the confusion matrix:"
        # 	kkmatMul = fvec.confMatrix(testLabel,predictMul)
        # 	np.savetxt('mulConfMat_k_'+str(k)+'_run_'+str(i)+'_'+filetype+'.txt', kkmatMul, fmt='%4d')

    # averageAcc = np.mean(predictAcc)

    return predictAcc
def navieBayesMulCV(k, filetype, groups):

    predictAcc = np.zeros(k)

    for i in range(k):

        print "k:", k, "run:", i
        result = dv.createDataVoca(k, i, filetype)
        data = result[0]
        voca = result[1]
        label = result[2]

        subsetSize = len(label) / k
        testLabel = label[i * subsetSize : (i + 1) * subsetSize]
        trainLabel = np.append(label[: i * subsetSize], label[(i + 1) * subsetSize :], axis=0)

        print "len(trainLabel):", len(trainLabel)
        print "len(testLabel):", len(testLabel)

        mask = np.logical_and((data[:, 0] > i * subsetSize), (data[:, 0] <= (i + 1) * subsetSize))

        tem = np.array([i * subsetSize, 0, 0])
        testData = data[mask] - tem

        tem1 = np.array([subsetSize, 0, 0])
        maskhi = data[:, 0] > (i + 1) * subsetSize
        trainDatahi = data[maskhi] - tem1

        masklo = data[:, 0] <= i * subsetSize
        trainDatalo = data[masklo]

        trainData = np.append(trainDatahi, trainDatalo, axis=0)

        print "we are training our data:"
        result = navieBayesMulTrain(groups, voca, trainData, trainLabel)
        print "we are testing our data:"
        predictMul = navieBayesMulTest(groups, voca, testData, testLabel, result[0], result[1])
        print "we are calculating accuracy:"
        predictAcc[i] = fvec.accPredict(testLabel, predictMul)

        # if i==0:
        # 	print "we are calculating the confusion matrix:"
        # 	kkmatMul = fvec.confMatrix(testLabel,predictMul)
        # 	np.savetxt('mulConfMat_k_'+str(k)+'_run_'+str(i)+'_'+filetype+'.txt', kkmatMul, fmt='%4d')

        # averageAcc = np.mean(predictAcc)

    return predictAcc
Ejemplo n.º 5
0
import numpy as np
import featureVector as fvec


predictMulFeatRemove = np.load("predictMulFeatRemove.npy")

testLabel = np.loadtxt("../data/test.label", delimiter=' ', dtype=int)

Acc = fvec.accPredict(testLabel,predictMulFeatRemove)
Ejemplo n.º 6
0
import numpy as np
import featureVector as fvec
import matplotlib.pyplot as pl

predictBer = np.load("predictBer.npy")
predictMul = np.load("predictMul.npy")
predictMulAlp = np.load("predictMulAlp.npy")
predictMulFeatRemove = np.load("predictMulFeatRemove.npy")


testLabel = np.loadtxt("../data/test.label", delimiter=' ', dtype=int)

berAcc = fvec.accPredict(testLabel,predictBer)
mulAcc = fvec.accPredict(testLabel,predictMul)

alpRan = predictMulAlp.shape[0]
mulAlpAcc = np.zeros(alpRan)
for index,item in enumerate(predictMulAlp):
	mulAlpAcc[index] = fvec.accPredict(testLabel,item)


print "berAcc = " + str(berAcc) + "; mulAcc = " + str(mulAcc)
print "mulAlpAcc = " + str(mulAlpAcc)

# Confusion matrix
kkmatBer = fvec.confMatrix(testLabel,predictBer)
kkmatMul = fvec.confMatrix(testLabel,predictMul)
np.savetxt("berConfMat.txt", kkmatBer, fmt='%4d')
np.savetxt("mulConfMat.txt", kkmatMul, fmt='%4d')