예제 #1
0
def blTopic():
    (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable
    dataset = options.dataset
    neighbors = options.neighbors
    nFold = options.nFold
    nTopic = options.nTopic
    beta = options.beta
    dataPath = rootDir+dataset+bofDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    #remove if bof of 1000 words is computed for all categories
    if(nTopic==1000):
        dataext = bofext
    else:
        dataext = str(nTopic)+bofext
    
    nCategory = len(catList)
    
    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)
    
    for iCategory,catname in enumerate(catList):
        print catname
        #read the category data which will positive
        fname = dataPath+catname+dataext
        catpos = np.genfromtxt(fname,dtype=np.int) 
        catpos = catpos[:,:nTopic+1]
        catpos[:,nTopic] = 1
        #read the category data of remaining classes
        for cats in catList:
            if(cats!=catname):
                firstvisit = True
                if(firstvisit):
                    catneg = np.genfromtxt(fname,dtype=np.int)
                    firstvisit = False
                else : 
                    catneg = np.concatenate((catneg,np.genfromtxt(fname,dtype=np.int)),axis=0)
        #sample the negative data to have equal size as the positive
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0,nNeg,nPos),:] 
        catneg = catneg[:,:nTopic+1]
        catneg[:,nTopic] = 0
        #combine positive and negative data
        botData = np.concatenate((catpos,catneg),axis=0)
        #shuffle the rows to aid in random selection of train and test
        np.random.shuffle(botData)
        
        if(options.verbose): 
            print 'classifying...'
       
        catPerfKNN = classify.knnClassify(botData, neighbors, nFold, beta, nMetrics)
        
        perfMean[iCategory] = np.mean(catPerfKNN)
        perfStd[iCategory] = np.std(catPerfKNN) 
    
    if(options.verbose):
        print perfMean
        print perfStd            
    return perfMean
def ccTopic():
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nRowCluster
    nTopic = options.nTopic
    ccType = options.ccType
    kernelType = options.kernelType
    nFold = options.nFold
    nCodeword = options.nCodeword
    beta = options.beta

    if (options.verbose):
        print dataset, nRowCluster, nTopic, ccType, kernelType, beta, nFold, nCodeword
        print options

    dataPath = rootDir + dataset + bofDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword) + bofext
    nCategory = len(catList)

    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)

    for iCategory, catName in enumerate(catList):
        fileName = dataPath + catName + dataext
        catpos = np.genfromtxt(fileName, dtype=np.int)
        if (options.verbose): print catName
        catpos = catpos[:, :nCodeword + 1]
        catpos[:, nCodeword] = 1
        #read the category data of remaining classes

        for cats in catList:
            if (cats != catName):
                firstvisit = True
                if (firstvisit):
                    catneg = np.genfromtxt(fileName, dtype=np.int)
                    firstvisit = False
                else:
                    catneg = np.concatenate(
                        (catneg, np.genfromtxt(fileName, dtype=np.int)),
                        axis=0)
        #sample the negative data to have equal size as the positive
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0, nNeg, nPos), :]
        catneg = catneg[:, :nCodeword + 1]
        catneg[:, nCodeword] = 0
        #combine positive and negative data
        bofData = np.concatenate((catpos, catneg), axis=0)
        if (options.verbose):
            print 'co-clustering...'

        ccData = cocluster.coclust(bofData, dataset, nRowCluster, nTopic,
                                   ccType)

        ccCol = np.array([int(i) for i in ccData[1].split()])
        tempCC = np.zeros((bofData.shape[0], nTopic))
        for i in np.arange(bofData.shape[0]):
            for j in sorted(set(ccCol)):
                tempCC[i, j] = np.sum(bofData[i, ccCol == j])

        botData = np.vstack((tempCC.T, bofData[:, -1])).T

        if (options.verbose):
            print 'classifying...'

        #catPerfSVM = classify.ccClassify(botData, kernelType, nFold, beta, nMetrics)
        catPerfKNN = classify.knnClassify(botData, 10, nFold, beta, nMetrics)
        #perfMean[iCategory,0] = np.mean(catPerfSVM)
        #perfStd[iCategory,0] = np.std(catPerfSVM)
        perfMean[iCategory] = np.mean(catPerfKNN)
        perfStd[iCategory] = np.std(catPerfKNN)

    if (options.verbose):
        print perfMean
        print perfStd
    return [perfMean, perfStd]
def ccWord():
    (options,args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nCodeword
    nColCluster = options.lowerDim
    ccType = options.ccType
    kernelType = options.kernelType
    beta = options.beta
    figfmt = options.figfmt
    nFold = options.nFold
    desc = options.desc
    nClusterSample = options.nClusterSample
    
    if(options.verbose): 
        print dataset,nRowCluster,nColCluster,ccType,kernelType,beta,figfmt,nFold,nClusterSample
    
    dataPath = rootDir+dataset+dataDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = '.'+desc
    nCategory = len(catList)
    dim = descdim.get(desc)
    
    nSamplePerCategory = int(np.round(nClusterSample/nCategory))
    
    if(options.verbose): print 'collating cluster data...'
    clusterData = collateClusterData(dataPath,dataext,catList,nSamplePerCategory,dim)
    if(options.verbose): print 'coclustering...'
    ccData = cocluster.coclustWord(clusterData,dataset,nRowCluster,nColCluster,ccType)
    
    ccRow = np.array([int(i) for i in ccData[0].split()])
    ccCol = np.array([int(i) for i in ccData[1].split()])
    
    cctemp = np.zeros((clusterData.shape[0],nColCluster))
    codebook = np.zeros((nRowCluster,nColCluster))
    for i in np.arange(clusterData.shape[0]):
        for j in sorted(set(ccCol)):
            cctemp[i,j] = np.linalg.norm(clusterData[i,ccCol==j], 2)        ###############
#            cctemp[i,j] = np.mean(clusterData[i,ccCol==j])
    for i in sorted(set(ccRow)):
        codebook[i,:] = np.mean(cctemp[ccRow==i,:],0)
    
    if(options.verbose): print 'writing bof...'
    writebof(dataset,catList,codebook,ccCol,nRowCluster,desc,ccType,nRowCluster,nColCluster)
    
    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)
    for iCategory,catName in enumerate(catList):
#        catboffilepath = rootDir+dataset+bofDir+catName+'_cc'+bofext
        catboffilepath = rootDir+dataset+bofDir+catName+str(ccType)+str(nRowCluster)+str(nColCluster)+bofext
        catpos = np.genfromtxt(catboffilepath,dtype=np.int) # catpos
        catpos = catpos[:,:nColCluster+1]
        catpos[:,nColCluster] = 1
        for catname in catList:
            if(catname!=catName):
                firstvisit = True
#                catboffilepath = rootDir+dataset+bofDir+catname+'_cc'+bofext
                catboffilepath = rootDir+dataset+bofDir+catName+str(ccType)+str(nRowCluster)+str(nColCluster)+bofext
                if(firstvisit):
                    catneg = np.genfromtxt(catboffilepath,dtype=np.int)
                    firstvisit = False
                else : 
                    catneg = np.concatenate((catneg,np.genfromtxt(catboffilepath,dtype=np.int)),axis=0)
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0,nNeg,nPos),:] #catneg
        catneg = catneg[:,:nColCluster+1]
        catneg[:,nColCluster] = 0
        #combine positive and negative data
        catData = np.concatenate((catpos,catneg),axis=0)
        #shuffle the rows to aid in random selection of train and test
        np.random.shuffle(catData)
#        catPerf = classify.ccClassify(catData, kernelType, nFold, beta, nMetrics)
        catPerf = classify.knnClassify(catData, 10, nFold, beta, nMetrics)
        perfMean[iCategory] = np.mean(catPerf)
        perfStd[iCategory] = np.std(catPerf) 
    
    if(options.verbose):
        print perfMean
        print perfStd
    #plotresult.ccPlot(dataset, catList, perfMean, perfStd, figfmt, 'BoW', ccType)
    return [perfMean,perfStd]