def ccTopic():
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nRowCluster
    nTopic = options.nTopic
    ccType = options.ccType
    kernelType = options.kernelType
    nFold = options.nFold
    nCodeword = options.nCodeword
    beta = options.beta

    if (options.verbose):
        print dataset, nRowCluster, nTopic, ccType, kernelType, beta, nFold, nCodeword
        print options

    dataPath = rootDir + dataset + bofDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword) + bofext
    nCategory = len(catList)

    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)

    for iCategory, catName in enumerate(catList):
        fileName = dataPath + catName + dataext
        catpos = np.genfromtxt(fileName, dtype=np.int)
        if (options.verbose): print catName
        catpos = catpos[:, :nCodeword + 1]
        catpos[:, nCodeword] = 1
        #read the category data of remaining classes

        for cats in catList:
            if (cats != catName):
                firstvisit = True
                if (firstvisit):
                    catneg = np.genfromtxt(fileName, dtype=np.int)
                    firstvisit = False
                else:
                    catneg = np.concatenate(
                        (catneg, np.genfromtxt(fileName, dtype=np.int)),
                        axis=0)
        #sample the negative data to have equal size as the positive
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0, nNeg, nPos), :]
        catneg = catneg[:, :nCodeword + 1]
        catneg[:, nCodeword] = 0
        #combine positive and negative data
        bofData = np.concatenate((catpos, catneg), axis=0)
        if (options.verbose):
            print 'co-clustering...'

        ccData = cocluster.coclust(bofData, dataset, nRowCluster, nTopic,
                                   ccType)

        ccCol = np.array([int(i) for i in ccData[1].split()])
        tempCC = np.zeros((bofData.shape[0], nTopic))
        for i in np.arange(bofData.shape[0]):
            for j in sorted(set(ccCol)):
                tempCC[i, j] = np.sum(bofData[i, ccCol == j])

        botData = np.vstack((tempCC.T, bofData[:, -1])).T

        if (options.verbose):
            print 'classifying...'

        #catPerfSVM = classify.ccClassify(botData, kernelType, nFold, beta, nMetrics)
        catPerfKNN = classify.knnClassify(botData, 10, nFold, beta, nMetrics)
        #perfMean[iCategory,0] = np.mean(catPerfSVM)
        #perfStd[iCategory,0] = np.std(catPerfSVM)
        perfMean[iCategory] = np.mean(catPerfKNN)
        perfStd[iCategory] = np.std(catPerfKNN)

    if (options.verbose):
        print perfMean
        print perfStd
    return [perfMean, perfStd]
コード例 #2
0
def ccUniversalTopicDictionary(wordn, topicn):
    #acquire program agruments
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nRowCluster

    ccType = options.ccType
    #nCodeword = options.nCodeword
    #nTopic = options.nTopic
    nCodeword = wordn
    nTopic = topicn

    #echo arguments
    if (options.verbose):
        print dataset, nRowCluster, nTopic, ccType, nCodeword
        print options

    #configure data path and other parameters
    dataPath = rootDir + dataset + imgWrdDir
    resultPath = rootDir + dataset + utdDir + ccType + dataset
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword) + imgWrdext
    resultext = str(nCodeword) + str(nTopic) + utdext
    resultFileName = resultPath + resultext
    if (os.path.exists(resultFileName)):
        print '%s already written' % (resultFileName)
        return
    #flag if incomplete data
    incompleteData = False

    for catName in catList:
        iwmFileName = dataPath + catName + dataext
        if (os.path.exists(iwmFileName) == False):
            incompleteData = True
            print '%s missing in %s,%d' % (catName, dataset, wordn)
            return
        else:
            pass

    # initialise empty iwm matrix and append each category to it
    iwmData = None
    for catName in catList:
        iwmFileName = dataPath + catName + dataext
        try:
            iwmCatData = np.loadtxt(iwmFileName, dtype=np.int16, delimiter=' ')
            if (options.verbose): print 'reading %s' % (iwmFileName)
        except:
            print 'unable to read %s' % (iwmFileName)
            incompleteData = True
            return
        #stack the category data to the existing data-set data
        if (iwmData == None):
            iwmData = iwmCatData
        else:
            iwmData = np.concatenate((iwmData, iwmCatData), axis=0)
        pass

    if (incompleteData == False):
        if (options.verbose): print 'co-clustering...'
        ccData = coclust(iwmData, dataset, nRowCluster, nTopic, ccType)
        # the indices of co-clusters columns
        ccarray = ccData[1].split()
        ccCol = np.array(ccarray, dtype=np.int16)
        if (options.verbose): print 'writing %s' % (resultFileName)
        np.savetxt(resultFileName, ccCol, fmt='%d', delimiter=' ')
    else:
        print 'incomplete data for %s' % (resultFileName)
コード例 #3
0
def ccWord():
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nRowCluster
    nColCluster = options.nColCluster
    ccType = options.ccType
    kernelType = options.kernelType
    beta = options.beta
    figfmt = options.figfmt
    nFold = options.nFold
    desc = options.desc
    nClusterSample = options.nClusterSample

    if (options.verbose):
        print dataset, nRowCluster, nColCluster, ccType, kernelType, beta, figfmt, nFold, nClusterSample

    dataPath = rootDir + dataset + dataDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = '.' + desc
    nCategory = len(catList)
    dim = descdim.get(desc)

    nSamplePerCategory = int(np.round(nClusterSample / nCategory))

    if (options.verbose): print 'collating cluster data...'
    clusterData = collateClusterData(dataPath, dataext, catList,
                                     nSamplePerCategory, dim)
    if (options.verbose): print 'coclustering...'
    ccData = cocluster.coclust(clusterData, dataset, nRowCluster, nColCluster,
                               ccType)

    ccRow = np.array([int(i) for i in ccData[0].split()])
    ccCol = np.array([int(i) for i in ccData[1].split()])

    cctemp = np.zeros((clusterData.shape[0], nColCluster))
    codebook = np.zeros((nRowCluster, nColCluster))
    for i in np.arange(clusterData.shape[0]):
        for j in sorted(set(ccCol)):
            cctemp[i, j] = np.linalg.norm(clusterData[i, ccCol == j], 2)
    for i in sorted(set(ccRow)):
        codebook[i, :] = np.mean(cctemp[ccRow == i, :], 0)

    if (options.verbose): print 'writing bof...'
    writebof(dataset, catList, codebook, ccCol, nRowCluster, desc)

    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)
    for iCategory, catName in enumerate(catList):
        catboffilepath = rootDir + dataset + bofDir + catName + '_cc' + bofext
        catpos = np.genfromtxt(catboffilepath, dtype=np.int)  # catpos
        catpos = catpos[:, :nColCluster + 1]
        catpos[:, nColCluster] = 1
        for catname in catList:
            if (catname != catName):
                firstvisit = True
                catboffilepath = rootDir + dataset + bofDir + catname + '_cc' + bofext
                if (firstvisit):
                    catneg = np.genfromtxt(catboffilepath, dtype=np.int)
                    firstvisit = False
                else:
                    catneg = np.concatenate(
                        (catneg, np.genfromtxt(catboffilepath, dtype=np.int)),
                        axis=0)
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0, nNeg, nPos), :]  #catneg
        catneg = catneg[:, :nColCluster + 1]
        catneg[:, nColCluster] = 0
        #combine positive and negative data
        catData = np.concatenate((catpos, catneg), axis=0)
        #shuffle the rows to aid in random selection of train and test
        np.random.shuffle(catData)
        catPerf = classify.ccClassify(catData, kernelType, nFold, beta,
                                      nMetrics)
        perfMean[iCategory] = np.mean(catPerf)
        perfStd[iCategory] = np.std(catPerf)

    if (options.verbose):
        print perfMean
        print perfStd
    plotresult.ccPlot(dataset, catList, perfMean, perfStd, figfmt, 'BoW',
                      ccType)