def combineClsDocs(rootDir):
    #get class names in mini_newsgroup folder
    clsList = getClsNames(rootDir) 
        
    #get all the directory names with absolute path
    dirList = []
    for parent, dirNames, fileNames in os.walk(rootDir):        
        for dirName in dirNames:
            dirList.append(os.path.join(parent, dirName))
    dirList.sort() #sort alphabetically

    clsWordDict = {} #(class name : class words list) 20 pairs
    
    for i in range(len(clsList)): #20 classes
        #get all the file names in a class, store them into fileList
        fileList = [] # file names with absolute path of a class
        for parent, dirNames, fileNames in os.walk(dirList[i]):        
            for fileName in fileNames:
                fileList.append(os.path.join(parent, fileName))

        #combine the files of a class into clsWordsList, each file pre-processed
        trainSize = int(len(fileList)*0.8) #80% of the documents for training
        clsWordsList = []
        for j in range(trainSize):
            clsWordsList += getDocWordsList(fileList[j], stopWordFileName)

        #construct className:classWordsList dictionary
        #clsList[i] refers to the i-th class name
        #clsWordsList refers to the class words list respectively
        clsWordDict[ clsList[i] ] = clsWordsList

    return clsWordDict
def getVocabulary(rootDir):
       #get all the file names (absolute path with names)
    fileList = [] 
    for parent, dirNames, fileNames in os.walk(rootDir):        
        for fileName in fileNames:
            fileList.append(os.path.join(parent, fileName))

    #spare file by file to get the vovabulary
    tempVocab = []
    for eachfile in fileList:
        #get a file's word list, pre-process is done.
        wordsList = getDocWordsList(eachfile, stopWordFileName)
        #add this file's words into vocabulary, combination(extend)
        tempVocab += wordsList
    tempVocab.sort()
    
##    #delete the words whose count is less than 3
##    #very time-consuming!!!
##    #tempVocab = [word for word in tempVocab if tempVocab.count(word) >= 3]
##    tempVocab = filter(lambda word: tempVocab.count(word) >= 3, tempVocab)
##    
##    #convert the vocabulary list to vocabulary set, remove reduplicative words
##    #and remain only one.
##    #the words in vocabulary is unique and sorted alphabetically(alphabetic order).
##    VocabSet = set(tempVocab) #first convert to set
##
##    Vocabulary = []
##    Vocabulary = list(VocabSet) #convert back to list
##    Vocabulary.sort()

    #delete the words whose count is less than 3, Another Method!
    vocabCountDict = {} #store word count for each word in tempVocab
    for word in tempVocab:
        if not vocabCountDict.has_key(word):
            vocabCountDict[word] = 1 # not in dict, add it into it
        else:
            vocabCountDict[word] += 1 #already in dict, count++

    for word,count in vocabCountDict.items():
        if (count < 3):
            del vocabCountDict[word] #delete word whose count < 3
    Vocabulary = vocabCountDict.keys() #final vocabulary
    
    #store the vocabulary into a file, to avoid repetitive function call
    vocabFile = "vocabulary.data"
    f = file(vocabFile, 'w')
    p.dump(Vocabulary, f) #dump the object into a file
    f.close()

    #vocabulary size is 12942 (remove words whose count<3), in mini_newsgroups
    #without removing low frequency words, vocab size will be about 30,000
    return Vocabulary  
def getTextFeature(docName, vocabulary):
    #get text words firt
    textWordsList = []
    textWordsList = getDocWordsList(docName, stopWordFileName)
    textWordsList.sort()

    #calc text feature in terms of Vocabulary
    textFeatureList = []
    for eachWord in vocabulary:
        textFeatureList.append(textWordsList.count(eachWord))

    #textFeatureVector = np.array(textFeatureList) #convet list to array
    #return textFeatureVector #len = sizeof(Vocabulary)
    return textFeatureList