def combineClsDocs(rootDir): #get class names in mini_newsgroup folder clsList = getClsNames(rootDir) #get all the directory names with absolute path dirList = [] for parent, dirNames, fileNames in os.walk(rootDir): for dirName in dirNames: dirList.append(os.path.join(parent, dirName)) dirList.sort() #sort alphabetically clsWordDict = {} #(class name : class words list) 20 pairs for i in range(len(clsList)): #20 classes #get all the file names in a class, store them into fileList fileList = [] # file names with absolute path of a class for parent, dirNames, fileNames in os.walk(dirList[i]): for fileName in fileNames: fileList.append(os.path.join(parent, fileName)) #combine the files of a class into clsWordsList, each file pre-processed trainSize = int(len(fileList)*0.8) #80% of the documents for training clsWordsList = [] for j in range(trainSize): clsWordsList += getDocWordsList(fileList[j], stopWordFileName) #construct className:classWordsList dictionary #clsList[i] refers to the i-th class name #clsWordsList refers to the class words list respectively clsWordDict[ clsList[i] ] = clsWordsList return clsWordDict
def getVocabulary(rootDir): #get all the file names (absolute path with names) fileList = [] for parent, dirNames, fileNames in os.walk(rootDir): for fileName in fileNames: fileList.append(os.path.join(parent, fileName)) #spare file by file to get the vovabulary tempVocab = [] for eachfile in fileList: #get a file's word list, pre-process is done. wordsList = getDocWordsList(eachfile, stopWordFileName) #add this file's words into vocabulary, combination(extend) tempVocab += wordsList tempVocab.sort() ## #delete the words whose count is less than 3 ## #very time-consuming!!! ## #tempVocab = [word for word in tempVocab if tempVocab.count(word) >= 3] ## tempVocab = filter(lambda word: tempVocab.count(word) >= 3, tempVocab) ## ## #convert the vocabulary list to vocabulary set, remove reduplicative words ## #and remain only one. ## #the words in vocabulary is unique and sorted alphabetically(alphabetic order). ## VocabSet = set(tempVocab) #first convert to set ## ## Vocabulary = [] ## Vocabulary = list(VocabSet) #convert back to list ## Vocabulary.sort() #delete the words whose count is less than 3, Another Method! vocabCountDict = {} #store word count for each word in tempVocab for word in tempVocab: if not vocabCountDict.has_key(word): vocabCountDict[word] = 1 # not in dict, add it into it else: vocabCountDict[word] += 1 #already in dict, count++ for word,count in vocabCountDict.items(): if (count < 3): del vocabCountDict[word] #delete word whose count < 3 Vocabulary = vocabCountDict.keys() #final vocabulary #store the vocabulary into a file, to avoid repetitive function call vocabFile = "vocabulary.data" f = file(vocabFile, 'w') p.dump(Vocabulary, f) #dump the object into a file f.close() #vocabulary size is 12942 (remove words whose count<3), in mini_newsgroups #without removing low frequency words, vocab size will be about 30,000 return Vocabulary
def getTextFeature(docName, vocabulary): #get text words firt textWordsList = [] textWordsList = getDocWordsList(docName, stopWordFileName) textWordsList.sort() #calc text feature in terms of Vocabulary textFeatureList = [] for eachWord in vocabulary: textFeatureList.append(textWordsList.count(eachWord)) #textFeatureVector = np.array(textFeatureList) #convet list to array #return textFeatureVector #len = sizeof(Vocabulary) return textFeatureList