def main(trainMode, binarized, testDir, resultFile): dir1 = r'R:\masters\fall2013\COMP579A\project\aclImdb\train\pos'; dir2 = r'R:\masters\fall2013\COMP579A\project\aclImdb\train\neg'; corpusName = r'corpus\binCorpus.txt'; if(not binarized): corpusName = r'corpus\corpus.txt'; if(trainMode): if(binarized): buildBinarizedCorpus(dir1, corpusName); buildBinarizedCorpus(dir2, corpusName); else: buildCorpus(dir1, corpusName); buildCorpus(dir2, corpusName); dictionary = loadCorpus(corpusName); testText = ''; print('test for dir' , testDir); for dir_entry in os.listdir(testDir): if binarized: testText = getBinTestText(os.path.join(testDir, dir_entry)); else: testText = getTestText(os.path.join(testDir, dir_entry)); result =test(testText, dictionary); FileIO.wrtieToFile(resultFile, 'a' ,"The file " + os.path.join(testDir, dir_entry) + " is classified as " + result + "\n"); print("Done....");
def buildCorpus(dirPath, corpusName): print('creating corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); dictionary = {}; FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); for dir_entry in os.listdir(dirPath): text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in dictionary: dictionary[token] = {}; if tagClass not in dictionary[token]: dictionary[token][tagClass] = 0; dictionary[token][tagClass] = dictionary[token][tagClass] + 1; for key, value in dictionary.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n')); print('Corpus creation : Done..');
def buildBinarizedCorpus(dirPath, corpusName): print('creating binarized corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); corpusDict = {}; for dir_entry in os.listdir(dirPath): fileTokens = {}; text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in fileTokens: fileTokens[token] = 1; if token not in corpusDict: corpusDict[token] = {}; corpusDict[token][tagClass] = 1; else: corpusDict[token][tagClass] = corpusDict[token][tagClass] + 1; for key, value in corpusDict.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n' )); print('binarized corpus creation done!!');