def buildCorpus(dirPath, corpusName): print('creating corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); dictionary = {}; FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); for dir_entry in os.listdir(dirPath): text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in dictionary: dictionary[token] = {}; if tagClass not in dictionary[token]: dictionary[token][tagClass] = 0; dictionary[token][tagClass] = dictionary[token][tagClass] + 1; for key, value in dictionary.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n')); print('Corpus creation : Done..');
def buildBinarizedCorpus(dirPath, corpusName): print('creating binarized corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); corpusDict = {}; for dir_entry in os.listdir(dirPath): fileTokens = {}; text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in fileTokens: fileTokens[token] = 1; if token not in corpusDict: corpusDict[token] = {}; corpusDict[token][tagClass] = 1; else: corpusDict[token][tagClass] = corpusDict[token][tagClass] + 1; for key, value in corpusDict.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n' )); print('binarized corpus creation done!!');