def buildSplits(numFolds, args): """Builds the splits for training/testing""" splits = [] trainDir = args[0] if len(args) == 1: print '[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % ( numFolds, trainDir) posTrainFileNames = os.listdir('%s/pos/' % trainDir) negTrainFileNames = os.listdir('%s/neg/' % trainDir) for fold in range(0, numFolds): split = NaiveBayes.TrainSplit() for fileName in posTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (trainDir, fileName)) example.klass = 'pos' if fileName[2] == str(fold): split.test.append(example) else: split.train.append(example) for fileName in negTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (trainDir, fileName)) example.klass = 'neg' if fileName[2] == str(fold): split.test.append(example) else: split.train.append(example) splits.append(split) elif len(args) == 2: split = NaiveBayes.TrainSplit() testDir = args[1] print '[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % ( trainDir, testDir) posTrainFileNames = os.listdir('%s/pos/' % trainDir) negTrainFileNames = os.listdir('%s/neg/' % trainDir) for fileName in posTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (trainDir, fileName)) example.klass = 'pos' split.train.append(example) for fileName in negTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (trainDir, fileName)) example.klass = 'neg' split.train.append(example) posTestFileNames = os.listdir('%s/pos/' % testDir) negTestFileNames = os.listdir('%s/neg/' % testDir) for fileName in posTestFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (testDir, fileName)) example.klass = 'pos' split.test.append(example) for fileName in negTestFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (testDir, fileName)) example.klass = 'neg' split.test.append(example) splits.append(split) return splits
def buildTestCorpus(ch_aux): """takes doc1\n###\ndoc2\n###... and makes list of documents. build their NB, train on train, output pos\nneg\npos... """ # split on ### testSplit = NaiveBayes.TrainSplit() documents = ch_aux.split('###') for document in documents: document = document.strip() # remove trailing/starting newlines example = NaiveBayes.Example() # example for this document example.klass = 'UNK' # testing time, we don't know the label example.words = [] for word in document.split(): # for every token example.words.append(word) testSplit.test.append(example) return testSplit