Ejemplo n.º 1
0
def buildSplits(numFolds, args):
    """Builds the splits for training/testing"""
    splits = []
    trainDir = args[0]
    if len(args) == 1:
        print '[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % (
            numFolds, trainDir)
        posTrainFileNames = os.listdir('%s/pos/' % trainDir)
        negTrainFileNames = os.listdir('%s/neg/' % trainDir)
        for fold in range(0, numFolds):
            split = NaiveBayes.TrainSplit()
            for fileName in posTrainFileNames:
                example = NaiveBayes.Example()
                example.words = readFile('%s/pos/%s' % (trainDir, fileName))
                example.klass = 'pos'
                if fileName[2] == str(fold):
                    split.test.append(example)
                else:
                    split.train.append(example)
            for fileName in negTrainFileNames:
                example = NaiveBayes.Example()
                example.words = readFile('%s/neg/%s' % (trainDir, fileName))
                example.klass = 'neg'
                if fileName[2] == str(fold):
                    split.test.append(example)
                else:
                    split.train.append(example)
            splits.append(split)
    elif len(args) == 2:
        split = NaiveBayes.TrainSplit()
        testDir = args[1]
        print '[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % (
            trainDir, testDir)
        posTrainFileNames = os.listdir('%s/pos/' % trainDir)
        negTrainFileNames = os.listdir('%s/neg/' % trainDir)
        for fileName in posTrainFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/pos/%s' % (trainDir, fileName))
            example.klass = 'pos'
            split.train.append(example)
        for fileName in negTrainFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/neg/%s' % (trainDir, fileName))
            example.klass = 'neg'
            split.train.append(example)

        posTestFileNames = os.listdir('%s/pos/' % testDir)
        negTestFileNames = os.listdir('%s/neg/' % testDir)
        for fileName in posTestFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/pos/%s' % (testDir, fileName))
            example.klass = 'pos'
            split.test.append(example)
        for fileName in negTestFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/neg/%s' % (testDir, fileName))
            example.klass = 'neg'
            split.test.append(example)
        splits.append(split)
    return splits
Ejemplo n.º 2
0
def buildTestCorpus(ch_aux): 
  """takes doc1\n###\ndoc2\n###... and makes list of documents.
     build their NB, train on train, output pos\nneg\npos...
  """
  # split on ###
  testSplit = NaiveBayes.TrainSplit()
  documents = ch_aux.split('###')
  for document in documents:
    document = document.strip() # remove trailing/starting newlines
    example = NaiveBayes.Example() # example for this document
    example.klass = 'UNK' # testing time, we don't know the label
    example.words = []
    for word in document.split(): # for every token
      example.words.append(word)
    testSplit.test.append(example)
  return testSplit