コード例 #1
0
    def queryModel(self, query, preprocess):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        allDistances = self.infer(self.encodeSample(sample))

        # Model trains multiple times for multi-label samples, so remove repeats.
        # note: numpy.unique() auto sorts least to greatest

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for i, uniqueID in enumerate(self.sampleReference):
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))
コード例 #2
0
def setupData(args):
  """ Performs data preprocessing and setup given the user-specified args.

  @param args       (Namespace)     User-provided arguments via the cmd line.
  @return           (tuple)         Tuple where first entry is a list of the
      samples, the second is the list of gold labels per example, the third is
      the list of all possible labels, and the fourth is the labels per example
      in the data.
  """
  dataDict = readCSV(args.dataPath, 2, args.numLabels)

  # Collect each possible label string into a list, where the indices will be
  # their references throughout the experiment.
  labelReference = list(set(
      itertools.chain.from_iterable(dataDict.values())))

  for sample, labels in dataDict.iteritems():
    dataDict[sample] = numpy.array([labelReference.index(label)
                                    for label in labels],
                                    dtype="int8")

  texter = TextPreprocess()
  if args.textPreprocess:
    samples = [(texter.tokenize(sample,
                                ignoreCommon=100,
                                removeStrings=["[identifier deleted]"],
                                correctSpell=True),
               labels) for sample, labels in dataDict.iteritems()]
  else:
    samples = [(texter.tokenize(sample), labels)
               for sample, labels in dataDict.iteritems()]

  return samples, labelReference
コード例 #3
0
    def testFunctionsWithoutDataFiles(self):
        """
    Ensures a TextPreprocess object can be created and tokenize when there are
    no text data files (corpus text, abbreviations, and contractions).
    """
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv")

        tokens = processor.tokenize(text)
        expected_tokens = [
            "i",
            "can",
            "t",
            "work",
            "at",
            "identifier",
            "deleted",
            "if",
            "you",
            "don",
            "t",
            "allw",
            "me",
            "to",
            "wfh",
        ]

        self.assertSequenceEqual(tokens, expected_tokens)
コード例 #4
0
 def testReadExpansionFileWithSuffixes(self):
     """Tests TextPreprocess reads csv files correctly and adds suffixes."""
     processor = TextPreprocess()
     suffixes = ["", "s", "'s"]
     abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes)
     expectedAbbreviations = {"wfh": "work from home", "wfhs": "work from homes", "wfh's": "work from home's"}
     self.assertEqual(abbreviations, expectedAbbreviations)
コード例 #5
0
  def _preprocess(self, preprocess):
    """Tokenize the samples, with or without preprocessing."""
    texter = TextPreprocess()
    if preprocess:
      self.samples = {category: [(texter.tokenize(data[0],
                                                  ignoreCommon=100,
                                                  removeStrings=["identifier deleted]"],
                                                  correctSpell=True), data[1], idx)
                      for idx, data in samples.iteritems()]
                      for category, samples in self.dataDict.iteritems()}

      if self.testDict:
        self.testSamples = [(texter.tokenize(data[0],
                                            ignoreCommon=100,
                                            removeStrings=["identifier deleted]"],
                                            correctSpell=True), data[1], idx)
                            for idx, data in self.testDict.iteritems()]
    else:
      self.samples = {category: [(texter.tokenize(data[0]), data[1], idx)
                      for idx, data in samples.iteritems()]
                      for category, samples in self.dataDict.iteritems()}

      if self.testDict:
        self.testSamples = [(texter.tokenize(data[0]), data[1], idx)
                            for idx, data in self.testDict.iteritems()]
コード例 #6
0
    def testTokenizeExpandAbbreviation(self):
        """Tests abbreviations are expanded."""
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess()

        expected_tokens = [
            "i",
            "can",
            "t",
            "work",
            "at",
            "identifier",
            "deleted",
            "if",
            "you",
            "don",
            "t",
            "allw",
            "me",
            "to",
            "work",
            "from",
            "home",
        ]

        tokens = processor.tokenize(text, expandAbbr=True)
        self.assertSequenceEqual(tokens, expected_tokens)
コード例 #7
0
    def testTokenizeRemoveString(self):
        """Tests a provided string is ignored."""
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess()

        expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don", "t", "allw", "me", "to", "wfh"]
        tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
        self.assertSequenceEqual(tokens, expected_tokens)
コード例 #8
0
  def testTokenizeExpandContraction(self):
    """Tests contractions are expanded."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted",
                       "if", "you", "do", "not", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, expandContr=True)
    self.assertSequenceEqual(tokens, expected_tokens)
コード例 #9
0
  def testTokenizeRemoveString(self):
    """Tests a provided string is ignored."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don",
                       "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
    self.assertSequenceEqual(tokens, expected_tokens)
コード例 #10
0
  def testTokenizeNoPreprocess(self):
    """Tests none of the preprocessing methods are used."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                       "if", "you", "don", "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text)
    self.assertSequenceEqual(tokens, expected_tokens)
コード例 #11
0
 def testReadExpansionFileWithSuffixes(self):
   """Tests TextPreprocess reads csv files correctly and adds suffixes."""
   processor = TextPreprocess()
   suffixes = ["", "s", "'s"]
   abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes)
   expectedAbbreviations = {"wfh": "work from home",
                            "wfhs": "work from homes",
                            "wfh's": "work from home's"}
   self.assertEqual(abbreviations, expectedAbbreviations)
コード例 #12
0
    def split(self,
              filePath,
              numLabels,
              textPreprocess=False,
              abbrCSV="",
              contrCSV="",
              ignoreCommon=100,
              removeStrings="[identifier deleted]",
              correctSpell=True):
        """
    Split all the comments in a file into tokens. Preprocess if necessary.
    
    @param filePath        (str)    Path to csv file
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPrepricess is True.
    """
        dataDict = readCSV(filePath, numLabels=numLabels)
        if dataDict is None:
            raise Exception("Could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = (abbrCSV != "")
        expandContr = (contrCSV != "")

        for i, uniqueID in enumerate(dataDict.keys()):
            comment, categories = dataDict[uniqueID]
            # Convert the categories to a string of their IDs
            categories = string.join(
                [str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens = preprocessor.tokenize(comment, ignoreCommon,
                                               removeStrings, correctSpell,
                                               expandAbbr, expandContr)
            else:
                tokens = preprocessor.tokenize(comment)

            # Write the sequence of data records for this sample.
            record = {"_categories": categories, "_sequenceID": i}
            data = []
            reset = 1
            for t in tokens:
                tokenRecord = record.copy()
                tokenRecord["_token"] = t
                tokenRecord["_reset"] = reset
                tokenRecord["ID"] = uniqueID
                reset = 0
                data.append(tokenRecord)

            self.records.append(data)
コード例 #13
0
 def _preprocess(self, preprocess):
   """Tokenize the samples, with or without preprocessing."""
   texter = TextPreprocess()
   if preprocess:
     self.samples = [(texter.tokenize(data[0],
                                      ignoreCommon=100,
                                      removeStrings=["[identifier deleted]"],
                                      correctSpell=True),
                      data[1]) for _, data in self.dataDict.iteritems()]
   else:
     self.samples = [(texter.tokenize(data[0]), data[1])
                     for _, data in self.dataDict.iteritems()]
コード例 #14
0
ファイル: runner.py プロジェクト: lscheinkman/nupic.fluent
 def _preprocess(self, preprocess):
     """Tokenize the samples, with or without preprocessing."""
     texter = TextPreprocess()
     if preprocess:
         self.samples = [
             (texter.tokenize(data[0],
                              ignoreCommon=100,
                              removeStrings=["[identifier deleted]"],
                              correctSpell=True), data[1])
             for id, data in self.dataDict.iteritems()
         ]
     else:
         self.samples = [(texter.tokenize(data[0]), data[1])
                         for id, data in self.dataDict.iteritems()]
コード例 #15
0
 def testFunctionsWithoutDataFiles(self):
   """
   Ensures a TextPreprocess object can be created and tokenize when there are
   no text data files (corpus text, abbreviations, and contractions).
   """
   text = "I can't work at [identifier deleted] if you don't allw me to wfh"
   processor = TextPreprocess(corpusTxt="fake.txt",
                              abbrCSV="not_here.csv",
                              contrCSV="not_real.csv")
   
   tokens = processor.tokenize(text)
   expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                      "if", "you", "don", "t", "allw", "me", "to", "wfh"]
   
   self.assertSequenceEqual(tokens, expected_tokens)
コード例 #16
0
    def prepText(text, preprocess=False):
        """
    Returns a list of the text tokens.

    @param preprocess   (bool)    Whether or not to preprocess the text data.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                text,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(text)

        return sample
コード例 #17
0
    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        # Count the ON bits represented in the encoded tokens.
        counts = Counter()
        for t in tokens:
            bitmap = self.client.getBitmap(t)["fingerprint"]["positions"]
            counts.update(bitmap)

        positions = self.sparseUnion(counts)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) * 100 / float(self.n),
            "df": 0.0,
            "height": self.h,
            "width": self.w,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding
コード例 #18
0
  def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="",
            contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]",
            correctSpell=True):
    """
    Split all the comments in a file into tokens. Preprocess if necessary.
    
    @param filePath        (str)    Path to csv file
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPrepricess is True.
    """
    dataDict = readCSV(filePath, numLabels=numLabels)
    if dataDict is None:
      raise Exception("Could not read CSV.")

    preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
    expandAbbr = (abbrCSV != "")
    expandContr = (contrCSV != "")

    for i, uniqueID in enumerate(dataDict.keys()):
      comment, categories = dataDict[uniqueID]
      # Convert the categories to a string of their IDs
      categories = string.join([str(self.categoryToId[c]) for c in categories])

      if textPreprocess:
        tokens = preprocessor.tokenize(
            comment, ignoreCommon, removeStrings, correctSpell, expandAbbr,
            expandContr)
      else:
        tokens = preprocessor.tokenize(comment)

      # Write the sequence of data records for this sample.
      record = {"_categories":categories,
                "_sequenceID":i}
      data = []
      reset = 1
      for t in tokens:
        tokenRecord = record.copy()
        tokenRecord["_token"] = t
        tokenRecord["_reset"] = reset
        tokenRecord["ID"] = uniqueID
        reset = 0
        data.append(tokenRecord)

      self.records.append(data)
コード例 #19
0
def setupData(args):
    """ Performs data preprocessing and setup given the user-specified args.

  @param args       (Namespace)     User-provided arguments via the cmd line.
  @return           (tuple)         Tuple where first entry is a list of the
      samples, the second is the list of gold labels per example, the third is
      the list of all possible labels, and the fourth is the labels per example
      in the data.
  """
    dataDict = readCSV(args.dataPath, 2, args.numLabels)

    # Collect each possible label string into a list, where the indices will be
    # their references throughout the experiment.
    labelReference = list(
        set(
            itertools.chain.from_iterable(
                map(lambda x: x[1], dataDict.values()))))

    for idx, data in dataDict.iteritems():
        comment, labels = data
        dataDict[idx] = (comment,
                         numpy.array(
                             [labelReference.index(label) for label in labels],
                             dtype="int8"))

    texter = TextPreprocess(abbrCSV=args.abbrCSV, contrCSV=args.contrCSV)
    expandAbbr = (args.abbrCSV != "")
    expandContr = (args.contrCSV != "")
    if args.textPreprocess:
        samples = [(texter.tokenize(data[0],
                                    ignoreCommon=100,
                                    removeStrings=["[identifier deleted]"],
                                    correctSpell=True,
                                    expandAbbr=expandAbbr,
                                    expandContr=expandContr), data[1])
                   for _, data in dataDict.iteritems()]
    else:
        samples = [(texter.tokenize(data[0]), data[1])
                   for _, data in dataDict.iteritems()]

    return samples, labelReference
コード例 #20
0
ファイル: multi_runner.py プロジェクト: numenta/nupic.fluent
    def _preprocess(self, preprocess):
        """Tokenize the samples, with or without preprocessing."""
        texter = TextPreprocess()
        if preprocess:
            self.samples = {
                category:
                [(texter.tokenize(data[0],
                                  ignoreCommon=100,
                                  removeStrings=["identifier deleted]"],
                                  correctSpell=True), data[1], idx)
                 for idx, data in samples.iteritems()]
                for category, samples in self.dataDict.iteritems()
            }

            if self.testDict:
                self.testSamples = [
                    (texter.tokenize(data[0],
                                     ignoreCommon=100,
                                     removeStrings=["identifier deleted]"],
                                     correctSpell=True), data[1], idx)
                    for idx, data in self.testDict.iteritems()
                ]
        else:
            self.samples = {
                category: [(texter.tokenize(data[0]), data[1], idx)
                           for idx, data in samples.iteritems()]
                for category, samples in self.dataDict.iteritems()
            }

            if self.testDict:
                self.testSamples = [(texter.tokenize(data[0]), data[1], idx)
                                    for idx, data in self.testDict.iteritems()]
コード例 #21
0
    def generateSequence(self, text, preprocess=False):
        """
    Return a list of lists representing the text sequence in network data 
    format. Does not preprocess the text.
    """
        # TODO: enable text preprocessing; abstract out the logic in split() into a common method.
        tokens = TextPreprocess().tokenize(text)
        cat = [-1]
        self.sequenceCount += 1
        uniqueID = "q"
        data = self._formatSequence(tokens, cat, self.sequenceCount, uniqueID)

        return data
コード例 #22
0
def run(args):
  """
  The experiment is configured to run on question response data.

  The runner sets up the data path to such that the experiment runs on a single
  data file located in the nupic.fluent/data directory.
  The data path MUST BE SPECIFIED at the cmd line, e.g. from the fluent dir:

  python experiments/random_baseline_runner.py data/sample_reviews/sample_reviews_data_training.csv

  To run k-folds cross validation, arguments must be: kFolds > 1, train = False,
  test = False. To run either training or testing, kFolds = 1.
  """
  start = time.time()

  # Setup directories.
  root = os.path.dirname(__file__)
  dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile))
  modelPath = os.path.abspath(
    os.path.join(root, args.resultsDir, args.expName, args.modelName))
  if not os.path.exists(modelPath):
    os.makedirs(modelPath)

  # Verify input params.
  if not os.path.isfile(dataPath):
    raise ValueError("Invalid data path.")
  if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
    raise ValueError("Invalid value for number of cross-validation folds.")
  if args.train and args.test:
    raise ValueError("Run training and testing independently.")
  if (args.train or args.test) and args.kFolds > 1:
    raise ValueError("Experiment runs either k-folds CV or training/testing, "
                     "not both.")

  # Load or init model.
  if args.load:
    with open(
      os.path.join(modelPath, "model.pkl"), "rb") as f:
      model = pkl.load(f)
    print "Model loaded from \'{0}\'.".format(modelPath)
  else:
    model = ClassificationModelRandomSDR(verbosity=args.verbosity)

  # Get and prep data.
  texter = TextPreprocess()
  samples, labels = readCSV(dataPath, 2, [3])  # Y data, [3] -> range(3,6)
  labelReference = list(set(labels))
  labels = numpy.array([labelReference.index(l) for l in labels], dtype=int)
  split = len(samples)/args.kFolds
  samples = [texter.tokenize(sample,
                             ignoreCommon=100,
                             removeStrings=["[identifier deleted]"],
                             correctSpell=True)
             for sample in samples]
  if args.verbosity > 1:
    for i, s in enumerate(samples): print i, s, labelReference[labels[i]]
  patterns = [[model.encodePattern(t) for t in tokens] for tokens in samples]

  # Either we train on all the data, test on all the data, or run k-fold CV.
  if args.train:
    training(model,
      [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.test:
    trialResults = testing(model,
      [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.kFolds>1:
    intermResults = []
    predictions = []
    for k in range(args.kFolds):
      # Train the model on a subset, and hold the evaluation subset.
      model.resetModel()
      evalIndices = range(k*split, (k+1)*split)
      trainIndices = [i for i in range(len(samples)) if not i in evalIndices]

      print "Training for CV fold {0}.".format(k)
      training(model,
        [(patterns[i], labels[i]) for i in trainIndices])

      print "Evaluating for trial {0}.".format(k)
      trialResults = testing(model,
        [(patterns[i], labels[i]) for i in evalIndices])

      if args.expectationDataPath:
        # Keep the predicted labels (top prediction only) for later.
        p = [l if l else [None] for l in trialResults[0]]
        predictions.append([labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p])

      print "Calculating intermediate results for this fold."
      result = model.evaluateTrialResults(
        trialResults, labelReference, evalIndices)
      intermResults.append(result)
      result[1].to_csv(os.path.join(
        modelPath, "evaluation_fold_" + str(k) + ".csv"))

    print "Calculating cumulative results for {0} trials.".format(args.kFolds)
    results = model.evaluateFinalResults(intermResults)
    results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv"))
    if args.expectationDataPath:
      computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)),
        os.path.abspath(os.path.join(root, '../..', args.expectationDataPath)))

  print "Calculating random classifier results for comparison."
  print model.classifyRandomly(labels)

  print "Saving model to \'{0}\' directory.".format(modelPath)
  with open(
    os.path.join(modelPath, "model.pkl"), "wb") as f:
    pkl.dump(model, f)
  print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)
コード例 #23
0
 def testReadExpansionFileNoSuffixes(self):
     """Tests TextPreprocess reads csv files correctly."""
     processor = TextPreprocess()
     abbreviations = processor.readExpansionFile("abbreviations.csv")
     expectedAbbreviations = {"wfh": "work from home"}
     self.assertEqual(abbreviations, expectedAbbreviations)
コード例 #24
0
 def testReadExpansionFileNoSuffixes(self):
   """Tests TextPreprocess reads csv files correctly."""
   processor = TextPreprocess()
   abbreviations = processor.readExpansionFile("abbreviations.csv")
   expectedAbbreviations = {"wfh": "work from home"}
   self.assertEqual(abbreviations, expectedAbbreviations)
コード例 #25
0
def run(args):
  """
  The experiment is configured to run on question response data.

  To run k-folds cross validation, arguments must be: kFolds > 1, train = False,
  test = False. To run either training or testing, kFolds = 1.
  """
  start = time.time()

  # Setup directories.
  root = os.path.dirname(__file__)
  dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile))
  modelPath = os.path.abspath(
    os.path.join(root, args.resultsDir, args.expName, args.modelName))
  if not os.path.exists(modelPath):
    os.makedirs(modelPath)

  # Verify input params.
  if not os.path.isfile(dataPath):
    raise ValueError("Invalid data path.")
  if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
    raise ValueError("Invalid value for number of cross-validation folds.")
  if args.train and args.test:
    raise ValueError("Run training and testing independently.")
  if (args.train or args.test) and args.kFolds > 1:
    raise ValueError("Experiment runs either k-folds CV or training/testing, "
                     "not both.")

  # Load or init model.
  if args.load:
    with open(
      os.path.join(modelPath, "model.pkl"), "rb") as f:
      model = pkl.load(f)
    print "Model loaded from \'{0}\'.".format(modelPath)
  else:
    try:
      module = __import__(args.modelModuleName, {}, {}, args.modelName)
      modelClass = getattr(module, args.modelName)
      model = modelClass(verbosity=args.verbosity)
    except ImportError:
      raise RuntimeError("Could not find model class \'%s\' to import."
                         % args.modelName)

  print "Reading in data and preprocessing."
  texter = TextPreprocess()
  samples, labels = readCSV(dataPath, 2, [3])  # Y data, [3] -> range(3,6)
  labelReference = list(set(labels))
  labels = numpy.array([labelReference.index(l) for l in labels], dtype="int8")
  samples = [texter.tokenize(sample,
                             ignoreCommon=100,
                             removeStrings=["[identifier deleted]"],
                             correctSpell=True)
             for sample in samples]
  if args.verbosity > 1:
    for i, s in enumerate(samples): print i, s, labelReference[labels[i]]
  patterns = [model.encodePattern(s) for s in samples]

  # Either we train on all the data, test on all the data, or run k-fold CV.
  if args.train:
    training(model, [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.test:
    results = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)])
    calculateTrialResults(model, results, labelReference, xrange(len(samples)),
      os.path.join(modelPath, "test_results.csv"))
  elif args.kFolds>1:
    # Run k-folds cross validation -- train the model on a subset, and evaluate
    # on the remaining subset.
    partitions = KFolds(args.kFolds).split(xrange(len(samples)))
    intermResults = []
    predictions = []
    for k in xrange(args.kFolds):
      print "Training and testing for CV fold {0}.".format(k)
      trialResults = runExperiment(model, patterns, labels, partitions[k])

      if args.expectationDataPath:
        # Keep the predicted labels (top prediction only) for later.
        p = [l if l else [None] for l in trialResults[0]]
        predictions.append(
          [labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p])

      print "Calculating intermediate results for this fold. Writing to CSV."
      intermResults.append(calculateTrialResults(model,
        trialResults, labelReference, partitions[k][1],
        os.path.join(modelPath, "evaluation_fold_" + str(k) + ".csv")))

    print "Calculating cumulative results for {0} trials.".format(args.kFolds)
    results = model.evaluateFinalResults(intermResults)
    results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv"))
    if args.expectationDataPath:
      computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)),
        os.path.abspath(os.path.join(root, '../..', args.expectationDataPath)))

  print "Calculating random classifier results for comparison."
  print model.classifyRandomly(labels)

  print "Saving model to \'{0}\' directory.".format(modelPath)
  with open(
    os.path.join(modelPath, "model.pkl"), "wb") as f:
    pkl.dump(model, f)
  print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)