class Runner(object):
  """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """

  def __init__(self,
               dataPath,
               resultsDir,
               experimentName,
               experimentType,
               modelName,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               loadPath=None,
               numClasses=3,
               plots=0,
               orderedSplit=False,
               folds=None,
               trainSizes=None,
               verbosity=0,
               **kwargs):
    """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param experimentType   (str)     Either 'incremental' or 'k-folds'.
    @param modelName        (str)     Name of nlp model subclass.
    @param retinaScaling    (float)   For scaling dimensions of Cio encoders.
    @param retina           (str)     Name of Cio retina for encodings.
    @param apiKey           (str)     Key for Cio API.
    @param classifierMetric (str)     Distance metric used by the classifier.
    @param loadPath         (str)     Path to serialized model for loading.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param folds            (int)     For k-folds experiment, number of cross
                                      validation folds.
    @param trainSizes       (list)    For incremental experiment, number of
                                      samples to use in training, per trial.
    @param verbosity        (int)     Greater value prints out more progress.
    """
    if experimentType not in ("buckets", "incremental", "k-folds"):
      raise ValueError("Experiment type not recognized.")

    self.experimentType = experimentType
    self.folds = folds
    self.trainSizes = trainSizes
    self.dataPath = dataPath
    self.resultsDir = resultsDir
    self.experimentName = experimentName
    self.loadPath = loadPath
    self.modelName = modelName
    self.numClasses = numClasses
    self.plots = plots
    self.orderedSplit = orderedSplit
    self.retinaScaling = retinaScaling
    self.retina = retina
    self.apiKey = apiKey
    self.classifierMetric = classifierMetric
    self.verbosity = verbosity

    self.modelDir = os.path.join(
      self.resultsDir, self.experimentName, self.modelName)
    if not os.path.exists(self.modelDir):
      os.makedirs(self.modelDir)

    if self.plots:
      from htmresearch.support.nlp_classification_plotting import PlotNLP
      self.plotter = PlotNLP()

    self.buckets = None
    self.dataDict = None
    self.labels = None
    self.labelRefs = None
    self.partitions = []
    self.samples = {}
    self.patterns = None
    self.results = []
    self.model = None


  def initModel(self, modelName):
    """Load or instantiate the classification model."""
    if self.loadPath:
      self.model = self.loadModel()
    else:
      self.model = self._createModel(modelName)


  def _createModel(self, modelName):
    """Return an instantiated model."""
    modelCls = _MODEL_MAPPING.get(modelName, None)

    if modelCls is None:
      raise ValueError("Could not instantiate model \'{}\'.".format(modelName))

    # TODO: remove these if blocks and just use the else; either specify the Cio
    # FP type elsewhere, or split Word and Doc into separate classes.

    if modelName == "CioWordFingerprint":
      return modelCls(verbosity=self.verbosity,
                      numLabels=self.numClasses,
                      modelDir=self.modelDir,
                      fingerprintType=EncoderTypes.word,
                      retinaScaling=self.retinaScaling,
                      retina=self.retina,
                      apiKey=self.apiKey,
                      classifierMetric=self.classifierMetric)

    elif modelName == "CioDocumentFingerprint":
      return modelCls(verbosity=self.verbosity,
                      numLabels=self.numClasses,
                      modelDir=self.modelDir,
                      fingerprintType=EncoderTypes.document,
                      retinaScaling=self.retinaScaling,
                      retina=self.retina,
                      apiKey=self.apiKey,
                      classifierMetric=self.classifierMetric)

    else:
      return modelCls(verbosity=self.verbosity,
                      numLabels=self.numClasses,
                      modelDir=self.modelDir,
                      classifierMetric=self.classifierMetric)


  def loadModel(self):
    """Load the serialized model."""
    try:
      with open(self.loadPath, "rb") as f:
        model = pkl.load(f)
      print "Model loaded from \'{}\'.".format(self.loadPath)
      return model
    except IOError as e:
      print "Could not load model from \'{}\'.".format(self.loadPath)
      raise e


  def resetModel(self, _):
    self.model.resetModel()


  def saveModel(self, trial=None):
    self.model.saveModel(trial)


  def setupData(self, preprocess=False):
    """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
    self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

    if self.experimentType == "incremental":
      # stop now if the data won't work for the specified experiment
      if (not isinstance(self.trainSizes, list) or not
          all([0 <= size <= len(self.dataDict) for size in self.trainSizes])):
        raise ValueError("Invalid size(s) for training set(s).")

    self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict)

    self.samples = self.model.prepData(self.dataDict, preprocess)

    if self.verbosity > 1:
      for i, s in self.samples.iteritems():
        print i, s


  def encodeSamples(self, writeEncodings=False):
    """
    The patterns list is in the same order as the samples in the original data
    file; the order is preserved by the OrderedDicts self.dataDict and
    self.samples, which may or may not match the samples' unique IDs.

    @param writeEncodings   (bool)    True will write the encodings to a JSON.
    """
    self.patterns = self.model.encodeSamples(self.samples, write=writeEncodings)


  def runExperiment(self, seed=42):
    """Train and test the model for each trial specified by self.splitting."""
    self.partitionIndices(seed)

    for i, _ in enumerate(self.partitions):
      self.resetModel(i)
      if self.verbosity > 0:
        print "\tTraining and testing for run {}.".format(i)
      self.training(i)
      self.testing(i, seed)


  def partitionIndices(self, seed=42):
    """
    Partitions list of two-tuples of train and test indices for each trial.
    """
    if self.experimentType == "k-folds":
      self.partitions = KFolds(self.folds).split(
        range(len(self.samples)), randomize=(not self.orderedSplit), seed=seed)
    else:
      # TODO: use StandardSplit in data_split.py
      length = len(self.samples)
      if self.orderedSplit:
        for split in self.trainSizes:
          trainIndices = range(split)
          testIndices = range(split, length)
          self.partitions.append((trainIndices, testIndices))
      else:
        # randomly sampled, not repeated
        random.seed(seed)
        for split in self.trainSizes:
          trainIndices = random.sample(xrange(length), split)
          testIndices = [i for i in xrange(length) if i not in trainIndices]
          self.partitions.append((trainIndices, testIndices))


  def training(self, trial):
    """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
    if self.verbosity > 0:
      print ("\tRunner selects to train on sample(s) {}".format(
        self.partitions[trial][0]))

    for i in self.partitions[trial][0]:
      self.model.trainModel(i)


  def testing(self, trial, seed):
    if self.verbosity > 0:
      print ("\tRunner selects to test on sample(s) {}".format(
        self.partitions[trial][1]))

    results = ([], [])
    for i in self.partitions[trial][1]:
      predicted = self.model.testModel(i, seed)
      results[0].append(predicted)
      results[1].append(self.patterns[i]["labels"])

    self.results.append(results)


  def writeOutClassifications(self):
    """Write the samples, actual, and predicted classes to a CSV."""
    headers = ("", "Tokenized sample", "Actual", "Predicted")

    if self.experimentType == "k-folds":
      splits = range(self.folds)
    else:
      splits = self.trainSizes

    for trial in xrange(len(splits)):
      resultsDict = defaultdict(list)
      for i, sampleNum in enumerate(self.partitions[trial][1]):
        # Loop through the indices in the test set of this trial.
        sample = self.samples.values()[sampleNum][0]
        pred = sorted([self.labelRefs[j] for j in self.results[trial][0][i]])
        actual = sorted([self.labelRefs[j] for j in self.results[trial][1][i]])
        resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

      resultsPath = os.path.join(self.model.modelDir,
                                 "results_trial" + str(trial) + ".csv")
      writeFromDict(resultsDict, headers, resultsPath)


  def calculateResults(self):
    """
    Calculate evaluation metrics from the result classifications.
    """
    # TODO: pass intended CM results to plotter.plotConfusionMatrix()
    resultCalcs = []
    for i, sampleNum in enumerate(self.partitions):
      if self.verbosity > 0:
        self.printTrialReport(i, sampleNum[1])
      resultCalcs.append(evaluateResults(
        self.results[i], self.labelRefs))

    trainSizes = [len(x[0]) for x in self.partitions]
    self.printFinalReport(trainSizes, [r[0] for r in resultCalcs])

    if self.plots:
      trialAccuracies = self._calculateTrialAccuracies()
      classificationAccuracies = self._calculateClassificationAccuracies(
        trialAccuracies)

      self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSizes)
      self.plotter.plotCumulativeAccuracies(
        classificationAccuracies, self.trainSizes)

    return resultCalcs


  def printTrialReport(self, trial, idx):
    """Print columns for sample #, actual label, and predicted label."""
    template = "{0:<10}|{1:<55}|{2:<55}"
    print "Classification results for the trial:"
    print template.format("#", "Actual", "Predicted")
    for i in xrange(len(self.results[trial][0])):
      if len(self.results[trial][0][i]) == 0:
        # No predicted classes for this sample.
        print template.format(
          idx[i],
          [self.labelRefs[label] for label in self.results[trial][1][i]],
          "(none)")
      else:
        print template.format(
          idx[i],
          [self.labelRefs[label] for label in self.results[trial][1][i]],
          [self.labelRefs[label] for label in self.results[trial][0][i]])


  @staticmethod
  def printFinalReport(trainSizes, accuracies):
    """Prints result accuracies."""
    template = "{0:<20}|{1:<10}"
    print "---------- RESULTS ----------"
    print template.format("Size of training set", "Accuracy")
    for size, acc in itertools.izip(trainSizes, accuracies):
      print template.format(size, acc)


  def _calculateTrialAccuracies(self):
    """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
    # To handle multiple trials of the same size:
    # trialSize -> (category -> list of accuracies)
    trialAccuracies = defaultdict(lambda: defaultdict(lambda: numpy.ndarray(0)))
    for result, size in itertools.izip(self.results, self.trainSizes):
      accuracies = calculateClassificationResults(result)
      for label, acc in accuracies:
        category = self.labelRefs[label]
        accList = trialAccuracies[size][category]
        trialAccuracies[size][category] = numpy.append(accList, acc)

    return trialAccuracies


  def _calculateClassificationAccuracies(self, trialAccuracies):
    """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
    # Need the accuracies to be ordered for the plot
    trials = sorted(set(self.trainSizes))
    # category -> list of list of accuracies
    classificationAccuracies = defaultdict(list)
    for trial in trials:
      accuracies = trialAccuracies[trial]
      for label, acc in accuracies.iteritems():
        classificationAccuracies[label].append(acc)

    return classificationAccuracies


  def validateExperiment(self, expectationFilePath):
    """Returns accuracy of predicted labels against expected labels."""
    dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

    accuracies = numpy.zeros((len(self.results)))
    for i, trial in enumerate(self.results):
      for j, predictionList in enumerate(trial[0]):
        predictions = [self.labelRefs[p] for p in predictionList]
        if predictions == []:
          predictions = ["(none)"]
        expected = dataDict.items()[j+self.trainSizes[i]][1]

        accuracies[i] += (float(len(set(predictions) & set(expected[1])))
                          / len(expected[1]))

      accuracies[i] = accuracies[i] / len(trial[0])

    return accuracies


  def evaluateCumulativeResults(self, intermResults):
    """
    Cumulative statistics for the outputs of evaluateTrialResults().

    @param intermResults      (list)          List of returned results from
                                              evaluateTrialResults().
    @return                   (dict)          Returns a dictionary with entries
                                              for max, mean, and min accuracies,
                                              and the mean confusion matrix.
    """
    accuracy = []
    cm = numpy.zeros((intermResults[0][1].shape))

    # Find mean, max, and min values for the metrics.
    for result in intermResults:
      accuracy.append(result[0])
      cm = numpy.add(cm, result[1])

    results = {"max_accuracy":max(accuracy),
               "mean_accuracy":sum(accuracy)/float(len(accuracy)),
               "min_accuracy":min(accuracy),
               "total_cm":cm}

    self._printCumulativeReport(results)

    return results


  @staticmethod
  def _printCumulativeReport(results):
    """
    Prints results as returned by evaluateFinalResults() after several trials.
    """
    print "max, mean, min accuracies = "
    print "{0:.3f}, {1:.3f}, {2:.3f}".format(
      results["max_accuracy"], results["mean_accuracy"],
      results["min_accuracy"])
    print "total confusion matrix =\n", results["total_cm"]
class HTMRunner(Runner):
  """
  Class to run the HTM NLP experiments with the specified data and evaluation
  metrics.
  """

  def __init__(self,
               dataPath,
               resultsDir,
               experimentName,
               experimentType,
               networkConfigPath=None,
               generateData=True,
               votingMethod="most",
               classificationFile="",
               seed=42,
               **kwargs):
    """
    @param networkConfigPath  (str)    Path to JSON specifying network params.
    @param generateData       (bool)   Whether or not we need to generate data.
    @param votingMethod       (str)    Classify with "last" token's score or
                                       "most" frequent of the sequence.
    @param classificationFile (str)    Path to JSON that maps labels to ids.

    See base class constructor for the other parameters.
    """
    if networkConfigPath is None:
      raise RuntimeError("Need to specify a network configuration JSON.")

    super(HTMRunner, self).__init__(
      dataPath, resultsDir, experimentName, experimentType, **kwargs)

    self.networkConfig = self._getNetworkConfig(networkConfigPath)
    self.model = None
    self.votingMethod = votingMethod
    self.dataFiles = []
    self.actualLabels = None

    if classificationFile == "" and not generateData:
      raise ValueError("Must give classificationFile if not generating data")
    self.classificationFile = classificationFile

    # Setup data now in order to init the network model. If you want to
    # specify data params, just call setupNetData() again later.
    self.setupNetData(generateData=generateData, seed=seed)


  @staticmethod
  def _getNetworkConfig(networkConfigPath):
    try:
      with open(networkConfigPath, "rb") as fin:
        return json.load(fin)
    except IOError as e:
      print "Could not find network configuration JSON at \'{}\'.".format(
        networkConfigPath)
      raise e


  def initModel(self, trial=0):
    """
    Load or instantiate the classification model. Assumes network data is
    already setup.
    """
    if self.loadPath:
      with open(self.loadPath, "rb") as f:
        self.model = pkl.load(f)
      # TODO: uncomment once we can save TPRegion; do we need this?
      # networkFile = self.model.network
      # self.model.network = Network(networkFile)
      print "Model loaded from \'{0}\'.".format(self.loadPath)
    else:
      print "Creating HTM classification model..."
      self.model = ClassificationModelHTM(self.networkConfig,
                                          self.dataFiles[trial],
                                          retinaScaling=self.retinaScaling,
                                          retina=self.retina,
                                          apiKey=self.apiKey,
                                          verbosity=self.verbosity,
                                          numLabels=self.numClasses,
                                          modelDir=self.modelDir,
                                          prepData=False)


  def setupNetData(self, generateData=False, seed=42, preprocess=False, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each experiment
    iteration.

    Look at runner.py (setupData) and network_text_data_generator.py (split) for
    the parameters.
    """
    # TODO: logic here is confusing (a lot of if-statements), so maybe cleanup.
    if self.experimentType == "k-folds":
      splits = self.folds
    elif self.experimentType == "incremental":
      splits = len(self.trainSizes)

    if generateData:
      self.generateNetworkDataFiles(splits, seed, preprocess, **kwargs)
    else:
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * splits

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(i) for i in xrange(splits)]
      self.mapLabelRefs()


  def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
    # TODO: use model.prepData()?
    ndg = NetworkDataGenerator()
    self.dataDict = ndg.split(
      filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs)

    filename, ext = os.path.splitext(self.dataPath)
    self.classificationFile = "{}_categories.json".format(filename)

    # Generate one data file for each experiment iteration.
    if self.experimentType == "k-folds" and not self.orderedSplit:
      # only randomize the data order once for k-folds cross validation
      ndg.randomizeData(seed)
    for i in xrange(splits):
      if self.experimentType != "k-folds" and not self.orderedSplit:
        ndg.randomizeData(seed)
        seed += 1
      # ext='.csv'
      dataFile = "{}_network_{}{}".format(filename, i, ext)
      ndg.saveData(dataFile, self.classificationFile)
      self.dataFiles.append(dataFile)

    if self.verbosity > 0:
      print "{} file(s) generated at {}".format(len(self.dataFiles),
        self.dataFiles)
      print "Classification JSON is at: {}".format(self.classificationFile)


  def _getClassifications(self, iteration):
    """
    Get the classifications for a particular iteration.
    @param iteration  (int)       Iteration of the experiment.
    @return           (list)      List of list of ids of classifications for a
                                  sample.
    """
    dataFile = self.dataFiles[iteration]
    classifications = NetworkDataGenerator.getClassifications(dataFile)
    return [[int(c) for c in classes.strip().split(" ")]
      for classes in classifications]


  def mapLabelRefs(self):
    """Get the mapping from label strings to the corresponding ints."""
    try:
      with open(self.classificationFile, "r") as f:
        labelToId = json.load(f)
    except IOError as e:
      print "Must have a valid classification JSON file"
      raise e

    # Convert the dict of strings -> ids to a list of strings ordered by id
    self.labelRefs = zip(*sorted(labelToId.iteritems(), key=lambda x: x[1]))[0]
    for recordNumber, data in self.dataDict.iteritems():
      self.dataDict[recordNumber] = (data[0], numpy.array(
        [self.labelRefs.index(label) for label in data[1]]), data[2])


  def resetModel(self, trial=0):
    """
    Load or instantiate the classification model; network API doesn't support
    resetting."""
    self.initModel(trial=trial)
    # TODO: change to same as Runner:
    #   self.model.resetModel()
    #   otherwise you're creating a new model instance twice each experiment


  def setupData(self, _):
    """Passthrough b/c network data generation was done upfront."""
    pass


  def encodeSamples(self, _):
    """Passthrough b/c the network encodes the samples."""
    pass


  def training(self, trial):
    """
    Train the network on all the tokens in the training set for a particular
    trial.
    @param trial      (int)       current trial number
    """
    if self.verbosity > 0:
      i = 0
      indices = []
      for numTokens in self.partitions[trial][0]:
        indices.append(i)
        i += numTokens
      print ("\tRunner selects to train on sequences starting at indices {}.".
            format(indices))

    for numTokens in self.partitions[trial][0]:
      self.model.trainModel(iterations=numTokens)


  def testing(self, trial, seed):
    """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
    if self.verbosity > 0:
      i = sum(self.partitions[trial][0])
      indices = []
      for numTokens in self.partitions[trial][1]:
        indices.append(i)
        i += numTokens
      print ("\tRunner selects to test on sequences starting at indices "
             "{}".format(indices))

    results = ([], [])
    testIndex = len(self.partitions[trial][0])
    for numTokens in self.partitions[trial][1]:
      predictions = []
      activations = []
      for _ in xrange(numTokens):
        predicted, active = self.model.testModel(seed)
        activations.append(active)
        predictions.append(predicted)
      winningPredictions = self._selectWinners(predictions, activations)

      # TODO: switch to standard (expected, actual) format
      results[0].append(winningPredictions)
      results[1].append(self.actualLabels[trial][testIndex])
      testIndex += 1

    # Prepare data for writeOutClassifications
    trainIdx = range(len(self.partitions[trial][0]))
    testIdx = range(len(self.partitions[trial][0]),
      len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
    self.partitions[trial] = (trainIdx, testIdx)
    self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

    self.results.append(results)


  def _selectWinners(self, predictions, activations):
    """
    Selects the final classifications for the predictions.  Voting
    method=="last" means the predictions of the last sample are used. Voting
    method=="most" means the most frequent sample is used.
    @param predictions    (list)    List of list of possible classifications
    @return               (list)    List of winning classifications
    """
    if self.votingMethod == "last":
      return predictions[-1]
    elif self.votingMethod == "most":
      counter = Counter()
      for p in predictions:
        counter.update(p)
      return zip(*counter.most_common(self.numClasses))[0]
    else:
      raise ValueError("voting method must be either \'last\' or \'most\'")


  def partitionIndices(self, _):
    """
    Sets self.partitions for the number of tokens for each sample in the
    training and test sets.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
    if self.experimentType == "k-folds":
      for fold in xrange(self.folds):
        dataFile = self.dataFiles[fold]
        numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
        self.partitions = KFolds(self.folds).split(numTokens, randomize=False)
    else:
      for trial, split in enumerate(self.trainSizes):
        dataFile = self.dataFiles[trial]
        numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
        self.partitions.append((numTokens[:split], numTokens[split:]))

  # TODO
  # This method is to partition data for which regions are learning, as in the
  # sequence classification experiments.
  def partitionLearning(self):
    """
    Find the number of partitions for the input data based on a specific
    networkConfig.

    @return partitions: (list of namedtuples) Region names and index at which the
      region is to begin learning. The final partition is reserved as a test set.
    """
    Partition = namedtuple("Partition", "partName index")

    # Add regions to partition list in order of learning.
    regionConfigs = ("spRegionConfig", "tmRegionConfig", "tpRegionConfig",
      "classifierRegionConfig")
    partitions = []

    return


  def writeOutClassifications(self):
    # TODO: implement this method after updating HTM network models and runner
    # per nupic.research #277
    pass
Beispiel #3
0
class Runner(object):
    """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """
    def __init__(self,
                 dataPath,
                 resultsDir,
                 experimentName,
                 experimentType,
                 modelName,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap",
                 loadPath=None,
                 numClasses=3,
                 plots=0,
                 orderedSplit=False,
                 folds=None,
                 trainSizes=None,
                 verbosity=0,
                 **kwargs):
        """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param experimentType   (str)     Either 'incremental' or 'k-folds'.
    @param modelName        (str)     Name of nlp model subclass.
    @param retinaScaling    (float)   For scaling dimensions of Cio encoders.
    @param retina           (str)     Name of Cio retina for encodings.
    @param apiKey           (str)     Key for Cio API.
    @param classifierMetric (str)     Distance metric used by the classifier.
    @param loadPath         (str)     Path to serialized model for loading.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param folds            (int)     For k-folds experiment, number of cross
                                      validation folds.
    @param trainSizes       (list)    For incremental experiment, number of
                                      samples to use in training, per trial.
    @param verbosity        (int)     Greater value prints out more progress.
    """
        if experimentType not in ("buckets", "incremental", "k-folds"):
            raise ValueError("Experiment type not recognized.")

        self.experimentType = experimentType
        self.folds = folds
        self.trainSizes = trainSizes
        self.dataPath = dataPath
        self.resultsDir = resultsDir
        self.experimentName = experimentName
        self.loadPath = loadPath
        self.modelName = modelName
        self.numClasses = numClasses
        self.plots = plots
        self.orderedSplit = orderedSplit
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.classifierMetric = classifierMetric
        self.verbosity = verbosity

        self.modelDir = os.path.join(self.resultsDir, self.experimentName,
                                     self.modelName)
        if not os.path.exists(self.modelDir):
            os.makedirs(self.modelDir)

        if self.plots:
            from htmresearch.support.nlp_classification_plotting import PlotNLP
            self.plotter = PlotNLP()

        self.buckets = None
        self.dataDict = None
        self.labels = None
        self.labelRefs = None
        self.partitions = []
        self.samples = {}
        self.patterns = None
        self.results = []
        self.model = None

    def initModel(self, modelName):
        """Load or instantiate the classification model."""
        if self.loadPath:
            self.model = self.loadModel()
        else:
            self.model = self._createModel(modelName)

    def _createModel(self, modelName):
        """Return an instantiated model."""
        modelCls = _MODEL_MAPPING.get(modelName, None)

        if modelCls is None:
            raise ValueError(
                "Could not instantiate model \'{}\'.".format(modelName))

        # TODO: remove these if blocks and just use the else; either specify the Cio
        # FP type elsewhere, or split Word and Doc into separate classes.

        if modelName == "CioWordFingerprint":
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir,
                            fingerprintType=EncoderTypes.word,
                            retinaScaling=self.retinaScaling,
                            retina=self.retina,
                            apiKey=self.apiKey,
                            classifierMetric=self.classifierMetric)

        elif modelName == "CioDocumentFingerprint":
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir,
                            fingerprintType=EncoderTypes.document,
                            retinaScaling=self.retinaScaling,
                            retina=self.retina,
                            apiKey=self.apiKey,
                            classifierMetric=self.classifierMetric)

        else:
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir,
                            classifierMetric=self.classifierMetric)

    def loadModel(self):
        """Load the serialized model."""
        try:
            with open(self.loadPath, "rb") as f:
                model = pkl.load(f)
            print "Model loaded from \'{}\'.".format(self.loadPath)
            return model
        except IOError as e:
            print "Could not load model from \'{}\'.".format(self.loadPath)
            raise e

    def resetModel(self, _):
        self.model.resetModel()

    def saveModel(self, trial=None):
        self.model.saveModel(trial)

    def setupData(self, preprocess=False):
        """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
        self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

        if self.experimentType == "incremental":
            # stop now if the data won't work for the specified experiment
            if (not isinstance(self.trainSizes, list) or not all(
                [0 <= size <= len(self.dataDict)
                 for size in self.trainSizes])):
                raise ValueError("Invalid size(s) for training set(s).")

        self._mapLabelRefs()

        self.samples = self.model.prepData(self.dataDict, preprocess)

        if self.verbosity > 1:
            for i, s in self.samples.iteritems():
                print i, s

    def _mapLabelRefs(self):
        """Replace the label strings in self.dataDict with corresponding ints."""
        self.labelRefs = [
            label for label in set(
                itertools.chain.from_iterable(
                    [x[1] for x in self.dataDict.values()]))
        ]

        for recordNumber, data in self.dataDict.iteritems():
            self.dataDict[recordNumber] = (data[0],
                                           numpy.array([
                                               self.labelRefs.index(label)
                                               for label in data[1]
                                           ]), data[2])

    def encodeSamples(self, writeEncodings=False):
        """
    The patterns list is in the same order as the samples in the original data
    file; the order is preserved by the OrderedDicts self.dataDict and
    self.samples, which may or may not match the samples' unique IDs.

    @param writeEncodings   (bool)    True will write the encodings to a JSON.
    """
        self.patterns = self.model.encodeSamples(self.samples,
                                                 write=writeEncodings)

    def runExperiment(self, seed=42):
        """Train and test the model for each trial specified by self.splitting."""
        self.partitionIndices(seed)

        for i, _ in enumerate(self.partitions):
            self.resetModel(i)
            if self.verbosity > 0:
                print "\tTraining and testing for run {}.".format(i)
            self.training(i)
            self.testing(i, seed)

    def partitionIndices(self, seed=42):
        """
    Partitions list of two-tuples of train and test indices for each trial.
    """
        if self.experimentType == "k-folds":
            self.partitions = KFolds(self.folds).split(
                range(len(self.samples)),
                randomize=(not self.orderedSplit),
                seed=seed)
        else:
            # TODO: use StandardSplit in data_split.py
            length = len(self.samples)
            if self.orderedSplit:
                for split in self.trainSizes:
                    trainIndices = range(split)
                    testIndices = range(split, length)
                    self.partitions.append((trainIndices, testIndices))
            else:
                # randomly sampled, not repeated
                random.seed(seed)
                for split in self.trainSizes:
                    trainIndices = random.sample(xrange(length), split)
                    testIndices = [
                        i for i in xrange(length) if i not in trainIndices
                    ]
                    self.partitions.append((trainIndices, testIndices))

    def training(self, trial):
        """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
        if self.verbosity > 0:
            print("\tRunner selects to train on sample(s) {}".format(
                self.partitions[trial][0]))

        for i in self.partitions[trial][0]:
            self.model.trainModel(i)

    def testing(self, trial, seed):
        if self.verbosity > 0:
            print("\tRunner selects to test on sample(s) {}".format(
                self.partitions[trial][1]))

        results = ([], [])
        for i in self.partitions[trial][1]:
            predicted = self.model.testModel(i, seed)
            results[0].append(predicted)
            results[1].append(self.patterns[i]["labels"])

        self.results.append(results)

    def writeOutClassifications(self):
        """Write the samples, actual, and predicted classes to a CSV."""
        headers = ("", "Tokenized sample", "Actual", "Predicted")

        if self.experimentType == "k-folds":
            splits = range(self.folds)
        else:
            splits = self.trainSizes

        for trial in xrange(len(splits)):
            resultsDict = defaultdict(list)
            for i, sampleNum in enumerate(self.partitions[trial][1]):
                # Loop through the indices in the test set of this trial.
                sample = self.samples.values()[sampleNum][0]
                pred = sorted(
                    [self.labelRefs[j] for j in self.results[trial][0][i]])
                actual = sorted(
                    [self.labelRefs[j] for j in self.results[trial][1][i]])
                resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

            resultsPath = os.path.join(self.model.modelDir,
                                       "results_trial" + str(trial) + ".csv")
            writeFromDict(resultsDict, headers, resultsPath)

    def calculateResults(self):
        """
    Calculate evaluation metrics from the result classifications.
    """
        # TODO: pass intended CM results to plotter.plotConfusionMatrix()
        resultCalcs = []
        for i, sampleNum in enumerate(self.partitions):
            if self.verbosity > 0:
                self.printTrialReport(i, sampleNum[1])
            resultCalcs.append(evaluateResults(self.results[i],
                                               self.labelRefs))

        trainSizes = [len(x[0]) for x in self.partitions]
        self.printFinalReport(trainSizes, [r[0] for r in resultCalcs])

        if self.plots:
            trialAccuracies = self._calculateTrialAccuracies()
            classificationAccuracies = self._calculateClassificationAccuracies(
                trialAccuracies)

            self.plotter.plotCategoryAccuracies(trialAccuracies,
                                                self.trainSizes)
            self.plotter.plotCumulativeAccuracies(classificationAccuracies,
                                                  self.trainSizes)

        return resultCalcs

    def printTrialReport(self, trial, idx):
        """Print columns for sample #, actual label, and predicted label."""
        template = "{0:<10}|{1:<55}|{2:<55}"
        print "Classification results for the trial:"
        print template.format("#", "Actual", "Predicted")
        for i in xrange(len(self.results[trial][0])):
            if len(self.results[trial][0][i]) == 0:
                # No predicted classes for this sample.
                print template.format(idx[i], [
                    self.labelRefs[label]
                    for label in self.results[trial][1][i]
                ], "(none)")
            else:
                print template.format(idx[i], [
                    self.labelRefs[label]
                    for label in self.results[trial][1][i]
                ], [
                    self.labelRefs[label]
                    for label in self.results[trial][0][i]
                ])

    @staticmethod
    def printFinalReport(trainSizes, accuracies):
        """Prints result accuracies."""
        template = "{0:<20}|{1:<10}"
        print "---------- RESULTS ----------"
        print template.format("Size of training set", "Accuracy")
        for size, acc in itertools.izip(trainSizes, accuracies):
            print template.format(size, acc)

    def _calculateTrialAccuracies(self):
        """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
        # To handle multiple trials of the same size:
        # trialSize -> (category -> list of accuracies)
        trialAccuracies = defaultdict(
            lambda: defaultdict(lambda: numpy.ndarray(0)))
        for result, size in itertools.izip(self.results, self.trainSizes):
            accuracies = calculateClassificationResults(result)
            for label, acc in accuracies:
                category = self.labelRefs[label]
                accList = trialAccuracies[size][category]
                trialAccuracies[size][category] = numpy.append(accList, acc)

        return trialAccuracies

    def _calculateClassificationAccuracies(self, trialAccuracies):
        """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
        # Need the accuracies to be ordered for the plot
        trials = sorted(set(self.trainSizes))
        # category -> list of list of accuracies
        classificationAccuracies = defaultdict(list)
        for trial in trials:
            accuracies = trialAccuracies[trial]
            for label, acc in accuracies.iteritems():
                classificationAccuracies[label].append(acc)

        return classificationAccuracies

    def validateExperiment(self, expectationFilePath):
        """Returns accuracy of predicted labels against expected labels."""
        dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

        accuracies = numpy.zeros((len(self.results)))
        for i, trial in enumerate(self.results):
            for j, predictionList in enumerate(trial[0]):
                predictions = [self.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                expected = dataDict.items()[j + self.trainSizes[i]][1]

                accuracies[i] += (
                    float(len(set(predictions) & set(expected[1]))) /
                    len(expected[1]))

            accuracies[i] = accuracies[i] / len(trial[0])

        return accuracies

    def evaluateCumulativeResults(self, intermResults):
        """
    Cumulative statistics for the outputs of evaluateTrialResults().

    @param intermResults      (list)          List of returned results from
                                              evaluateTrialResults().
    @return                   (dict)          Returns a dictionary with entries
                                              for max, mean, and min accuracies,
                                              and the mean confusion matrix.
    """
        accuracy = []
        cm = numpy.zeros((intermResults[0][1].shape))

        # Find mean, max, and min values for the metrics.
        for result in intermResults:
            accuracy.append(result[0])
            cm = numpy.add(cm, result[1])

        results = {
            "max_accuracy": max(accuracy),
            "mean_accuracy": sum(accuracy) / float(len(accuracy)),
            "min_accuracy": min(accuracy),
            "total_cm": cm
        }

        self._printCumulativeReport(results)

        return results

    @staticmethod
    def _printCumulativeReport(results):
        """
    Prints results as returned by evaluateFinalResults() after several trials.
    """
        print "max, mean, min accuracies = "
        print "{0:.3f}, {1:.3f}, {2:.3f}".format(results["max_accuracy"],
                                                 results["mean_accuracy"],
                                                 results["min_accuracy"])
        print "total confusion matrix =\n", results["total_cm"]
Beispiel #4
0
class HTMRunner(Runner):
    """
  Class to run the HTM NLP experiments with the specified data and evaluation
  metrics.
  """
    def __init__(self,
                 dataPath,
                 resultsDir,
                 experimentName,
                 experimentType,
                 networkConfigPath=None,
                 generateData=True,
                 votingMethod="most",
                 classificationFile="",
                 seed=42,
                 **kwargs):
        """
    @param networkConfigPath  (str)    Path to JSON specifying network params.
    @param generateData       (bool)   Whether or not we need to generate data.
    @param votingMethod       (str)    Classify with "last" token's score or
                                       "most" frequent of the sequence.
    @param classificationFile (str)    Path to JSON that maps labels to ids.

    See base class constructor for the other parameters.
    """
        if networkConfigPath is None:
            raise RuntimeError("Need to specify a network configuration JSON.")

        super(HTMRunner, self).__init__(dataPath, resultsDir, experimentName,
                                        experimentType, **kwargs)

        self.networkConfig = self._getNetworkConfig(networkConfigPath)
        self.model = None
        self.votingMethod = votingMethod
        self.dataFiles = []
        self.actualLabels = None

        if classificationFile == "" and not generateData:
            raise ValueError(
                "Must give classificationFile if not generating data")
        self.classificationFile = classificationFile

        # Setup data now in order to init the network model. If you want to
        # specify data params, just call setupNetData() again later.
        self.setupNetData(generateData=generateData, seed=seed)

    @staticmethod
    def _getNetworkConfig(networkConfigPath):
        try:
            with open(networkConfigPath, "rb") as fin:
                return json.load(fin)
        except IOError as e:
            print "Could not find network configuration JSON at \'{}\'.".format(
                networkConfigPath)
            raise e

    def initModel(self, trial=0):
        """
    Load or instantiate the classification model. Assumes network data is
    already setup.
    """
        if self.loadPath:
            with open(self.loadPath, "rb") as f:
                self.model = pkl.load(f)
            # TODO: uncomment once we can save TPRegion; do we need this?
            # networkFile = self.model.network
            # self.model.network = Network(networkFile)
            print "Model loaded from \'{0}\'.".format(self.loadPath)
        else:
            print "Creating HTM classification model..."
            self.model = ClassificationModelHTM(
                self.networkConfig,
                self.dataFiles[trial],
                retinaScaling=self.retinaScaling,
                retina=self.retina,
                apiKey=self.apiKey,
                verbosity=self.verbosity,
                numLabels=self.numClasses,
                modelDir=self.modelDir,
                prepData=False)

    def setupNetData(self,
                     generateData=False,
                     seed=42,
                     preprocess=False,
                     **kwargs):
        """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each experiment
    iteration.

    Look at runner.py (setupData) and network_text_data_generator.py (split) for
    the parameters.
    """
        # TODO: logic here is confusing (a lot of if-statements), so maybe cleanup.
        if self.experimentType == "k-folds":
            splits = self.folds
        elif self.experimentType == "incremental":
            splits = len(self.trainSizes)

        if generateData:
            self.generateNetworkDataFiles(splits, seed, preprocess, **kwargs)
        else:
            # Use the input file for each trial; maintains the order of samples.
            self.dataFiles = [self.dataPath] * splits

        if self.numClasses > 0:
            # Setup labels data objects
            self.actualLabels = [
                self._getClassifications(i) for i in xrange(splits)
            ]
            self.mapLabelRefs()

    def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
        # TODO: use model.prepData()?
        ndg = NetworkDataGenerator()
        self.dataDict = ndg.split(filePath=self.dataPath,
                                  numLabels=self.numClasses,
                                  textPreprocess=preprocess,
                                  **kwargs)

        filename, ext = os.path.splitext(self.dataPath)
        self.classificationFile = "{}_categories.json".format(filename)

        # Generate one data file for each experiment iteration.
        if self.experimentType == "k-folds" and not self.orderedSplit:
            # only randomize the data order once for k-folds cross validation
            ndg.randomizeData(seed)
        for i in xrange(splits):
            if self.experimentType != "k-folds" and not self.orderedSplit:
                ndg.randomizeData(seed)
                seed += 1
            # ext='.csv'
            dataFile = "{}_network_{}{}".format(filename, i, ext)
            ndg.saveData(dataFile, self.classificationFile)
            self.dataFiles.append(dataFile)

        if self.verbosity > 0:
            print "{} file(s) generated at {}".format(len(self.dataFiles),
                                                      self.dataFiles)
            print "Classification JSON is at: {}".format(
                self.classificationFile)

    def _getClassifications(self, iteration):
        """
    Get the classifications for a particular iteration.
    @param iteration  (int)       Iteration of the experiment.
    @return           (list)      List of list of ids of classifications for a
                                  sample.
    """
        dataFile = self.dataFiles[iteration]
        classifications = NetworkDataGenerator.getClassifications(dataFile)
        return [[int(c) for c in classes.strip().split(" ")]
                for classes in classifications]

    def mapLabelRefs(self):
        """Get the mapping from label strings to the corresponding ints."""
        try:
            with open(self.classificationFile, "r") as f:
                labelToId = json.load(f)
        except IOError as e:
            print "Must have a valid classification JSON file"
            raise e

        # Convert the dict of strings -> ids to a list of strings ordered by id
        self.labelRefs = zip(
            *sorted(labelToId.iteritems(), key=lambda x: x[1]))[0]
        for recordNumber, data in self.dataDict.iteritems():
            self.dataDict[recordNumber] = (data[0],
                                           numpy.array([
                                               self.labelRefs.index(label)
                                               for label in data[1]
                                           ]), data[2])

    def resetModel(self, trial=0):
        """
    Load or instantiate the classification model; network API doesn't support
    resetting."""
        self.initModel(trial=trial)
        # TODO: change to same as Runner:
        #   self.model.resetModel()
        #   otherwise you're creating a new model instance twice each experiment

    def setupData(self, _):
        """Passthrough b/c network data generation was done upfront."""
        pass

    def encodeSamples(self, _):
        """Passthrough b/c the network encodes the samples."""
        pass

    def training(self, trial):
        """
    Train the network on all the tokens in the training set for a particular
    trial.
    @param trial      (int)       current trial number
    """
        if self.verbosity > 0:
            i = 0
            indices = []
            for numTokens in self.partitions[trial][0]:
                indices.append(i)
                i += numTokens
            print(
                "\tRunner selects to train on sequences starting at indices {}."
                .format(indices))

        for numTokens in self.partitions[trial][0]:
            self.model.trainModel(iterations=numTokens)

    def testing(self, trial, seed):
        """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
        if self.verbosity > 0:
            i = sum(self.partitions[trial][0])
            indices = []
            for numTokens in self.partitions[trial][1]:
                indices.append(i)
                i += numTokens
            print(
                "\tRunner selects to test on sequences starting at indices "
                "{}".format(indices))

        results = ([], [])
        testIndex = len(self.partitions[trial][0])
        for numTokens in self.partitions[trial][1]:
            predictions = []
            activations = []
            for _ in xrange(numTokens):
                predicted, active = self.model.testModel(seed)
                activations.append(active)
                predictions.append(predicted)
            winningPredictions = self._selectWinners(predictions, activations)

            # TODO: switch to standard (expected, actual) format
            results[0].append(winningPredictions)
            results[1].append(self.actualLabels[trial][testIndex])
            testIndex += 1

        # Prepare data for writeOutClassifications
        trainIdx = range(len(self.partitions[trial][0]))
        testIdx = range(
            len(self.partitions[trial][0]),
            len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
        self.partitions[trial] = (trainIdx, testIdx)
        self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

        self.results.append(results)

    def _selectWinners(self, predictions, activations):
        """
    Selects the final classifications for the predictions.  Voting
    method=="last" means the predictions of the last sample are used. Voting
    method=="most" means the most frequent sample is used.
    @param predictions    (list)    List of list of possible classifications
    @return               (list)    List of winning classifications
    """
        if self.votingMethod == "last":
            return predictions[-1]
        elif self.votingMethod == "most":
            counter = Counter()
            for p in predictions:
                counter.update(p)
            return zip(*counter.most_common(self.numClasses))[0]
        else:
            raise ValueError(
                "voting method must be either \'last\' or \'most\'")

    def partitionIndices(self, _):
        """
    Sets self.partitions for the number of tokens for each sample in the
    training and test sets.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
        if self.experimentType == "k-folds":
            for fold in xrange(self.folds):
                dataFile = self.dataFiles[fold]
                numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
                self.partitions = KFolds(self.folds).split(numTokens,
                                                           randomize=False)
        else:
            for trial, split in enumerate(self.trainSizes):
                dataFile = self.dataFiles[trial]
                numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
                self.partitions.append((numTokens[:split], numTokens[split:]))

    # TODO
    # This method is to partition data for which regions are learning, as in the
    # sequence classification experiments.
    def partitionLearning(self):
        """
    Find the number of partitions for the input data based on a specific
    networkConfig.

    @return partitions: (list of namedtuples) Region names and index at which the
      region is to begin learning. The final partition is reserved as a test set.
    """
        Partition = namedtuple("Partition", "partName index")

        # Add regions to partition list in order of learning.
        regionConfigs = ("spRegionConfig", "tmRegionConfig", "tpRegionConfig",
                         "classifierRegionConfig")
        partitions = []

        return

    def writeOutClassifications(self):
        # TODO: implement this method after updating HTM network models and runner
        # per nupic.research #277
        pass