def testClassifyEndpointAsExpected(self):
    """
    Tests ClassificationModelEndpoint.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
    runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                    resultsDir="",
                    experimentName="endpoint_test",
                    load=False,
                    modelName="ClassificationModelEndpoint",
                    modelModuleName="fluent.models.classify_endpoint",
                    numClasses=3,
                    plots=0,
                    orderedSplit=True,
                    trainSize=[5],
                    verbosity=0)
    runner.initModel()
    self.runExperiment(runner)

    expectedClasses, resultClasses = self.getExpectedClassifications(runner,
      os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv"))

    [self.assertEqual(sorted(e), sorted(r),
      "Endpoint model predicted classes other than what we expect.")
      for e, r in zip(expectedClasses, resultClasses)]
  def testClassifyKeywordsAsExpected(self):
    """
    Tests ClassificationModelKeywords.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
    runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                    resultsDir="",
                    experimentName="keywords_test",
                    load=False,
                    modelName="ClassificationModelKeywords",
                    modelModuleName="fluent.models.classify_keywords",
                    numClasses=3,
                    plots=0,
                    orderedSplit=True,
                    trainSize=[5],
                    verbosity=0)
    runner.initModel()
    self.runExperiment(runner)

    expectedClasses, resultClasses = self.getExpectedClassifications(
      runner, os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv"))

    for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)):
      if i in (7, 9, 12):
        # Ties amongst winning labels are handled randomly, which affects the
        # third classification in these test samples.
        e = e[:2]
        r = r[:2]
      self.assertEqual(sorted(e), sorted(r),
      "Keywords model predicted classes other than what we expect.")
Example #3
0
    def testClassifyKeywordsAsExpected(self):
        """
    Tests ClassificationModelKeywords.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "Keywords"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="keywords_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv"))

        for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)):
            if i in (7, 9, 12):
                # Ties amongst winning labels are handled randomly, which affects the
                # third classification in these test samples.
                e = e[:2]
                r = r[:2]
            self.assertEqual(
                sorted(e), sorted(r),
                "Keywords model predicted classes other than what we expect.")
Example #4
0
    def testClassifyWordFingerprintsAsExpected(self):
        """
    Tests ClassificationModelFingerprint (for encoder type 'word').
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "CioWordFingerprint"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="fingerprints_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        runner.model.encoder.fingerprintType = EncoderTypes.word
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR,
                         "responses_expected_classes_fingerprint_word.csv"))
        for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)):
            if sorted(e) != sorted(r): print i, e, r
        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Fingerprint model predicted classes other than what we expect."
            ) for e, r in zip(expectedClasses, resultClasses)
        ]
Example #5
0
    def testClassifyEndpointAsExpected(self):
        """
    Tests ClassificationModelEndpoint.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "CioEndpoint"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="endpoint_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv"))

        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Endpoint model predicted classes other than what we expect.")
            for e, r in zip(expectedClasses, resultClasses)
        ]
Example #6
0
    def testClassifyDocumentFingerprintsAsExpected(self):
        """
    Tests ClassificationModelFingerprint (for encoder type 'document').
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="fingerprints_test",
                        load=False,
                        modelName="ClassificationModelFingerprint",
                        modelModuleName="fluent.models.classify_fingerprint",
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSize=[5],
                        verbosity=0)
        runner.initModel()
        runner.model.encoder.fingerprintType = EncoderTypes.document
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(
                DATA_DIR,
                "responses_expected_classes_fingerprint_document.csv"))

        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Fingerprint model predicted classes other than what we expect."
            ) for e, r in zip(expectedClasses, resultClasses)
        ]
Example #7
0
class FluentWrapper(object):
  """ Wraps nupic.fluent Model """

  def __init__(self, dataPath):
    """
    initializes nupic.fluent model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
    g_log.info("Initialize nupic.fluent")
    # Initialize nupic.fluent model runner
    self._fluent = FluentRunner(dataPath=dataPath,
                        resultsDir="",
                        experimentName="imbu_fingerprints",
                        load=False,
                        modelName="ClassificationModelFingerprint",
                        modelModuleName="fluent.models.classify_fingerprint",
                        numClasses=1,  # must be >0 to go through training
                        plots=0,
                        orderedSplit=False,
                        trainSizes=[],
                        verbosity=0)

    # Train model with given sample data
    self._fluent.initModel()
    self._fluent.setupData()
    self._fluent.trainSize = len(self._fluent.samples)
    self._fluent.encodeSamples()
    self._fluent.resetModel(0)

    for i in range(self._fluent.trainSize):
      self._fluent.model.trainModel(i)


  def query(self, text):
    """ Queries fluent model and returns an ordered list of matching documents.

    :param str text: The text to match.

    :returns: a sequence of matching samples.

    ::
    [
        {"id": "1", "text": "sampleText", "score": "0.75"},
        ...
    ]
    """
    results = []
    if text:
      g_log.info("Query model for : %s", text)
      sampleIDs, sampleDists = self._fluent.model.queryModel(text, False)
      for sID, dist in zip (sampleIDs, sampleDists):
        results.append({"id": sID,
                        "text": self._fluent.dataDict[sID][0],
                        "score": dist.item()})

    return results
Example #8
0
  def __init__(self, dataPath):
    """
    initializes nupic.fluent model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
    g_log.info("Initialize nupic.fluent")
    # Initialize nupic.fluent model runner
    self._fluent = FluentRunner(dataPath=dataPath,
                        resultsDir="",
                        experimentName="imbu_fingerprints",
                        load=False,
                        modelName="ClassificationModelFingerprint",
                        modelModuleName="fluent.models.classify_fingerprint",
                        numClasses=1,  # must be >0 to go through training
                        plots=0,
                        orderedSplit=False,
                        trainSizes=[],
                        verbosity=0)

    # Train model with given sample data
    self._fluent.initModel()
    self._fluent.setupData()
    self._fluent.trainSize = len(self._fluent.samples)
    self._fluent.encodeSamples()
    self._fluent.resetModel(0)

    for i in range(self._fluent.trainSize):
      self._fluent.model.trainModel(i)
Example #9
0
    def __init__(self, dataPath):
        """
    initializes nupic.fluent model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
        g_log.info("Initialize nupic.fluent")
        # Initialize nupic.fluent model runner
        self._fluent = FluentRunner(
            dataPath=dataPath,
            resultsDir="",
            experimentName="imbu_fingerprints",
            load=False,
            modelName="ClassificationModelFingerprint",
            modelModuleName="fluent.models.classify_fingerprint",
            numClasses=1,  # must be >0 to go through training
            plots=0,
            orderedSplit=False,
            trainSizes=[],
            verbosity=0,
        )

        # Train model with given sample data
        self._fluent.initModel()
        self._fluent.setupData()
        self._fluent.trainSize = len(self._fluent.samples)
        self._fluent.encodeSamples()
        self._fluent.resetModel(0)

        for i in range(self._fluent.trainSize):
            self._fluent.model.trainModel(i)
Example #10
0
class FluentWrapper(object):
    """ Wraps nupic.fluent Model """

    def __init__(self, dataPath):
        """
    initializes nupic.fluent model with given sample data

    :param str dataPath: Path to sample data file.
                         Must be a CSV file having 'ID and 'Sample' columns
    """
        g_log.info("Initialize nupic.fluent")
        # Initialize nupic.fluent model runner
        self._fluent = FluentRunner(
            dataPath=dataPath,
            resultsDir="",
            experimentName="imbu_fingerprints",
            load=False,
            modelName="ClassificationModelFingerprint",
            modelModuleName="fluent.models.classify_fingerprint",
            numClasses=1,  # must be >0 to go through training
            plots=0,
            orderedSplit=False,
            trainSizes=[],
            verbosity=0,
        )

        # Train model with given sample data
        self._fluent.initModel()
        self._fluent.setupData()
        self._fluent.trainSize = len(self._fluent.samples)
        self._fluent.encodeSamples()
        self._fluent.resetModel(0)

        for i in range(self._fluent.trainSize):
            self._fluent.model.trainModel(i)

    def query(self, text):
        """ Queries fluent model and returns an ordered list of matching documents.

    :param str text: The text to match.

    :returns: a sequence of matching samples.

    ::
    [
        {"id": "1", "text": "sampleText", "score": "0.75"},
        ...
    ]
    """
        results = []
        if text:
            g_log.info("Query model for : %s", text)
            sampleIDs, sampleDists = self._fluent.model.queryModel(text, False)
            for sID, dist in zip(sampleIDs, sampleDists):
                results.append({"id": sID, "text": self._fluent.dataDict[sID][0], "score": dist.item()})

        return results
Example #11
0
def run(args):
  start = time.time()

  root = os.path.dirname(os.path.realpath(__file__))
  resultsDir = os.path.join(root, args.resultsDir)

  runner = Runner(dataPath=args.dataPath,
                  resultsDir=resultsDir,
                  experimentName=args.experimentName,
                  load=args.load,
                  modelName=args.modelName,
                  modelModuleName=args.modelModuleName,
                  numClasses=args.numClasses,
                  plots=args.plots,
                  orderedSplit=args.orderedSplit,
                  trainSize=args.trainSize,
                  verbosity=args.verbosity)

  runner.initModel()

  print "Reading in data and preprocessing."
  dataTime = time.time()
  runner.setupData()
  print ("Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding "
        "the data".format(time.time() - dataTime))

  encodeTime = time.time()
  runner.encodeSamples()
  print ("Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the "
         "experiment.".format(time.time() - encodeTime))

  runner.runExperiment()

  runner.calculateResults()

  runner.save()

  print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

  if args.validation:
    print "Validating experiment against expected classifications..."
    print runner.validateExperiment(args.validation)
Example #12
0
def run(args):
    start = time.time()

    root = os.path.dirname(os.path.realpath(__file__))
    resultsDir = os.path.join(root, args.resultsDir)

    if os.path.isdir(args.dataPath):
        runner = MultiRunner(dataPath=args.dataPath,
                             resultsDir=resultsDir,
                             experimentName=args.experimentName,
                             load=args.load,
                             modelName=args.modelName,
                             modelModuleName=args.modelModuleName,
                             numClasses=args.numClasses,
                             plots=args.plots,
                             orderedSplit=args.orderedSplit,
                             trainSize=args.trainSize,
                             verbosity=args.verbosity,
                             test=args.test)
    elif args.modelName == "ClassificationModelHTM":
        runner = HTMRunner(dataPath=args.dataPath,
                           resultsDir=resultsDir,
                           experimentName=args.experimentName,
                           load=args.load,
                           modelName=args.modelName,
                           modelModuleName=args.modelModuleName,
                           numClasses=args.numClasses,
                           plots=args.plots,
                           orderedSplit=args.orderedSplit,
                           trainSize=args.trainSize,
                           verbosity=args.verbosity,
                           generateData=args.generateData,
                           votingMethod=args.votingMethod,
                           classificationFile=args.classificationFile,
                           classifierType=args.classifierType)
    else:
        runner = Runner(dataPath=args.dataPath,
                        resultsDir=resultsDir,
                        experimentName=args.experimentName,
                        load=args.load,
                        modelName=args.modelName,
                        modelModuleName=args.modelModuleName,
                        numClasses=args.numClasses,
                        plots=args.plots,
                        orderedSplit=args.orderedSplit,
                        trainSize=args.trainSize,
                        verbosity=args.verbosity)

    if args.modelName != "ClassificationModelHTM":
        # The data isn't ready yet to initialize an htm model
        runner.initModel()

    print "Reading in data and preprocessing."
    dataTime = time.time()
    runner.setupData(args.textPreprocess)
    print(
        "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding "
        "the data".format(time.time() - dataTime))

    encodeTime = time.time()
    runner.encodeSamples()
    print(
        "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the "
        "experiment.".format(time.time() - encodeTime))

    runner.runExperiment()

    runner.writeOutClassifications()

    runner.calculateResults()

    print "Saving..."
    runner.save()

    print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

    if args.validation:
        print "Validating experiment against expected classifications..."
        print runner.validateExperiment(args.validation)
def run(args):
    start = time.time()

    if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
        raise ValueError("Invalid value for number of cross-validation folds.")

    root = os.path.dirname(os.path.realpath(__file__))
    resultsDir = os.path.join(root, args.resultsDir)

    if args.modelName == "HTMNetwork":
        runner = HTMRunner(dataPath=args.dataPath,
                           networkConfigPath=args.networkConfigPath,
                           resultsDir=resultsDir,
                           experimentName=args.experimentName,
                           loadPath=args.loadPath,
                           modelName=args.modelName,
                           numClasses=args.numClasses,
                           plots=args.plots,
                           orderedSplit=args.orderedSplit,
                           trainSizes=[],
                           verbosity=args.verbosity,
                           generateData=args.generateData,
                           votingMethod=args.votingMethod,
                           classificationFile=args.classificationFile,
                           classifierType=args.classifierType)
    else:
        runner = Runner(dataPath=args.dataPath,
                        resultsDir=resultsDir,
                        experimentName=args.experimentName,
                        loadPath=args.loadPath,
                        modelName=args.modelName,
                        numClasses=args.numClasses,
                        plots=args.plots,
                        orderedSplit=args.orderedSplit,
                        trainSizes=[],
                        verbosity=args.verbosity)

        # HTM network data isn't ready yet to initialize the model
        runner.initModel(args.modelName)

    print "Reading in data and preprocessing."
    dataTime = time.time()
    runner.setupData(args.textPreprocess)

    # TODO: move kfolds splitting to Runner
    random = False if args.orderedSplit else True
    runner.partitions = KFolds(args.kFolds).split(range(len(runner.samples)),
                                                  randomize=random)
    runner.trainSizes = [len(x[0]) for x in runner.partitions]
    print(
        "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding "
        "the data".format(time.time() - dataTime))

    encodeTime = time.time()
    runner.encodeSamples()
    print(
        "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the "
        "experiment.".format(time.time() - encodeTime))

    runner.runExperiment()
    print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

    resultCalcs = runner.calculateResults()
    _ = runner.evaluateCumulativeResults(resultCalcs)

    print "Saving..."
    runner.saveModel()

    if args.validation:
        print "Validating experiment against expected classifications..."
        print runner.validateExperiment(args.validation)