Exemple #1
0
    def testClassifyKeywordsAsExpected(self):
        """
    Tests ClassificationModelKeywords.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "Keywords"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="keywords_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv"))

        for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)):
            if i in (7, 9, 12):
                # Ties amongst winning labels are handled randomly, which affects the
                # third classification in these test samples.
                e = e[:2]
                r = r[:2]
            self.assertEqual(
                sorted(e), sorted(r),
                "Keywords model predicted classes other than what we expect.")
Exemple #2
0
    def testClassifyEndpointAsExpected(self):
        """
    Tests ClassificationModelEndpoint.
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "CioEndpoint"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="endpoint_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv"))

        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Endpoint model predicted classes other than what we expect.")
            for e, r in zip(expectedClasses, resultClasses)
        ]
Exemple #3
0
    def testClassifyWordFingerprintsAsExpected(self):
        """
    Tests ClassificationModelFingerprint (for encoder type 'word').
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        modelName = "CioWordFingerprint"
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="fingerprints_test",
                        loadPath=None,
                        modelName=modelName,
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSizes=[5],
                        verbosity=0)
        runner.initModel(modelName)
        runner.model.encoder.fingerprintType = EncoderTypes.word
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(DATA_DIR,
                         "responses_expected_classes_fingerprint_word.csv"))
        for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)):
            if sorted(e) != sorted(r): print i, e, r
        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Fingerprint model predicted classes other than what we expect."
            ) for e, r in zip(expectedClasses, resultClasses)
        ]
Exemple #4
0
    def testClassifyDocumentFingerprintsAsExpected(self):
        """
    Tests ClassificationModelFingerprint (for encoder type 'document').
    
    Training on the first five samples of the dataset, and testing on the rest,
    the model's classifications should match those in the expected classes
    data file.
    """
        runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"),
                        resultsDir="",
                        experimentName="fingerprints_test",
                        load=False,
                        modelName="ClassificationModelFingerprint",
                        modelModuleName="fluent.models.classify_fingerprint",
                        numClasses=3,
                        plots=0,
                        orderedSplit=True,
                        trainSize=[5],
                        verbosity=0)
        runner.initModel()
        runner.model.encoder.fingerprintType = EncoderTypes.document
        self.runExperiment(runner)

        expectedClasses, resultClasses = self.getExpectedClassifications(
            runner,
            os.path.join(
                DATA_DIR,
                "responses_expected_classes_fingerprint_document.csv"))

        [
            self.assertEqual(
                sorted(e), sorted(r),
                "Fingerprint model predicted classes other than what we expect."
            ) for e, r in zip(expectedClasses, resultClasses)
        ]
Exemple #5
0
def run(args):
    start = time.time()

    root = os.path.dirname(os.path.realpath(__file__))
    resultsDir = os.path.join(root, args.resultsDir)

    if os.path.isdir(args.dataPath):
        runner = MultiRunner(dataPath=args.dataPath,
                             resultsDir=resultsDir,
                             experimentName=args.experimentName,
                             load=args.load,
                             modelName=args.modelName,
                             modelModuleName=args.modelModuleName,
                             numClasses=args.numClasses,
                             plots=args.plots,
                             orderedSplit=args.orderedSplit,
                             trainSize=args.trainSize,
                             verbosity=args.verbosity,
                             test=args.test)
    elif args.modelName == "ClassificationModelHTM":
        runner = HTMRunner(dataPath=args.dataPath,
                           resultsDir=resultsDir,
                           experimentName=args.experimentName,
                           load=args.load,
                           modelName=args.modelName,
                           modelModuleName=args.modelModuleName,
                           numClasses=args.numClasses,
                           plots=args.plots,
                           orderedSplit=args.orderedSplit,
                           trainSize=args.trainSize,
                           verbosity=args.verbosity,
                           generateData=args.generateData,
                           votingMethod=args.votingMethod,
                           classificationFile=args.classificationFile,
                           classifierType=args.classifierType)
    else:
        runner = Runner(dataPath=args.dataPath,
                        resultsDir=resultsDir,
                        experimentName=args.experimentName,
                        load=args.load,
                        modelName=args.modelName,
                        modelModuleName=args.modelModuleName,
                        numClasses=args.numClasses,
                        plots=args.plots,
                        orderedSplit=args.orderedSplit,
                        trainSize=args.trainSize,
                        verbosity=args.verbosity)

    if args.modelName != "ClassificationModelHTM":
        # The data isn't ready yet to initialize an htm model
        runner.initModel()

    print "Reading in data and preprocessing."
    dataTime = time.time()
    runner.setupData(args.textPreprocess)
    print(
        "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding "
        "the data".format(time.time() - dataTime))

    encodeTime = time.time()
    runner.encodeSamples()
    print(
        "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the "
        "experiment.".format(time.time() - encodeTime))

    runner.runExperiment()

    runner.writeOutClassifications()

    runner.calculateResults()

    print "Saving..."
    runner.save()

    print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

    if args.validation:
        print "Validating experiment against expected classifications..."
        print runner.validateExperiment(args.validation)
def run(args):
    start = time.time()

    if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
        raise ValueError("Invalid value for number of cross-validation folds.")

    root = os.path.dirname(os.path.realpath(__file__))
    resultsDir = os.path.join(root, args.resultsDir)

    if args.modelName == "HTMNetwork":
        runner = HTMRunner(dataPath=args.dataPath,
                           networkConfigPath=args.networkConfigPath,
                           resultsDir=resultsDir,
                           experimentName=args.experimentName,
                           loadPath=args.loadPath,
                           modelName=args.modelName,
                           numClasses=args.numClasses,
                           plots=args.plots,
                           orderedSplit=args.orderedSplit,
                           trainSizes=[],
                           verbosity=args.verbosity,
                           generateData=args.generateData,
                           votingMethod=args.votingMethod,
                           classificationFile=args.classificationFile,
                           classifierType=args.classifierType)
    else:
        runner = Runner(dataPath=args.dataPath,
                        resultsDir=resultsDir,
                        experimentName=args.experimentName,
                        loadPath=args.loadPath,
                        modelName=args.modelName,
                        numClasses=args.numClasses,
                        plots=args.plots,
                        orderedSplit=args.orderedSplit,
                        trainSizes=[],
                        verbosity=args.verbosity)

        # HTM network data isn't ready yet to initialize the model
        runner.initModel(args.modelName)

    print "Reading in data and preprocessing."
    dataTime = time.time()
    runner.setupData(args.textPreprocess)

    # TODO: move kfolds splitting to Runner
    random = False if args.orderedSplit else True
    runner.partitions = KFolds(args.kFolds).split(range(len(runner.samples)),
                                                  randomize=random)
    runner.trainSizes = [len(x[0]) for x in runner.partitions]
    print(
        "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding "
        "the data".format(time.time() - dataTime))

    encodeTime = time.time()
    runner.encodeSamples()
    print(
        "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the "
        "experiment.".format(time.time() - encodeTime))

    runner.runExperiment()
    print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

    resultCalcs = runner.calculateResults()
    _ = runner.evaluateCumulativeResults(resultCalcs)

    print "Saving..."
    runner.saveModel()

    if args.validation:
        print "Validating experiment against expected classifications..."
        print runner.validateExperiment(args.validation)