def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ (dataSet, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # Train only with documents whose id's are divisible by 100 trainingData = [x for i,x in enumerate(dataSet) if x[2]%100==0] testData = [x for i,x in enumerate(dataSet) if x[2]%100!=0] print "Num training",len(trainingData),"num testing",len(testData) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) newmodel = ClassificationModel.load(args.modelDir) testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap) testModel(args, newmodel, testData, labelRefs, documentCategoryMap) return model
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ (dataSet, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # Train only with documents whose id's are divisible by 100 trainingData = [x for i, x in enumerate(dataSet) if x[2] % 100 == 0] testData = [x for i, x in enumerate(dataSet) if x[2] % 100 != 0] print "Num training", len(trainingData), "num testing", len(testData) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) newmodel = ClassificationModel.load(args.modelDir) testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap) testModel(args, newmodel, testData, labelRefs, documentCategoryMap) return model
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ # Read in data file (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle( args, [8, 9, 10, 5, 6, 11, 13, 0, 1, 2, 3, 4, 7, 12, 14]) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) # Now query the model using some example HR complaints about managers queryModel( model, "Begin by treating the employees of the department with the " "respect they deserve. Halt the unfair practices " "that they are aware of doing. There is no compassion " "or loyalty to its senior employees", documentTextMap, labelRefs, documentCategoryMap, ) queryModel( model, "My manager is really incompetent. He has no clue how to " "properly supervise his employees and keep them motivated.", documentTextMap, labelRefs, documentCategoryMap, ) queryModel( model, "I wish I had a lot more vacation and much more flexibility " "in how I manage my own time. I should be able to choose " "when I come in as long as I manage to get all my tasks done.", documentTextMap, labelRefs, documentCategoryMap, ) # Print profile information print model.dumpProfile() return model
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ args.numLabels = 2 (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs)
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args, [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14]) model = ClassificationModel.load(args.modelDir) analyzeModel(args, model, documentTextMap) return model
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ # Read in data file (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args, [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14]) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) # Now query the model using some example HR complaints about managers queryModel(model, "Begin by treating the employees of the department with the " "respect they deserve. Halt the unfair practices " "that they are aware of doing. There is no compassion " "or loyalty to its senior employees", documentTextMap, labelRefs, documentCategoryMap, ) queryModel(model, "My manager is really incompetent. He has no clue how to " "properly supervise his employees and keep them motivated.", documentTextMap, labelRefs, documentCategoryMap, ) queryModel(model, "I wish I had a lot more vacation and much more flexibility " "in how I manage my own time. I should be able to choose " "when I come in as long as I manage to get all my tasks done.", documentTextMap, labelRefs, documentCategoryMap, ) # Print profile information print model.dumpProfile() return model
def setupExperiment(args): """ Create model according to args, train on training data, save model, restore model. @return newModel (ClassificationModel) The restored NLP model. @return dataSet (list) Each item is a list representing a data sample, with the text string, list of label indices, and the sample ID. """ dataSet, labelRefs, _, _ = readDataAndReshuffle(args) args.numLabels = len(labelRefs) # Create a model, train it, save it, reload it model = instantiateModel(args) model = trainModel(model, dataSet, labelRefs, args.verbosity) model.save(args.modelDir) newModel = ClassificationModel.load(args.modelDir) return newModel, dataSet
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle( args, [8, 9, 10, 5, 6, 11, 13, 0, 1, 2, 3, 4, 7, 12, 14]) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) newmodel = ClassificationModel.load(args.modelDir) testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap) # Print profile information print model.dumpProfile() return model
def runExperiment(args): """ Create model according to args, train on training data, save model, restore model, test on test data. """ (trainingData, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args, [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14]) # Create model model = instantiateModel(args) model = trainModel(args, model, trainingData, labelRefs) model.save(args.modelDir) newmodel = ClassificationModel.load(args.modelDir) testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap) # Print profile information print model.dumpProfile() return model
def run(args): """ Run the classification test. This method handles scenarios for running a single model or all of them. Also tests serialization by checking the a model's results match before and after saving/loading. """ if args.hello: args = _setupHelloTest(args) (dataset, labelRefs, documentCategoryMap, _) = readDataAndReshuffle(args) if args.modelName == "all": modelNames = NLP_MODEL_TYPES runningAllModels = True else: modelNames = [args.modelName] runningAllModels = False accuracies = {} for name in modelNames: # Setup args args.modelName = name args.modelDir = os.path.join(args.experimentName, name) if name == "htm": if runningAllModels: # Need to specify network config for htm models try: htmModelInfo = HTM_CONFIGS.pop() except KeyError: print "Not enough HTM configs, so skipping the HTM model." continue name = htmModelInfo[0] args.networkConfigPath = htmModelInfo[1] else: # Get the specific model name from the config path for (modelName, configPath) in HTM_CONFIGS: if configPath == args.networkConfigPath: name = modelName # Split data for train/test (We still test on the training data!) if args.split: split = int(len(dataset) * args.split) trainingData = dataset[:split] else: trainingData = dataset # Create a model, train it, save it, reload it _, model = executeModelLifecycle(args, trainingData, labelRefs) # Test the model accuracies[name] = testModel(model, dataset, labelRefs, documentCategoryMap, args.verbosity) if args.verbosity > 0: # Print profile information print model.dumpProfile() printSummary(args.experimentName, accuracies) if args.hello: assertResults("hello_classification", accuracies)
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs, ) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def run(args): """ Run the 'query' test. This method handles scenarios for running a single model or all of them. """ (trainingData, labelRefs, _, documentTextMap) = readDataAndReshuffle(args) if args.modelName == "all": modelNames = NLP_MODEL_TYPES runningAllModels = True else: modelNames = [args.modelName] runningAllModels = False accuracies = {} for name in modelNames: # Setup args args.modelName = name args.modelDir = os.path.join(args.experimentDir, name) if name == "htm": if runningAllModels: # Need to specify network config for htm models try: htmModelInfo = HTM_CONFIGS.pop() except KeyError: print "Not enough HTM configs, so skipping the HTM model." continue name = htmModelInfo[0] args.networkConfigPath = htmModelInfo[1] else: # Get the specific model name from the config path for (modelName, configPath) in HTM_CONFIGS: if configPath == args.networkConfigPath: name = modelName # Create a model, train it, save it, reload it _, model = executeModelLifecycle(args, trainingData, labelRefs) # Now query the model using some example HR complaints about managers queryModel(model, "Begin by treating the employees of the department with the " "respect they deserve. Halt the unfair practices " "that they are aware of doing. There is no compassion " "or loyalty to its senior employees", documentTextMap) queryModel(model, "My manager is really incompetent. He has no clue how to " "properly supervise his employees and keep them motivated.", documentTextMap) queryModel(model, "I wish I had a lot more vacation and much more flexibility " "in how I manage my own time. I should be able to choose " "when I come in as long as I manage to get all my tasks done.", documentTextMap) if args.verbosity > 0: # Print profile information print model.dumpProfile() resultsCheck(name)
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs,) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def run(args): """ Run the 'query' test. This method handles scenarios for running a single model or all of them. """ (trainingData, labelRefs, _, documentTextMap) = readDataAndReshuffle(args) if args.modelName == "all": modelNames = NLP_MODEL_TYPES runningAllModels = True else: modelNames = [args.modelName] runningAllModels = False accuracies = {} for name in modelNames: # Setup args args.modelName = name args.modelDir = os.path.join(args.experimentDir, name) if name == "htm": if runningAllModels: # Need to specify network config for htm models try: htmModelInfo = HTM_CONFIGS.pop() except KeyError: print "Not enough HTM configs, so skipping the HTM model." continue name = htmModelInfo[0] args.networkConfigPath = htmModelInfo[1] else: # Get the specific model name from the config path for (modelName, configPath) in HTM_CONFIGS: if configPath == args.networkConfigPath: name = modelName # Create a model, train it, save it, reload it _, model = executeModelLifecycle(args, trainingData, labelRefs) # Now query the model using some example HR complaints about managers queryModel( model, "Begin by treating the employees of the department with the " "respect they deserve. Halt the unfair practices " "that they are aware of doing. There is no compassion " "or loyalty to its senior employees", documentTextMap) queryModel( model, "My manager is really incompetent. He has no clue how to " "properly supervise his employees and keep them motivated.", documentTextMap) queryModel( model, "I wish I had a lot more vacation and much more flexibility " "in how I manage my own time. I should be able to choose " "when I come in as long as I manage to get all my tasks done.", documentTextMap) if args.verbosity > 0: # Print profile information print model.dumpProfile() resultsCheck(name)