Esempio n. 1
0
def runExperiment(args):
  """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

  (dataSet, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args)

  # Train only with documents whose id's are divisible by 100
  trainingData = [x for i,x in enumerate(dataSet) if x[2]%100==0]
  testData = [x for i,x in enumerate(dataSet) if x[2]%100!=0]

  print "Num training",len(trainingData),"num testing",len(testData)

  # Create model
  model = instantiateModel(args)

  model = trainModel(args, model, trainingData, labelRefs)
  model.save(args.modelDir)
  newmodel = ClassificationModel.load(args.modelDir)
  testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap)
  testModel(args, newmodel, testData, labelRefs, documentCategoryMap)

  return model
Esempio n. 2
0
def runExperiment(args):
    """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

    (dataSet, labelRefs, documentCategoryMap,
     documentTextMap) = readDataAndReshuffle(args)

    # Train only with documents whose id's are divisible by 100
    trainingData = [x for i, x in enumerate(dataSet) if x[2] % 100 == 0]
    testData = [x for i, x in enumerate(dataSet) if x[2] % 100 != 0]

    print "Num training", len(trainingData), "num testing", len(testData)

    # Create model
    model = instantiateModel(args)

    model = trainModel(args, model, trainingData, labelRefs)
    model.save(args.modelDir)
    newmodel = ClassificationModel.load(args.modelDir)
    testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap)
    testModel(args, newmodel, testData, labelRefs, documentCategoryMap)

    return model
Esempio n. 3
0
def runExperiment(args):
    """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

    # Read in data file
    (trainingData, labelRefs,
     documentCategoryMap, documentTextMap) = readDataAndReshuffle(
         args, [8, 9, 10, 5, 6, 11, 13, 0, 1, 2, 3, 4, 7, 12, 14])

    # Create model
    model = instantiateModel(args)

    model = trainModel(args, model, trainingData, labelRefs)

    model.save(args.modelDir)

    # Now query the model using some example HR complaints about managers
    queryModel(
        model,
        "Begin by treating the employees of the department with the "
        "respect they deserve. Halt the unfair practices "
        "that they are aware of doing. There is no compassion "
        "or loyalty to its senior employees",
        documentTextMap,
        labelRefs,
        documentCategoryMap,
    )

    queryModel(
        model,
        "My manager is really incompetent. He has no clue how to "
        "properly supervise his employees and keep them motivated.",
        documentTextMap,
        labelRefs,
        documentCategoryMap,
    )

    queryModel(
        model,
        "I wish I had a lot more vacation and much more flexibility "
        "in how I manage my own time. I should be able to choose "
        "when I come in as long as I manage to get all my tasks done.",
        documentTextMap,
        labelRefs,
        documentCategoryMap,
    )

    # Print profile information
    print
    model.dumpProfile()

    return model
def runExperiment(args):
    """
	Create model according to args, train on training data, save model,
	restore model, test on test data.
	"""

    args.numLabels = 2
    (trainingData, labelRefs, documentCategoryMap,
     documentTextMap) = readDataAndReshuffle(args)

    # Create model
    model = instantiateModel(args)

    model = trainModel(args, model, trainingData, labelRefs)
def runExperiment(args):
	"""
	Create model according to args, train on training data, save model,
	restore model, test on test data.
	"""

	args.numLabels = 2
	(trainingData, labelRefs, documentCategoryMap,
	 documentTextMap) = readDataAndReshuffle(args)

	# Create model
	model = instantiateModel(args)

	model = trainModel(args, model, trainingData, labelRefs)
def runExperiment(args):
  """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

  (trainingData, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args,
                         [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14])

  model = ClassificationModel.load(args.modelDir)

  analyzeModel(args, model, documentTextMap)

  return model
def runExperiment(args):
  """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

  # Read in data file
  (trainingData, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args,
                         [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14])

  # Create model
  model = instantiateModel(args)

  model = trainModel(args, model, trainingData, labelRefs)
  
  model.save(args.modelDir)

  # Now query the model using some example HR complaints about managers
  queryModel(model,
             "Begin by treating the employees of the department with the "
             "respect they deserve. Halt the unfair practices "
             "that they are aware of doing. There is no compassion "
             "or loyalty to its senior employees",
             documentTextMap, labelRefs, documentCategoryMap,
             )

  queryModel(model,
             "My manager is really incompetent. He has no clue how to "
             "properly supervise his employees and keep them motivated.",
             documentTextMap, labelRefs, documentCategoryMap,
             )

  queryModel(model,
             "I wish I had a lot more vacation and much more flexibility "
             "in how I manage my own time. I should be able to choose "
             "when I come in as long as I manage to get all my tasks done.",
             documentTextMap, labelRefs, documentCategoryMap,
             )

  # Print profile information
  print
  model.dumpProfile()

  return model
Esempio n. 8
0
def setupExperiment(args):
    """
  Create model according to args, train on training data, save model,
  restore model.

  @return newModel (ClassificationModel) The restored NLP model.
  @return dataSet (list) Each item is a list representing a data sample, with
      the text string, list of label indices, and the sample ID.
  """
    dataSet, labelRefs, _, _ = readDataAndReshuffle(args)
    args.numLabels = len(labelRefs)

    # Create a model, train it, save it, reload it
    model = instantiateModel(args)
    model = trainModel(model, dataSet, labelRefs, args.verbosity)
    model.save(args.modelDir)
    newModel = ClassificationModel.load(args.modelDir)

    return newModel, dataSet
def setupExperiment(args):
  """
  Create model according to args, train on training data, save model,
  restore model.

  @return newModel (ClassificationModel) The restored NLP model.
  @return dataSet (list) Each item is a list representing a data sample, with
      the text string, list of label indices, and the sample ID.
  """
  dataSet, labelRefs, _, _ = readDataAndReshuffle(args)
  args.numLabels = len(labelRefs)

  # Create a model, train it, save it, reload it
  model = instantiateModel(args)
  model = trainModel(model, dataSet, labelRefs, args.verbosity)
  model.save(args.modelDir)
  newModel = ClassificationModel.load(args.modelDir)

  return newModel, dataSet
Esempio n. 10
0
def runExperiment(args):
    """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

    (trainingData, labelRefs,
     documentCategoryMap, documentTextMap) = readDataAndReshuffle(
         args, [8, 9, 10, 5, 6, 11, 13, 0, 1, 2, 3, 4, 7, 12, 14])

    # Create model
    model = instantiateModel(args)

    model = trainModel(args, model, trainingData, labelRefs)
    model.save(args.modelDir)
    newmodel = ClassificationModel.load(args.modelDir)
    testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap)

    # Print profile information
    print
    model.dumpProfile()

    return model
def runExperiment(args):
  """
  Create model according to args, train on training data, save model,
  restore model, test on test data.
  """

  (trainingData, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args,
                         [8,9,10,5,6,11,13,0,1,2,3,4,7,12,14])

  # Create model
  model = instantiateModel(args)

  model = trainModel(args, model, trainingData, labelRefs)
  model.save(args.modelDir)
  newmodel = ClassificationModel.load(args.modelDir)
  testModel(args, newmodel, trainingData, labelRefs, documentCategoryMap)

  # Print profile information
  print
  model.dumpProfile()

  return model
Esempio n. 12
0
def run(args):
    """ Run the classification test.
  This method handles scenarios for running a single model or all of them.
  Also tests serialization by checking the a model's results match before and
  after saving/loading.
  """
    if args.hello:
        args = _setupHelloTest(args)

    (dataset, labelRefs, documentCategoryMap, _) = readDataAndReshuffle(args)

    if args.modelName == "all":
        modelNames = NLP_MODEL_TYPES
        runningAllModels = True
    else:
        modelNames = [args.modelName]
        runningAllModels = False

    accuracies = {}
    for name in modelNames:
        # Setup args
        args.modelName = name
        args.modelDir = os.path.join(args.experimentName, name)
        if name == "htm":
            if runningAllModels:
                # Need to specify network config for htm models
                try:
                    htmModelInfo = HTM_CONFIGS.pop()
                except KeyError:
                    print "Not enough HTM configs, so skipping the HTM model."
                    continue
                name = htmModelInfo[0]
                args.networkConfigPath = htmModelInfo[1]
            else:
                # Get the specific model name from the config path
                for (modelName, configPath) in HTM_CONFIGS:
                    if configPath == args.networkConfigPath:
                        name = modelName

        # Split data for train/test (We still test on the training data!)
        if args.split:
            split = int(len(dataset) * args.split)
            trainingData = dataset[:split]
        else:
            trainingData = dataset

        # Create a model, train it, save it, reload it
        _, model = executeModelLifecycle(args, trainingData, labelRefs)

        # Test the model
        accuracies[name] = testModel(model, dataset, labelRefs,
                                     documentCategoryMap, args.verbosity)

        if args.verbosity > 0:
            # Print profile information
            print
            model.dumpProfile()

    printSummary(args.experimentName, accuracies)

    if args.hello:
        assertResults("hello_classification", accuracies)
def run(args):
  """ Run the classification test.
  This method handles scenarios for running a single model or all of them.
  Also tests serialization by checking the a model's results match before and
  after saving/loading.
  """
  if args.hello:
    args = _setupHelloTest(args)

  (dataset, labelRefs, documentCategoryMap, _) = readDataAndReshuffle(args)

  if args.modelName == "all":
    modelNames = NLP_MODEL_TYPES
    runningAllModels = True
  else:
    modelNames = [args.modelName]
    runningAllModels = False

  accuracies = {}
  for name in modelNames:
    # Setup args
    args.modelName = name
    args.modelDir = os.path.join(args.experimentName, name)
    if name == "htm":
      if runningAllModels:
        # Need to specify network config for htm models
        try:
          htmModelInfo = HTM_CONFIGS.pop()
        except KeyError:
          print "Not enough HTM configs, so skipping the HTM model."
          continue
        name = htmModelInfo[0]
        args.networkConfigPath = htmModelInfo[1]
      else:
        # Get the specific model name from the config path
        for (modelName, configPath) in HTM_CONFIGS:
          if configPath == args.networkConfigPath:
            name = modelName

    # Split data for train/test (We still test on the training data!)
    if args.split:
      split = int(len(dataset) * args.split)
      trainingData = dataset[:split]
    else:
      trainingData = dataset

    # Create a model, train it, save it, reload it
    _, model = executeModelLifecycle(args, trainingData, labelRefs)

    # Test the model
    accuracies[name] = testModel(model,
                                 dataset,
                                 labelRefs,
                                 documentCategoryMap,
                                 args.verbosity)

    if args.verbosity > 0:
      # Print profile information
      print
      model.dumpProfile()

  printSummary(args.experimentName, accuracies)

  if args.hello:
    assertResults("hello_classification", accuracies)
Esempio n. 14
0
def runExperiment(args):
    if not os.path.exists(SAVE_PATH):
        os.makedirs(SAVE_PATH)

    (trainingDataDup, labelRefs, documentCategoryMap,
     documentTextMap) = readDataAndReshuffle(args)

    # remove duplicates from training data
    includedDocIds = set()
    trainingData = []
    for record in trainingDataDup:
        if record[2] not in includedDocIds:
            includedDocIds.add(record[2])
            trainingData.append(record)

    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    model = createModel(numLabels=1, **vars(args))
    model = trainModel(args, model, trainingData, labelRefs)

    numDocs = model.getClassifier()._numPatterns

    print "Model trained with %d documents" % (numDocs, )

    knn = model.getClassifier()
    hc = HierarchicalClustering(knn)

    hc.cluster("complete")
    protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs)

    # Run test to ensure consistency with KNN
    if args.knnTest:
        knnTest(protos, knn)
        return

    # Summary statistics
    # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
    bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))

    for clusterId in xrange(len(clusterSizes)):
        print
        print "Cluster %d with %d documents" % (clusterId,
                                                clusterSizes[clusterId])
        print "==============="

        prototypeNum = 0
        for index in protos[clusterId]:
            if index != -1:
                docId = trainingData[index][2]
                prototypeNum += 1
                display = prototypeNum <= args.numPrototypes

                if display:
                    print "(%d) %s" % (docId, trainingData[index][0])
                    print "Buckets:"

                # The docId keys in documentCategoryMap are strings rather than ints
                if docId in documentCategoryMap:
                    for bucketId in documentCategoryMap[docId]:
                        bucketCounts[clusterId, bucketId] += 1
                        if display:
                            print "    ", labelRefs[bucketId]
                elif display:
                    print "    <None>"
                if display:
                    print "\n\n"

    createBucketClusterPlot(args, bucketCounts)
    create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def run(args):
  """ Run the 'query' test.
  This method handles scenarios for running a single model or all of them.
  """
  (trainingData, labelRefs, _, documentTextMap) = readDataAndReshuffle(args)

  if args.modelName == "all":
    modelNames = NLP_MODEL_TYPES
    runningAllModels = True
  else:
    modelNames = [args.modelName]
    runningAllModels = False

  accuracies = {}
  for name in modelNames:
    # Setup args
    args.modelName = name
    args.modelDir = os.path.join(args.experimentDir, name)
    if name == "htm":
      if runningAllModels:
        # Need to specify network config for htm models
        try:
          htmModelInfo = HTM_CONFIGS.pop()
        except KeyError:
          print "Not enough HTM configs, so skipping the HTM model."
          continue
        name = htmModelInfo[0]
        args.networkConfigPath = htmModelInfo[1]
      else:
        # Get the specific model name from the config path
        for (modelName, configPath) in HTM_CONFIGS:
          if configPath == args.networkConfigPath:
            name = modelName

    # Create a model, train it, save it, reload it
    _, model = executeModelLifecycle(args, trainingData, labelRefs)

    # Now query the model using some example HR complaints about managers
    queryModel(model,
               "Begin by treating the employees of the department with the "
               "respect they deserve. Halt the unfair practices "
               "that they are aware of doing. There is no compassion "
               "or loyalty to its senior employees",
               documentTextMap)

    queryModel(model,
               "My manager is really incompetent. He has no clue how to "
               "properly supervise his employees and keep them motivated.",
               documentTextMap)

    queryModel(model,
               "I wish I had a lot more vacation and much more flexibility "
               "in how I manage my own time. I should be able to choose "
               "when I come in as long as I manage to get all my tasks done.",
               documentTextMap)

    if args.verbosity > 0:
      # Print profile information
      print
      model.dumpProfile()

  resultsCheck(name)
def runExperiment(args):
  if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)
  
  (trainingDataDup, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args)
  
  # remove duplicates from training data
  includedDocIds = set()
  trainingData = []
  for record in trainingDataDup:
    if record[2] not in includedDocIds:
      includedDocIds.add(record[2])
      trainingData.append(record)
  
  args.networkConfig = getNetworkConfig(args.networkConfigPath)
  model = createModel(numLabels=1, **vars(args))
  model = trainModel(args, model, trainingData, labelRefs)
  
  numDocs = model.getClassifier()._numPatterns
  
  print "Model trained with %d documents" % (numDocs,)
  
  knn = model.getClassifier()
  hc = HierarchicalClustering(knn)
  
  hc.cluster("complete")
  protos, clusterSizes = hc.getClusterPrototypes(args.numClusters,
                                                 numDocs)

  # Run test to ensure consistency with KNN
  if args.knnTest:
    knnTest(protos, knn)
    return


  # Summary statistics
  # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
  bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))  

  for clusterId in xrange(len(clusterSizes)):
    print
    print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId])
    print "==============="

    prototypeNum = 0
    for index in protos[clusterId]:
      if index != -1:
        docId = trainingData[index][2]
        prototypeNum += 1
        display = prototypeNum <= args.numPrototypes

        if display:
          print "(%d) %s" % (docId, trainingData[index][0])
          print "Buckets:"

        # The docId keys in documentCategoryMap are strings rather than ints
        if docId in documentCategoryMap:
          for bucketId in documentCategoryMap[docId]:
            bucketCounts[clusterId, bucketId] += 1
            if display:
              print "    ", labelRefs[bucketId]
        elif display:
          print "    <None>"
        if display:
          print "\n\n"

  createBucketClusterPlot(args, bucketCounts)
  create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
Esempio n. 17
0
def run(args):
    """ Run the 'query' test.
  This method handles scenarios for running a single model or all of them.
  """
    (trainingData, labelRefs, _, documentTextMap) = readDataAndReshuffle(args)

    if args.modelName == "all":
        modelNames = NLP_MODEL_TYPES
        runningAllModels = True
    else:
        modelNames = [args.modelName]
        runningAllModels = False

    accuracies = {}
    for name in modelNames:
        # Setup args
        args.modelName = name
        args.modelDir = os.path.join(args.experimentDir, name)
        if name == "htm":
            if runningAllModels:
                # Need to specify network config for htm models
                try:
                    htmModelInfo = HTM_CONFIGS.pop()
                except KeyError:
                    print "Not enough HTM configs, so skipping the HTM model."
                    continue
                name = htmModelInfo[0]
                args.networkConfigPath = htmModelInfo[1]
            else:
                # Get the specific model name from the config path
                for (modelName, configPath) in HTM_CONFIGS:
                    if configPath == args.networkConfigPath:
                        name = modelName

        # Create a model, train it, save it, reload it
        _, model = executeModelLifecycle(args, trainingData, labelRefs)

        # Now query the model using some example HR complaints about managers
        queryModel(
            model,
            "Begin by treating the employees of the department with the "
            "respect they deserve. Halt the unfair practices "
            "that they are aware of doing. There is no compassion "
            "or loyalty to its senior employees", documentTextMap)

        queryModel(
            model, "My manager is really incompetent. He has no clue how to "
            "properly supervise his employees and keep them motivated.",
            documentTextMap)

        queryModel(
            model,
            "I wish I had a lot more vacation and much more flexibility "
            "in how I manage my own time. I should be able to choose "
            "when I come in as long as I manage to get all my tasks done.",
            documentTextMap)

        if args.verbosity > 0:
            # Print profile information
            print
            model.dumpProfile()

    resultsCheck(name)