コード例 #1
0
ファイル: description.py プロジェクト: srecioe/nupic
def getDatasets(baseDatasets, generate=False):
    # We're going to put datasets in data/dutyCycle/expname_<file>.csv

    expDir = getSubExpDir()
    if expDir is None:
        name = "base"
    else:
        name = os.path.basename(expDir)

    dataDir = "data/dutyCycle"

    trainingFilename = os.path.join(dataDir, name + "_" + config["trainingSet"])
    datasets = dict(trainingFilename=trainingFilename)

    numUnique = config["numAValues"] * config["numBValues"]
    testSetSize = int(config["testSetPct"] * numUnique)
    if testSetSize > 0:
        testingFilename = os.path.join(dataDir, config["testingSet"])
        datasets["testingFilename"] = testingFilename
    else:
        testingFilename = None

    if not generate:
        return datasets

    # ========================================================================
    # Create the data files. We create a training set and a testing set. The
    #  testing set contains combinations of A and B that do not appear in the
    #  training set
    #

    if not os.path.exists(dataDir):
        os.makedirs(dataDir)

    if (not os.path.exists(trainingFilename)) or (testingFilename is not None and not os.path.exists(testingFilename)):
        print "====================================================================="
        print "Creating data set..."

        # Create the pool of A values
        aValues = range(config["numAValues"])
        # Create the pool of B values, allowing for unequal distribution
        bValues = range(config["numBValues"])

        # Pick a random A and B value
        random.seed(42)

        def generateSample():
            a = random.sample(aValues, 1)[0]
            b = random.sample(bValues, 1)[0]
            return (a, b)

        if config["b0Likelihood"] is not None:
            print "In the B dataset, there is a %d%% chance of getting a B value of 0" % (
                int(100 * config["b0Likelihood"])
            )
            # likelihood of B0 is: (numB0) / (numB0 + numBvalues)
            # solving for numB0 = numBValues / (1 - likelihood)
            numB0Values = int(round(len(bValues) / (1.0 - config["b0Likelihood"])))
            bValues.extend([0] * numB0Values)  # 90% chance of getting first B value
        else:
            print "All values in B are equally likely"
        print

        # -----------------------------------------------------------------------
        fields = [("fieldA", "int", ""), ("fieldB", "int", "")]
        # Generate the test set
        testSet = set()
        if testSetSize > 0:
            # Hold back 10% of the possible combinations for the test set
            while len(testSet) < testSetSize:
                testSet.add(generateSample())
            testList = list(testSet)
            testList.sort()
            print "These (A,B) combinations are reserved for the test set:", testList
            print

            # Write out the test set
            print "Creating test set: %s..." % (testingFilename)
            print "Contains %d unique combinations of A and B chosen from the %d possible" % (testSetSize, numUnique)
            with File(testingFilename, fields=fields) as o:
                numSamples = 0
                while numSamples < config["iterationCount"]:
                    sample = generateSample()
                    if sample in testSet:
                        o.write(list(sample))
                        # print >>fd, "%d, %d" % (sample[0], sample[1])

                        numSamples += 1
            print

        # ------------------------------------------------------------------------
        # Write out the training set
        print "Creating training set: %s..." % (trainingFilename)
        if len(testSet) > 0:
            print "Contains %d samples, chosen from %d of the possible %d combinations " "that are not in the test set" % (
                config["iterationCount"],
                numUnique - testSetSize,
                numUnique,
            )
        else:
            print "Contains %d samples" % (config["iterationCount"])
        print
        with FileRecordStream(trainingFilename, write=True, fields=fields) as o:
            numSamples = 0
            while numSamples < config["iterationCount"]:
                sample = generateSample()
                if sample in testSet:
                    continue
                # print >>fd, "%d, %d" % (sample[0], sample[1])
                o.appendRecord(list(sample))
                numSamples += 1

    return datasets
コード例 #2
0
ファイル: description.py プロジェクト: zacg/nupic
def getDatasets(baseDatasets, generate=False):
  # We're going to put datasets in data/dutyCycle/expname_<file>.csv
  
  expDir = getSubExpDir()
  if expDir is None:
    name = "base"
  else:
    name = os.path.basename(expDir)

  dataDir = "data/dutyCycle"

  trainingFilename = os.path.join(dataDir, name + "_" + config['trainingSet'])
  datasets = dict(trainingFilename=trainingFilename)

  numUnique = config['numAValues'] * config['numBValues']
  testSetSize = int(config['testSetPct'] * numUnique)    
  if testSetSize > 0:
    testingFilename = os.path.join(dataDir, config['testingSet'])
    datasets['testingFilename'] = testingFilename
  else:
    testingFilename = None

  
  if not generate:
    return datasets

  # ========================================================================
  # Create the data files. We create a training set and a testing set. The
  #  testing set contains combinations of A and B that do not appear in the
  #  training set
  #
  
  if not os.path.exists(dataDir):
    os.makedirs(dataDir)

  if (not os.path.exists(trainingFilename)) or \
     (testingFilename is not None and not os.path.exists(testingFilename)):
    print "====================================================================="
    print "Creating data set..."

    # Create the pool of A values
    aValues = range(config['numAValues'])
    # Create the pool of B values, allowing for unequal distribution
    bValues = range(config['numBValues'])

    # Pick a random A and B value
    random.seed(42)
    def generateSample():
      a = random.sample(aValues, 1)[0]
      b = random.sample(bValues, 1)[0]
      return (a, b)

    if config['b0Likelihood'] is not None:
      print "In the B dataset, there is a %d%% chance of getting a B value of 0" \
            % (int(100 * config['b0Likelihood']))
      # likelihood of B0 is: (numB0) / (numB0 + numBvalues)
      # solving for numB0 = numBValues / (1 - likelihood)
      numB0Values = int(round(len(bValues) / (1.0 - config['b0Likelihood'])))
      bValues.extend([0]*numB0Values)   # 90% chance of getting first B value
    else:
      print "All values in B are equally likely"
    print

    # -----------------------------------------------------------------------
    fields = [('fieldA', 'int', ''), ('fieldB', 'int', '')]
    # Generate the test set
    testSet = set()
    if testSetSize > 0:
      # Hold back 10% of the possible combinations for the test set
      while len(testSet) < testSetSize:
        testSet.add(generateSample())
      testList = list(testSet)
      testList.sort()
      print "These (A,B) combinations are reserved for the test set:", testList
      print

      # Write out the test set
      print "Creating test set: %s..." % (testingFilename)
      print "Contains %d unique combinations of A and B chosen from the %d possible" \
              % (testSetSize, numUnique)
      with File(testingFilename, fields=fields) as o:
        numSamples = 0
        while numSamples < config['iterationCount']:
          sample = generateSample()
          if sample in testSet:
            o.write(list(sample))
            #print >>fd, "%d, %d" % (sample[0], sample[1])

            numSamples += 1
      print

    # ------------------------------------------------------------------------
    # Write out the training set
    print "Creating training set: %s..." % (trainingFilename)
    if len(testSet) > 0:
      print "Contains %d samples, chosen from %d of the possible %d combinations " \
            "that are not in the test set" % (config['iterationCount'], 
            numUnique - testSetSize, numUnique)
    else:
      print "Contains %d samples" % (config['iterationCount'])
    print
    with FileRecordStream(trainingFilename, write=True, fields=fields) as o:
      numSamples = 0
      while numSamples < config['iterationCount']:
        sample = generateSample()
        if sample in testSet:
          continue
        #print >>fd, "%d, %d" % (sample[0], sample[1])
        o.appendRecord(list(sample))
        numSamples += 1
 
  return datasets