def getDatasets(baseDatasets, generate=False): # We're going to put datasets in data/dutyCycle/expname_<file>.csv expDir = getSubExpDir() if expDir is None: name = "base" else: name = os.path.basename(expDir) dataDir = "data/dutyCycle" trainingFilename = os.path.join(dataDir, name + "_" + config["trainingSet"]) datasets = dict(trainingFilename=trainingFilename) numUnique = config["numAValues"] * config["numBValues"] testSetSize = int(config["testSetPct"] * numUnique) if testSetSize > 0: testingFilename = os.path.join(dataDir, config["testingSet"]) datasets["testingFilename"] = testingFilename else: testingFilename = None if not generate: return datasets # ======================================================================== # Create the data files. We create a training set and a testing set. The # testing set contains combinations of A and B that do not appear in the # training set # if not os.path.exists(dataDir): os.makedirs(dataDir) if (not os.path.exists(trainingFilename)) or (testingFilename is not None and not os.path.exists(testingFilename)): print "=====================================================================" print "Creating data set..." # Create the pool of A values aValues = range(config["numAValues"]) # Create the pool of B values, allowing for unequal distribution bValues = range(config["numBValues"]) # Pick a random A and B value random.seed(42) def generateSample(): a = random.sample(aValues, 1)[0] b = random.sample(bValues, 1)[0] return (a, b) if config["b0Likelihood"] is not None: print "In the B dataset, there is a %d%% chance of getting a B value of 0" % ( int(100 * config["b0Likelihood"]) ) # likelihood of B0 is: (numB0) / (numB0 + numBvalues) # solving for numB0 = numBValues / (1 - likelihood) numB0Values = int(round(len(bValues) / (1.0 - config["b0Likelihood"]))) bValues.extend([0] * numB0Values) # 90% chance of getting first B value else: print "All values in B are equally likely" print # ----------------------------------------------------------------------- fields = [("fieldA", "int", ""), ("fieldB", "int", "")] # Generate the test set testSet = set() if testSetSize > 0: # Hold back 10% of the possible combinations for the test set while len(testSet) < testSetSize: testSet.add(generateSample()) testList = list(testSet) testList.sort() print "These (A,B) combinations are reserved for the test set:", testList print # Write out the test set print "Creating test set: %s..." % (testingFilename) print "Contains %d unique combinations of A and B chosen from the %d possible" % (testSetSize, numUnique) with File(testingFilename, fields=fields) as o: numSamples = 0 while numSamples < config["iterationCount"]: sample = generateSample() if sample in testSet: o.write(list(sample)) # print >>fd, "%d, %d" % (sample[0], sample[1]) numSamples += 1 print # ------------------------------------------------------------------------ # Write out the training set print "Creating training set: %s..." % (trainingFilename) if len(testSet) > 0: print "Contains %d samples, chosen from %d of the possible %d combinations " "that are not in the test set" % ( config["iterationCount"], numUnique - testSetSize, numUnique, ) else: print "Contains %d samples" % (config["iterationCount"]) print with FileRecordStream(trainingFilename, write=True, fields=fields) as o: numSamples = 0 while numSamples < config["iterationCount"]: sample = generateSample() if sample in testSet: continue # print >>fd, "%d, %d" % (sample[0], sample[1]) o.appendRecord(list(sample)) numSamples += 1 return datasets
def getDatasets(baseDatasets, generate=False): # We're going to put datasets in data/dutyCycle/expname_<file>.csv expDir = getSubExpDir() if expDir is None: name = "base" else: name = os.path.basename(expDir) dataDir = "data/dutyCycle" trainingFilename = os.path.join(dataDir, name + "_" + config['trainingSet']) datasets = dict(trainingFilename=trainingFilename) numUnique = config['numAValues'] * config['numBValues'] testSetSize = int(config['testSetPct'] * numUnique) if testSetSize > 0: testingFilename = os.path.join(dataDir, config['testingSet']) datasets['testingFilename'] = testingFilename else: testingFilename = None if not generate: return datasets # ======================================================================== # Create the data files. We create a training set and a testing set. The # testing set contains combinations of A and B that do not appear in the # training set # if not os.path.exists(dataDir): os.makedirs(dataDir) if (not os.path.exists(trainingFilename)) or \ (testingFilename is not None and not os.path.exists(testingFilename)): print "=====================================================================" print "Creating data set..." # Create the pool of A values aValues = range(config['numAValues']) # Create the pool of B values, allowing for unequal distribution bValues = range(config['numBValues']) # Pick a random A and B value random.seed(42) def generateSample(): a = random.sample(aValues, 1)[0] b = random.sample(bValues, 1)[0] return (a, b) if config['b0Likelihood'] is not None: print "In the B dataset, there is a %d%% chance of getting a B value of 0" \ % (int(100 * config['b0Likelihood'])) # likelihood of B0 is: (numB0) / (numB0 + numBvalues) # solving for numB0 = numBValues / (1 - likelihood) numB0Values = int(round(len(bValues) / (1.0 - config['b0Likelihood']))) bValues.extend([0]*numB0Values) # 90% chance of getting first B value else: print "All values in B are equally likely" print # ----------------------------------------------------------------------- fields = [('fieldA', 'int', ''), ('fieldB', 'int', '')] # Generate the test set testSet = set() if testSetSize > 0: # Hold back 10% of the possible combinations for the test set while len(testSet) < testSetSize: testSet.add(generateSample()) testList = list(testSet) testList.sort() print "These (A,B) combinations are reserved for the test set:", testList print # Write out the test set print "Creating test set: %s..." % (testingFilename) print "Contains %d unique combinations of A and B chosen from the %d possible" \ % (testSetSize, numUnique) with File(testingFilename, fields=fields) as o: numSamples = 0 while numSamples < config['iterationCount']: sample = generateSample() if sample in testSet: o.write(list(sample)) #print >>fd, "%d, %d" % (sample[0], sample[1]) numSamples += 1 print # ------------------------------------------------------------------------ # Write out the training set print "Creating training set: %s..." % (trainingFilename) if len(testSet) > 0: print "Contains %d samples, chosen from %d of the possible %d combinations " \ "that are not in the test set" % (config['iterationCount'], numUnique - testSetSize, numUnique) else: print "Contains %d samples" % (config['iterationCount']) print with FileRecordStream(trainingFilename, write=True, fields=fields) as o: numSamples = 0 while numSamples < config['iterationCount']: sample = generateSample() if sample in testSet: continue #print >>fd, "%d, %d" % (sample[0], sample[1]) o.appendRecord(list(sample)) numSamples += 1 return datasets