def prepare(self, testSet, rand, numSamples=20, samplePercent=50): """Prepares for bootstrap estimation of lift of one ANN. @param testSet: test set on which to test the ANN @type testSet: input.DataSet @param rand: source of randomness for bootstrap samples @type rand: random.Random @param numSamples: number of bootstrap samples @type numSamples: int @param samplePercent: size of each sample, in percent of the testSet @type samplePercent: int """ self.sampleSets = [] self.anns = [] for sampleIndex in range(numSamples): sampleSet = testSet.sample(samplePercent, rand) self.sampleSets.append(sampleSet) a = ANN() a.prepare(sampleSet, popSize=1) self.anns.append(a) self.sampleSize = self.sampleSets[0].size
def main(): import input logging.basicConfig(level=logging.INFO, stream=sys.stdout) np.set_printoptions(precision=3, edgeitems=3, threshold=20) random.seed(80085) # used by the GA randSample = random.Random(input.SAMPLE_SEED) # used for data set sampling inp = input.Input("train3-std.tsv", randSample) print "Train set:", inp.trainSet.show() print "Test set:", inp.testSet.show() n = inp.trainSet.size * 20/100 a = ANN() a.prepare(inp.trainSet, POPSIZE) tester = SampleTester() tester.prepare(inp.testSet, randSample) tester.showSampleSets() params = [] generatePop(params) for genIndex in range(TOTAL_GENS): print "Generation", genIndex, "starting." logFP("Population", params) outputValues = a.evaluate(params, returnOutputs=True) logFP("Outputs", outputValues) thresholds = a.nlargest(n) logFP("Thresholds", thresholds) lifts = a.lift(n) logFP("Lifts", lifts) taggedParams = sorted(zip(lifts, params, range(len(params))), key=lambda (l, p, i): l, reverse=True) sortedParams = [p for l, p, i in taggedParams] logFP("Sorted pop", sortedParams) testLift, _ = tester.test(sortedParams[0]) genplot.addGeneration(lifts, testLift, genIndex) mutateValue = 1.0 params = generateGeneration(sortedParams, mutateValue) args = sys.argv[1:] if len(args) == 1: open(args[0], "w").write(repr(sortedParams[0])) genplot.plot()