def main(datafile = '../data_by_cookie_slim.json', outputFolder = '.', iterations = 10, epochmult = 4): filename = 'runMLP' outputFile = '{}/{}.p'.format(outputFolder,filename) data = funcs.loadData(datafile) #Filter away bottom 75% data = funcs.filterByPercRank(data, 75) print 'iterations: {}\nMultiplier Samplesize Epochs: {}\n output file: {}'.format(iterations,epochmult,outputFile) #Get first 10 values and try to decide whether people will keep on playing past 20 games samples = np.fromiter(((funcs.normalize(np.array(k[:10])),0 if len(k) < 20 else 1) for k in data if len(k) >= 10), dtype=[('input', float, 10), ('output', float, 1)]) print 'Learning from {} samples...'.format(samples.size) network = MLP(10,10,10,1) def processResults(network,results): stepf = lambda x: 0 if x < .5 else 1 test_data = [(t[0], t[1], stepf(t[2])) for t in results] percHits = np.mean([1 if t[2] == 1 else 0 for t in test_data if t[1] == 1]) # Percentage right hits falseAlarm = np.mean([1 if t[2] == 1 else 0 for t in test_data if t[1] == 0]) # Percentage false positives dPrime = funcs.dprime(percHits, falseAlarm) out = (percHits, falseAlarm, dPrime, network.weights) return out #print 'Hit % = {}, but false alarm % = {}, d\' = {}'.format(percHits,falseAlarm, dPrime) out = network.learnLoop(samples, iterations = iterations, epochs = epochmult * samples.size, processResults = processResults) #40 million epochs for full dataset.. Too many? pickle.dump(out,open(outputFile, 'wb')) #print out #results = network.test(samples) dprimes = pickle.load(open(outputFile,'rb')) #set nan to 0 dprimes = [[0 if np.isnan(i) or np.isinf(i) else i for i in k[2]] for k in dprimes] print print 'Results:' print 'Mean d\' score for each quit opportunity: {}'.format([np.mean([k[i] for k in dprimes]) for i in xrange(1)]) print 'Std : {}'.format([np.std([k[i] for k in dprimes]) for i in xrange(1)]) print 'Max : {}'.format([np.max([k[i] for k in dprimes]) for i in xrange(1)]) print print
def runObs(data, outfolder ='.', rankFilter = 0, preprocess = False, processX = False, processY = False): windowSizes1 = range(5,30,5) # Sizes of attempt group 1 windowSizes2 = range(5,30,5) # Sizes of attempt group 2 if processX == False: processX = lambda x,x_plays: np.var(x,axis=0) if processY == False: processY = lambda x,x_plays: np.mean(x,axis=0) #Run preprocessing if passed along if(preprocess != False): data = preprocess(data) for i1 in xrange(len(windowSizes1)): for i2 in xrange(len(windowSizes2)): window1 = windowSizes1[i1] window2 = windowSizes2[i2] first_plays = range(window1) second_plays = range(window1,window1+window2) total_attempts = window1 + window2 # -------------------------------------------- # look at subsample of people who played more than x times #print "organising data" big = [k for k in data if len(k) >= total_attempts] # -------------------------------------------- #calc dict of maximum score for each player(=each key) #maxscore=[max(a) for a in big] #calc percentile ranking for each player (=each key) #prcentiles= np.percentile(maxscore,range(100)) #construct vaiables dicts #print "calculating summary stats" #for each player make two lists, of plays 1-5 (first) and 6-10 (second) #and calculate summary stats av1,var1 and av2, var2 if(rankFilter != 0): big = funcs.filterByPercRank(big,rankFilter) first = [k[0:window1] for k in big] second = [k[window1:window2+window1] for k in big] #av1=np.mean(first,axis=1) x = processX(zip(*first),first_plays) #x = processX(zip(*first), first_plays) y = processY(zip(*second),second_plays) #processY(second, second_plays) #var2 = np.var(second,axis=1) #find percentile values #prcentiles_x=np.percentile(x,range(100)) #prcentiles_y=np.percentile(y,range(100)) #make dict of prcentile values for each statistic for each player #xlist=[bisect.bisect(prcentiles_x,k) for k in x] #ylist=[bisect.bisect(prcentiles_y,k) for k in y] #print "saving data" pickle.dump(x, open(outfolder + '/save_a5_xlist' + str(window1) + "," + str(window2) +'.p', 'wb')) pickle.dump(y, open(outfolder + '/save_a5_ylist' + str(window1) + "," + str(window2) +'.p', 'wb')) #print "mean x: ",np.mean(x) #print "mean y: ",np.mean(y)