def find_all(): print '\nTesting that each random partitioning algorithm can discover the entire feasible set.' q = 20 n = 5 answer = 84 # there 84 partitions of 20 having 5 parts sample_size = 1000 names = ['divide_and_conquer','multiplicity','top_down','bottom_up'] for name in names: passtest = 0 feasibleset = [] D = {} while passtest < 1: partitions = parts.rand_partitions(q, n, sample_size, name, D, zeros=False) partitions = [list(x) for x in set(tuple(x) for x in partitions)] feasibleset.extend(partitions) feasibleset = [list(x) for x in set(tuple(x) for x in feasibleset)] if len(feasibleset) == answer: print 'entire feasible set found using',name,'PASS' passtest +=1 elif len(feasibleset) > answer: print 'feasible set is larger than it should be. FAIL.' break else: print 'working to randomly generate entire feasible set for q='+str(q)+' n='+str(n) return
def time_trials_sage(): """ Compare the speed of the random partitioning function of Sage to the functions developed in Locey and McGlinn (2013), for cases when zero values are and are not allowed.""" fig = plt.figure() algorithms = ['multiplicity', 'top_down', 'divide_and_conquer', 'bottom_up'] LABELS = ['Multiplicity:', 'Top down:', 'Divide and conquer:', 'Bottom up:'] COLORS = ['#00CED1', '#FF1493', 'k', 'gray'] qs = [50, 100, 150, 200] sample_size = 300 for i, q in enumerate(qs): ax = fig.add_subplot(2,2,i+1) print '\n',q step = int(q/10) ns = range(step, int(0.5*q)+step,step) Sage_times = [] for n in ns: print 'Sage', n with Timer() as t: x = rand_part_Sage(q, n, sample_size) Sage_times.append(round(t.interval,2)) print 'Sage', q, Sage_times, '\n' for ii, name in enumerate(algorithms): times = [] for n in ns: zeros=False D = {} with Timer() as t: x = parts.rand_partitions(q, n, sample_size, name, D, zeros) times.append(round(t.interval,2)) Y = [] for iii, t in enumerate(times): if Sage_times[ii] > 0.0: Y.append(float(Sage_times[iii])/t) else: Y.append(1.0) plt.plot(ns, Y, lw=3, color=COLORS[ii], label=LABELS[ii]) plt.yscale('log') if i == 0: plt.text(1.5,11,'Sage/algorithm',fontsize=10,rotation='90') legend(bbox_to_anchor=(-0.03, 1.1, 2.25, .2), loc=10, ncol=4, mode="expand",prop={'size':9})#, borderaxespad=0.) plt.tick_params(axis='both', which='major', labelsize=8) plt.xlabel('number of parts',fontsize=10) plt.savefig('time_trials_sage.png', dpi=500) #, pad_inches=0) return
def time_trials_bigQ( ): """ Compare the speed of the random partitioning functions developed in Locey and McGlinn (2013), for cases when zero values are and are not allowed. The code generates figure ? of the Appendix from Locey and McGlinn (2013) """ fig = plt.figure() sample_size=1 zeros = False qs = [5000, 20000, 200000, 500000] # at these values, this script will take >1 day to run for i, q in enumerate(qs): ax = fig.add_subplot(2,2,i+1) if i==0: algorithms = ['multiplicity','top_down','divide_and_conquer','bottom_up'] LABELS = ['Multiplicity:','Top down:','Divide and conquer:','Bottom up:'] COLORS = ['#00CED1','#FF1493','k','gray'] ns = [10,20,40,80] if i==1: algorithms = ['multiplicity','top_down','divide_and_conquer'] LABELS = ['Multiplicity:','Top down:','Divide and conquer:'] COLORS = ['#00CED1','#FF1493','k'] ns = [20,40,80,160] elif i==2: algorithms = ['multiplicity','top_down'] LABELS = ['Multiplicity:','Top down:'] COLORS = ['#00CED1','#FF1493'] ns = [40,80,160,320] elif i==3: algorithms = ['multiplicity'] LABELS = ['Multiplicity:'] COLORS = ['#00CED1'] ns = [80,160,320,640] for ii, name in enumerate(algorithms): times = [] for n in ns: D = {} with Timer() as t: x = parts.rand_partitions(q, n, sample_size, name, D, zeros=False) times.append(round(t.interval,2)) plt.plot(ns,times,color=COLORS[ii],lw=3,label=LABELS[ii]) plt.xlabel('number of parts, sample size='+str(sample_size)+')',fontsize=8) plt.tick_params(axis='both', which='major', labelsize=8) if i==0 or i==2: plt.ylabel("Seconds",fontsize=10) if i==0: legend(bbox_to_anchor=(0.0, 1.1, 2.2, .2), loc=10, ncol=4, mode="expand",prop={'size':9})#, borderaxespad=0.) plt.savefig('time_trials_bigQ.png', dpi=500)#, pad_inches=0) return
def get_var_for_Q_N(q, n, sample_size, t_limit, analysis): """Given q and n, returns a list of variance of length sample size with variance of each sample partitions or compositions. """ QN_var = [] try: with time_limit(t_limit): for Niter in range(sample_size): if analysis == 'partition': QN_parts = parts.rand_partitions(q, n, 1, 'bottom_up', {}, True) else: QN_parts = rand_compositions(q, n, 1, True) QN_var.append(np.var(QN_parts[0], ddof = 1)) return QN_var except TimeoutException, msg: print 'Timed out!' return QN_var
def get_all_SSADs(qnlist): # Figure X Locey and McGlinn (2013) s_size = 300 i = 1 fig = plt.figure() q = qnlist[0] nlist = qnlist[1] colors = ['#00CED1', '#FF1493', 'gray'] for ind, num in enumerate(nlist): ax = fig.add_subplot(3,3,i) n = num print i ct = 0 clr = colors[ind] sample_size = s_size zeros = True D = {} name = 'divide_and_conquer' SSADs = parts.rand_partitions(q, n, sample_size, name, D, zeros) for SSAD in SSADs: Y = vectorTohist(SSAD, zeros) while len(Y) > 40: Y.pop() while len(Y) < 40: Y.append(0) x = range(0,40) plt.bar(x,Y, color=clr, linewidth=0, align='center', alpha = 0.015) plt.bar([0],[0], color=clr, linewidth=0, align='center', label= 'q='+str(q)+', n='+str(n)) ct+=1 plt.xlim(-1,8) plt.xlabel("Abundance class",fontsize=8) plt.ylabel("Frequency",fontsize=8) plt.tick_params(axis='both', which='major', labelsize=5) plt.setp(ax, xticks=[0,1,2,3,4,5,6,7,8]) leg = plt.legend(loc=1,prop={'size':7}) leg.draw_frame(False) i+=1 ct = 0 ax = fig.add_subplot(3,3,i) print i clr = colors[ind] sample_size = s_size zeros = True D = {} name = 'divide_and_conquer' partitions = parts.rand_partitions(q, n, sample_size, name, D, zeros) skews = [] for partition in partitions: skews.append(stats.skew(partition)) D = get_kdens(skews) plt.plot(D[0],D[1],color = clr,lw=3, alpha = 0.99,label= 'n='+str(n)) plt.xlabel("Skewnness",fontsize=8) plt.tick_params(axis='both', which='major', labelsize=5) plt.ylabel("Density",fontsize=8) i+=1 ct = 0 ax = fig.add_subplot(3,3,i) print i clr = colors[ind] sample_size = s_size zeros = False D = {} name = 'divide_and_conquer' RADs = parts.rand_partitions(q, n, sample_size, name, D, zeros) ranks = range(1,n+1) max_ab = 0 varlist = [] for rad in RADs: log_rad = list(np.log(rad)) variance = np.var(rad, ddof=1) varlist.append(variance) if max(rad) > max_ab: max_ab = max(rad) plt.plot(ranks,rad, color=colors[ind], lw=1.0,alpha=0.04) print ' log(mean) vs. log(variance):', np.log(q/n), np.log(np.mean(varlist)) plt.tick_params(axis='both', which='major', labelsize=5) plt.yscale('log') plt.xlabel("Rank",fontsize=8) plt.ylabel("Abundance",fontsize=8) i+=1 plt.subplots_adjust(wspace=0.4, hspace=0.12) plt.savefig('SSADfig-'+str(q)+'-'+str(n)+'.png', dpi=600, bbox_inches = 'tight', pad_inches=0.01) print 'done'
def kdens_unbias(): """ The code below compares random partitioning nplottions of Sage and Locey and McGlinn (2013) to full feasible sets. These analyses confirm that the algorithms are unbiased. The code uses full feasible sets, the random partitioning function in Sage, and the random partitioning for cases when 0' are or are not allowed.""" algs = ['multiplicity','top_down','divide_and_conquer','bottom_up'] colors = ['#00CED1','#FF1493','k','gray'] fig = plt.figure() nplot = 1 # a variable used to designate subplots sample_size = 10000 # min number of macrostates needed to safely capture distributional # features across the feasible set metrics = ['gini', 'variance', 'median', 'skewness', 'evar'] metric = metrics[2] while nplot <= 4: ax =fig.add_subplot(2,2,nplot) if nplot < 3: q = 50 # values of q and n small enough to generate n = 10 # the entire feasible set else: q = 100 # values of q and n requiring random samples n = 20 # of feasible sets partitions = [] for i, alg in enumerate(algs): if nplot == 1 or nplot == 3: zeros = False D = {} partitions = parts.rand_partitions(q, n, sample_size, alg, D, zeros) else: D = {} zeros = True partitions = parts.rand_partitions(q, n, sample_size, alg, D, zeros) kdens = mt.get_kdens_obs(partitions, metric) plt.xlim(min(kdens[0]), max(kdens[0])) plt.plot(kdens[0], kdens[1], color=colors[i], lw=0.7) if nplot == 1: # using the full feasible set, no zero values (i.e. proper integer partitions) partitions = [] numparts = parts.NrParts(q, n) partition = parts.first_lexical(q, n, None) partitions.append(partition) ct2 = 0 while len(partitions) < numparts: partition = parts.next_restricted_part(partition) if len(partition) == n: partitions.append(partition) else: print 'bug in next_restricted_part()' sys.exit() kdens = mt.get_kdens_obs(partitions, metric) plt.xlim(min(kdens[0]), max(kdens[0])) plt.plot(kdens[0], kdens[1], color='r', lw=3.0, alpha=0.5) elif nplot == 2: # using the full feasible set, zero values included partitions = [] for p in Partitions(q): partition = list(p) if len(partition) == n: partitions.append(partition) elif len(partition) < n: zeros = [0]*(n-len(partition)) partition.extend(zeros) partitions.append(partition) kdens = mt.get_kdens_obs(partitions, metric) plt.xlim(min(kdens[0]), max(kdens[0])) plt.plot(kdens[0], kdens[1], color='r', lw=3.0, alpha=0.5) elif nplot == 3: partitions = [] while len(partitions) < sample_size: # Use the random partition nplottion in Sage to generate partitions for q and n partition = Partitions(q).random_element() if len(partition) == n: partitions.append(partition) else: partition = parts.conjugate(partition) if len(partition) == n: partitions.append(partition) kdens = mt.get_kdens_obs(partitions, metric) plt.xlim(min(kdens[0]), max(kdens[0])) plt.plot(kdens[0], kdens[1], color='r', lw=3.0, alpha=0.5) elif nplot == 4: partitions = [] while len(partitions) < sample_size: # Use the random partition nplottion in Sage to generate partitions for q and n part = list(Partitions(q).random_element()) if len(part) == n: partitions.append(part) elif len(part) < n: zeros = [0]*(n - len(part)) part.extend(zeros) partitions.append(part) kdens = mt.get_kdens_obs(partitions, metric) plt.xlim(min(kdens[0]), max(kdens[0])) plt.plot(kdens[0], kdens[1], color='r', lw=3.0, alpha=0.5) if nplot == 1: plt.plot([0],[0], color='#00CED1', lw=2, label = 'Multiplicity') plt.plot([0],[0], color='#FF1493',lw=2, label='Top-down') plt.plot([0],[0], color='k',lw=2, label='Divide & Conquer') plt.plot([0],[0], color='gray',lw=2, label='Bottom-up') plt.plot([0],[0], color='r',lw=2, label='FS q='+str(q)+', n='+str(n),alpha=0.5) plt.legend(bbox_to_anchor=(-0.02, 1.00, 2.24, .2), loc=10, ncol=5, mode="expand",prop={'size':8})#, borderaxespad=0.) if nplot == 1 or nplot == 3: plt.ylabel("density", fontsize=12) if nplot == 3 or nplot == 4: plt.xlabel(metric, fontsize=12) print nplot nplot+=1 plt.tick_params(axis='both', which='major', labelsize=8) plt.savefig('kdens_'+metric+'_'+str(sample_size)+'.png', dpi=500, pad_inches=0)
def bias_check(): print '\nTesting algorithms for bias across combinations of q and n, allowing or excluding zero-valued parts.' qn_combos = [[50,10],[100,20],[200,40]] for combo in qn_combos: q = combo[0] n = combo[1] sagepartitions = [] DATA = open('./testfiles/sage_zeros_q=' + str(q) + '_n='+str(n)+'.txt','r') for line in DATA: partition = eval(line) sagepartitions.append(partition) sagevars = [] for partition in sagepartitions: var = np.var(partition, ddof=1) sagevars.append(var) names = ['divide_and_conquer','multiplicity','top_down','bottom_up'] for name in names: D = {} sample_size = len(sagepartitions) passes = 0 tries = 0 while tries < 10: partitions = parts.rand_partitions(q, n, sample_size, name, D, zeros=True) #partitions = [list(x) for x in set(tuple(x) for x in partitions)] myvars = [] for partition in partitions: var = np.var(partition, ddof=1) myvars.append(var) tries+=1 ks, p = stats.ks_2samp(sagevars, myvars) #t, p = stats.ttest_ind(sagevars, myvars) if p > 0.05: passes+=1 print name,'(q =', q,' n =',n,') with zeros: PASSED', passes,'out of',tries DATA.close() sagepartitions = [] DATA = open('./testfiles/sage_q=' + str(q) + '_n='+str(n)+'.txt','r') for line in DATA: partition = eval(line) sagepartitions.append(partition) sagevars = [] for partition in sagepartitions: var = np.var(partition, ddof=1) sagevars.append(var) names = ['divide_and_conquer','multiplicity','top_down','bottom_up'] for name in names: D = {} sample_size = len(sagepartitions) passes = 0 tries = 0 while tries < 10: partitions = parts.rand_partitions(q, n, sample_size, name, D, zeros=False) #partitions = [list(x) for x in set(tuple(x) for x in partitions)] myvars = [] for partition in partitions: var = np.var(partition, ddof=1) myvars.append(var) tries+=1 ks, p = stats.ks_2samp(sagevars, myvars) #t, p = stats.ttest_ind(sagevars, myvars) if p > 0.05: passes+=1 print name,'(q =', q,' n =',n,') no zeros: PASSED', passes,'out of',tries DATA.close() return
for q in qs: if q <= 100: step = 1 elif q <= 300: step = 5 elif q <= 600: step = 10 else: step = 20 n = int(step) while n <= q: print alg, zero, alg, q times = [] D = {} t0 = time.time() x = parts.rand_partitions(q, n, sample_size, alg, D, zeros) t = time.time() - t0 times.append([n,t]) if zero == True: OUT = open('time_files/Python_' + alg + '_zeros_q=' + str(q) + '.txt','a+') elif zero == False: OUT = open('time_files/Python_' + alg + '_q=' + str(q) + '.txt','a+') for i in times: print>>OUT, i[0], i[1] OUT.close() n+=step