def lognormal_adsolver(pts): ec = util.ecdf(np.array(pts), issorted=False) x = ec[:, 0] xrev = util.reverse(x) n = float((len(x))) i = np.array(range(len(x)), dtype=float) l1 = logn.Lognormal.fromFit(pts) imu = l1.mu() isig = l1.sigma() ivs = [imu, isig] ovs = (i, x, xrev, n) print ovs (fvals, infodict, ier, mesg) = opt.fsolve(solve_admin, ivs, ovs, None, 1, 0) f_mu = fvals[0] f_sigma = fvals[1] if ier != 1: raise logn.LognormalConvergenceError(mesg, (f_mu, f_sigma)) return logn.Lognormal(f_mu, f_sigma)
def lognormal_adsolver(pts): ec = util.ecdf(np.array(pts), issorted=False) x = ec[:,0] xrev = util.reverse(x) n = float((len(x))) i = np.array(range(len(x)), dtype=float) l1 = logn.Lognormal.fromFit(pts) imu = l1.mu() isig = l1.sigma() ivs = [imu, isig] ovs = (i,x,xrev,n) print ovs (fvals, infodict, ier, mesg) = opt.fsolve(solve_admin, ivs, ovs, None, 1, 0) f_mu = fvals[0] f_sigma = fvals[1] if ier != 1: raise logn.LognormalConvergenceError(mesg, (f_mu, f_sigma)) return logn.Lognormal(f_mu, f_sigma)
def paroptfit(x1, hi, lo, n, fit1, mt1): """ x1: Sorted array of points hi: Max. xmax to estimate to lo: Min. xmax to estimate to n: #. of points from hi to lo for the estimation fit1: Type of fit - mlefit or mmefit mt1: Mirror xform - True or False """ global x, fit, mt, c, max_of_x ## Initialize globals prior to the parallel run ## x = x1 max_of_x = max(x) fit = fit1 mt = mt1 c = util.ecdf(x) xmax_pts = util.gen_points(lo, hi, n) l_xmax_pts = xmax_pts.tolist() l_xmax_pts.append(max_of_x) l_xmax_pts.sort() xmax_pts = np.array(l_xmax_pts) ncpus = mp.cpu_count() proc_pool = Pool(ncpus) result = proc_pool.map(optfit, xmax_pts) FIT_COMP_IDX = 2 return best_fit(result, FIT_COMP_IDX)
def catm_cdf(): try: plt.show() except: pass # These are already computed values. # The script is only to plot the data. # Please refer to the notes on how to re-compute. # xmax is the value from the optimization run for mme x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses") mle=ml.ModLav(0.9398,3003.4797,0.1488) mme=ml.ModLav(1.0,3243.2566,0.1894) xmax = 17123.2972 ec = util.ecdf(x) mleec = mle.cdf(ec[:,0]) mmeec = mme.cdf(ec[:,0]) plt.plot(ec[:,0],ec[:,1],'k-',label='Data',linestyle='steps') plt.plot(ec[:,0],mleec,'k--',label='MLE fit') plt.plot(ec[:,0],mmeec,'k-.',label='MME fit') plt.grid() plt.xlabel("Session length [seconds]") plt.ylabel("P(X <= x)") plt.ylim((0.0,1.0)) plt.legend(loc=4)
def main(): cats = [None, 'catm\n', 'catd\n', 'cath\n'] for cat in cats: r = acct_util(cat) tuf = "tu_all.dat" duf = "du_all.dat" if cat != None: tuf = "tu_" + cat.strip() + ".dat" duf = "du_" + cat.strip() + ".dat" tu = r[1] du = r[2] # Need unsorted values if cat == None: tudu = np.zeros((len(tu),2)) tudu[:,0] = tu tudu[:,1] = du util.write_data("tudu.dat", tudu) tu.sort() du.sort() ctu = util.ecdf(tu,zdisp=True) cdu = util.ecdf(du,zdisp=True) util.write_data(tuf, ctu) util.write_data(duf, cdu) pcat = "ALL" if cat != None: pcat = cat.strip() print "CATEGORY: ", pcat print "Time utilization" print "------------------------------------------------" util.pstats(tu) print print "Data utilization" print "------------------------------------------------" util.pstats(du) print print
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power( 10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:, 0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:,0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def lognormal_nrsolver(pts): ec = util.ecdf(np.array(pts), issorted=False) x = ec[:, 0] xrev = util.reverse(x) n = float(len(x)) i = np.array(range(len(x)), dtype=float) l1 = logn.Lognormal.fromFit(pts) imu = l1.mu() isig = l1.sigma() ivs = [imu, isig] ovs = (i, x, xrev, n) [mu, sigma] = opt.root(solve_admin, ivs, ovs) return [mu, sigma]
def lognormal_nrsolver(pts): ec = util.ecdf(np.array(pts), issorted=False) x = ec[:,0] xrev = util.reverse(x) n = float(len(x)) i = np.array(range(len(x)), dtype=float) l1 = logn.Lognormal.fromFit(pts) imu = l1.mu() isig = l1.sigma() ivs = [imu, isig] ovs = (i, x, xrev, n) [mu, sigma] = opt.root(solve_admin, ivs, ovs) return [mu, sigma]
def paroptfit(x1, hi, lo, n, fit1, mt1): """ x1: Sorted array of points hi: Max. xmax to estimate to lo: Min. xmax to estimate to n: #. of points from hi to lo for the estimation fit1: Type of fit - mlefit or mmefit mt1: Mirror xform - True or False """ global x, fit, mt, c, max_of_x ## Initialize globals prior to the parallel run ## x = x1 max_of_x = max(x) fit = fit1 mt = mt1 c = util.ecdf(x) xmax_pts = util.gen_points(lo, hi, n) l_xmax_pts = xmax_pts.tolist() l_xmax_pts.append(max_of_x) l_xmax_pts.sort() xmax_pts = np.array(l_xmax_pts) ncpus = mp.cpu_count() proc_pool = Pool(ncpus) result = proc_pool.map(optfit, xmax_pts) # definitions to compare fit and k-s values # 2 = index of k-s metric in each tuple of the result # 3 = index of fit metric in each tuple of the result # !!! REMOVING KS COMP !!! FIT_COMP_IDX = 2 # KS_COMP_IDX = 3 DIFF_COMP_IDX = 3 return { "fit": best_fit(result, FIT_COMP_IDX), "diff": best_fit(result, DIFF_COMP_IDX) }
def paroptfit(x1, hi, lo, n, fit1, mt1): """ x1: Sorted array of points hi: Max. xmax to estimate to lo: Min. xmax to estimate to n: #. of points from hi to lo for the estimation fit1: Type of fit - mlefit or mmefit mt1: Mirror xform - True or False """ global x, fit, mt, c, max_of_x ## Initialize globals prior to the parallel run ## x = x1 max_of_x = max(x) fit = fit1 mt = mt1 c = util.ecdf(x) xmax_pts = util.gen_points(lo, hi, n) l_xmax_pts = xmax_pts.tolist() l_xmax_pts.append(max_of_x) l_xmax_pts.sort() xmax_pts = np.array(l_xmax_pts) ncpus = mp.cpu_count() proc_pool = Pool(ncpus) result = proc_pool.map(optfit, xmax_pts) # definitions to compare fit and k-s values # 2 = index of k-s metric in each tuple of the result # 3 = index of fit metric in each tuple of the result # !!! REMOVING KS COMP !!! FIT_COMP_IDX = 2 # KS_COMP_IDX = 3 DIFF_COMP_IDX = 3 return {"fit": best_fit(result, FIT_COMP_IDX), "diff": best_fit(result, DIFF_COMP_IDX)}
def solver_main(data_pts): ## Scrub the data points and remove 0 values ## Ideally we should get an np array ## But if not -- dp = np.array(data_pts) dp_i = dp[np.where(dp > 0.0)] cdf_i = ecdf(dp_i, issorted=False) x_i = cdf_i[:, 0] a_i = cdf_i[:, 1] i_u, i_s = Lognormal.mmefit(dp_i) val_list = ['x', 'a', 'x2', 'a2', 'n', 'k', \ 'F', 'dF_du', 'dF_ds', \ 'ddF_dsdu', 'd2F_du2', 'd2F_ds2'] svals = namedtuple('svals', val_list) cvals = svals(x=x_i, a=a_i, x2=np.power(x_i, 2.0), \ a2=np.power(a_i, 2.0), \ n=float(len(x_i)), \ k=k(a_i), \ F=None, \ dF_du=None, \ dF_ds=None, \ ddF_dsdu=None, \ d2F_du2=None, \ d2F_ds2=None) (fval, infodict, ier, msg) = opt.fsolve(logn_solver, \ [i_u, i_s], \ (cvals), \ jacobian, \ 1, \ 0) if ier != 1: print "Failed to converge: ", msg print "u: ", fval[0] print "s: ", fval[1]
def solver(pts): npts = np.array(pts) pts.sort() c = ecdf(pts) x_i = c[:,0] a_i = c[:,1] b = 1.0 c = np.median(npts) d = c/float(pts.max()) N = 2000 n = 0 tol = 1e-8 tolc = float('Inf') tolb = float('Inf') told = float('inf') while (n < N and (tolc > tol or told > tol or tolb > tol)): b = solve_beta(x_i, a_i, d) c = solve_c(x_i, a_i, b, d) d = solve_d(x_i, a_i, b, c) tolc = abs(dqdc(x_i, a_i, b, c, d)) told = abs(dqdd(x_i, a_i, b, c, d)) tolb = abs(dqdb(x_i, a_i, b, c, d)) print "n: ", n print "beta: ", b, " c: ", c, "d: ", d print "tolc: ", tolc, " told: ", told, "tolb: ", tolb if ( n >= N ): params = {"beta": b, "c": c, "d": d} raise RuntimeError("Cannot converge: " + str(params)) return (b, c, d)
def tlladmin_solver(pts): ec = util.ecdf(pts) x = ec[:, 1] xrev = util.reverse(x) i = np.array(range(len(x)), dtype=float) n = float(len(x)) ib = 1.0 ic = float(np.median(x)) id = ic / float(x.max()) ivs = [ib, ic, id] ovs = (i, x, xrev, n) (fvals, infodict, ier, mesg) = opt.fsolve(tll_admin, ivs, ovs, None, 1, 0) f_b = fvals[0] f_c = fvals[1] f_d = fvals[2] if ier != 1: raise ml.ModLavConvergenceError(mesg, (f_b, f_c, f_d)) return ml.ModLav(f_b, f_c, f_d)
def tlladmin_solver(pts): ec = util.ecdf(pts) x = ec[:,1] xrev = util.reverse(x) i = np.array(range(len(x)), dtype=float) n = float(len(x)) ib = 1.0 ic = float(np.median(x)) id = ic/float(x.max()) ivs = [ib,ic,id] ovs = (i,x,xrev,n) (fvals, infodict, ier, mesg) = opt.fsolve(tll_admin, ivs, ovs, None, 1, 0) f_b = fvals[0] f_c = fvals[1] f_d = fvals[2] if ier != 1: raise ml.ModLavConvergenceError(mesg, (f_b,f_c,f_d)) return ml.ModLav(f_b, f_c, f_d)
def gen_cdf_ccdf(): r = RunSQL("files_and_analysis.db") dsets = r.sqlq("select unique_id, filename from datasets") for dpair in dsets: dset_id = dpair[0] dset_file = dpair[1] print "Processing data set - ", dset_id x = read_data(dset_file) x.sort() ec = ecdf(x, issorted=True) pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) # dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"] dist_list = ["LOGN"] fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"} for distname in dist_list: print "\t Getting distribution - ", distname dist = get_dist(dset_id, distname) dec = dist.cdf(ec[:,0]) dcc = dist.ccdf(pts) fdec = np.array([ec[:,0], dec]).transpose() fdcc = np.array([pts, dcc]).transpose() op_dir = os.path.dirname(dset_file) op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname] op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname] print "\t Writing CDF - ", op_ec_file write_data(op_ec_file, fdec) print "\t Writing CCF - ", op_cc_file write_data(op_cc_file, fdcc)
def fitlogn(dataf): x = np.array(read_data(dataf)) x.sort() l1 = Lognormal.fromFit(x) l2 = Lognormal.fromFit(x, mmefit=False) ec = ecdf(x) cc = ccdf(x) q1 = l1.fitmetric(cdf=ec) q2 = l2.fitmetric(cdf=ec) print "File: " + dataf if q1 <= q2: print "Type: MME" print "Lognormal: " + str(l1) print "FIT: ", q1 print "K-S: ", l1.ksmetric(cdf=ec) else: print "Type: MLE" print "Lognormal: " + str(l2) print "FIT: ", q2 print "K-S: ", l2.ksmetric(cdf=ec)
def fitlogn(dataf): x=np.array(read_data(dataf)) x.sort() l1 = Lognormal.fromFit(x) l2 = Lognormal.fromFit(x,mmefit=False) ec = ecdf(x) cc = ccdf(x) q1 = l1.fitmetric(cdf=ec) q2 = l2.fitmetric(cdf=ec) print "File: " + dataf if q1 <= q2: print "Type: MME" print "Lognormal: " + str(l1) print "FIT: ", q1 print "K-S: ", l1.ksmetric(cdf=ec) else: print "Type: MLE" print "Lognormal: " + str(l2) print "FIT: ", q2 print "K-S: ", l2.ksmetric(cdf=ec)
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1*xmx hi = 10*xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma/mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append("--------------------------------------------------------------------------") op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append("--------------------------------------------------------------------------") op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx/xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append("--------------------------------------------------------------------------") op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:,0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:,0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1 * xmx hi = 10 * xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma / mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx / xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:, 0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:, 0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()
def main(dt): slen = {"catm": "select seslen from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 order by seslen", \ "catd": "select seslen from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 order by seslen", \ "cath": "select seslen from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 order by seslen", \ "all": "select seslen from data_log where seslen>0 and seslen<18000 order by seslen"} inb = {"catm": "select bin from data_log where ucat_term='catm\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "catd": "select bin from data_log where ucat_term='catd\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "cath": "select bin from data_log where ucat_term='cath\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "all": "select bin from data_log where bin>0 and seslen>0 and seslen<18000 order by bin"} outb = {"catm": "select bout from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "catd": "select bout from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "cath": "select bout from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "all": "select bout from data_log where bout>0 and seslen>0 and seslen<18000 order by bout"} to_inb = {"all": "select bin from data_log where seslen >= 18000 and bin>0 order by bin"} to_outb = {"all": "select bout from data_log where seslen >= 18000 and bout>0 order by bout"} tslen = {"catm": "select sum(seslen) t from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(seslen) t from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(seslen) t from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(seslen) t from data_log where seslen>0 and seslen<18000 group by user order by t"} tinb = {"catm": "select sum(bin) t from data_log where ucat_term='catm\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(bin) t from data_log where ucat_term='catd\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(bin) t from data_log where ucat_term='cath\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(bin) t from data_log where bin > 0 and seslen>0 and seslen<18000 group by user order by t"} toutb = {"catm": "select sum(bout) t from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(bout) t from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(bout) t from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(bout) t from data_log where bout>0 and seslen>0 and seslen<18000 group by user order by t"} dtmap = {"slen": slen, "inb": inb, "outb": outb, "to_inb": to_inb, "to_outb": to_outb, "tslen": tslen, "tinb": tinb, "toutb": toutb} if dt not in dtmap: raise NotImplementedError("Type - " + dt + " - is not implemented") qmap = dtmap[dt] s = sql.RunSQL("azure.db") for i in qmap.items(): q = i[1] y = s.sqlq(q) x = np.array(y) x.sort() # just making sure df = i[0] + "_" + dt ccf = i[0] + "_ccdf" ecf = i[0] + "_ecdf" cc = util.ccdf(x) ec = util.ecdf(x) util.write_data(df, x) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x,fit="mlefit") mme = ml.ModLav.fromFit(x,fit="mmefit") mle_mt = ml.ModLav.fromFit(x,fit="mlefit",mt=True) mme_mt = ml.ModLav.fromFit(x,fit="mmefit",mt=True) omle = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=False); omle_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=True); omme = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=False); omme_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=True); mle_opt = omle["fit"][0] xm_mle_opt = omle["fit"][1] mle_opt_mt = omle_mt["fit"][0] xm_mle_opt_mt = omle_mt["fit"][1] mme_opt = omme["fit"][0] xm_mme_opt = omme["fit"][1] mme_opt_mt = omme_mt["fit"][0] xm_mme_opt_mt = omme_mt["fit"][1] yyy = [("MLE", mle, x.max()), ("MME", mme, x.max()), ("MLE-MT", mle_mt, x.max()), ("MME-MT", mme_mt, x.max()), ("MLE-OPT", mle_opt, xm_mle_opt), ("MLE-OPT-MT", mle_opt_mt, xm_mle_opt_mt), ("MME-OPT", mme_opt, xm_mme_opt), ("MME-OPT-MT", mme_opt_mt, xm_mme_opt_mt)] n,amin,amax,mu,sigma = len(x), x.min(), x.max(), x.mean(), x.std() cv = sigma/mu q = ms.mquantiles(x, [0.1, 0.5, 0.9]) op_str = [] op_str.append("BASIC STATISTICS") op_str.append("----------------------------------------------------------------------") op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") for yy in yyy: typ = i[0] lbl = yy[0] m = yy[1] xmx = yy[2] op_str.append(lbl) op_str.append("----------------------------------------------------------------------") op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(xmx)) op_str.append("Xmax/Max: " + str(xmx/amax)) op_str.append("FIT metric: " + str(m.fitmetric(points=x))) op_str.append("K-S metric: " + str(m.ksmetric(points=x))) op_str.append("----------------------------------------------------------------------") op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = typ + "_" + flbl lx = util.gen_points(math.log10(x.min()),math.log10(xmx),2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:,0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:,0], mec]).transpose() util.write_data(fname_pfx+"_ccdf", fmcc) util.write_data(fname_pfx+"_ecdf", fmec) op1_str = [] for s1 in op_str: op1_str.append(s1 + "\n") txf = open(typ+"_metric", "w+") txf.writelines(op1_str) txf.close()