def process_data(): days = [13, 14, 15, 16, 17, 18, 19, 20] dirn = "/Users/chimpu/research/ml_data/unc/" for day in days: print "Processing day - ", day fcount = process_single_day(day) fname = dirn + "/fcount/day_" + str(day) write_data(fname, fcount) print "Done writing records - ", len(fcount)
def process_data(): days = [13,14,15,16,17,18,19,20] dirn="/Users/chimpu/research/ml_data/unc/" for day in days: print "Processing day - ", day fcount = process_single_day(day) fname = dirn + "/fcount/day_" + str(day) write_data(fname, fcount) print "Done writing records - ", len(fcount)
def process_data(): days = [13,14,15,16,17,18,19,20] dirn="/Users/chimpu/research/ml_data/unc/" for day in days: print "Processing day - ", day (inb, outb) = process_single_day(day) fname_inb = dirn + "/inb/day_" + str(day) fname_outb = dirn + "/outb/day_" + str(day) write_data(fname_inb, inb) write_data(fname_outb, outb) print "Done writing records - ", len(inb)
def process_data(): days = [13, 14, 15, 16, 17, 18, 19, 20] dirn = "/Users/chimpu/research/ml_data/unc/" for day in days: print "Processing day - ", day (inb, outb) = process_single_day(day) fname_inb = dirn + "/inb/day_" + str(day) fname_outb = dirn + "/outb/day_" + str(day) write_data(fname_inb, inb) write_data(fname_outb, outb) print "Done writing records - ", len(inb)
def create_ses_inter(): dl = get_day_list() r = RunSQL("unc.db") for d in dl: q = "select ts from flows where day = " + str(d) + " and term > 1 order by ts" print "Running query for day - ", str(d) ts = np.array(r.sqlq(q)) n = len(ts) its = ts[1:n] - ts[0:n-1] nits = its[np.where(its > 0)] fname = get_output_dir("interses") + "/day_" + str(d) print "Writing to file - ", fname write_data(fname, nits)
def main(): day_list = [13,14,15,16,17,18,19,20] sql_str = "select bytes from flows where day = " unc = sql.RunSQL("unc.db") for d in day_list: fname = "flow_" + str(d) dsql_str = sql_str + str(d) print "Running query --> SQL: ", dsql_str f = unc.sqlq(dsql_str) print "Writing data --> file: ", fname util.write_data(fname, f) del(f)
def main(): day_list = [13, 14, 15, 16, 17, 18, 19, 20] sql_str = "select bytes from flows where day = " unc = sql.RunSQL("unc.db") for d in day_list: fname = "flow_" + str(d) dsql_str = sql_str + str(d) print "Running query --> SQL: ", dsql_str f = unc.sqlq(dsql_str) print "Writing data --> file: ", fname util.write_data(fname, f) del (f)
def create_ses_count_inter(): dl = get_day_list() r = RunSQL("unc.db") r1 = RunSQL("syslog_final.db") for d in dl: print "Processing day - ", d q1 = "select client, ap, start, end from seslen where day = " + str(d) l1 = r1.sqlq(q1) fcount = list() sesinter = list() for ses in l1: start_ts = ses[2] end_ts = ses[3] client = ses[0] ap = ses[1] q = "select ts from flows" + \ " where day = " + str(d) + \ " and ts >= " + str(start_ts) + \ " and ts <= " + str(end_ts) + \ " and client = " + str(client) + \ " and ap = " + str(ap) + \ " order by ts" l = r.sqlq(q) if ( l != None and len(l) > 0 ): fcount.append(len(l)) n = len(l) npl = np.array(l) inpl = (npl[1:n] - npl[0:n-1]) inpl1 = inpl[np.where(inpl>0)] if ( len(inpl1) > 0 ): sesinter.extend(inpl1.tolist()) ifname = get_output_dir("intrases") + "/day_" + str(d) cfname = get_output_dir("fcount") + "/day_" + str(d) print "Writing file - ", ifname write_data(ifname, sesinter) print "Writing file - ", cfname write_data(cfname, fcount)
def create_seslen(): dl = get_day_list() r = RunSQL("unc-proc.db") slen = list() inb = list() outb = list() for d in dl: q = "select seslen, bin, bout from sessions where day = " + str(d) + " and seslen > 0 and bin > 0 and bout > 0" print "Getting data for day - ", d slist = r.sqlq(q) n = len(slist) for i in xrange(n): slen.append(slist[i][0]) inb.append(slist[i][1]) outb.append(slist[i][2]) fname = "day_" + str(d) slenf = get_output_dir("seslen") + "/" + fname inbf = get_output_dir("inb") + "/" + fname outbf = get_output_dir("outb") + "/" + fname print "Writing slen - " + slenf write_data(slenf, slen) print "Writing inb - " + inbf write_data(inbf, inb) print "Writing outb - " + outbf write_data(outbf, outb)
def create_seslen(): dl = get_day_list() r = RunSQL("unc-proc.db") slen = list() inb = list() outb = list() for d in dl: q = "select seslen, bin, bout from sessions where day = " + str( d) + " and seslen > 0 and bin > 0 and bout > 0" print "Getting data for day - ", d slist = r.sqlq(q) n = len(slist) for i in xrange(n): slen.append(slist[i][0]) inb.append(slist[i][1]) outb.append(slist[i][2]) fname = "day_" + str(d) slenf = get_output_dir("seslen") + "/" + fname inbf = get_output_dir("inb") + "/" + fname outbf = get_output_dir("outb") + "/" + fname print "Writing slen - " + slenf write_data(slenf, slen) print "Writing inb - " + inbf write_data(inbf, inb) print "Writing outb - " + outbf write_data(outbf, outb)
def main(): cats = [None, 'catm\n', 'catd\n', 'cath\n'] for cat in cats: r = acct_util(cat) tuf_pfx = "tu_g." duf_pfx = "du_g." sfx = "all" tu = r[1] du = r[2] if cat != None: sfx = cat.strip() tuf = tuf_pfx + sfx duf = duf_pfx + sfx write_data(tuf, tu) write_data(duf, du)
def main(): cats = [None, "catm\n", "catd\n", "cath\n"] for cat in cats: r = acct_util(cat) tuf_pfx = "tu_g." duf_pfx = "du_g." sfx = "all" tu = r[1] du = r[2] if cat != None: sfx = cat.strip() tuf = tuf_pfx + sfx duf = duf_pfx + sfx write_data(tuf, tu) write_data(duf, du)
def gen_cdf_ccdf(): r = RunSQL("files_and_analysis.db") dsets = r.sqlq("select unique_id, filename from datasets") for dpair in dsets: dset_id = dpair[0] dset_file = dpair[1] print "Processing data set - ", dset_id x = read_data(dset_file) x.sort() ec = ecdf(x, issorted=True) pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) # dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"] dist_list = ["LOGN"] fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"} for distname in dist_list: print "\t Getting distribution - ", distname dist = get_dist(dset_id, distname) dec = dist.cdf(ec[:,0]) dcc = dist.ccdf(pts) fdec = np.array([ec[:,0], dec]).transpose() fdcc = np.array([pts, dcc]).transpose() op_dir = os.path.dirname(dset_file) op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname] op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname] print "\t Writing CDF - ", op_ec_file write_data(op_ec_file, fdec) print "\t Writing CCF - ", op_cc_file write_data(op_cc_file, fdcc)
def main(): cats = [None, 'catm\n', 'catd\n', 'cath\n'] for cat in cats: r = acct_util(cat) tuf = "tu_all.dat" duf = "du_all.dat" if cat != None: tuf = "tu_" + cat.strip() + ".dat" duf = "du_" + cat.strip() + ".dat" tu = r[1] du = r[2] # Need unsorted values if cat == None: tudu = np.zeros((len(tu),2)) tudu[:,0] = tu tudu[:,1] = du util.write_data("tudu.dat", tudu) tu.sort() du.sort() ctu = util.ecdf(tu,zdisp=True) cdu = util.ecdf(du,zdisp=True) util.write_data(tuf, ctu) util.write_data(duf, cdu) pcat = "ALL" if cat != None: pcat = cat.strip() print "CATEGORY: ", pcat print "Time utilization" print "------------------------------------------------" util.pstats(tu) print print "Data utilization" print "------------------------------------------------" util.pstats(du) print print
def main(dt): slen = {"catm": "select seslen from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 order by seslen", \ "catd": "select seslen from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 order by seslen", \ "cath": "select seslen from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 order by seslen", \ "all": "select seslen from data_log where seslen>0 and seslen<18000 order by seslen"} inb = {"catm": "select bin from data_log where ucat_term='catm\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "catd": "select bin from data_log where ucat_term='catd\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "cath": "select bin from data_log where ucat_term='cath\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \ "all": "select bin from data_log where bin>0 and seslen>0 and seslen<18000 order by bin"} outb = {"catm": "select bout from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "catd": "select bout from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "cath": "select bout from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \ "all": "select bout from data_log where bout>0 and seslen>0 and seslen<18000 order by bout"} to_inb = {"all": "select bin from data_log where seslen >= 18000 and bin>0 order by bin"} to_outb = {"all": "select bout from data_log where seslen >= 18000 and bout>0 order by bout"} tslen = {"catm": "select sum(seslen) t from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(seslen) t from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(seslen) t from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(seslen) t from data_log where seslen>0 and seslen<18000 group by user order by t"} tinb = {"catm": "select sum(bin) t from data_log where ucat_term='catm\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(bin) t from data_log where ucat_term='catd\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(bin) t from data_log where ucat_term='cath\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(bin) t from data_log where bin > 0 and seslen>0 and seslen<18000 group by user order by t"} toutb = {"catm": "select sum(bout) t from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "catd": "select sum(bout) t from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "cath": "select sum(bout) t from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \ "all": "select sum(bout) t from data_log where bout>0 and seslen>0 and seslen<18000 group by user order by t"} dtmap = {"slen": slen, "inb": inb, "outb": outb, "to_inb": to_inb, "to_outb": to_outb, "tslen": tslen, "tinb": tinb, "toutb": toutb} if dt not in dtmap: raise NotImplementedError("Type - " + dt + " - is not implemented") qmap = dtmap[dt] s = sql.RunSQL("azure.db") for i in qmap.items(): q = i[1] y = s.sqlq(q) x = np.array(y) x.sort() # just making sure df = i[0] + "_" + dt ccf = i[0] + "_ccdf" ecf = i[0] + "_ecdf" cc = util.ccdf(x) ec = util.ecdf(x) util.write_data(df, x) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x,fit="mlefit") mme = ml.ModLav.fromFit(x,fit="mmefit") mle_mt = ml.ModLav.fromFit(x,fit="mlefit",mt=True) mme_mt = ml.ModLav.fromFit(x,fit="mmefit",mt=True) omle = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=False); omle_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=True); omme = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=False); omme_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=True); mle_opt = omle["fit"][0] xm_mle_opt = omle["fit"][1] mle_opt_mt = omle_mt["fit"][0] xm_mle_opt_mt = omle_mt["fit"][1] mme_opt = omme["fit"][0] xm_mme_opt = omme["fit"][1] mme_opt_mt = omme_mt["fit"][0] xm_mme_opt_mt = omme_mt["fit"][1] yyy = [("MLE", mle, x.max()), ("MME", mme, x.max()), ("MLE-MT", mle_mt, x.max()), ("MME-MT", mme_mt, x.max()), ("MLE-OPT", mle_opt, xm_mle_opt), ("MLE-OPT-MT", mle_opt_mt, xm_mle_opt_mt), ("MME-OPT", mme_opt, xm_mme_opt), ("MME-OPT-MT", mme_opt_mt, xm_mme_opt_mt)] n,amin,amax,mu,sigma = len(x), x.min(), x.max(), x.mean(), x.std() cv = sigma/mu q = ms.mquantiles(x, [0.1, 0.5, 0.9]) op_str = [] op_str.append("BASIC STATISTICS") op_str.append("----------------------------------------------------------------------") op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") for yy in yyy: typ = i[0] lbl = yy[0] m = yy[1] xmx = yy[2] op_str.append(lbl) op_str.append("----------------------------------------------------------------------") op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(xmx)) op_str.append("Xmax/Max: " + str(xmx/amax)) op_str.append("FIT metric: " + str(m.fitmetric(points=x))) op_str.append("K-S metric: " + str(m.ksmetric(points=x))) op_str.append("----------------------------------------------------------------------") op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = typ + "_" + flbl lx = util.gen_points(math.log10(x.min()),math.log10(xmx),2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:,0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:,0], mec]).transpose() util.write_data(fname_pfx+"_ccdf", fmcc) util.write_data(fname_pfx+"_ecdf", fmec) op1_str = [] for s1 in op_str: op1_str.append(s1 + "\n") txf = open(typ+"_metric", "w+") txf.writelines(op1_str) txf.close()
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:,0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1*xmx hi = 10*xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma/mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append("--------------------------------------------------------------------------") op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append("--------------------------------------------------------------------------") op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx/xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append("--------------------------------------------------------------------------") op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:,0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:,0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power( 10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:, 0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1 * xmx hi = 10 * xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma / mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx / xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:, 0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:, 0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()