def est_all_data(frequent_items, total_transactions): print 'est all data2' start = time() transactions = None #transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt') all_frequent_items = fpgrowth(transactions, supp=-10, min=1, max=3) #-10 yields 3437 M, triples = filter_items(all_frequent_items) fp_time = time() - start print "Finding frequent items: {}".format(fp_time) est_start = time() est = [] obs = [] abs_errors = [] max_est = 0 max_obs = 0 i = 0 j = 0 triangle_start = time() triangle_tree, triples = Forward.forward_compact(frequent_items) print 'Finding triangles done: ', (time()-triangle_start) # DFS the triangle tree for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] if s123 < 30: continue e = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=20) est.append(e) obs.append(s123) error = abs(e-s123) / float(s123) * 100 abs_errors.append(error) # For plotting max_est = max(max_est, e) max_obs = max(max_obs, s123) with open('../tmp/est_all_data.json', 'w') as fd: fd.write(json.dumps(zip(est, obs))) with open('../tmp/est_al_data.tsv', 'w') as fd: fd.write('est\tobs\tkind\n') for index, i in enumerate(est): fd.write(str(est[index]) + '\t' + str(obs[index]) + '\t' + 'est/obs\n') # scale = 1.5 # fig = plt.figure() # fig.text(0, 0, "Total running time: {} sec.".format(time()-est_start)) avg_error = sum(abs_errors) / float(len(abs_errors)) print 'avg error: {}'.format(avg_error) print 'error var: {}'.format(np.var(abs_errors)) print 'max observed: {}'.format(max_obs)
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation on ALL DATA cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions sample_size = total_transactions avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] for index in range(iterations): borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3'] call(args) print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(sample_freq_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) # * 2, horrible hack to make Forward calculated the # observed frequency correctly. observed[sorted_trip] = item[1][0] * 2 print 'Total triplets observed:', len(observed) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, total_transactions) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] heurestics = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] triangle_counts = [] triplets = [] pair_triple_ratios = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2))+1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle = (n1, n2, n3) triplets.append(triangle) triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Observed is the triple support, since sample is all data obs = s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth) # extrapolation estimate, does not make sense for all data est2 = s123 / float(sample_size) * (total_transactions) # heurestic, use max_ent for 0 triple in sample, does not make sense for all data # est3 = s123 == 0 and est or est2 estimates.append(est) # extrapolations.append(est2) # heurestics.append(est3) observations.append(obs) triplets.append(triangle) # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) MAPE_errors.append(error) # MAPE error extrapolation error2 = abs(obs-est2) / math.sqrt(obs) MAPE_errors_ext.append(error2) # MAPE error heurestic # error3 = abs(obs-est3) / float(obs) * 100 # MAPE_errors_heu.append(error3) del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) # avg_errors_ext.append(avg_error_ext) # heurestic error # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) # avg_errors_heu.append(avg_error_heu) # variance var_error = var(MAPE_errors) # var_error_ext = tvar(MAPE_errors_ext) # var_error_heu = tvar(MAPE_errors_heu) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval # std_dev_ext = math.sqrt(var_error_ext) # std_error_ext = std_dev_ext / math.sqrt(sample_size) # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # heurestic confidence interval # std_dev_heu = math.sqrt(var_error_heu) # std_error_heu = std_dev_heu / math.sqrt(sample_size) # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) # var_errors_ext.append(var_error_ext) # var_errors_heu.append(var_error_heu) res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) with open(path + 'log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) if len(avg_errors) > 0: total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation. Using compact representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward_compact(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangle_tree, triples = Forward.forward_compact(frequent_items) print 'triangle roots: {}'.format(len(triangle_tree)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key((n1, n2, n3)): observed = all_triples[(n1, n2, n3)] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation, 'old' version not using compatct triangle representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangles, triples = Forward.forward(frequent_items) print 'triangles: {}'.format(len(triangles)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 for (s1, s2, s3, s12, s23, s13, s123) in triangles: # if s123[1] != 0: # continue # maxent estimate from the sample. # Index [1] of the tuples hold the # occurences in the sample est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key(s123[0]): observed = all_triples[s123[0]] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) # TODO histogram of the average errors. max-ent, extrapolation, heurestic # TODO print average error og the average errors to the log. res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error) return path
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions # Get the total observed triples borgelt_start = time() observed_file_name = path + 'observed_frequent_items.out' args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3'] # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) # os.killpg(pro.pid, signal.SIGTERM) call(args) # sleep(20) print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(observed_file_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) observed[sorted_trip] = item[1][0] print 'Total triplets observed:', len(observed) average_observed = sum(observed.values()) / float(len(observed)) print 'Baseline: ', average_observed del freq avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] avg_errors_ind = [] var_errors_ind = [] avg_errors_baseline = [] occurrences = [0 for i in range(100)] max_ent_acc_error = [0 for i in range(100)] ext_acc_error = [0 for i in range(100)] ind_acc_error = [0 for i in range(100)] heu_acc_error = [0 for i in range(100)] baseline_acc_error = [0 for i in range(100)] # Record trip counts for the best estimats max_ent_best = Counter() ext_best = Counter() ind_best = Counter() for index in range(iterations): # Create sample file sampling_start = time() if sample_pct > 0: sample_size= int(total_transactions*sample_pct) else: sample_size = abs(sample_pct) test_data_size = total_transactions - sample_size sample = random.sample(range(total_transactions), sample_size) assert len(sample) == sample_size, 'Sample size not equal to sample' sample.sort() sample_file_name = path + str(index) + '_sample.tab' with open(sample_file_name, 'a') as sample_file: sample_line = 0 for line_num, line in enumerate(open(tab_file, 'rb')): if line_num == sample[sample_line]: sample_file.write(line) sample_line += 1 if sample_line == sample_size: break del sample print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start) borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3'] call(args) print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, test_data_size) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] independences = [] heurestics = [] baselines = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] MAPE_errors_ind = [] MAPE_errors_heu = [] MAPE_errors_baseline = [] true_errors = [] pair_triple_ratios = [] triangle_counts = [] # s1_list = [] # s2_list = [] # s3_list = [] # s12_list = [] # s13_list = [] # s23_list = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2)) + 1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) triangle = (n1, n2, n3) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Get the obs (test data) frequency minus those found in the sample (training data) obs = 0 if triangle in observed: # (triples in data) - (triples in sample). Calculating the number of triples in test data. obs = observed[triangle] - s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size)) if est < 0: print 'max ent below 0' print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123) # extrapolation estimate est2 = s123 / float(sample_size) * test_data_size # independence estimat est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size) # heurestic, use max_ent for 0 triple in sample est4 = s123 < 5 and est or est2 # base line estimat est5 = average_observed estimates.append(est) extrapolations.append(est2) independences.append(est3) heurestics.append(est4) baselines.append(est5) observations.append(obs) triplets.append(triangle) # TODO Do why save these? They already exist in the triangle tree (and take # up shit load of space..) # s1_list.append(s1) # s2_list.append(s2) # s3_list.append(s3) # s12_list.append(s12) # s13_list.append(s13) # s23_list.append(s23) #end TODO # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) # * 100 MAPE_errors.append(error) true_errors.append(obs-est) # MAPE error extrapolation error2 = 0 if est2 > 0: error2 = abs(obs-est2) / math.sqrt(obs) # * 100 MAPE_errors_ext.append(error2) # MAPE error independence error3 = abs(obs-est3) / math.sqrt(obs) # * 100 MAPE_errors_ind.append(error3) # MAPE error heurestic error4 = abs(obs-est4) / math.sqrt(obs) # * 100 MAPE_errors_heu.append(error4) # MAPE baseline error error5 = abs(obs-est5) / math.sqrt(obs) #* 100 MAPE_errors_baseline.append(error5) # Record error for the estimeate that performed best if error < error2 and error < error3: max_ent_best[s123] += 1 elif error2 < error and error2 < error3: ext_best[s123] += 1 else: ind_best[s123] += 1 try: occurrences[s123] += 1 max_ent_acc_error[s123] += error ext_acc_error[s123] += error2 ind_acc_error[s123] += error3 heu_acc_error[s123] += error4 baseline_acc_error[s123] += error5 except IndexError, ie: pass # print 'true errors: ', true_errors # print 'estimates: ', estimates # print 'observed: ', observed # print 'mape ', MAPE_errors del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) avg_errors_ext.append(avg_error_ext) # independence error avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind)) avg_errors_ind.append(avg_error_ind) # heurestic error avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) avg_errors_heu.append(avg_error_heu) # baseline error avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline)) avg_errors_baseline.append(avg_error_baseline) var_error = 0 var_error_ext = 0 var_error_heu = 0 var_error_ind = 0 # variance if len(MAPE_errors) > 1: var_error = tvar(MAPE_errors) #tvar is the sample variance var_error_ext = tvar(MAPE_errors_ext) var_error_heu = tvar(MAPE_errors_heu) var_error_ind = tvar(MAPE_errors_ind) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval std_dev_ext = math.sqrt(var_error_ext) std_error_ext = std_dev_ext / math.sqrt(sample_size) span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # independence confidence interval std_dev_ind = math.sqrt(var_error_ind) std_error_ind = std_dev_ind / math.sqrt(sample_size) span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind) span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind) # heurestic confidence interval std_dev_heu = math.sqrt(var_error_heu) std_error_heu = std_dev_heu / math.sqrt(sample_size) span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) var_errors_ext.append(var_error_ext) var_errors_heu.append(var_error_heu) var_errors_ind.append(var_error_ind) res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind)) res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline) with open(path + str(index) + '_log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(heurestics): fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_independece.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(independences): fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') # Save the errors with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd: pickle.dump(MAPE_errors, fd) with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ext, fd) with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd: pickle.dump(MAPE_errors_heu, fd) with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ind, fd) with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd: pickle.dump(MAPE_errors_baseline, fd) #saves amounts of all subsets of triples. # TODO this code does not run! # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd: # fd.write('s1\ts2\ts3\ts12\ts13\ts23\n') # for _index, i in enumerate(s123): # fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n') #saves independence estimate for all triples. # TODO Why s123[_index] in the denominator? # TODO What is a 'double independece estimat'? # TODO Why not calculate and save estimates in the same way as ext and max_ent? # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd: # fd.write('single independence estimate\tdouble independence estimate\n') # for _index, i in enumerate(s123): # tempVal1 = sample_size/(s1[_index]) # tempVal2=sample_size/(s2[_index]) # tempVal3=sample_size/(s3[_index]) # tempVal12=sample_size/(s12[_index]) # tempVal13=sample_size/(s13[_index]) # tempVal23=sample_size/(s23[_index]) # fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n')) del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!'