def GetEntropy(occlusions): I = np.zeros((XSize, YSize)) for coord in occlusions: I[coord.X, coord.Y] = 1 entropy = Entropy(I) outputMatrix = entropy.MovingWindowFilter(entropy.MovingAverage, 1) filteredMatrices = [outputMatrix] profile = entropy.Profile(filteredMatrices) return profile
def est_all_data(frequent_items, total_transactions): print 'est all data2' start = time() transactions = None #transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt') all_frequent_items = fpgrowth(transactions, supp=-10, min=1, max=3) #-10 yields 3437 M, triples = filter_items(all_frequent_items) fp_time = time() - start print "Finding frequent items: {}".format(fp_time) est_start = time() est = [] obs = [] abs_errors = [] max_est = 0 max_obs = 0 i = 0 j = 0 triangle_start = time() triangle_tree, triples = Forward.forward_compact(frequent_items) print 'Finding triangles done: ', (time()-triangle_start) # DFS the triangle tree for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] if s123 < 30: continue e = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=20) est.append(e) obs.append(s123) error = abs(e-s123) / float(s123) * 100 abs_errors.append(error) # For plotting max_est = max(max_est, e) max_obs = max(max_obs, s123) with open('../tmp/est_all_data.json', 'w') as fd: fd.write(json.dumps(zip(est, obs))) with open('../tmp/est_al_data.tsv', 'w') as fd: fd.write('est\tobs\tkind\n') for index, i in enumerate(est): fd.write(str(est[index]) + '\t' + str(obs[index]) + '\t' + 'est/obs\n') # scale = 1.5 # fig = plt.figure() # fig.text(0, 0, "Total running time: {} sec.".format(time()-est_start)) avg_error = sum(abs_errors) / float(len(abs_errors)) print 'avg error: {}'.format(avg_error) print 'error var: {}'.format(np.var(abs_errors)) print 'max observed: {}'.format(max_obs)
def make_frequency_dict(self, text): # frequency = {} # for character in text: # if not character in frequency: # frequency[character] = 0 # frequency[character] += 1 ent = Entropy(self.path) self.symbols_count = ent.symbols_count return ent.freq
def entropy_for_feature(self, feature_number): unique_feature_values = [0, 1] entropy = 0.0 for value in unique_feature_values: sub_features_list, sub_labels_list = \ DataSetSplitter(self.features_list, self.labels_list, feature_number, value).new_data_set() probability = sub_features_list.shape[0] / float( self.data_set_entries_count) entropy += probability * Entropy(sub_features_list, sub_labels_list).value() return entropy
def __entropy_for_feature(self, feature_number): feature_list = [example[feature_number] for example in self.data_set] unique_feature_values = set(feature_list) entropy = 0.0 for value in unique_feature_values: sub_data_set = DataSetSplitter(self.data_set, feature_number, value).new_data_set() probability = len(sub_data_set) / float( self.data_set_entries_count) entropy += probability * Entropy(sub_data_set).value() return entropy
def id3Algorithm(df, heuristic, finalAttribute, attributes, default_class=None): counter = Counter(x for x in df[finalAttribute]) if len(counter) == 1: return next(iter(counter)) elif df.empty or (not attributes): return default_class else: default_class = max(counter.keys()) if heuristic == "entropy": gain = [ Entropy.information_gain(df, attr, finalAttribute) for attr in attributes ] elif heuristic == "variance": gain = [ Variance.variance_gain(df, attr, finalAttribute) for attr in attributes ] maxIndex = gain.index(max(gain)) rootAttribute = attributes[maxIndex] # Create an empty tree, to be populated in a moment tree = { rootAttribute: {} } # Initiate the tree with best attribute as a node remainingAttributes = [i for i in attributes if i != rootAttribute] for attr_val, data_subset in df.groupby(rootAttribute): subtree = id3Algorithm(data_subset, heuristic, finalAttribute, remainingAttributes, default_class) tree[rootAttribute][attr_val] = subtree return tree
from pox.lib.packet.ipv4 import ipv4 from pox.lib.packet.arp import arp from pox.lib.addresses import IPAddr, EthAddr from pox.lib.util import str_to_bool, dpid_to_str from pox.lib.recoco import Timer import pox.openflow.libopenflow_01 as of from pox.lib.revent import * import itertools import time from entropy import Entropy my_dictionary = {} my_entropy = Entropy() set_Timer = False defendDDOS = False log = core.getLogger() FLOW_IDLE_TIMEOUT = 10 ARP_TIMEOUT = 60 * 2 MAX_BUFFERED_PER_IP = 5 MAX_BUFFER_TIME = 5 class Entry(object): def __init__(self, port, mac): self.timeout = time.time() + ARP_TIMEOUT self.port = port self.mac = mac
def __init__(self, features_list, labels_list): self.features_list = features_list self.labels_list = labels_list self.data_set_entries_count = features_list.shape[0] self.number_of_features = features_list.shape[1] - 1 self.base_entropy = Entropy(features_list, labels_list).value()
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation on ALL DATA cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions sample_size = total_transactions avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] for index in range(iterations): borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3'] call(args) print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(sample_freq_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) # * 2, horrible hack to make Forward calculated the # observed frequency correctly. observed[sorted_trip] = item[1][0] * 2 print 'Total triplets observed:', len(observed) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, total_transactions) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] heurestics = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] triangle_counts = [] triplets = [] pair_triple_ratios = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2))+1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle = (n1, n2, n3) triplets.append(triangle) triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Observed is the triple support, since sample is all data obs = s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth) # extrapolation estimate, does not make sense for all data est2 = s123 / float(sample_size) * (total_transactions) # heurestic, use max_ent for 0 triple in sample, does not make sense for all data # est3 = s123 == 0 and est or est2 estimates.append(est) # extrapolations.append(est2) # heurestics.append(est3) observations.append(obs) triplets.append(triangle) # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) MAPE_errors.append(error) # MAPE error extrapolation error2 = abs(obs-est2) / math.sqrt(obs) MAPE_errors_ext.append(error2) # MAPE error heurestic # error3 = abs(obs-est3) / float(obs) * 100 # MAPE_errors_heu.append(error3) del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) # avg_errors_ext.append(avg_error_ext) # heurestic error # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) # avg_errors_heu.append(avg_error_heu) # variance var_error = var(MAPE_errors) # var_error_ext = tvar(MAPE_errors_ext) # var_error_heu = tvar(MAPE_errors_heu) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval # std_dev_ext = math.sqrt(var_error_ext) # std_error_ext = std_dev_ext / math.sqrt(sample_size) # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # heurestic confidence interval # std_dev_heu = math.sqrt(var_error_heu) # std_error_heu = std_dev_heu / math.sqrt(sample_size) # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) # var_errors_ext.append(var_error_ext) # var_errors_heu.append(var_error_heu) res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) with open(path + 'log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) if len(avg_errors) > 0: total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def img_analysis(image_path: str): counter = Entropy() with open(image_path, 'rb+') as f: # read the header id_length = one_byte(f) colour_map_type = one_byte(f) image_type = one_byte(f) # colour map specification first_entry_index = byte_list(f, 2) colour_map_length = byte_list(f, 2) colour_map_entry_size = ord(f.read(1)) # image specification x_origin = byte_list(f, 2) y_origin = byte_list(f, 2) image_width = int_from_bytes(byte_list(f, 2)) image_height = int_from_bytes(byte_list(f, 2)) pixel_depth = one_byte(f) image_descriptor = one_byte(f) # create a two-line pixel buffer # the pixel on the most left is always black # 0 -> top row # 1 -> current row buffer = [[(0, 0, 0) for _ in range(0, image_width + 1)] for _ in [1, 2]] # load the first row of pixels for pixel in range(1, image_width + 1): # for every pixel load three bytes represeting BGR colours buffer[0][pixel] = byte_list(f, 3) for line in range(0, image_height): # take the top row as the current one # (use switching as a hackaround, because of pointers) buffer[1], buffer[0] = buffer[0], buffer[1] # and load another row on top if line != image_height - 1: for pixel in range(1, image_width + 1): buffer[0][pixel] = byte_list(f, 3) else: # if this is the last row, the top row needs to be # a row of black pixels for pixel in range(1, image_width + 1): buffer[0][pixel] = (0, 0, 0) # loop through the loaded pixels for i in range(1, image_width + 1): pixel = buffer[1][i] west = buffer[1][i - 1] north = buffer[0][i] northwest = buffer[0][i - 1] # do all the predictions # \hat{X} = (0,0,0) counter.register_char('normal', pixel) # \hat{X} = W hat_x = west counter.register_char('W', subtract_pixels(pixel, hat_x)) # \hat{X} = N hat_x = north counter.register_char('N', subtract_pixels(pixel, hat_x)) # \hat{X} = NW hat_x = northwest counter.register_char('NW', subtract_pixels(pixel, hat_x)) # \hat{X} = N + W - NW hat_x = subtract_pixels(add_pixels(north, west), northwest) counter.register_char('N + W - NW', subtract_pixels(pixel, hat_x)) # \hat{X} = N + (W - NW)/2 hat_x = add_pixels( north, scale_pixel(subtract_pixels(west, northwest), 0.5)) counter.register_char('N + (W - NW)/2', subtract_pixels(pixel, hat_x)) # \hat{X} = W + (N - NW)/2 hat_x = add_pixels( west, scale_pixel(subtract_pixels(north, northwest), 0.5)) counter.register_char('W + (N - NW)/2', subtract_pixels(pixel, hat_x)) # \hat{X} = (N + W)/2 hat_x = scale_pixel(add_pixels(north, west), 0.5) counter.register_char('(N + W)/2', subtract_pixels(pixel, hat_x)) # new standard hat_x = [0, 0, 0] # perform the algorithm for every colour discretely for i in range(0, 3): if northwest[i] >= max(west[i], north[i]): hat_x[i] = max(west[i], north[i]) elif northwest[i] <= min(west[i], north[i]): hat_x[i] = min(west[i], north[i]) else: hat_x[i] = west[i] + north[i] - northwest[i] hat_x = tuple(hat_x) counter.register_char('new standard', subtract_pixels(pixel, hat_x)) return counter
def make_frequency_dict(self, text): ent = Entropy(self.path) ent.HBA() return ent.pairs
def Construct_Vector(mystr, conn): vec = [] removed_protocol = re.sub( r'^http(s*)://', '', mystr) # Removed Protocol in a given URL using Python Regex vec.append(len(removed_protocol)) # append length of URL to the Vector vec.append(Total_Dots( removed_protocol)) # append Number of Dots in URL to the Vector # Checking for Presence of Suspicious Words in URL for i in Suspicious_Words: if re.search(i, removed_protocol, re.IGNORECASE): vec.append(1) # security sensitive word present so append 1 break else: vec.append(0) # security sensitive word not present so append 0 patt = r'^[^/]*' # pattern to extract domain from the URL patt_path = r'/[^/]*' # pattern to extract path of URL dom = re.match(patt, removed_protocol).group(0) info = re.findall(patt_path, removed_protocol) # print('Domain Name: ',dom) dom_hyph_count = no_of_hyphens_in_domain(dom) vec.append(int(dom_hyph_count) ) # Appending Number of hyphens in Domain of URL to the Vector domain_tokens = dom.split('.') # split the domain by the periods domain_tokens = [x for x in domain_tokens if x != ''] # Removing Null Values (if Any) # print('Domain Length: ',len(dom)) path_tokens = [re.sub('/', '', x) for x in info] if path_tokens != []: file_n_args = path_tokens[-1] else: file_n_args = '' path_tokens = path_tokens[:-1] info = [x for x in info if x != ''] slashes = len(info) # print('Slashes:',slashes) dir_len = 0 for i in path_tokens: dir_len += len(i) dir_len += slashes vec.append( int(dir_len)) # Appending Directory length to the URL to the Vector # print('Directory Length: ',dir_len) num_subdir = len(path_tokens) # print('Number of Subdirectories :',num_subdir) vec.append( num_subdir ) # Appending Number of Subdirectories Present in the URL to the Vector # print('Path Tokens : ',path_tokens) TLD = domain_tokens[-1] # print('Top Level Domain :',TLD) vec.append(len(dom)) # Domain Length vec.append(len(domain_tokens)) # Domain Token Count vec.append(len(path_tokens)) # Path Token Count # does the url contain an IP address has_ip = ip_presence(removed_protocol) vec.append(has_ip) # Presence of ip address Yes:1, No:0 # get the alexa page rank has_alexa_rank = alexa_pagerank(dom, conn) vec.append(has_alexa_rank) # does page use ssl uses_https = check_https(mystr) vec.append(uses_https) # get country code and domain age calc country_code, dom_age_gt_1year = get_ip_info(dom) vec.append(country_code) # domain age gt 1 year vec.append(dom_age_gt_1year) # bag of words for word occurances word = bag_of_words(mystr) vec.append(word) # entropy of URL ent = Entropy(mystr) entropy = ent.H(mystr) vec.append(entropy) # count of special characters characters = special_chars(mystr) vec.append(characters) domain_tok_lengths = [] for i in domain_tokens: domain_tok_lengths.append(len(i)) largest_dom_token_len = max(domain_tok_lengths) vec.append(largest_dom_token_len) # Largest Domain Token Length avg_dom_Tok_len = round( (float(sum(domain_tok_lengths)) / len(domain_tok_lengths)), 2) vec.append(avg_dom_Tok_len) # Average Domain Token Length path_tok_lengths = [] path_tok_dots = 0 path_tok_delims = 0 avg_path_Tok_len = 0 largest_path_token_len = 0 if len(path_tokens): for i in path_tokens: path_tok_lengths.append(len(i)) path_tok_dots = Total_Dots(i) path_tok_delims = Total_Delims(i) avg_path_Tok_len = round( (float(sum(path_tok_lengths)) / len(path_tok_lengths)), 2) largest_path_token_len = max(path_tok_lengths) vec.append(largest_path_token_len) # Largest Path Token Length vec.append(avg_path_Tok_len) # Average Path Token Length else: vec.append(largest_path_token_len ) # Largest Path Token Length :0 (No, Path Tokens) vec.append( avg_path_Tok_len) # Average Path Token Length :0 (No, Path Tokens) # print('Largest Path Token Length:',largest_path_token_len) # print('Path Token Total Dots:',path_tok_dots) # print('Path Token Delims:',path_tok_delims) if has_ip: vec.append(0) # Ip address present so no suspicious TLD else: for i in Suspicious_TLD: if re.search(i, TLD, re.IGNORECASE): vec.append(1) # Suspicious TLD break else: vec.append(0) # Non Suspicious TLD if file_n_args != '': # Define Condition whether file and arguments present in the URL # POST arguments are conditions passed after the ? # file (filenames) are items such as index.html tmp = file_n_args.split('?') file = tmp[0] if len(tmp) > 1: args = tmp[1] else: args = '' # print('File:',file) # print('Arguments:',args) if not file: vec.append(0) else: vec.append(1) vec.append(len(file)) # Length of file vec.append(Total_Dots(file)) # Total_Dots in file name vec.append(Total_Delims(file)) # Total_Delims in file name # print('Total dots in file: ',Total_Dots(file)) # print('Total Delims in file: ',Total_Delims(file)) if args == '': # Checking if any POST arguments present in the URL or not vec.append(0) # no arguments present in url vec.append(0) # Length of Argument Appended to the Vector vec.append(0) # Number of Variables Appended to the Vector vec.append( 0) # Length of larges variable value Appended to the Vector vec.append(0) # Maximum number of Delims Appended to the Vector # print('argument length:',0) # print('number of arguments:',0) # print('length of Largest variable value:',0) # print('Maximun no of delims:',0) else: # indicated Presence of POST arguments in the URL vec.append(1) # arguments are present vec.append(len(args) + 1) # Length of Argument Appended to the Vector # print('argument length:',len(args)+1) arb = args.split('&') vec.append(len(arb)) # Number of Arguments Appended to the Vector # print('Number of arguments',len(arb)) len_var = [] max_delim = [] for i in arb: # Spliting POST Arguments around '=' sign tmp = i.split('=') if len(tmp) > 1: len_var.append(len(tmp[1])) max_delim.append(Total_Delims(tmp[0])) max_delim.append(Total_Delims(tmp[1])) else: len_var.append(0) max_delim.append(0) vec.append(max(len_var)) # Length of Largest variable value # print('length of Largest variable value:',max(len_var)) max_delim = max(max_delim) vec.append(max_delim) # Maximum number of Delimeters # print('Maximum no of delims:',max_delim) else: # Defines condition to the corresponding if that File and Arguments are not Present in the URL so # Just Append 0 to the corresponding Parameter in the Vector vec.append(0) # has file name in url vec.append(0) # Length of file Appended to the Vector vec.append(0) # Total_Dots in file name Appended to the Vector vec.append(0) # Total_Delims in file name Appended to the Vector vec.append(0) # has arguments appended to url vec.append(0) # Length of Argument Appended to the Vector vec.append(0) # Number of Variables Appended to the Vector vec.append(0) # Length of larges variable value Appended to the Vector vec.append(0) # Maximum number of Delims Appended to the Vector # print('argument length:',0) # print('number of arguments:',0) return vec
else: print("invalid --mode") exit(usage_help) if mode == "e": encoded = list( map( # offsetting the numbers by one because of universal coding limitations lambda x: x + 1, LZW.encode(input_file))) coding.encode(encoded, output_file) # print the stats print("encoded number list entropy:", Entropy.encoded_file_entropy(encoded)) print("original file entropy :", Entropy.original_file_entropy(input_file)) original_size = os.path.getsize(input_file) encoded_size = os.path.getsize(output_file) print("original file size :", original_size) print("encoded file size :", encoded_size) print("compression rate :", original_size / encoded_size) elif mode == "d": decoded = list( map( # offsetting the numbers by one because of universal coding limitations lambda x: x - 1, coding.decode(input_file)))
"Ex. >> python decision.py training_set.csv test_set.csv yes entropy" ) sys.exit() #PATH = "dataset1/" PATH = "dataset1/" trainingData = pd.read_csv(PATH + sys.argv[1]) testingData = pd.read_csv(PATH + sys.argv[2]) # Getting list of attributes except 'Class' attributes = list(trainingData.columns) attributes.remove('Class') entropy, variance = 0, 0 answer = [] total_entropy = Entropy.entropy_of_list(trainingData['Class']) tree_entropy = id3Algorithm(trainingData, "entropy", 'Class', attributes) trainingData['predicted'] = trainingData.apply(classifyDataset, axis=1, args=(tree_entropy, 0)) train_tree = id3Algorithm(trainingData, "entropy", 'Class', attributes) testingData['predicted2'] = testingData.apply(classifyDataset, axis=1, args=(train_tree, 1)) entropy = sum(testingData['Class'] == testingData['predicted2']) / ( 1.0 * len(testingData.index)) total_variance = Variance.calculate_variance(trainingData['Class']) tree_variance = id3Algorithm(trainingData, "variance", 'Class', attributes) trainingData['predicted'] = trainingData.apply(classifyDataset, axis=1,
def __init__(self, data_set): self.data_set = data_set self.data_set_entries_count = len(data_set) self.number_of_features = len(data_set[0]) - 1 self.base_entropy = Entropy(data_set).value()
#!/usr/bin/env python3 import sys from entropy import Entropy if len(sys.argv) != 3: print("Correct usage: python3 run.py </path/to/file> <N>") else: file_path = sys.argv[1] n = int(sys.argv[2]) elements = [] with open(file_path, 'r') as file: for line in file: elements.extend(list(line)) entropy = Entropy(elements) # If memory equals N, we need to consider N + 1 elements at a time. print(f"Entropy with N = {n}: {entropy.entropy(n + 1)}")
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation, 'old' version not using compatct triangle representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangles, triples = Forward.forward(frequent_items) print 'triangles: {}'.format(len(triangles)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 for (s1, s2, s3, s12, s23, s13, s123) in triangles: # if s123[1] != 0: # continue # maxent estimate from the sample. # Index [1] of the tuples hold the # occurences in the sample est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key(s123[0]): observed = all_triples[s123[0]] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) # TODO histogram of the average errors. max-ent, extrapolation, heurestic # TODO print average error og the average errors to the log. res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error) return path
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation. Using compact representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward_compact(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangle_tree, triples = Forward.forward_compact(frequent_items) print 'triangle roots: {}'.format(len(triangle_tree)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key((n1, n2, n3)): observed = all_triples[(n1, n2, n3)] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions # Get the total observed triples borgelt_start = time() observed_file_name = path + 'observed_frequent_items.out' args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3'] # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) # os.killpg(pro.pid, signal.SIGTERM) call(args) # sleep(20) print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(observed_file_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) observed[sorted_trip] = item[1][0] print 'Total triplets observed:', len(observed) average_observed = sum(observed.values()) / float(len(observed)) print 'Baseline: ', average_observed del freq avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] avg_errors_ind = [] var_errors_ind = [] avg_errors_baseline = [] occurrences = [0 for i in range(100)] max_ent_acc_error = [0 for i in range(100)] ext_acc_error = [0 for i in range(100)] ind_acc_error = [0 for i in range(100)] heu_acc_error = [0 for i in range(100)] baseline_acc_error = [0 for i in range(100)] # Record trip counts for the best estimats max_ent_best = Counter() ext_best = Counter() ind_best = Counter() for index in range(iterations): # Create sample file sampling_start = time() if sample_pct > 0: sample_size= int(total_transactions*sample_pct) else: sample_size = abs(sample_pct) test_data_size = total_transactions - sample_size sample = random.sample(range(total_transactions), sample_size) assert len(sample) == sample_size, 'Sample size not equal to sample' sample.sort() sample_file_name = path + str(index) + '_sample.tab' with open(sample_file_name, 'a') as sample_file: sample_line = 0 for line_num, line in enumerate(open(tab_file, 'rb')): if line_num == sample[sample_line]: sample_file.write(line) sample_line += 1 if sample_line == sample_size: break del sample print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start) borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3'] call(args) print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, test_data_size) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] independences = [] heurestics = [] baselines = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] MAPE_errors_ind = [] MAPE_errors_heu = [] MAPE_errors_baseline = [] true_errors = [] pair_triple_ratios = [] triangle_counts = [] # s1_list = [] # s2_list = [] # s3_list = [] # s12_list = [] # s13_list = [] # s23_list = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2)) + 1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) triangle = (n1, n2, n3) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Get the obs (test data) frequency minus those found in the sample (training data) obs = 0 if triangle in observed: # (triples in data) - (triples in sample). Calculating the number of triples in test data. obs = observed[triangle] - s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size)) if est < 0: print 'max ent below 0' print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123) # extrapolation estimate est2 = s123 / float(sample_size) * test_data_size # independence estimat est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size) # heurestic, use max_ent for 0 triple in sample est4 = s123 < 5 and est or est2 # base line estimat est5 = average_observed estimates.append(est) extrapolations.append(est2) independences.append(est3) heurestics.append(est4) baselines.append(est5) observations.append(obs) triplets.append(triangle) # TODO Do why save these? They already exist in the triangle tree (and take # up shit load of space..) # s1_list.append(s1) # s2_list.append(s2) # s3_list.append(s3) # s12_list.append(s12) # s13_list.append(s13) # s23_list.append(s23) #end TODO # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) # * 100 MAPE_errors.append(error) true_errors.append(obs-est) # MAPE error extrapolation error2 = 0 if est2 > 0: error2 = abs(obs-est2) / math.sqrt(obs) # * 100 MAPE_errors_ext.append(error2) # MAPE error independence error3 = abs(obs-est3) / math.sqrt(obs) # * 100 MAPE_errors_ind.append(error3) # MAPE error heurestic error4 = abs(obs-est4) / math.sqrt(obs) # * 100 MAPE_errors_heu.append(error4) # MAPE baseline error error5 = abs(obs-est5) / math.sqrt(obs) #* 100 MAPE_errors_baseline.append(error5) # Record error for the estimeate that performed best if error < error2 and error < error3: max_ent_best[s123] += 1 elif error2 < error and error2 < error3: ext_best[s123] += 1 else: ind_best[s123] += 1 try: occurrences[s123] += 1 max_ent_acc_error[s123] += error ext_acc_error[s123] += error2 ind_acc_error[s123] += error3 heu_acc_error[s123] += error4 baseline_acc_error[s123] += error5 except IndexError, ie: pass # print 'true errors: ', true_errors # print 'estimates: ', estimates # print 'observed: ', observed # print 'mape ', MAPE_errors del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) avg_errors_ext.append(avg_error_ext) # independence error avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind)) avg_errors_ind.append(avg_error_ind) # heurestic error avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) avg_errors_heu.append(avg_error_heu) # baseline error avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline)) avg_errors_baseline.append(avg_error_baseline) var_error = 0 var_error_ext = 0 var_error_heu = 0 var_error_ind = 0 # variance if len(MAPE_errors) > 1: var_error = tvar(MAPE_errors) #tvar is the sample variance var_error_ext = tvar(MAPE_errors_ext) var_error_heu = tvar(MAPE_errors_heu) var_error_ind = tvar(MAPE_errors_ind) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval std_dev_ext = math.sqrt(var_error_ext) std_error_ext = std_dev_ext / math.sqrt(sample_size) span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # independence confidence interval std_dev_ind = math.sqrt(var_error_ind) std_error_ind = std_dev_ind / math.sqrt(sample_size) span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind) span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind) # heurestic confidence interval std_dev_heu = math.sqrt(var_error_heu) std_error_heu = std_dev_heu / math.sqrt(sample_size) span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) var_errors_ext.append(var_error_ext) var_errors_heu.append(var_error_heu) var_errors_ind.append(var_error_ind) res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind)) res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline) with open(path + str(index) + '_log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(heurestics): fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_independece.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(independences): fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') # Save the errors with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd: pickle.dump(MAPE_errors, fd) with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ext, fd) with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd: pickle.dump(MAPE_errors_heu, fd) with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ind, fd) with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd: pickle.dump(MAPE_errors_baseline, fd) #saves amounts of all subsets of triples. # TODO this code does not run! # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd: # fd.write('s1\ts2\ts3\ts12\ts13\ts23\n') # for _index, i in enumerate(s123): # fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n') #saves independence estimate for all triples. # TODO Why s123[_index] in the denominator? # TODO What is a 'double independece estimat'? # TODO Why not calculate and save estimates in the same way as ext and max_ent? # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd: # fd.write('single independence estimate\tdouble independence estimate\n') # for _index, i in enumerate(s123): # tempVal1 = sample_size/(s1[_index]) # tempVal2=sample_size/(s2[_index]) # tempVal3=sample_size/(s3[_index]) # tempVal12=sample_size/(s12[_index]) # tempVal13=sample_size/(s13[_index]) # tempVal23=sample_size/(s23[_index]) # fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n')) del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!'