Beispiel #1
0
def GetEntropy(occlusions):
    I = np.zeros((XSize, YSize))
    for coord in occlusions:
        I[coord.X, coord.Y] = 1
    entropy = Entropy(I)
    outputMatrix = entropy.MovingWindowFilter(entropy.MovingAverage, 1)
    filteredMatrices = [outputMatrix]
    profile = entropy.Profile(filteredMatrices)

    return profile
Beispiel #2
0
def est_all_data(frequent_items, total_transactions):
    print 'est all data2'
    start = time()
    transactions = None
    #transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt')
    all_frequent_items = fpgrowth(transactions, supp=-10, min=1, max=3) #-10 yields 3437
    M, triples = filter_items(all_frequent_items)
    fp_time = time() - start
    print "Finding frequent items: {}".format(fp_time)

    est_start = time()

    est = []
    obs = []
    abs_errors = []
    max_est = 0
    max_obs = 0
    i = 0
    j = 0

    triangle_start = time()
    triangle_tree, triples = Forward.forward_compact(frequent_items)
    print 'Finding triangles done: ', (time()-triangle_start)

    # DFS the triangle tree
    for n1 in triangle_tree.keys():
        s1, s2_dict = triangle_tree[n1]
        for n2 in s2_dict.keys():
            s2, s12, s3_dict = s2_dict[n2]
            for n3 in s3_dict.keys():
                s3, s13, s23, s123 = s3_dict[n3]
                if s123 < 30:
                    continue

                e = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=20)

                est.append(e)
                obs.append(s123)
                error = abs(e-s123) / float(s123) * 100
                    
                abs_errors.append(error)

                # For plotting
                max_est = max(max_est, e)
                max_obs = max(max_obs, s123)

    with open('../tmp/est_all_data.json', 'w') as fd:
        fd.write(json.dumps(zip(est, obs)))
    with open('../tmp/est_al_data.tsv', 'w') as fd:
        fd.write('est\tobs\tkind\n')
        for index, i in enumerate(est):
            fd.write(str(est[index]) + '\t' + str(obs[index]) + '\t' + 'est/obs\n')
    # scale = 1.5

    # fig = plt.figure()
    # fig.text(0, 0, "Total running time: {} sec.".format(time()-est_start))
    avg_error = sum(abs_errors) / float(len(abs_errors))
    print 'avg error: {}'.format(avg_error)
    print 'error var: {}'.format(np.var(abs_errors))
    print 'max observed: {}'.format(max_obs)
Beispiel #3
0
 def make_frequency_dict(self, text):
     # frequency = {}
     # for character in text:
     #     if not character in frequency:
     #         frequency[character] = 0
     #     frequency[character] += 1
     ent = Entropy(self.path)
     self.symbols_count = ent.symbols_count
     return ent.freq
    def entropy_for_feature(self, feature_number):
        unique_feature_values = [0, 1]

        entropy = 0.0

        for value in unique_feature_values:
            sub_features_list, sub_labels_list = \
                DataSetSplitter(self.features_list, self.labels_list, feature_number, value).new_data_set()

            probability = sub_features_list.shape[0] / float(
                self.data_set_entries_count)
            entropy += probability * Entropy(sub_features_list,
                                             sub_labels_list).value()

        return entropy
    def __entropy_for_feature(self, feature_number):
        feature_list = [example[feature_number] for example in self.data_set]
        unique_feature_values = set(feature_list)

        entropy = 0.0

        for value in unique_feature_values:
            sub_data_set = DataSetSplitter(self.data_set, feature_number,
                                           value).new_data_set()

            probability = len(sub_data_set) / float(
                self.data_set_entries_count)
            entropy += probability * Entropy(sub_data_set).value()

        return entropy
Beispiel #6
0
def id3Algorithm(df,
                 heuristic,
                 finalAttribute,
                 attributes,
                 default_class=None):
    counter = Counter(x for x in df[finalAttribute])

    if len(counter) == 1:
        return next(iter(counter))
    elif df.empty or (not attributes):
        return default_class
    else:
        default_class = max(counter.keys())
        if heuristic == "entropy":
            gain = [
                Entropy.information_gain(df, attr, finalAttribute)
                for attr in attributes
            ]
        elif heuristic == "variance":
            gain = [
                Variance.variance_gain(df, attr, finalAttribute)
                for attr in attributes
            ]

        maxIndex = gain.index(max(gain))
        rootAttribute = attributes[maxIndex]

        # Create an empty tree, to be populated in a moment
        tree = {
            rootAttribute: {}
        }  # Initiate the tree with best attribute as a node
        remainingAttributes = [i for i in attributes if i != rootAttribute]

        for attr_val, data_subset in df.groupby(rootAttribute):
            subtree = id3Algorithm(data_subset, heuristic, finalAttribute,
                                   remainingAttributes, default_class)
            tree[rootAttribute][attr_val] = subtree
        return tree
Beispiel #7
0
from pox.lib.packet.ipv4 import ipv4
from pox.lib.packet.arp import arp
from pox.lib.addresses import IPAddr, EthAddr
from pox.lib.util import str_to_bool, dpid_to_str
from pox.lib.recoco import Timer

import pox.openflow.libopenflow_01 as of

from pox.lib.revent import *
import itertools
import time

from entropy import Entropy

my_dictionary = {}
my_entropy = Entropy()
set_Timer = False
defendDDOS = False

log = core.getLogger()
FLOW_IDLE_TIMEOUT = 10
ARP_TIMEOUT = 60 * 2
MAX_BUFFERED_PER_IP = 5
MAX_BUFFER_TIME = 5


class Entry(object):
    def __init__(self, port, mac):
        self.timeout = time.time() + ARP_TIMEOUT
        self.port = port
        self.mac = mac
 def __init__(self, features_list, labels_list):
     self.features_list = features_list
     self.labels_list = labels_list
     self.data_set_entries_count = features_list.shape[0]
     self.number_of_features = features_list.shape[1] - 1
     self.base_entropy = Entropy(features_list, labels_list).value()
Beispiel #9
0
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation on ALL DATA cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions
    sample_size = total_transactions

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    for index in range(iterations):

        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3']
        call(args)
        print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start)


        freq = Borgelt.read_frequent_items(sample_freq_name)
        # Create ds of all observed triplets
        # Saved as sorted keys for lookup,
        # and their frequency as value
        observed = {}
        count = 0
        for item in freq:
            if len(item[0]) == 3:
                sorted_trip = triple_sort(item[0])
                # * 2, horrible hack to make Forward calculated the 
                # observed frequency correctly.
                observed[sorted_trip] = item[1][0] * 2
        print 'Total triplets observed:', len(observed)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, total_transactions)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        heurestics = []
        observations = []
        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        triangle_counts = []
        triplets = []
        pair_triple_ratios = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2))+1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]                                                                                                                                                                                                                          
                for n3 in s3_dict.keys():                                                                                                                                       
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle = (n1, n2, n3)  
                    triplets.append(triangle)

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))   

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)                                                                                                                                                                                                                                                                                                                                                                                                                                   

                    # Observed is the triple support, since sample is all data
                    obs = s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth)

                    # extrapolation estimate, does not make sense for all data
                    est2 = s123 / float(sample_size) * (total_transactions)

                    # heurestic, use max_ent for 0 triple in sample, does not make sense for all data
                    # est3 = s123 == 0 and est or est2

                    estimates.append(est)
                    # extrapolations.append(est2)
                    # heurestics.append(est3)
                    observations.append(obs)
                    triplets.append(triangle)

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs)
                    MAPE_errors.append(error)
                    # MAPE error extrapolation
                    error2 = abs(obs-est2) / math.sqrt(obs)
                    MAPE_errors_ext.append(error2)
                    # MAPE error heurestic
                    # error3 = abs(obs-est3) / float(obs) * 100
                    # MAPE_errors_heu.append(error3)

        
        del triangle_tree
        del sample_triples
                    
        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            # avg_errors_ext.append(avg_error_ext)
            
            # heurestic error
            # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            # avg_errors_heu.append(avg_error_heu)
            
            # variance
            var_error = var(MAPE_errors)
            # var_error_ext = tvar(MAPE_errors_ext)
            # var_error_heu = tvar(MAPE_errors_heu)

            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            # std_dev_ext = math.sqrt(var_error_ext)
            # std_error_ext = std_dev_ext / math.sqrt(sample_size)
            # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # heurestic confidence interval
            # std_dev_heu = math.sqrt(var_error_heu)
            # std_error_heu = std_dev_heu / math.sqrt(sample_size)
            # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            # var_errors_ext.append(var_error_ext)
            # var_errors_heu.append(var_error_heu)
            
            res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            with open(path + 'log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    if len(avg_errors) > 0:
        total_avg_error = sum(avg_errors)/float(len(avg_errors))
        total_res_string = "Avg error:{}".format(total_avg_error)
Beispiel #10
0
def img_analysis(image_path: str):

    counter = Entropy()

    with open(image_path, 'rb+') as f:
        # read the header
        id_length = one_byte(f)
        colour_map_type = one_byte(f)
        image_type = one_byte(f)
        # colour map specification
        first_entry_index = byte_list(f, 2)
        colour_map_length = byte_list(f, 2)
        colour_map_entry_size = ord(f.read(1))
        # image specification
        x_origin = byte_list(f, 2)
        y_origin = byte_list(f, 2)
        image_width = int_from_bytes(byte_list(f, 2))
        image_height = int_from_bytes(byte_list(f, 2))
        pixel_depth = one_byte(f)
        image_descriptor = one_byte(f)

        # create a two-line pixel buffer
        # the pixel on the most left is always black
        # 0 -> top row
        # 1 -> current row
        buffer = [[(0, 0, 0) for _ in range(0, image_width + 1)]
                  for _ in [1, 2]]
        # load the first row of pixels
        for pixel in range(1, image_width + 1):
            # for every pixel load three bytes represeting BGR colours
            buffer[0][pixel] = byte_list(f, 3)

        for line in range(0, image_height):
            # take the top row as the current one
            # (use switching as a hackaround, because of pointers)
            buffer[1], buffer[0] = buffer[0], buffer[1]
            # and load another row on top
            if line != image_height - 1:
                for pixel in range(1, image_width + 1):
                    buffer[0][pixel] = byte_list(f, 3)
            else:
                # if this is the last row, the top row needs to be
                # a row of black pixels
                for pixel in range(1, image_width + 1):
                    buffer[0][pixel] = (0, 0, 0)

            # loop through the loaded pixels
            for i in range(1, image_width + 1):
                pixel = buffer[1][i]
                west = buffer[1][i - 1]
                north = buffer[0][i]
                northwest = buffer[0][i - 1]

                # do all the predictions

                # \hat{X} = (0,0,0)
                counter.register_char('normal', pixel)
                # \hat{X} = W
                hat_x = west
                counter.register_char('W', subtract_pixels(pixel, hat_x))
                # \hat{X} = N
                hat_x = north
                counter.register_char('N', subtract_pixels(pixel, hat_x))
                # \hat{X} = NW
                hat_x = northwest
                counter.register_char('NW', subtract_pixels(pixel, hat_x))
                # \hat{X} = N + W - NW
                hat_x = subtract_pixels(add_pixels(north, west), northwest)
                counter.register_char('N + W - NW',
                                      subtract_pixels(pixel, hat_x))
                # \hat{X} = N + (W - NW)/2
                hat_x = add_pixels(
                    north, scale_pixel(subtract_pixels(west, northwest), 0.5))
                counter.register_char('N + (W - NW)/2',
                                      subtract_pixels(pixel, hat_x))
                # \hat{X} = W + (N - NW)/2
                hat_x = add_pixels(
                    west, scale_pixel(subtract_pixels(north, northwest), 0.5))
                counter.register_char('W + (N - NW)/2',
                                      subtract_pixels(pixel, hat_x))
                # \hat{X} = (N + W)/2
                hat_x = scale_pixel(add_pixels(north, west), 0.5)
                counter.register_char('(N + W)/2',
                                      subtract_pixels(pixel, hat_x))
                # new standard
                hat_x = [0, 0, 0]
                # perform the algorithm for every colour discretely
                for i in range(0, 3):
                    if northwest[i] >= max(west[i], north[i]):
                        hat_x[i] = max(west[i], north[i])
                    elif northwest[i] <= min(west[i], north[i]):
                        hat_x[i] = min(west[i], north[i])
                    else:
                        hat_x[i] = west[i] + north[i] - northwest[i]
                hat_x = tuple(hat_x)
                counter.register_char('new standard',
                                      subtract_pixels(pixel, hat_x))

    return counter
Beispiel #11
0
 def make_frequency_dict(self, text):
     ent = Entropy(self.path)
     ent.HBA()
     return ent.pairs
Beispiel #12
0
def Construct_Vector(mystr, conn):
    vec = []

    removed_protocol = re.sub(
        r'^http(s*)://', '',
        mystr)  # Removed Protocol in a given URL using Python Regex

    vec.append(len(removed_protocol))  # append length of URL to the Vector
    vec.append(Total_Dots(
        removed_protocol))  # append Number of Dots in URL to the Vector

    # Checking for Presence of Suspicious Words in URL
    for i in Suspicious_Words:
        if re.search(i, removed_protocol, re.IGNORECASE):
            vec.append(1)  # security sensitive word present so append 1
            break
    else:
        vec.append(0)  # security sensitive word not present so append 0

    patt = r'^[^/]*'  # pattern to extract domain from the URL
    patt_path = r'/[^/]*'  # pattern to extract path of URL
    dom = re.match(patt, removed_protocol).group(0)
    info = re.findall(patt_path, removed_protocol)
    # print('Domain Name: ',dom)
    dom_hyph_count = no_of_hyphens_in_domain(dom)
    vec.append(int(dom_hyph_count)
               )  # Appending Number of hyphens in Domain of URL to the Vector

    domain_tokens = dom.split('.')  # split the domain by the periods
    domain_tokens = [x for x in domain_tokens
                     if x != '']  # Removing Null Values (if Any)
    # print('Domain Length: ',len(dom))

    path_tokens = [re.sub('/', '', x) for x in info]
    if path_tokens != []:
        file_n_args = path_tokens[-1]
    else:
        file_n_args = ''
    path_tokens = path_tokens[:-1]
    info = [x for x in info if x != '']
    slashes = len(info)
    # print('Slashes:',slashes)
    dir_len = 0
    for i in path_tokens:
        dir_len += len(i)
    dir_len += slashes
    vec.append(
        int(dir_len))  # Appending Directory length to the URL to the Vector
    # print('Directory Length: ',dir_len)

    num_subdir = len(path_tokens)
    # print('Number of Subdirectories :',num_subdir)
    vec.append(
        num_subdir
    )  # Appending Number of Subdirectories	Present in the URL to the Vector
    # print('Path Tokens : ',path_tokens)

    TLD = domain_tokens[-1]
    # print('Top Level Domain :',TLD)
    vec.append(len(dom))  # Domain Length
    vec.append(len(domain_tokens))  # Domain Token Count
    vec.append(len(path_tokens))  # Path Token Count

    # does the url contain an IP address
    has_ip = ip_presence(removed_protocol)
    vec.append(has_ip)  # Presence of ip address Yes:1, No:0

    # get the alexa page rank
    has_alexa_rank = alexa_pagerank(dom, conn)
    vec.append(has_alexa_rank)

    # does page use ssl
    uses_https = check_https(mystr)
    vec.append(uses_https)

    # get country code and domain age calc
    country_code, dom_age_gt_1year = get_ip_info(dom)
    vec.append(country_code)

    # domain age gt 1 year
    vec.append(dom_age_gt_1year)

    # bag of words for word occurances
    word = bag_of_words(mystr)
    vec.append(word)

    # entropy of URL
    ent = Entropy(mystr)
    entropy = ent.H(mystr)
    vec.append(entropy)

    # count of special characters
    characters = special_chars(mystr)
    vec.append(characters)

    domain_tok_lengths = []
    for i in domain_tokens:
        domain_tok_lengths.append(len(i))
    largest_dom_token_len = max(domain_tok_lengths)
    vec.append(largest_dom_token_len)  # Largest Domain Token Length

    avg_dom_Tok_len = round(
        (float(sum(domain_tok_lengths)) / len(domain_tok_lengths)), 2)

    vec.append(avg_dom_Tok_len)  # Average Domain Token Length

    path_tok_lengths = []
    path_tok_dots = 0
    path_tok_delims = 0
    avg_path_Tok_len = 0
    largest_path_token_len = 0
    if len(path_tokens):
        for i in path_tokens:
            path_tok_lengths.append(len(i))
            path_tok_dots = Total_Dots(i)
            path_tok_delims = Total_Delims(i)
        avg_path_Tok_len = round(
            (float(sum(path_tok_lengths)) / len(path_tok_lengths)), 2)
        largest_path_token_len = max(path_tok_lengths)
        vec.append(largest_path_token_len)  # Largest Path Token Length
        vec.append(avg_path_Tok_len)  # Average Path Token Length
    else:
        vec.append(largest_path_token_len
                   )  # Largest Path Token Length :0 (No, Path Tokens)
        vec.append(
            avg_path_Tok_len)  # Average Path Token Length :0 (No, Path Tokens)
    # print('Largest Path Token Length:',largest_path_token_len)
    # print('Path Token Total Dots:',path_tok_dots)
    # print('Path Token Delims:',path_tok_delims)
    if has_ip:
        vec.append(0)  # Ip address present so no suspicious TLD
    else:
        for i in Suspicious_TLD:
            if re.search(i, TLD, re.IGNORECASE):
                vec.append(1)  # Suspicious TLD
                break
        else:
            vec.append(0)  # Non Suspicious TLD
    if file_n_args != '':

        # Define Condition whether file and arguments present in the URL
        # POST arguments are conditions passed after the ?
        # file (filenames) are items such as index.html
        tmp = file_n_args.split('?')
        file = tmp[0]
        if len(tmp) > 1:
            args = tmp[1]
        else:
            args = ''
        # print('File:',file)
        # print('Arguments:',args)
        if not file:
            vec.append(0)
        else:
            vec.append(1)
        vec.append(len(file))  # Length of file
        vec.append(Total_Dots(file))  # Total_Dots in file name
        vec.append(Total_Delims(file))  # Total_Delims in file name
        # print('Total dots in file: ',Total_Dots(file))
        # print('Total Delims in file: ',Total_Delims(file))

        if args == '':
            # Checking if any POST arguments present in the URL or not
            vec.append(0)  # no arguments present in url
            vec.append(0)  # Length of Argument Appended to the Vector
            vec.append(0)  # Number of Variables Appended to the Vector
            vec.append(
                0)  # Length of larges variable value Appended to the Vector
            vec.append(0)  # Maximum number of Delims Appended to the Vector
        # print('argument length:',0)
        # print('number of arguments:',0)
        # print('length of Largest variable value:',0)
        # print('Maximun no of delims:',0)

        else:
            # indicated Presence of POST arguments in the URL
            vec.append(1)  # arguments are present
            vec.append(len(args) +
                       1)  # Length of Argument Appended to the Vector
            # print('argument length:',len(args)+1)
            arb = args.split('&')
            vec.append(len(arb))  # Number of Arguments Appended to the Vector
            # print('Number of arguments',len(arb))
            len_var = []
            max_delim = []
            for i in arb:
                # Spliting POST Arguments around '=' sign
                tmp = i.split('=')
                if len(tmp) > 1:
                    len_var.append(len(tmp[1]))
                    max_delim.append(Total_Delims(tmp[0]))
                    max_delim.append(Total_Delims(tmp[1]))
                else:
                    len_var.append(0)
                    max_delim.append(0)
            vec.append(max(len_var))  # Length of Largest variable value
            # print('length of Largest variable value:',max(len_var))
            max_delim = max(max_delim)
            vec.append(max_delim)  # Maximum number of Delimeters

        # print('Maximum no of delims:',max_delim)

    else:

        # Defines condition to the corresponding if that File and Arguments are not Present in the URL so
        # Just Append 0 to the corresponding Parameter in the Vector
        vec.append(0)  # has file name in url
        vec.append(0)  # Length of file Appended to the Vector
        vec.append(0)  # Total_Dots in file name Appended to the Vector
        vec.append(0)  # Total_Delims in file name Appended to the Vector
        vec.append(0)  # has arguments appended to url
        vec.append(0)  # Length of Argument Appended to the Vector
        vec.append(0)  # Number of Variables Appended to the Vector
        vec.append(0)  # Length of larges variable value Appended to the Vector
        vec.append(0)  # Maximum number of Delims Appended to the Vector
    # print('argument length:',0)
    # print('number of arguments:',0)

    return vec
Beispiel #13
0
            else:
                print("invalid --mode")
                exit(usage_help)

    if mode == "e":
        encoded = list(
            map(
                # offsetting the numbers by one because of universal coding limitations
                lambda x: x + 1,
                LZW.encode(input_file)))

        coding.encode(encoded, output_file)

        # print the stats
        print("encoded number list entropy:",
              Entropy.encoded_file_entropy(encoded))
        print("original file entropy      :",
              Entropy.original_file_entropy(input_file))

        original_size = os.path.getsize(input_file)
        encoded_size = os.path.getsize(output_file)
        print("original file size         :", original_size)
        print("encoded file size          :", encoded_size)
        print("compression rate           :", original_size / encoded_size)

    elif mode == "d":
        decoded = list(
            map(
                # offsetting the numbers by one because of universal coding limitations
                lambda x: x - 1,
                coding.decode(input_file)))
Beispiel #14
0
            "Ex. >> python decision.py training_set.csv test_set.csv yes entropy"
        )
        sys.exit()

    #PATH = "dataset1/"
    PATH = "dataset1/"
    trainingData = pd.read_csv(PATH + sys.argv[1])
    testingData = pd.read_csv(PATH + sys.argv[2])

    # Getting list of attributes except 'Class'
    attributes = list(trainingData.columns)
    attributes.remove('Class')
    entropy, variance = 0, 0
    answer = []

    total_entropy = Entropy.entropy_of_list(trainingData['Class'])
    tree_entropy = id3Algorithm(trainingData, "entropy", 'Class', attributes)
    trainingData['predicted'] = trainingData.apply(classifyDataset,
                                                   axis=1,
                                                   args=(tree_entropy, 0))
    train_tree = id3Algorithm(trainingData, "entropy", 'Class', attributes)
    testingData['predicted2'] = testingData.apply(classifyDataset,
                                                  axis=1,
                                                  args=(train_tree, 1))
    entropy = sum(testingData['Class'] == testingData['predicted2']) / (
        1.0 * len(testingData.index))

    total_variance = Variance.calculate_variance(trainingData['Class'])
    tree_variance = id3Algorithm(trainingData, "variance", 'Class', attributes)
    trainingData['predicted'] = trainingData.apply(classifyDataset,
                                                   axis=1,
 def __init__(self, data_set):
     self.data_set = data_set
     self.data_set_entries_count = len(data_set)
     self.number_of_features = len(data_set[0]) - 1
     self.base_entropy = Entropy(data_set).value()
Beispiel #16
0
#!/usr/bin/env python3

import sys
from entropy import Entropy

if len(sys.argv) != 3:
    print("Correct usage: python3 run.py </path/to/file> <N>")
else:
    file_path = sys.argv[1]
    n = int(sys.argv[2])

    elements = []
    with open(file_path, 'r') as file:
        for line in file:
            elements.extend(list(line))

    entropy = Entropy(elements)
    # If memory equals N, we need to consider N + 1 elements at a time.
    print(f"Entropy with N = {n}: {entropy.entropy(n + 1)}")
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation, 'old' version not using compatct
    triangle representation from Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangles, triples = Forward.forward(frequent_items)
        print 'triangles: {}'.format(len(triangles))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        for (s1, s2, s3, s12, s23, s13, s123) in triangles:

            # if s123[1] != 0:
            #     continue
            # maxent estimate from the sample.
            # Index [1] of the tuples hold the # occurences in the sample
            est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

            # maxumum estiamte seen (for plotting)
            max_est = max(max_est, est)

            # record the estimate
            estimates.append(est)

            # from all observed triples get the actual observed number of triples
            observed = 0
            if all_triples.has_key(s123[0]):
                observed = all_triples[s123[0]]

            # maximum observation of the triple (for plotting)
            max_obs = max(max_obs, observed)

            # record the observed
            observations.append(observed)

            # record abs error
            error = abs(obs-est) / float(obs) * 100
            abs_errors.append(error)



        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            # TODO histogram of the average errors. max-ent, extrapolation, heurestic
            # TODO print average error og the average errors to the log.

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
    return path
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation. Using compact representation from
    Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward_compact(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangle_tree, triples = Forward.forward_compact(frequent_items)
        print 'triangle roots: {}'.format(len(triangle_tree))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

                    # maxumum estiamte seen (for plotting)
                    max_est = max(max_est, est)

                    # record the estimate
                    estimates.append(est)

                    # from all observed triples get the actual observed number of triples
                    observed = 0
                    if all_triples.has_key((n1, n2, n3)):
                        observed = all_triples[(n1, n2, n3)]

                    # maximum observation of the triple (for plotting)
                    max_obs = max(max_obs, observed)

                    # record the observed
                    observations.append(observed)

                    # record abs error
                    error = abs(obs-est) / float(obs) * 100
                    abs_errors.append(error)


        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions

    # Get the total observed triples
    borgelt_start = time()
    observed_file_name = path + 'observed_frequent_items.out'
    args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3']
    # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # os.killpg(pro.pid, signal.SIGTERM)
    call(args)
    # sleep(20)
    print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start)

    freq = Borgelt.read_frequent_items(observed_file_name)

    # Create ds of all observed triplets
    # Saved as sorted keys for lookup,
    # and their frequency as value
    observed = {}
    count = 0
    for item in freq:
        if len(item[0]) == 3:
            sorted_trip = triple_sort(item[0])
            observed[sorted_trip] = item[1][0]
    print 'Total triplets observed:', len(observed)
    average_observed = sum(observed.values()) / float(len(observed))
    print 'Baseline: ', average_observed

    del freq

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    avg_errors_ind = []
    var_errors_ind = []
    avg_errors_baseline = []

    occurrences = [0 for i in range(100)]
    max_ent_acc_error = [0 for i in range(100)]
    ext_acc_error = [0 for i in range(100)]
    ind_acc_error = [0 for i in range(100)]
    heu_acc_error = [0 for i in range(100)]
    baseline_acc_error = [0 for i in range(100)]

    # Record trip counts for the best estimats
    max_ent_best = Counter()
    ext_best = Counter()
    ind_best = Counter()

    for index in range(iterations):

        # Create sample file
        sampling_start = time()
        if sample_pct > 0:
            sample_size= int(total_transactions*sample_pct)
        else:
            sample_size = abs(sample_pct)
        test_data_size = total_transactions - sample_size
        sample = random.sample(range(total_transactions), sample_size)
        assert len(sample) == sample_size, 'Sample size not equal to sample'
        sample.sort()
        sample_file_name = path + str(index) + '_sample.tab'
        with open(sample_file_name, 'a') as sample_file:
            sample_line = 0
            for line_num, line in enumerate(open(tab_file, 'rb')):
                if line_num == sample[sample_line]:
                    sample_file.write(line)
                    sample_line += 1
                    if sample_line == sample_size:
                        break

        del sample
        print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start)
        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3']
        call(args)
        print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, test_data_size)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        independences = []
        heurestics = []
        baselines = []
        observations = []

        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        MAPE_errors_ind = []
        MAPE_errors_heu = []
        MAPE_errors_baseline = []
        true_errors = []
        pair_triple_ratios = []

        triangle_counts = []

        # s1_list = []
        # s2_list = []
        # s3_list = []
        # s12_list = []
        # s13_list = []
        # s23_list = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2)) + 1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))

                    triangle = (n1, n2, n3)

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)

                    # Get the obs (test data) frequency minus those found in the sample (training data)
                    obs = 0
                    if triangle in observed:
                         # (triples in data) - (triples in sample). Calculating the number of triples in test data.
                        obs = observed[triangle] - s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size))

                    if est < 0:
                        print 'max ent below 0'
                        print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123)

                    # extrapolation estimate
                    est2 = s123 / float(sample_size) * test_data_size

                    # independence estimat
                    est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size
                    # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size)

                    # heurestic, use max_ent for 0 triple in sample
                    est4 = s123 < 5 and est or est2

                    # base line estimat
                    est5 = average_observed

                    estimates.append(est)
                    extrapolations.append(est2)
                    independences.append(est3)
                    heurestics.append(est4)
                    baselines.append(est5)
                    observations.append(obs)
                    triplets.append(triangle)
                    # TODO Do why save these? They already exist in the triangle tree (and take
                    # up shit load of space..)
                    # s1_list.append(s1)
                    # s2_list.append(s2)
                    # s3_list.append(s3)
                    # s12_list.append(s12)
                    # s13_list.append(s13)
                    # s23_list.append(s23)
                    #end TODO

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs) # * 100
                    MAPE_errors.append(error)
                    true_errors.append(obs-est)

                    # MAPE error extrapolation
                    error2 = 0
                    if est2 > 0:
                        error2 = abs(obs-est2) / math.sqrt(obs) # * 100
                    MAPE_errors_ext.append(error2)

                    # MAPE error independence
                    error3 = abs(obs-est3) / math.sqrt(obs) # * 100
                    MAPE_errors_ind.append(error3)

                    # MAPE error heurestic
                    error4 = abs(obs-est4) / math.sqrt(obs) # * 100
                    MAPE_errors_heu.append(error4)

                    # MAPE baseline error
                    error5 = abs(obs-est5) / math.sqrt(obs) #* 100
                    MAPE_errors_baseline.append(error5)

                    # Record error for the estimeate that performed best
                    if error < error2 and error < error3:
                        max_ent_best[s123] += 1
                    elif error2 < error and error2 < error3:
                        ext_best[s123] += 1
                    else:
                        ind_best[s123] += 1

                    try:
                        occurrences[s123] += 1
                        max_ent_acc_error[s123] += error
                        ext_acc_error[s123] += error2
                        ind_acc_error[s123] += error3
                        heu_acc_error[s123] += error4
                        baseline_acc_error[s123] += error5
                    except IndexError, ie:
                        pass


        # print 'true errors: ', true_errors
        # print 'estimates: ', estimates
        # print 'observed: ', observed
        # print 'mape ', MAPE_errors
        del triangle_tree
        del sample_triples

        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            avg_errors_ext.append(avg_error_ext)

            # independence error
            avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind))
            avg_errors_ind.append(avg_error_ind)

            # heurestic error
            avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            avg_errors_heu.append(avg_error_heu)

            # baseline error
            avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline))
            avg_errors_baseline.append(avg_error_baseline)

            var_error = 0
            var_error_ext = 0
            var_error_heu = 0
            var_error_ind = 0
            # variance
            if len(MAPE_errors) > 1:
                var_error = tvar(MAPE_errors) #tvar is the sample variance
                var_error_ext = tvar(MAPE_errors_ext)
                var_error_heu = tvar(MAPE_errors_heu)
                var_error_ind = tvar(MAPE_errors_ind)


            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            std_dev_ext = math.sqrt(var_error_ext)
            std_error_ext = std_dev_ext / math.sqrt(sample_size)
            span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # independence confidence interval
            std_dev_ind = math.sqrt(var_error_ind)
            std_error_ind = std_dev_ind / math.sqrt(sample_size)
            span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind)
            span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind)

            # heurestic confidence interval
            std_dev_heu = math.sqrt(var_error_heu)
            std_error_heu = std_dev_heu / math.sqrt(sample_size)
            span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            var_errors_ext.append(var_error_ext)
            var_errors_heu.append(var_error_heu)
            var_errors_ind.append(var_error_ind)

            res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind))

            res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline)

            with open(path + str(index) + '_log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(heurestics):
                    fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_independece.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(independences):
                    fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')

            # Save the errors
            with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors, fd)
            with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ext, fd)
            with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_heu, fd)
            with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ind, fd)
            with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_baseline, fd)

            #saves amounts of all subsets of triples.
            # TODO this code does not run!
            # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd:
            #     fd.write('s1\ts2\ts3\ts12\ts13\ts23\n')
            #     for _index, i in enumerate(s123):
            #         fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n')

            #saves independence estimate for all triples.
            # TODO Why s123[_index] in the denominator?
            # TODO What is a 'double independece estimat'?
            # TODO Why not calculate and save estimates in the same way as ext and max_ent?
            # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd:
            #     fd.write('single independence estimate\tdouble independence estimate\n')
            #     for _index, i in enumerate(s123):
            #     	tempVal1 = sample_size/(s1[_index])
            #     	tempVal2=sample_size/(s2[_index])
            #     	tempVal3=sample_size/(s3[_index])
            #     	tempVal12=sample_size/(s12[_index])
            #     	tempVal13=sample_size/(s13[_index])
            #     	tempVal23=sample_size/(s23[_index])
            #         fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n'))


            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'