def process_files(data_dir, percentiles, output_files, ip_to_use=None, replace_timeout=False, cloud=None):
    probe_round_offset = 0
    server_latencies = dict() # Maps from the IP address to a list of latencies.
    for path, _, files in os.walk(data_dir):
        print 'running path: ' + path
        if len(files) <= 0:
            splitted_path = path.split('/')
            if splitted_path[len(splitted_path) - 1] in common_module.SERVER_SET:
                print 'From: ' + path
                if probe_round_offset > 0 and common_module.check_cloud(cloud, server):
                    output_to_file(server, server_latencies, percentiles, output_files, ip_to_use=ip_to_use, replace_timeout=replace_timeout)
                # A new server, reset all the variables.
                probe_round_offset = 0
                server_latencies = dict() # Maps from the IP address to a list of latencies.
                print 'reseted the data structures.'
            continue

        tokenized_path = path.split('/')
        server = tokenized_path[len(tokenized_path) - 2]
        if common_module.check_cloud(cloud, server):
            for filename in files:
                if not filename.startswith(common_module.FILENAME_PREFIX):
                    continue
                full_path = os.path.join(path, filename)
                with open(full_path, 'rb') as input_file:
                    for raw_line in input_file:
                        line = raw_line.rstrip().split()
                        round_id = int(line[common_module.ROUND_COL_INDEX])
                        ip_address = line[common_module.IP_COL_INDEX]
                        latency = float(line[common_module.LATENCY_COL_INDEX])
                        actual_round = round_id + probe_round_offset
                        if ip_address not in server_latencies:
                            server_latencies[ip_address] = []
                        if actual_round < len(server_latencies[ip_address]):
                            # The round already exists.
                            server_latencies[ip_address][actual_round] = min(server_latencies[ip_address][actual_round], latency)
                        else:
                            for i in range(len(server_latencies[ip_address]), actual_round - 1):
                                server_latencies[ip_address].append(10000)
                            server_latencies[ip_address].append(latency)
            probe_round_offset = actual_round + 1 # The probe offset is the last round.
    if common_module.check_cloud(cloud, server):
        output_to_file(server, server_latencies, percentiles, output_files, ip_to_use=ip_to_use, replace_timeout=replace_timeout)
def get_servers(median_latencies, cloud=None):
    print cloud
    server_set = set()
    nameserver_set = set()
    for pair, latency in median_latencies.iteritems():
        nameserver_set.add(pair[1])
        if common_module.check_cloud(cloud, pair[0]):
            server_set.add(pair[0])
    print '# Server: ' + str(len(server_set)) + ' # Nameserver: ' + str(len(nameserver_set))
    return server_set, nameserver_set
def find_server_cover_set(median_latencies, server_set, nameserver_set, threshold, output_filename, cloud=None):
    prev_cover_set = None
    current_cover_set = set()
    total_nameservers = len(nameserver_set)
    while current_cover_set != prev_cover_set and \
            len(server_set) > 0 and \
            len(nameserver_set) > 0:
        prev_cover_set = set(current_cover_set)
        histogram = dict() # mapping from server --> list of pl nodes that are < threshold
        for pair, latency in median_latencies.iteritems():
            server, nameserver = pair
            if server in server_set and nameserver in nameserver_set:
                # Only consider unchosen server and unchosen planetlab nodes
                if common_module.check_cloud(cloud, server):
                    if server not in histogram:
                        histogram[server] = set()
                    if latency < threshold:
                        histogram[server].add(nameserver)
        
        # Update only if the histogram contains some elements.
        if len(histogram) > 0:
            best_server = None
            num_nodes_covered = -1
            for server in histogram:
               if len(histogram[server]) > num_nodes_covered:
                   best_server = server
                   num_nodes_covered = len(histogram[server])
            print 'best server: ' + best_server + ' num nodes covered: ' + str(num_nodes_covered)
            percentage = num_nodes_covered * 100.0 / total_nameservers
            if num_nodes_covered > 0 and percentage > 1:
                # At this point, we get the best_server
                # Remove that server and remove the planetlab nodes covered.
                server_set.remove(best_server)
                nodes_covered = histogram[best_server]
                for node in nodes_covered:
                    nameserver_set.remove(node)
                current_cover_set.add(best_server)
    with open(output_filename, 'wb') as output_file:
        # Done finding the cover set
        for server in current_cover_set:
            output_file.write(str(server) + '\n')
    print 'cover set: ' + str(current_cover_set) + ' remaining nameservers: ' + str(len(nameserver_set)) + ' remaining servers: ' + str(len(server_set))