def rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype='fast'): query = set(query) start = time.time() if ctype == 'fast': cluster_index = clustering.insert_rt_fast(query) elif ctype == 'full': cluster_index = clustering.insert_rt_noupdate(query) if cluster_index == -1: # print 'JUST DID LG' return linear_greedy(query, machines, dataunit_in_machine)[0], time.time() - start # relevant_parts = parts_data[cluster_index] parts_cover = gcpa_data.partcover_by_cluster[cluster_index] dataunit_in_parts = gcpa_data.partindex_by_cluster[cluster_index] unprocessed = set(query) last_greedy = set() cover = set() while len(unprocessed) > 0: x = unprocessed.pop() # Depending on how you write rt-gcpa, one of these if statements can be removed if x in dataunit_in_parts: x_part = dataunit_in_parts[x] # print x_part # print hello cover |= parts_cover[x_part] for machine in parts_cover[x_part]: unprocessed = unprocessed - machines[machine] last_greedy = last_greedy - machines[machine] else: last_greedy.add(x) # if len(last_greedy) == len(query): # print 'BAD!' cover |= linear_greedy(last_greedy, machines, dataunit_in_machine)[0] dt = time.time() - start # # COVERED CHECK query_copy = set(query) for c in cover: query_copy = query_copy - machines[c] if len(query_copy) > 0: print 'NOT COVERED' return cover, dt
def process(self, machines, dataunit_in_machine): nprocessed=0 queues = self.queues datapart_in_query_lists = self.datapart_in_query_lists uppersets_list = self.uppersets_list nqueriesinclusters = self.nqueries_in_clusters # covers = self.covers # covers=[None for i in xrange(len(queues))] for index in range(len(queues)): self.covers[index]=[set() for i in xrange(nqueriesinclusters[index])] start=time.time() for queueindex,queue in enumerate(queues): # queue=queues[queueindex] dataunit_in_part = dict() part_cover = dict() covered=set() part_index = -1 for datapartindex in xrange(len(queue)): part_index += 1 datapart=queue[datapartindex]-covered if datapart!=set(): #now we make machinesintersected machinesintersected={} for dataunit in datapart: dataunit_in_part[dataunit] = part_index for machine in dataunit_in_machine[dataunit]: if machine in machinesintersected: machinesintersected[machine].add(dataunit) else: machinesintersected[machine]=set([dataunit]) #then use the linear greedy cover, dt = linear_greedy(datapart, machines, dataunit_in_machine) part_cover[part_index] = cover #now we add the cover to appropriate covers for queries for queryindex in datapart_in_query_lists[queueindex][datapartindex]: # smartload+=len(cover-covers[queueindex][queryindex]) self.covers[queueindex][queryindex]|=cover #and the last thing, we need to remove appropriate elements from appropriate data parts coveredinupperset=datapart.copy() for dataunit in uppersets_list[queueindex][datapartindex]-datapart: if not (dataunit_in_machine[dataunit]&cover)==set(): coveredinupperset.add(dataunit) covered|=coveredinupperset self.partindex_by_cluster.append(dataunit_in_part) self.partcover_by_cluster.append(part_cover) # nprocessed+=1 # print nprocessed end=time.time()
def rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype='fast'): query = set(query) start = time.time() if ctype == 'fast': cluster_index = clustering.insert_rt_fast(query) elif ctype=='full': cluster_index = clustering.insert_rt_noupdate(query) if cluster_index == -1: # print 'JUST DID LG' return linear_greedy(query, machines, dataunit_in_machine)[0], time.time() - start # relevant_parts = parts_data[cluster_index] parts_cover = gcpa_data.partcover_by_cluster[cluster_index] dataunit_in_parts = gcpa_data.partindex_by_cluster[cluster_index] unprocessed = set(query) last_greedy = set() cover = set() while len(unprocessed) >0: x = unprocessed.pop() # Depending on how you write rt-gcpa, one of these if statements can be removed if x in dataunit_in_parts: x_part = dataunit_in_parts[x] # print x_part # print hello cover |= parts_cover[x_part] for machine in parts_cover[x_part]: unprocessed = unprocessed - machines[machine] last_greedy = last_greedy - machines[machine] else: last_greedy.add(x) # if len(last_greedy) == len(query): # print 'BAD!' cover |= linear_greedy(last_greedy, machines, dataunit_in_machine)[0] dt = time.time() - start # # COVERED CHECK query_copy = set(query) for c in cover: query_copy = query_copy - machines[c] if len(query_copy) > 0: print 'NOT COVERED' return cover, dt
def full_clustering_procedure_comparisons(ndata=100000, N=50000, nmachines=50, min_q_len=6, max_q_len=15, number_of_clusterings=1, queryfile=None, np=.995, delim=','): NoNodes = ndata for iteration in xrange(number_of_clusterings): print 'ITERATION: ', iteration # np = .993 p = np / NoNodes output = [] if queryfile == None: #we genarate random graph on NoNodes vertexes (need to set probability) g = Graph.Erdos_Renyi(n=NoNodes, p=p) print 'Graph generated' #taking random node from the graph node = random.randint(0, NoNodes - 1) #the DFS function, as arguments we have name of the graph, first node output = [] #the loop on the number of queries # for q in range(N): while len(output) < N: node = random.randint(0, NoNodes - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: output.append(line) graphfile = 'n' + str( len(output) / 1000) + 'np' + str(np) + '_' + str(iteration) with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in output: w.writerow(line) print 'Queries generated', len(output) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: output.append(map(int, row)) print 'Queries imported' graphfile = queryfile infile = graphfile test_queries = output max_len = len(test_queries) N = len(test_queries) #min(50000, len(test_queries)) test_queries = test_queries[:max_len] # clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries) # clusters, cl_entropies = simple_entropy(test_queries) clustering = Clustering(test_queries, notif='loud') clusters = clustering.clusters outfile = infile + '_output_test' print 'Clustered' with open(outfile + '.csv', 'wb') as f: # f.write('Output from simpleROCK clustering algorithm \n') f.write(str(len(clusters)) + '\n') ctr = 1 for c in clusters: f.write('-----------------------\n') f.write('Cluster ' + str(ctr) + '\n') f.write('# of Queries: ' + str(len(c)) + '\n') #print 1.0*c.min_query_len/len(c.span) # f.write('Span: ' + str(c.span) + '\n') f.write(c.aligned_output()) f.write('-----------------------\n') ctr += 1 print 'Clusters written to file' machines = generate(range(ndata), nmachines) dataunit_in_machine = generate_hash(machines, ndata) gcpa_data = GCPA(clustering, ndata) start = time.time() gcpa_data.process(machines, dataunit_in_machine) cover_time = time.time() - start average = 1.0 * cover_time / len(test_queries) gcpa_better = GCPA_better(clustering, ndata) betterstart = time.time() gcpa_better.process(machines, dataunit_in_machine) better_dt = time.time() - betterstart better_average = 1.0 * better_dt / len(test_queries) lg_start = time.time() for query in test_queries: cover, dt = linear_greedy(query, machines, dataunit_in_machine) lg_dt = time.time() - lg_start lg_ave = 1.0 * lg_dt / len(test_queries) baseline_start = time.time() for query in test_queries: cover, dt = baseline(query, machines, dataunit_in_machine) baseline_dt = time.time() - baseline_start baseline_ave = 1.0 * baseline_dt / len(test_queries) b_baseline_start = time.time() for query in test_queries: cover, dt = better_baseline(query, machines, dataunit_in_machine) b_baseline_dt = time.time() - baseline_start b_baseline_ave = 1.0 * b_baseline_dt / len(test_queries) # print average, better_average, lg_ave, baseline_ave, b_baseline_ave print baseline_ave, b_baseline_ave, lg_ave, average, better_average covers = gcpa_data.covers better_covers = gcpa_better.covers to_write = [] total = 0 for clusterind, coverset in enumerate(covers): for query_ind, cover in enumerate(coverset): if total % 1000 == 0: print total total += 1 query = clustering.clusters[clusterind][query_ind] gcpa_fast_lin = cover gcpa_fast_better = better_covers[clusterind][query_ind] lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) baseline_cover, baseline_dt = baseline(query, machines, dataunit_in_machine) b_baseline_cover, b_baseline_dt = better_baseline( query, machines, dataunit_in_machine) # to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover])) to_write.append( map(len, [ baseline_cover, b_baseline_cover, lg_cover, gcpa_fast_lin, gcpa_fast_better ])) with open(infile + 'big_comparison.csv', 'wb') as f: w = csv.writer(f) w.writerow([ 'Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL' ]) w.writerow([ baseline_ave, b_baseline_ave, lg_ave, average, better_average ]) for row in to_write: w.writerow(row)
def full_realtime_comparisons(precompute_fraction=.2, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better', queryfile=None,delim=','): queries = [] if queryfile == None: g = Graph.Erdos_Renyi(n=ndataunits, p = np/ndataunits) q = 0 while q < nqueries: node=random.randint(0, ndataunits-1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str(len(queries)/1000) + 'np' + str(np) +ctype + gcpatype + 'test' with open(graphfile + '.csv','wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: queries.append(map(int, row)) graphfile = queryfile infile = graphfile # max_to_process = min(nqueries, len(queries)) # queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction*len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed, notif='loud') rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) elif gcpatype == 'both': gcpa_linear = GCPA(clustering, ndataunits) gcpa_better = GCPA_better(clustering, ndataunits) if gcpatype != 'both': gcpa_data.process(machines, dataunit_in_machine) else: gcpa_linear.process(machines, dataunit_in_machine) gcpa_better.process(machines, dataunit_in_machine) gcpa_rt_coverlens = [] gcpa_times = [] lg_coverlens = [] baseline_coverlens = [] baseline_times = [] b_baseline_coverlens = [] b_baseline_times = [] smaller = 0 lg_times = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx if ctype != 'both': cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) gcpa_rt_coverlens.append(len(cover)) gcpa_times.append(gcpa_dt) else: cover_fast, gcpa_fast_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'fast') cover_full, gcpa_full_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'full') cover_better_fast, gcpa_better_fast_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'fast') cover_better_full, gcpa_better_full_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'full') gcpa_rt_coverlens.append(map(len,[cover_fast, cover_full, cover_better_fast, cover_better_full])) gcpa_times.append([gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt, gcpa_better_full_dt]) lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) lg_times.append(lg_dt) baseline_cover, baseline_time = baseline(query, machines, dataunit_in_machine) lg_coverlens.append(len(lg_cover)) baseline_coverlens.append(len(baseline_cover)) baseline_times.append(baseline_time) b_baseline_cover, b_baseline_time = better_baseline(query, machines, dataunit_in_machine) b_baseline_coverlens.append(len(b_baseline_cover)) b_baseline_times.append(b_baseline_time) with open(infile +'_cover_len_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): w.writerow([cl, lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]]) else: w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): cl.extend([lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]]) w.writerow(cl) with open(infile +'_time_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): w.writerow([gcpa_dt, lg_times[idx], baseline_times[idx], b_baseline_times[idx]]) else: w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): gcpa_dt.extend([lg_times[idx], baseline_times[idx], b_baseline_times[idx]]) w.writerow(gcpa_dt)
def full_clustering_procedure_comparisons(ndata = 100000, N=50000, nmachines = 50, min_q_len = 6, max_q_len = 15, number_of_clusterings=1, queryfile = None, np = .995, delim=','): NoNodes = ndata for iteration in xrange(number_of_clusterings): print 'ITERATION: ', iteration # np = .993 p = np/NoNodes output = [] if queryfile == None: #we genarate random graph on NoNodes vertexes (need to set probability) g=Graph.Erdos_Renyi(n=NoNodes, p=p) print 'Graph generated' #taking random node from the graph node=random.randint(0,NoNodes-1) #the DFS function, as arguments we have name of the graph, first node output = [] #the loop on the number of queries # for q in range(N): while len(output) < N: node=random.randint(0, NoNodes-1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: output.append(line) graphfile = 'n' + str(len(output)/1000) + 'np' + str(np) + '_' + str(iteration) with open(graphfile + '.csv','wb') as f: w = csv.writer(f) for line in output: w.writerow(line) print 'Queries generated', len(output) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f,delimiter=delim) for row in r: output.append(map(int, row)) print 'Queries imported' graphfile = queryfile infile = graphfile test_queries = output max_len = len(test_queries) N = len(test_queries) #min(50000, len(test_queries)) test_queries = test_queries[:max_len] # clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries) # clusters, cl_entropies = simple_entropy(test_queries) clustering = Clustering(test_queries, notif='loud') clusters = clustering.clusters outfile = infile + '_output_test' print 'Clustered' with open(outfile + '.csv', 'wb') as f: # f.write('Output from simpleROCK clustering algorithm \n') f.write(str(len(clusters)) + '\n') ctr = 1 for c in clusters: f.write('-----------------------\n') f.write('Cluster ' + str(ctr) + '\n') f.write('# of Queries: ' + str(len(c)) + '\n') #print 1.0*c.min_query_len/len(c.span) # f.write('Span: ' + str(c.span) + '\n') f.write(c.aligned_output()) f.write('-----------------------\n') ctr += 1 print 'Clusters written to file' machines = generate(range(ndata), nmachines) dataunit_in_machine = generate_hash(machines, ndata) gcpa_data = GCPA(clustering,ndata) start = time.time() gcpa_data.process(machines, dataunit_in_machine) cover_time = time.time() - start average = 1.0*cover_time/len(test_queries) gcpa_better = GCPA_better(clustering, ndata) betterstart = time.time() gcpa_better.process(machines, dataunit_in_machine) better_dt = time.time() - betterstart better_average = 1.0*better_dt/len(test_queries) lg_start = time.time() for query in test_queries: cover, dt = linear_greedy(query, machines, dataunit_in_machine) lg_dt = time.time() - lg_start lg_ave = 1.0*lg_dt/len(test_queries) baseline_start = time.time() for query in test_queries: cover, dt = baseline(query, machines, dataunit_in_machine) baseline_dt = time.time() - baseline_start baseline_ave = 1.0*baseline_dt/len(test_queries) b_baseline_start = time.time() for query in test_queries: cover, dt = better_baseline(query, machines, dataunit_in_machine) b_baseline_dt = time.time() - baseline_start b_baseline_ave = 1.0*b_baseline_dt/len(test_queries) # print average, better_average, lg_ave, baseline_ave, b_baseline_ave print baseline_ave, b_baseline_ave, lg_ave, average, better_average covers = gcpa_data.covers better_covers = gcpa_better.covers to_write = [] total = 0 for clusterind, coverset in enumerate(covers): for query_ind, cover in enumerate(coverset): if total % 1000 == 0: print total total +=1 query = clustering.clusters[clusterind][query_ind] gcpa_fast_lin = cover gcpa_fast_better = better_covers[clusterind][query_ind] lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) baseline_cover, baseline_dt = baseline(query, machines, dataunit_in_machine) b_baseline_cover, b_baseline_dt = better_baseline(query, machines, dataunit_in_machine) # to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover])) to_write.append(map(len, [baseline_cover, b_baseline_cover, lg_cover, gcpa_fast_lin, gcpa_fast_better])) with open(infile + 'big_comparison.csv', 'wb') as f: w = csv.writer(f) w.writerow(['Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL']) w.writerow([baseline_ave, b_baseline_ave, lg_ave, average, better_average]) for row in to_write: w.writerow(row)
def full_realtime_comparisons(precompute_fraction=.2, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better', queryfile=None, delim=','): queries = [] if queryfile == None: g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits) q = 0 while q < nqueries: node = random.randint(0, ndataunits - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str( len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test' with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: queries.append(map(int, row)) graphfile = queryfile infile = graphfile # max_to_process = min(nqueries, len(queries)) # queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction * len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed, notif='loud') rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) elif gcpatype == 'both': gcpa_linear = GCPA(clustering, ndataunits) gcpa_better = GCPA_better(clustering, ndataunits) if gcpatype != 'both': gcpa_data.process(machines, dataunit_in_machine) else: gcpa_linear.process(machines, dataunit_in_machine) gcpa_better.process(machines, dataunit_in_machine) gcpa_rt_coverlens = [] gcpa_times = [] lg_coverlens = [] baseline_coverlens = [] baseline_times = [] b_baseline_coverlens = [] b_baseline_times = [] smaller = 0 lg_times = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx if ctype != 'both': cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) gcpa_rt_coverlens.append(len(cover)) gcpa_times.append(gcpa_dt) else: cover_fast, gcpa_fast_dt = rt_query_process( query, clustering, gcpa_linear, machines, dataunit_in_machine, 'fast') cover_full, gcpa_full_dt = rt_query_process( query, clustering, gcpa_linear, machines, dataunit_in_machine, 'full') cover_better_fast, gcpa_better_fast_dt = rt_query_process( query, clustering, gcpa_better, machines, dataunit_in_machine, 'fast') cover_better_full, gcpa_better_full_dt = rt_query_process( query, clustering, gcpa_better, machines, dataunit_in_machine, 'full') gcpa_rt_coverlens.append( map(len, [ cover_fast, cover_full, cover_better_fast, cover_better_full ])) gcpa_times.append([ gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt, gcpa_better_full_dt ]) lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) lg_times.append(lg_dt) baseline_cover, baseline_time = baseline(query, machines, dataunit_in_machine) lg_coverlens.append(len(lg_cover)) baseline_coverlens.append(len(baseline_cover)) baseline_times.append(baseline_time) b_baseline_cover, b_baseline_time = better_baseline( query, machines, dataunit_in_machine) b_baseline_coverlens.append(len(b_baseline_cover)) b_baseline_times.append(b_baseline_time) with open(infile + '_cover_len_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): w.writerow([ cl, lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx] ]) else: w.writerow([ 'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline' ]) for idx, cl in enumerate(gcpa_rt_coverlens): cl.extend([ lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx] ]) w.writerow(cl) with open(infile + '_time_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): w.writerow([ gcpa_dt, lg_times[idx], baseline_times[idx], b_baseline_times[idx] ]) else: w.writerow([ 'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline' ]) for idx, gcpa_dt in enumerate(gcpa_times): gcpa_dt.extend([ lg_times[idx], baseline_times[idx], b_baseline_times[idx] ]) w.writerow(gcpa_dt)