def pretrain_baseline(self): baseline = self.create_baseline() if os.path.exists(self.options.result_dir + 'baseline'): self.load_baseline(baseline) baseline_trainer = optimizers[self.options.optimizer](baseline.model) lr = self.options.lr #used only for sgd i = 0 lowest_valid_loss = 9999 print('train baseline, for simplicity use the same data here') for epoch in range(self.options.epochs): sents = 0 total_loss = 0.0 train = self.reader.next_example(0) train_size = len(self.reader.data[0]) for data in train: s1, s2, s3, pos, act = data[0], data[1], data[2], data[ 3], data[4] sents += 1 loss = -baseline(s3) if loss is not None: total_loss += loss.scalar_value() loss.backward() if self.options.optimizer == 'sgd': baseline_trainer.update(lr) else: baseline_trainer.update() e = float(i) / train_size if i % self.options.print_every == 0: print('epoch {}: loss per sentence: {}'.format( e, total_loss / sents)) sents = 0 total_loss = 0.0 if i != 0 and i % self.options.save_every == 0: print('computing loss on validation set...') total_valid_loss = 0 valid = self.reader.next_example(1) valid_size = len(self.reader.data[1]) for vdata in valid: s1, s2, s3, pos, act = vdata[0], vdata[1], vdata[ 2], vdata[3], vdata[4] valid_loss = -baseline(s3) if valid_loss is not None: total_valid_loss += valid_loss.scalar_value() total_valid_loss = total_valid_loss * 1.0 / valid_size if total_valid_loss < lowest_valid_loss: lowest_valid_loss = total_valid_loss print('saving model...') baseline.Save(self.options.result_dir + 'baseline') else: lr = lr * self.options.decay i += 1
def paraTune(campaign_list): suffix_list = ['n','s','f'] runtimes_leafSize = {} for campaign in campaign_list: runtimes_leafSize[campaign] = {} for mode in MODE_LIST: runtimes_leafSize[campaign][mode] = {} ################################## leafSize ###################################### for leafSize in [0]: start_time = time.clock() info = Info() info.basebid = BASE_BID info.campaign = campaign info.mode = mode modeName = MODE_NAME_LIST[mode] suffix = suffix_list[mode] info.laplace = LAPLACE info.leafSize = leafSize # info.treeDepth = TREE_DEPTH # create os directory if not os.path.exists(OFROOT+campaign+'/'+modeName): os.makedirs(OFROOT+campaign+'/'+modeName) if not os.path.exists(OFROOT+campaign+'/'+modeName+'/paraTune'): os.makedirs(OFROOT+campaign+'/'+modeName+'/paraTune') if not os.path.exists(OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)): os.makedirs(OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)) # info assignment info.fname_trainlog = IFROOT+campaign+'/train.log.txt' info.fname_testlog = IFROOT+campaign+'/test.log.txt' info.fname_nodeData = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/nodeData_'+campaign+suffix+'.txt' info.fname_nodeInfo = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/nodeInfos_'+campaign+suffix+'.txt' info.fname_trainbid = IFROOT+campaign+'/train_bid.txt' info.fname_testbid = IFROOT+campaign+'/test_bid.txt' info.fname_baseline = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_'+campaign+suffix+'.txt' info.fname_monitor = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/monitor_'+campaign+suffix+'.txt' info.fname_testKmeans = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testKmeans_'+campaign+suffix+'.txt' info.fname_testSurvival = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testSurvival_'+campaign+suffix+'.txt' info.fname_evaluation = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/evaluation_'+campaign+suffix+'.txt' info.fname_baseline_q = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_q_'+campaign+suffix+'.txt' info.fname_tree_q = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/tree_q_'+campaign+suffix+'.txt' info.fname_baseline_w = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_w_'+campaign+suffix+'.txt' info.fname_tree_w = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/tree_w_'+campaign+suffix+'.txt' info.fname_pruneNode = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/pruneNode_'+campaign+suffix+'.txt' info.fname_pruneEval = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/pruneEval_'+campaign+suffix+'.txt' info.fname_testwin = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testwin_'+campaign+suffix+'.txt' # baseline print campaign,modeName,'leafSize',leafSize,"baseline begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) baseline(info) print campaign,modeName,'leafSize',leafSize,"baseline ends." # getDataset dataset = getTrainData(info.fname_trainlog,info.fname_trainbid) print campaign,modeName,'leafSize',leafSize,"decisionTree2 begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) decisionTree2(dataset,info) #evaluation print campaign,modeName,'leafSize',leafSize,"evaluation begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) evaluate(info) # runtime end_time = time.clock() runtimes_leafSize[campaign][mode][leafSize] = end_time-start_time print campaign,modeName,leafSize,"run time: "+str(end_time-start_time)+" s" for campaign in runtimes_leafSize: for mode in runtimes_leafSize[campaign]: for leafSize in runtimes_leafSize[campaign][mode]: print campaign,MODE_NAME_LIST[mode],'leafSize',leafSize,"runtime "+str( runtimes_leafSize[campaign][mode][leafSize] )
def main(campaign_list): suffix_list = ['n','s','f'] runtimes = {} for campaign in campaign_list: for mode in MODE_LIST: # tempt filter start_time = time.clock() info = Info() info.basebid = BASE_BID info.campaign = campaign info.mode = mode modeName = MODE_NAME_LIST[mode] suffix = suffix_list[mode] info.laplace = LAPLACE info.leafSize = LEAF_SIZE info.treeDepth = TREE_DEPTH # create os directory if not os.path.exists(OFROOT+campaign+'/'+modeName): os.makedirs(OFROOT+campaign+'/'+modeName) # info assignment info.fname_trainlog = IFROOT+campaign+'/train.log.txt' info.fname_testlog = IFROOT+campaign+'/test.log.txt' info.fname_nodeData = OFROOT+campaign+'/'+modeName+'/nodeData_'+campaign+suffix+'.txt' info.fname_nodeInfo = OFROOT+campaign+'/'+modeName+'/nodeInfos_'+campaign+suffix+'.txt' info.fname_trainbid = IFROOT+campaign+'/train_bid.txt' info.fname_testbid = IFROOT+campaign+'/test_bid.txt' info.fname_baseline = OFROOT+campaign+'/'+modeName+'/baseline_'+campaign+suffix+'.txt' info.fname_monitor = OFROOT+campaign+'/'+modeName+'/monitor_'+campaign+suffix+'.txt' info.fname_testKmeans = OFROOT+campaign+'/'+modeName+'/testKmeans_'+campaign+suffix+'.txt' info.fname_testSurvival = OFROOT+campaign+'/'+modeName+'/testSurvival_'+campaign+suffix+'.txt' info.fname_evaluation = OFROOT+campaign+'/'+modeName+'/evaluation_'+campaign+suffix+'.txt' info.fname_baseline_q = OFROOT+campaign+'/'+modeName+'/baseline_q_'+campaign+suffix+'.txt' info.fname_tree_q = OFROOT+campaign+'/'+modeName+'/tree_q_'+campaign+suffix+'.txt' info.fname_baseline_w = OFROOT+campaign+'/'+modeName+'/baseline_w_'+campaign+suffix+'.txt' info.fname_tree_w = OFROOT+campaign+'/'+modeName+'/tree_w_'+campaign+suffix+'.txt' info.fname_pruneNode = OFROOT+campaign+'/'+modeName+'/pruneNode_'+campaign+suffix+'.txt' info.fname_pruneEval = OFROOT+campaign+'/'+modeName+'/pruneEval_'+campaign+suffix+'.txt' info.fname_testwin = OFROOT+campaign+'/'+modeName+'/testwin_'+campaign+suffix+'.txt' step = STEP # baseline print campaign+" "+modeName+" baseline begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) baseline(info) print campaign+" "+modeName+" baseline ends." # getDataset dataset = getTrainData(info.fname_trainlog,info.fname_trainbid) print campaign+" "+modeName+" decisionTree2 begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) decisionTree2(dataset,info) #evaluation print "evaluation begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) evaluate(info) # runtime end_time = time.clock() if not runtimes.has_key(campaign): runtimes[campaign] = [] runtimes[campaign].append(end_time-start_time) print campaign+" run time: "+str(end_time-start_time)+" s" for campaign in runtimes: for mode in range(0,len(runtimes[campaign])): print campaign+" "+MODE_NAME_LIST[mode]+" runtime "+str( runtimes[campaign][mode] )
def full_clustering_procedure_comparisons(ndata=100000, N=50000, nmachines=50, min_q_len=6, max_q_len=15, number_of_clusterings=1, queryfile=None, np=.995, delim=','): NoNodes = ndata for iteration in xrange(number_of_clusterings): print 'ITERATION: ', iteration # np = .993 p = np / NoNodes output = [] if queryfile == None: #we genarate random graph on NoNodes vertexes (need to set probability) g = Graph.Erdos_Renyi(n=NoNodes, p=p) print 'Graph generated' #taking random node from the graph node = random.randint(0, NoNodes - 1) #the DFS function, as arguments we have name of the graph, first node output = [] #the loop on the number of queries # for q in range(N): while len(output) < N: node = random.randint(0, NoNodes - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: output.append(line) graphfile = 'n' + str( len(output) / 1000) + 'np' + str(np) + '_' + str(iteration) with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in output: w.writerow(line) print 'Queries generated', len(output) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: output.append(map(int, row)) print 'Queries imported' graphfile = queryfile infile = graphfile test_queries = output max_len = len(test_queries) N = len(test_queries) #min(50000, len(test_queries)) test_queries = test_queries[:max_len] # clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries) # clusters, cl_entropies = simple_entropy(test_queries) clustering = Clustering(test_queries, notif='loud') clusters = clustering.clusters outfile = infile + '_output_test' print 'Clustered' with open(outfile + '.csv', 'wb') as f: # f.write('Output from simpleROCK clustering algorithm \n') f.write(str(len(clusters)) + '\n') ctr = 1 for c in clusters: f.write('-----------------------\n') f.write('Cluster ' + str(ctr) + '\n') f.write('# of Queries: ' + str(len(c)) + '\n') #print 1.0*c.min_query_len/len(c.span) # f.write('Span: ' + str(c.span) + '\n') f.write(c.aligned_output()) f.write('-----------------------\n') ctr += 1 print 'Clusters written to file' machines = generate(range(ndata), nmachines) dataunit_in_machine = generate_hash(machines, ndata) gcpa_data = GCPA(clustering, ndata) start = time.time() gcpa_data.process(machines, dataunit_in_machine) cover_time = time.time() - start average = 1.0 * cover_time / len(test_queries) gcpa_better = GCPA_better(clustering, ndata) betterstart = time.time() gcpa_better.process(machines, dataunit_in_machine) better_dt = time.time() - betterstart better_average = 1.0 * better_dt / len(test_queries) lg_start = time.time() for query in test_queries: cover, dt = linear_greedy(query, machines, dataunit_in_machine) lg_dt = time.time() - lg_start lg_ave = 1.0 * lg_dt / len(test_queries) baseline_start = time.time() for query in test_queries: cover, dt = baseline(query, machines, dataunit_in_machine) baseline_dt = time.time() - baseline_start baseline_ave = 1.0 * baseline_dt / len(test_queries) b_baseline_start = time.time() for query in test_queries: cover, dt = better_baseline(query, machines, dataunit_in_machine) b_baseline_dt = time.time() - baseline_start b_baseline_ave = 1.0 * b_baseline_dt / len(test_queries) # print average, better_average, lg_ave, baseline_ave, b_baseline_ave print baseline_ave, b_baseline_ave, lg_ave, average, better_average covers = gcpa_data.covers better_covers = gcpa_better.covers to_write = [] total = 0 for clusterind, coverset in enumerate(covers): for query_ind, cover in enumerate(coverset): if total % 1000 == 0: print total total += 1 query = clustering.clusters[clusterind][query_ind] gcpa_fast_lin = cover gcpa_fast_better = better_covers[clusterind][query_ind] lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) baseline_cover, baseline_dt = baseline(query, machines, dataunit_in_machine) b_baseline_cover, b_baseline_dt = better_baseline( query, machines, dataunit_in_machine) # to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover])) to_write.append( map(len, [ baseline_cover, b_baseline_cover, lg_cover, gcpa_fast_lin, gcpa_fast_better ])) with open(infile + 'big_comparison.csv', 'wb') as f: w = csv.writer(f) w.writerow([ 'Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL' ]) w.writerow([ baseline_ave, b_baseline_ave, lg_ave, average, better_average ]) for row in to_write: w.writerow(row)
with open(filename, mode='r') as infile: reader = csv.reader(infile) next(reader, None) # skip header for rows in reader: yield (rows[1], rows[8], rows[3]) # results.append((rows[1], rows[5], rows[9])) # return results test_data = get_data('video_characteristics_upload.csv') counter = 1 baseline_loss_avg = 0.0 oracle_loss_avg = 0.0 for video_id, title, channel_id in test_data: try: baseline_prediction = baseline(channel_id) oracle_prediction = oracle(title, channel_id) view_count = int(youtube.videos().list( id=video_id, part='statistics', fields='items/statistics/viewCount', ).execute()['items'][0]['statistics']['viewCount']) baseline_loss = log10(float(baseline_prediction) / view_count)**2 oracle_loss = log10(float(oracle_prediction) / view_count)**2 baseline_loss_avg = ( (counter - 1) * baseline_loss_avg + baseline_loss) / counter oracle_loss_avg = ( (counter - 1) * oracle_loss_avg + oracle_loss) / counter print counter print "Baseline: %d (Loss %f), Oracle: %d (Loss %f), True Value: %d" % ( baseline_prediction, baseline_loss, oracle_prediction, oracle_loss,
def add_all_available_baselines(self): for baseline in self.baselines: self.config.register_player(name=str(baseline), algorithm=baseline())
def full_realtime_comparisons(precompute_fraction=.2, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better', queryfile=None,delim=','): queries = [] if queryfile == None: g = Graph.Erdos_Renyi(n=ndataunits, p = np/ndataunits) q = 0 while q < nqueries: node=random.randint(0, ndataunits-1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str(len(queries)/1000) + 'np' + str(np) +ctype + gcpatype + 'test' with open(graphfile + '.csv','wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: queries.append(map(int, row)) graphfile = queryfile infile = graphfile # max_to_process = min(nqueries, len(queries)) # queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction*len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed, notif='loud') rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) elif gcpatype == 'both': gcpa_linear = GCPA(clustering, ndataunits) gcpa_better = GCPA_better(clustering, ndataunits) if gcpatype != 'both': gcpa_data.process(machines, dataunit_in_machine) else: gcpa_linear.process(machines, dataunit_in_machine) gcpa_better.process(machines, dataunit_in_machine) gcpa_rt_coverlens = [] gcpa_times = [] lg_coverlens = [] baseline_coverlens = [] baseline_times = [] b_baseline_coverlens = [] b_baseline_times = [] smaller = 0 lg_times = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx if ctype != 'both': cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) gcpa_rt_coverlens.append(len(cover)) gcpa_times.append(gcpa_dt) else: cover_fast, gcpa_fast_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'fast') cover_full, gcpa_full_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'full') cover_better_fast, gcpa_better_fast_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'fast') cover_better_full, gcpa_better_full_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'full') gcpa_rt_coverlens.append(map(len,[cover_fast, cover_full, cover_better_fast, cover_better_full])) gcpa_times.append([gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt, gcpa_better_full_dt]) lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) lg_times.append(lg_dt) baseline_cover, baseline_time = baseline(query, machines, dataunit_in_machine) lg_coverlens.append(len(lg_cover)) baseline_coverlens.append(len(baseline_cover)) baseline_times.append(baseline_time) b_baseline_cover, b_baseline_time = better_baseline(query, machines, dataunit_in_machine) b_baseline_coverlens.append(len(b_baseline_cover)) b_baseline_times.append(b_baseline_time) with open(infile +'_cover_len_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): w.writerow([cl, lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]]) else: w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): cl.extend([lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]]) w.writerow(cl) with open(infile +'_time_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): w.writerow([gcpa_dt, lg_times[idx], baseline_times[idx], b_baseline_times[idx]]) else: w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): gcpa_dt.extend([lg_times[idx], baseline_times[idx], b_baseline_times[idx]]) w.writerow(gcpa_dt)
def full_clustering_procedure_comparisons(ndata = 100000, N=50000, nmachines = 50, min_q_len = 6, max_q_len = 15, number_of_clusterings=1, queryfile = None, np = .995, delim=','): NoNodes = ndata for iteration in xrange(number_of_clusterings): print 'ITERATION: ', iteration # np = .993 p = np/NoNodes output = [] if queryfile == None: #we genarate random graph on NoNodes vertexes (need to set probability) g=Graph.Erdos_Renyi(n=NoNodes, p=p) print 'Graph generated' #taking random node from the graph node=random.randint(0,NoNodes-1) #the DFS function, as arguments we have name of the graph, first node output = [] #the loop on the number of queries # for q in range(N): while len(output) < N: node=random.randint(0, NoNodes-1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: output.append(line) graphfile = 'n' + str(len(output)/1000) + 'np' + str(np) + '_' + str(iteration) with open(graphfile + '.csv','wb') as f: w = csv.writer(f) for line in output: w.writerow(line) print 'Queries generated', len(output) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f,delimiter=delim) for row in r: output.append(map(int, row)) print 'Queries imported' graphfile = queryfile infile = graphfile test_queries = output max_len = len(test_queries) N = len(test_queries) #min(50000, len(test_queries)) test_queries = test_queries[:max_len] # clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries) # clusters, cl_entropies = simple_entropy(test_queries) clustering = Clustering(test_queries, notif='loud') clusters = clustering.clusters outfile = infile + '_output_test' print 'Clustered' with open(outfile + '.csv', 'wb') as f: # f.write('Output from simpleROCK clustering algorithm \n') f.write(str(len(clusters)) + '\n') ctr = 1 for c in clusters: f.write('-----------------------\n') f.write('Cluster ' + str(ctr) + '\n') f.write('# of Queries: ' + str(len(c)) + '\n') #print 1.0*c.min_query_len/len(c.span) # f.write('Span: ' + str(c.span) + '\n') f.write(c.aligned_output()) f.write('-----------------------\n') ctr += 1 print 'Clusters written to file' machines = generate(range(ndata), nmachines) dataunit_in_machine = generate_hash(machines, ndata) gcpa_data = GCPA(clustering,ndata) start = time.time() gcpa_data.process(machines, dataunit_in_machine) cover_time = time.time() - start average = 1.0*cover_time/len(test_queries) gcpa_better = GCPA_better(clustering, ndata) betterstart = time.time() gcpa_better.process(machines, dataunit_in_machine) better_dt = time.time() - betterstart better_average = 1.0*better_dt/len(test_queries) lg_start = time.time() for query in test_queries: cover, dt = linear_greedy(query, machines, dataunit_in_machine) lg_dt = time.time() - lg_start lg_ave = 1.0*lg_dt/len(test_queries) baseline_start = time.time() for query in test_queries: cover, dt = baseline(query, machines, dataunit_in_machine) baseline_dt = time.time() - baseline_start baseline_ave = 1.0*baseline_dt/len(test_queries) b_baseline_start = time.time() for query in test_queries: cover, dt = better_baseline(query, machines, dataunit_in_machine) b_baseline_dt = time.time() - baseline_start b_baseline_ave = 1.0*b_baseline_dt/len(test_queries) # print average, better_average, lg_ave, baseline_ave, b_baseline_ave print baseline_ave, b_baseline_ave, lg_ave, average, better_average covers = gcpa_data.covers better_covers = gcpa_better.covers to_write = [] total = 0 for clusterind, coverset in enumerate(covers): for query_ind, cover in enumerate(coverset): if total % 1000 == 0: print total total +=1 query = clustering.clusters[clusterind][query_ind] gcpa_fast_lin = cover gcpa_fast_better = better_covers[clusterind][query_ind] lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) baseline_cover, baseline_dt = baseline(query, machines, dataunit_in_machine) b_baseline_cover, b_baseline_dt = better_baseline(query, machines, dataunit_in_machine) # to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover])) to_write.append(map(len, [baseline_cover, b_baseline_cover, lg_cover, gcpa_fast_lin, gcpa_fast_better])) with open(infile + 'big_comparison.csv', 'wb') as f: w = csv.writer(f) w.writerow(['Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL']) w.writerow([baseline_ave, b_baseline_ave, lg_ave, average, better_average]) for row in to_write: w.writerow(row)
def paraTune(campaign_list): suffix_list = ['n','s','f'] runtimes_leafSize = {} for campaign in campaign_list: runtimes_leafSize[campaign] = {} for mode in MODE_LIST: runtimes_leafSize[campaign][mode] = {} ################################## leafSize ###################################### for leafSize in [0]: start_time = time.clock() info = Info() info.basebid = BASE_BID info.campaign = campaign info.mode = mode modeName = MODE_NAME_LIST[mode] suffix = suffix_list[mode] info.laplace = LAPLACE info.leafSize = leafSize # info.treeDepth = TREE_DEPTH # create os directory if not os.path.exists(OFROOT+campaign+'\\'+modeName): os.makedirs(OFROOT+campaign+'\\'+modeName) if not os.path.exists(OFROOT+campaign+'\\'+modeName+'\\paraTune'): os.makedirs(OFROOT+campaign+'\\'+modeName+'\\paraTune') if not os.path.exists(OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)): os.makedirs(OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)) # info assignment info.fname_trainlog = IFROOT+campaign+'\\train.log.txt' info.fname_testlog = IFROOT+campaign+'\\test.log.txt' info.fname_nodeData = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\nodeData_'+campaign+suffix+'.txt' info.fname_nodeInfo = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\nodeInfos_'+campaign+suffix+'.txt' info.fname_trainbid = IFROOT+campaign+'\\train_bid.txt' info.fname_testbid = IFROOT+campaign+'\\test_bid.txt' info.fname_baseline = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_'+campaign+suffix+'.txt' info.fname_monitor = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\monitor_'+campaign+suffix+'.txt' info.fname_testKmeans = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testKmeans_'+campaign+suffix+'.txt' info.fname_testSurvival = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testSurvival_'+campaign+suffix+'.txt' info.fname_evaluation = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\evaluation_'+campaign+suffix+'.txt' info.fname_baseline_q = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_q_'+campaign+suffix+'.txt' info.fname_tree_q = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\tree_q_'+campaign+suffix+'.txt' info.fname_baseline_w = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_w_'+campaign+suffix+'.txt' info.fname_tree_w = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\tree_w_'+campaign+suffix+'.txt' info.fname_pruneNode = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\pruneNode_'+campaign+suffix+'.txt' info.fname_pruneEval = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\pruneEval_'+campaign+suffix+'.txt' info.fname_testwin = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testwin_'+campaign+suffix+'.txt' # baseline print campaign,modeName,'leafSize',leafSize,"baseline begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) baseline(info) print campaign,modeName,'leafSize',leafSize,"baseline ends." # getDataset dataset = getTrainData(info.fname_trainlog,info.fname_trainbid) print campaign,modeName,'leafSize',leafSize,"decisionTree2 begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) decisionTree2(dataset,info) #evaluation print campaign,modeName,'leafSize',leafSize,"evaluation begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) evaluate(info) # runtime end_time = time.clock() runtimes_leafSize[campaign][mode][leafSize] = end_time-start_time print campaign,modeName,leafSize,"run time: "+str(end_time-start_time)+" s" for campaign in runtimes_leafSize: for mode in runtimes_leafSize[campaign]: for leafSize in runtimes_leafSize[campaign][mode]: print campaign,MODE_NAME_LIST[mode],'leafSize',leafSize,"runtime "+str( runtimes_leafSize[campaign][mode][leafSize] )
def main(campaign_list): suffix_list = ['n','s','f'] runtimes = {} for campaign in campaign_list: for mode in MODE_LIST: # tempt filter start_time = time.clock() info = Info() info.basebid = BASE_BID info.campaign = campaign info.mode = mode modeName = MODE_NAME_LIST[mode] suffix = suffix_list[mode] info.laplace = LAPLACE info.leafSize = LEAF_SIZE info.treeDepth = TREE_DEPTH # create os directory if not os.path.exists(OFROOT+campaign+'\\'+modeName): os.makedirs(OFROOT+campaign+'\\'+modeName) # info assignment info.fname_trainlog = IFROOT+campaign+'\\train.log.txt' info.fname_testlog = IFROOT+campaign+'\\test.log.txt' info.fname_nodeData = OFROOT+campaign+'\\'+modeName+'\\nodeData_'+campaign+suffix+'.txt' info.fname_nodeInfo = OFROOT+campaign+'\\'+modeName+'\\nodeInfos_'+campaign+suffix+'.txt' info.fname_trainbid = IFROOT+campaign+'\\train_bid.txt' info.fname_testbid = IFROOT+campaign+'\\test_bid.txt' info.fname_baseline = OFROOT+campaign+'\\'+modeName+'\\baseline_'+campaign+suffix+'.txt' info.fname_monitor = OFROOT+campaign+'\\'+modeName+'\\monitor_'+campaign+suffix+'.txt' info.fname_testKmeans = OFROOT+campaign+'\\'+modeName+'\\testKmeans_'+campaign+suffix+'.txt' info.fname_testSurvival = OFROOT+campaign+'\\'+modeName+'\\testSurvival_'+campaign+suffix+'.txt' info.fname_evaluation = OFROOT+campaign+'\\'+modeName+'\\evaluation_'+campaign+suffix+'.txt' info.fname_baseline_q = OFROOT+campaign+'\\'+modeName+'\\baseline_q_'+campaign+suffix+'.txt' info.fname_tree_q = OFROOT+campaign+'\\'+modeName+'\\tree_q_'+campaign+suffix+'.txt' info.fname_baseline_w = OFROOT+campaign+'\\'+modeName+'\\baseline_w_'+campaign+suffix+'.txt' info.fname_tree_w = OFROOT+campaign+'\\'+modeName+'\\tree_w_'+campaign+suffix+'.txt' info.fname_pruneNode = OFROOT+campaign+'\\'+modeName+'\\pruneNode_'+campaign+suffix+'.txt' info.fname_pruneEval = OFROOT+campaign+'\\'+modeName+'\\pruneEval_'+campaign+suffix+'.txt' info.fname_testwin = OFROOT+campaign+'\\'+modeName+'\\testwin_'+campaign+suffix+'.txt' step = STEP # baseline print campaign+" "+modeName+" baseline begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) baseline(info) print campaign+" "+modeName+" baseline ends." # getDataset dataset = getTrainData(info.fname_trainlog,info.fname_trainbid) print campaign+" "+modeName+" decisionTree2 begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) decisionTree2(dataset,info) #evaluation print "evaluation begins." print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) evaluate(info) # runtime end_time = time.clock() if not runtimes.has_key(campaign): runtimes[campaign] = [] runtimes[campaign].append(end_time-start_time) print campaign+" run time: "+str(end_time-start_time)+" s" for campaign in runtimes: for mode in range(0,len(runtimes[campaign])): print campaign+" "+MODE_NAME_LIST[mode]+" runtime "+str( runtimes[campaign][mode] )
def full_realtime_comparisons(precompute_fraction=.2, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better', queryfile=None, delim=','): queries = [] if queryfile == None: g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits) q = 0 while q < nqueries: node = random.randint(0, ndataunits - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str( len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test' with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) else: with open(queryfile + '.csv', 'rb') as f: r = csv.reader(f, delimiter=delim) for row in r: queries.append(map(int, row)) graphfile = queryfile infile = graphfile # max_to_process = min(nqueries, len(queries)) # queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction * len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed, notif='loud') rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) elif gcpatype == 'both': gcpa_linear = GCPA(clustering, ndataunits) gcpa_better = GCPA_better(clustering, ndataunits) if gcpatype != 'both': gcpa_data.process(machines, dataunit_in_machine) else: gcpa_linear.process(machines, dataunit_in_machine) gcpa_better.process(machines, dataunit_in_machine) gcpa_rt_coverlens = [] gcpa_times = [] lg_coverlens = [] baseline_coverlens = [] baseline_times = [] b_baseline_coverlens = [] b_baseline_times = [] smaller = 0 lg_times = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx if ctype != 'both': cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) gcpa_rt_coverlens.append(len(cover)) gcpa_times.append(gcpa_dt) else: cover_fast, gcpa_fast_dt = rt_query_process( query, clustering, gcpa_linear, machines, dataunit_in_machine, 'fast') cover_full, gcpa_full_dt = rt_query_process( query, clustering, gcpa_linear, machines, dataunit_in_machine, 'full') cover_better_fast, gcpa_better_fast_dt = rt_query_process( query, clustering, gcpa_better, machines, dataunit_in_machine, 'fast') cover_better_full, gcpa_better_full_dt = rt_query_process( query, clustering, gcpa_better, machines, dataunit_in_machine, 'full') gcpa_rt_coverlens.append( map(len, [ cover_fast, cover_full, cover_better_fast, cover_better_full ])) gcpa_times.append([ gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt, gcpa_better_full_dt ]) lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine) lg_times.append(lg_dt) baseline_cover, baseline_time = baseline(query, machines, dataunit_in_machine) lg_coverlens.append(len(lg_cover)) baseline_coverlens.append(len(baseline_cover)) baseline_times.append(baseline_time) b_baseline_cover, b_baseline_time = better_baseline( query, machines, dataunit_in_machine) b_baseline_coverlens.append(len(b_baseline_cover)) b_baseline_times.append(b_baseline_time) with open(infile + '_cover_len_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, cl in enumerate(gcpa_rt_coverlens): w.writerow([ cl, lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx] ]) else: w.writerow([ 'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline' ]) for idx, cl in enumerate(gcpa_rt_coverlens): cl.extend([ lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx] ]) w.writerow(cl) with open(infile + '_time_comparison.csv', 'wb') as f: w = csv.writer(f) if ctype != 'both': w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) for idx, gcpa_dt in enumerate(gcpa_times): w.writerow([ gcpa_dt, lg_times[idx], baseline_times[idx], b_baseline_times[idx] ]) else: w.writerow([ 'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline' ]) for idx, gcpa_dt in enumerate(gcpa_times): gcpa_dt.extend([ lg_times[idx], baseline_times[idx], b_baseline_times[idx] ]) w.writerow(gcpa_dt)
def unsupervised_with_baseline(self): decoder = self.create_decoder() assert (os.path.exists(self.options.result_dir + 'model_dec')) self.load_decoder(decoder) encoder = self.create_encoder() assert (os.path.exists(self.options.result_dir + 'model_enc')) self.load_encoder(encoder) baseline = self.create_baseline() if os.path.exists(self.options.result_dir + 'baseline'): self.load_baseline(baseline) enc_trainer = optimizers[self.options.optimizer](encoder.model) dec_trainer = optimizers[self.options.optimizer](decoder.model) baseline_trainer = optimizers[self.options.optimizer](baseline.model) lr = self.options.lr #used only for sgd i = 0 lowest_valid_loss = 9999 print('unsupervised training...') for epoch in range(self.options.epochs): sents = 0 total_loss = 0.0 train = self.reader.next_example(0) train_size = len(self.reader.data[0]) for data in train: s1, s2, s3, pos, act = data[0], data[1], data[2], data[ 3], data[4] sents += 1 # random sample enc_loss_act, _, act = encoder.parse(s1, s2, s3, pos, sample=True) _, dec_loss_act, dec_loss_word = decoder.compute_loss(s3, act) # save reward logpx = -dec_loss_word.scalar_value() total_loss -= logpx # reconstruction and regularization loss backprop to theta_d dec_loss_total = dec_loss_word + dec_loss_act * dy.scalarInput( self.options.dec_reg) dec_loss_total = dec_loss_total * dy.scalarInput( 1.0 / self.options.mcsamples) dec_loss_total.scalar_value() dec_loss_total.backward() # update decoder if self.options.optimizer == 'sgd': dec_trainer.update(lr) else: dec_trainer.update() if self.options.enc_update > 0: # compute baseline and backprop to theta_b b = baseline(s3) logpxb = b.scalar_value() b_loss = dy.squared_distance(b, dy.scalarInput(logpx)) b_loss.value() b_loss.backward() # update baseline if self.options.optimizer == 'sgd': baseline_trainer.update(lr) else: baseline_trainer.update() # policy and and regularization loss backprop to theta_e enc_loss_act = encoder.train(s1, s2, s3, pos, act) enc_loss_policy = enc_loss_act * dy.scalarInput( (logpx - logpxb) / len(s1)) enc_loss_total = enc_loss_policy * dy.scalarInput( self.options.enc_update ) - enc_loss_act * dy.scalarInput(self.options.enc_reg) enc_loss_total = enc_loss_total * dy.scalarInput( 1.0 / self.options.mcsamples) enc_loss_total.value() enc_loss_total.backward() # update encoder if self.options.optimizer == 'sgd': enc_trainer.update(lr) else: enc_trainer.update() e = float(i) / train_size if i % self.options.print_every == 0: print('epoch {}: loss per sentence: {}'.format( e, total_loss / sents)) sents = 0 total_loss = 0.0 if i != 0 and i % self.options.save_every == 0: print('computing loss on validation set...') total_valid_loss = 0 valid = self.reader.next_example(1) valid_size = len(self.reader.data[1]) for vdata in valid: s1, s2, s3, pos, act = vdata[0], vdata[1], vdata[ 2], vdata[3], vdata[4] _, _, valid_word_loss = decoder.compute_loss(s3, act) if valid_word_loss is not None: total_valid_loss += valid_word_loss.scalar_value() total_valid_loss = total_valid_loss * 1.0 / valid_size if total_valid_loss < lowest_valid_loss: lowest_valid_loss = total_valid_loss print('saving model...') encoder.Save(self.options.result_dir + 'model_enc') decoder.Save(self.options.result_dir + 'model_dec') baseline.Save(self.options.result_dir + 'baseline') else: lr = lr * self.options.decay i += 1