Exemple #1
0
    def pretrain_baseline(self):
        baseline = self.create_baseline()
        if os.path.exists(self.options.result_dir + 'baseline'):
            self.load_baseline(baseline)

        baseline_trainer = optimizers[self.options.optimizer](baseline.model)
        lr = self.options.lr  #used only for sgd

        i = 0
        lowest_valid_loss = 9999
        print('train baseline, for simplicity use the same data here')
        for epoch in range(self.options.epochs):
            sents = 0
            total_loss = 0.0

            train = self.reader.next_example(0)
            train_size = len(self.reader.data[0])

            for data in train:
                s1, s2, s3, pos, act = data[0], data[1], data[2], data[
                    3], data[4]
                sents += 1
                loss = -baseline(s3)

                if loss is not None:
                    total_loss += loss.scalar_value()
                    loss.backward()
                    if self.options.optimizer == 'sgd':
                        baseline_trainer.update(lr)
                    else:
                        baseline_trainer.update()

                e = float(i) / train_size
                if i % self.options.print_every == 0:
                    print('epoch {}: loss per sentence: {}'.format(
                        e, total_loss / sents))
                    sents = 0
                    total_loss = 0.0
                if i != 0 and i % self.options.save_every == 0:
                    print('computing loss on validation set...')
                    total_valid_loss = 0
                    valid = self.reader.next_example(1)
                    valid_size = len(self.reader.data[1])
                    for vdata in valid:
                        s1, s2, s3, pos, act = vdata[0], vdata[1], vdata[
                            2], vdata[3], vdata[4]
                        valid_loss = -baseline(s3)
                        if valid_loss is not None:
                            total_valid_loss += valid_loss.scalar_value()
                    total_valid_loss = total_valid_loss * 1.0 / valid_size
                    if total_valid_loss < lowest_valid_loss:
                        lowest_valid_loss = total_valid_loss
                        print('saving model...')
                        baseline.Save(self.options.result_dir + 'baseline')
                    else:
                        lr = lr * self.options.decay
                i += 1
Exemple #2
0
def paraTune(campaign_list):
    suffix_list = ['n','s','f']
    runtimes_leafSize = {}
    for campaign in campaign_list:
        runtimes_leafSize[campaign] = {}
        for mode in MODE_LIST:
            runtimes_leafSize[campaign][mode] = {}

            ##################################   leafSize   ######################################
            for leafSize in [0]:
                start_time = time.clock()

                info = Info()
                info.basebid = BASE_BID
                info.campaign = campaign
                info.mode = mode
                modeName = MODE_NAME_LIST[mode]
                suffix = suffix_list[mode]

                info.laplace = LAPLACE
                info.leafSize = leafSize        #
                info.treeDepth = TREE_DEPTH

                # create os directory
                if not os.path.exists(OFROOT+campaign+'/'+modeName):
                    os.makedirs(OFROOT+campaign+'/'+modeName)
                if not os.path.exists(OFROOT+campaign+'/'+modeName+'/paraTune'):
                    os.makedirs(OFROOT+campaign+'/'+modeName+'/paraTune')
                if not os.path.exists(OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)):
                    os.makedirs(OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize))
                # info assignment
                info.fname_trainlog = IFROOT+campaign+'/train.log.txt'
                info.fname_testlog = IFROOT+campaign+'/test.log.txt'
                info.fname_nodeData = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/nodeData_'+campaign+suffix+'.txt'
                info.fname_nodeInfo = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/nodeInfos_'+campaign+suffix+'.txt'

                info.fname_trainbid = IFROOT+campaign+'/train_bid.txt'
                info.fname_testbid = IFROOT+campaign+'/test_bid.txt'
                info.fname_baseline = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_'+campaign+suffix+'.txt'

                info.fname_monitor = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/monitor_'+campaign+suffix+'.txt'
                info.fname_testKmeans = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testKmeans_'+campaign+suffix+'.txt'
                info.fname_testSurvival = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testSurvival_'+campaign+suffix+'.txt'

                info.fname_evaluation = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/evaluation_'+campaign+suffix+'.txt'
                info.fname_baseline_q = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_q_'+campaign+suffix+'.txt'
                info.fname_tree_q = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/tree_q_'+campaign+suffix+'.txt'
                info.fname_baseline_w = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/baseline_w_'+campaign+suffix+'.txt'
                info.fname_tree_w = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/tree_w_'+campaign+suffix+'.txt'

                info.fname_pruneNode = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/pruneNode_'+campaign+suffix+'.txt'
                info.fname_pruneEval = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/pruneEval_'+campaign+suffix+'.txt'
                info.fname_testwin = OFROOT+campaign+'/'+modeName+'/paraTune/leafSize_'+str(leafSize)+'/testwin_'+campaign+suffix+'.txt'
                # baseline
                print campaign,modeName,'leafSize',leafSize,"baseline begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                baseline(info)
                print campaign,modeName,'leafSize',leafSize,"baseline ends."
                # getDataset
                dataset = getTrainData(info.fname_trainlog,info.fname_trainbid)

                print campaign,modeName,'leafSize',leafSize,"decisionTree2 begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                decisionTree2(dataset,info)

                #evaluation
                print campaign,modeName,'leafSize',leafSize,"evaluation begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                evaluate(info)

                # runtime
                end_time = time.clock()
                runtimes_leafSize[campaign][mode][leafSize] = end_time-start_time

                print campaign,modeName,leafSize,"run time: "+str(end_time-start_time)+" s"



    for campaign in runtimes_leafSize:
        for mode in runtimes_leafSize[campaign]:
            for leafSize in runtimes_leafSize[campaign][mode]:
                print campaign,MODE_NAME_LIST[mode],'leafSize',leafSize,"runtime "+str( runtimes_leafSize[campaign][mode][leafSize] )
Exemple #3
0
def main(campaign_list):
    suffix_list = ['n','s','f']
    runtimes = {}
    for campaign in campaign_list:
        for mode in MODE_LIST:
            # tempt filter
            start_time = time.clock()

            info = Info()
            info.basebid = BASE_BID
            info.campaign = campaign
            info.mode = mode
            modeName = MODE_NAME_LIST[mode]
            suffix = suffix_list[mode]

            info.laplace = LAPLACE
            info.leafSize = LEAF_SIZE
            info.treeDepth = TREE_DEPTH

            # create os directory
            if not os.path.exists(OFROOT+campaign+'/'+modeName):
                os.makedirs(OFROOT+campaign+'/'+modeName)
            # info assignment
            info.fname_trainlog = IFROOT+campaign+'/train.log.txt'
            info.fname_testlog = IFROOT+campaign+'/test.log.txt'
            info.fname_nodeData = OFROOT+campaign+'/'+modeName+'/nodeData_'+campaign+suffix+'.txt'
            info.fname_nodeInfo = OFROOT+campaign+'/'+modeName+'/nodeInfos_'+campaign+suffix+'.txt'

            info.fname_trainbid = IFROOT+campaign+'/train_bid.txt'
            info.fname_testbid = IFROOT+campaign+'/test_bid.txt'
            info.fname_baseline = OFROOT+campaign+'/'+modeName+'/baseline_'+campaign+suffix+'.txt'

            info.fname_monitor = OFROOT+campaign+'/'+modeName+'/monitor_'+campaign+suffix+'.txt'
            info.fname_testKmeans = OFROOT+campaign+'/'+modeName+'/testKmeans_'+campaign+suffix+'.txt'
            info.fname_testSurvival = OFROOT+campaign+'/'+modeName+'/testSurvival_'+campaign+suffix+'.txt'

            info.fname_evaluation = OFROOT+campaign+'/'+modeName+'/evaluation_'+campaign+suffix+'.txt'
            info.fname_baseline_q = OFROOT+campaign+'/'+modeName+'/baseline_q_'+campaign+suffix+'.txt'
            info.fname_tree_q = OFROOT+campaign+'/'+modeName+'/tree_q_'+campaign+suffix+'.txt'
            info.fname_baseline_w = OFROOT+campaign+'/'+modeName+'/baseline_w_'+campaign+suffix+'.txt'
            info.fname_tree_w = OFROOT+campaign+'/'+modeName+'/tree_w_'+campaign+suffix+'.txt'

            info.fname_pruneNode = OFROOT+campaign+'/'+modeName+'/pruneNode_'+campaign+suffix+'.txt'
            info.fname_pruneEval = OFROOT+campaign+'/'+modeName+'/pruneEval_'+campaign+suffix+'.txt'
            info.fname_testwin = OFROOT+campaign+'/'+modeName+'/testwin_'+campaign+suffix+'.txt'
            step = STEP
            # baseline
            print campaign+" "+modeName+" baseline begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            baseline(info)
            print campaign+" "+modeName+" baseline ends."
            # getDataset
            dataset = getTrainData(info.fname_trainlog,info.fname_trainbid)

            print campaign+" "+modeName+" decisionTree2 begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            decisionTree2(dataset,info)

            #evaluation
            print "evaluation begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            evaluate(info)

            # runtime
            end_time = time.clock()
            if not runtimes.has_key(campaign):
                runtimes[campaign] = []
            runtimes[campaign].append(end_time-start_time)

            print campaign+" run time: "+str(end_time-start_time)+" s"

    for campaign in runtimes:
        for mode in range(0,len(runtimes[campaign])):
            print campaign+" "+MODE_NAME_LIST[mode]+" runtime "+str( runtimes[campaign][mode] )
def full_clustering_procedure_comparisons(ndata=100000,
                                          N=50000,
                                          nmachines=50,
                                          min_q_len=6,
                                          max_q_len=15,
                                          number_of_clusterings=1,
                                          queryfile=None,
                                          np=.995,
                                          delim=','):
    NoNodes = ndata

    for iteration in xrange(number_of_clusterings):
        print 'ITERATION: ', iteration

        #        np = .993
        p = np / NoNodes

        output = []
        if queryfile == None:
            #we genarate random graph on NoNodes vertexes (need to set probability)
            g = Graph.Erdos_Renyi(n=NoNodes, p=p)
            print 'Graph generated'

            #taking random node from the graph
            node = random.randint(0, NoNodes - 1)

            #the DFS function, as arguments we have name of the graph, first node
            output = []

            #the loop on the number of queries
            #    for q in range(N):
            while len(output) < N:
                node = random.randint(0, NoNodes - 1)
                line = iterative_dfs(g, node, path=[])
                if len(line) >= min_q_len:
                    output.append(line)

            graphfile = 'n' + str(
                len(output) / 1000) + 'np' + str(np) + '_' + str(iteration)
            with open(graphfile + '.csv', 'wb') as f:
                w = csv.writer(f)
                for line in output:
                    w.writerow(line)

            print 'Queries generated', len(output)
        else:
            with open(queryfile + '.csv', 'rb') as f:
                r = csv.reader(f, delimiter=delim)
                for row in r:
                    output.append(map(int, row))
            print 'Queries imported'
            graphfile = queryfile

        infile = graphfile

        test_queries = output

        max_len = len(test_queries)
        N = len(test_queries)
        #min(50000, len(test_queries))
        test_queries = test_queries[:max_len]

        #    clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries)
        #    clusters, cl_entropies = simple_entropy(test_queries)
        clustering = Clustering(test_queries, notif='loud')
        clusters = clustering.clusters

        outfile = infile + '_output_test'

        print 'Clustered'

        with open(outfile + '.csv', 'wb') as f:
            #    f.write('Output from simpleROCK clustering algorithm \n')
            f.write(str(len(clusters)) + '\n')
            ctr = 1
            for c in clusters:
                f.write('-----------------------\n')
                f.write('Cluster ' + str(ctr) + '\n')
                f.write('# of Queries: ' + str(len(c)) + '\n')
                #print 1.0*c.min_query_len/len(c.span)
                #        f.write('Span: ' + str(c.span) + '\n')
                f.write(c.aligned_output())
                f.write('-----------------------\n')
                ctr += 1
        print 'Clusters written to file'

        machines = generate(range(ndata), nmachines)
        dataunit_in_machine = generate_hash(machines, ndata)
        gcpa_data = GCPA(clustering, ndata)
        start = time.time()
        gcpa_data.process(machines, dataunit_in_machine)
        cover_time = time.time() - start
        average = 1.0 * cover_time / len(test_queries)

        gcpa_better = GCPA_better(clustering, ndata)

        betterstart = time.time()
        gcpa_better.process(machines, dataunit_in_machine)
        better_dt = time.time() - betterstart
        better_average = 1.0 * better_dt / len(test_queries)

        lg_start = time.time()
        for query in test_queries:
            cover, dt = linear_greedy(query, machines, dataunit_in_machine)

        lg_dt = time.time() - lg_start
        lg_ave = 1.0 * lg_dt / len(test_queries)

        baseline_start = time.time()
        for query in test_queries:
            cover, dt = baseline(query, machines, dataunit_in_machine)
        baseline_dt = time.time() - baseline_start
        baseline_ave = 1.0 * baseline_dt / len(test_queries)

        b_baseline_start = time.time()
        for query in test_queries:
            cover, dt = better_baseline(query, machines, dataunit_in_machine)
        b_baseline_dt = time.time() - baseline_start
        b_baseline_ave = 1.0 * b_baseline_dt / len(test_queries)

        #        print average, better_average, lg_ave, baseline_ave, b_baseline_ave
        print baseline_ave, b_baseline_ave, lg_ave, average, better_average

        covers = gcpa_data.covers
        better_covers = gcpa_better.covers

        to_write = []
        total = 0
        for clusterind, coverset in enumerate(covers):
            for query_ind, cover in enumerate(coverset):
                if total % 1000 == 0:
                    print total
                total += 1
                query = clustering.clusters[clusterind][query_ind]

                gcpa_fast_lin = cover
                gcpa_fast_better = better_covers[clusterind][query_ind]

                lg_cover, lg_dt = linear_greedy(query, machines,
                                                dataunit_in_machine)
                baseline_cover, baseline_dt = baseline(query, machines,
                                                       dataunit_in_machine)
                b_baseline_cover, b_baseline_dt = better_baseline(
                    query, machines, dataunit_in_machine)
                #            to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover]))
                to_write.append(
                    map(len, [
                        baseline_cover, b_baseline_cover, lg_cover,
                        gcpa_fast_lin, gcpa_fast_better
                    ]))

        with open(infile + 'big_comparison.csv', 'wb') as f:
            w = csv.writer(f)
            w.writerow([
                'Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL'
            ])
            w.writerow([
                baseline_ave, b_baseline_ave, lg_ave, average, better_average
            ])
            for row in to_write:
                w.writerow(row)
Exemple #5
0
    with open(filename, mode='r') as infile:
        reader = csv.reader(infile)
        next(reader, None)  # skip header
        for rows in reader:
            yield (rows[1], rows[8], rows[3])
            # results.append((rows[1], rows[5], rows[9]))
    # return results


test_data = get_data('video_characteristics_upload.csv')
counter = 1
baseline_loss_avg = 0.0
oracle_loss_avg = 0.0
for video_id, title, channel_id in test_data:
    try:
        baseline_prediction = baseline(channel_id)
        oracle_prediction = oracle(title, channel_id)
        view_count = int(youtube.videos().list(
            id=video_id,
            part='statistics',
            fields='items/statistics/viewCount',
        ).execute()['items'][0]['statistics']['viewCount'])
        baseline_loss = log10(float(baseline_prediction) / view_count)**2
        oracle_loss = log10(float(oracle_prediction) / view_count)**2
        baseline_loss_avg = (
            (counter - 1) * baseline_loss_avg + baseline_loss) / counter
        oracle_loss_avg = (
            (counter - 1) * oracle_loss_avg + oracle_loss) / counter
        print counter
        print "Baseline: %d (Loss %f), Oracle: %d (Loss %f), True Value: %d" % (
            baseline_prediction, baseline_loss, oracle_prediction, oracle_loss,
Exemple #6
0
 def add_all_available_baselines(self):
     for baseline in self.baselines:
         self.config.register_player(name=str(baseline),
                                     algorithm=baseline())
def full_realtime_comparisons(precompute_fraction=.2, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995,
                  min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better', queryfile=None,delim=','): 
    queries = []

    if queryfile == None: 
        g = Graph.Erdos_Renyi(n=ndataunits, p = np/ndataunits)
        q = 0
        while q < nqueries: 
            node=random.randint(0, ndataunits-1)
            line = iterative_dfs(g, node, path=[])
            if len(line) >= min_q_len:
                queries.append(line)
                q += 1

        graphfile = 'n' + str(len(queries)/1000) + 'np' + str(np) +ctype + gcpatype + 'test'
        with open(graphfile + '.csv','wb') as f:
            w = csv.writer(f)
            for line in queries:
                w.writerow(line)

        print 'Queries generated', len(queries)
    else: 
        with open(queryfile + '.csv', 'rb') as f: 
            r = csv.reader(f, delimiter=delim)
            for row in r: 
                queries.append(map(int, row))
        graphfile = queryfile

    infile = graphfile
#    max_to_process = min(nqueries, len(queries))
#    queries = queries[:max_to_process]

    pre_computed = queries[:int(precompute_fraction*len(queries))]
    machines = generate(range(ndataunits), nmachines)
    dataunit_in_machine = generate_hash(machines, ndataunits)

    clustering = Clustering(pre_computed, notif='loud')

    rt_queries = queries[len(pre_computed):]
    
    if gcpatype == 'linear': 
        gcpa_data = GCPA(clustering, ndataunits)
    elif gcpatype == 'better': 
        gcpa_data = GCPA_better(clustering, ndataunits)
    elif gcpatype == 'both': 
        gcpa_linear = GCPA(clustering, ndataunits)
        gcpa_better = GCPA_better(clustering, ndataunits)
    
    if gcpatype != 'both':
        gcpa_data.process(machines, dataunit_in_machine)
    else: 
        gcpa_linear.process(machines, dataunit_in_machine)
        gcpa_better.process(machines, dataunit_in_machine)

    gcpa_rt_coverlens = []
    gcpa_times = []

    
    lg_coverlens = []
    baseline_coverlens = []
    baseline_times = []

    b_baseline_coverlens = []
    b_baseline_times = []
    smaller = 0

    lg_times = []
    for idx, query in enumerate(rt_queries):
        oldlen = len(query)
        if (idx % 1000) == 0: 
            print 'Query: ', idx

        if ctype != 'both': 
            cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype)
            gcpa_rt_coverlens.append(len(cover))
            gcpa_times.append(gcpa_dt)
        else: 
            cover_fast, gcpa_fast_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'fast')
            cover_full, gcpa_full_dt = rt_query_process(query, clustering, gcpa_linear, machines, dataunit_in_machine, 'full')
            cover_better_fast, gcpa_better_fast_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'fast')
            cover_better_full, gcpa_better_full_dt = rt_query_process(query, clustering, gcpa_better, machines, dataunit_in_machine, 'full')
            gcpa_rt_coverlens.append(map(len,[cover_fast, cover_full, cover_better_fast, cover_better_full]))
            gcpa_times.append([gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt, gcpa_better_full_dt])
        
        lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine)
        
        lg_times.append(lg_dt)

        baseline_cover, baseline_time = baseline(query, machines, dataunit_in_machine)

        lg_coverlens.append(len(lg_cover))
        baseline_coverlens.append(len(baseline_cover))
        baseline_times.append(baseline_time)
        
        b_baseline_cover, b_baseline_time = better_baseline(query, machines, dataunit_in_machine)
        b_baseline_coverlens.append(len(b_baseline_cover))
        b_baseline_times.append(b_baseline_time)

    with open(infile +'_cover_len_comparison.csv', 'wb') as f:
        w = csv.writer(f)
        if ctype != 'both': 
            w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline'])
            for idx, cl in enumerate(gcpa_rt_coverlens):
                w.writerow([cl, lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]])
        else: 
            w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline'])
            for idx, cl in enumerate(gcpa_rt_coverlens):
                cl.extend([lg_coverlens[idx], baseline_coverlens[idx], b_baseline_coverlens[idx]])
                w.writerow(cl)

    with open(infile +'_time_comparison.csv', 'wb') as f:
        w = csv.writer(f)
        if ctype != 'both':
            w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline']) 
            for idx, gcpa_dt in enumerate(gcpa_times): 
                w.writerow([gcpa_dt, lg_times[idx], baseline_times[idx], b_baseline_times[idx]])
        else: 
            w.writerow(['GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy', 'Baseline', 'Better Baseline'])
            for idx, gcpa_dt in enumerate(gcpa_times): 
                gcpa_dt.extend([lg_times[idx], baseline_times[idx], b_baseline_times[idx]])
                w.writerow(gcpa_dt)
def full_clustering_procedure_comparisons(ndata = 100000, N=50000, nmachines = 50, min_q_len = 6, max_q_len = 15, number_of_clusterings=1, queryfile = None, np = .995, delim=','): 
    NoNodes = ndata

    for iteration in xrange(number_of_clusterings): 
        print 'ITERATION: ', iteration

#        np = .993
        p = np/NoNodes

        output = []
        if queryfile == None: 
    #we genarate random graph on NoNodes vertexes (need to set probability)
            g=Graph.Erdos_Renyi(n=NoNodes, p=p)
            print 'Graph generated'

    #taking random node from the graph
            node=random.randint(0,NoNodes-1)

    #the DFS function, as arguments we have name of the graph, first node
            output = []

    #the loop on the number of queries
    #    for q in range(N):
            while len(output) < N: 
                node=random.randint(0, NoNodes-1)
                line = iterative_dfs(g, node, path=[])
                if len(line) >= min_q_len:
                    output.append(line)

            graphfile = 'n' + str(len(output)/1000) + 'np' + str(np) + '_' + str(iteration)
            with open(graphfile + '.csv','wb') as f:
                w = csv.writer(f)
                for line in output:
                    w.writerow(line)

            print 'Queries generated', len(output)
        else: 
            with open(queryfile + '.csv', 'rb') as f: 
                r = csv.reader(f,delimiter=delim)
                for row in r:
                    output.append(map(int, row))
            print 'Queries imported'
            graphfile =  queryfile

        infile = graphfile

        test_queries = output

        max_len = len(test_queries)
        N = len(test_queries)
        #min(50000, len(test_queries))
        test_queries = test_queries[:max_len]

    #    clusters, disjoint, cluster_tracker, data_added_count, data_in_nclusters = simple_entropy(test_queries)
    #    clusters, cl_entropies = simple_entropy(test_queries)
        clustering = Clustering(test_queries, notif='loud')
        clusters = clustering.clusters

        outfile = infile + '_output_test'

        print 'Clustered'

        with open(outfile + '.csv', 'wb') as f:
    #    f.write('Output from simpleROCK clustering algorithm \n')
            f.write(str(len(clusters)) + '\n')
            ctr = 1
            for c in clusters: 
                f.write('-----------------------\n')
                f.write('Cluster ' + str(ctr) + '\n')
                f.write('# of Queries: ' + str(len(c)) + '\n')
            #print 1.0*c.min_query_len/len(c.span)
            #        f.write('Span: ' + str(c.span) + '\n')
                f.write(c.aligned_output())
                f.write('-----------------------\n')
                ctr += 1
        print 'Clusters written to file'

        machines = generate(range(ndata), nmachines)
        dataunit_in_machine = generate_hash(machines, ndata)
        gcpa_data = GCPA(clustering,ndata)
        start = time.time()
        gcpa_data.process(machines, dataunit_in_machine)
        cover_time = time.time() - start
        average = 1.0*cover_time/len(test_queries)

        gcpa_better = GCPA_better(clustering, ndata)

        betterstart = time.time()
        gcpa_better.process(machines, dataunit_in_machine)
        better_dt = time.time() - betterstart
        better_average = 1.0*better_dt/len(test_queries)

        lg_start = time.time()
        for query in test_queries: 
            cover, dt = linear_greedy(query, machines, dataunit_in_machine)

        lg_dt = time.time() - lg_start
        lg_ave = 1.0*lg_dt/len(test_queries)

        baseline_start = time.time()
        for query in test_queries: 
            cover, dt = baseline(query, machines, dataunit_in_machine)
        baseline_dt = time.time() - baseline_start
        baseline_ave = 1.0*baseline_dt/len(test_queries)

        b_baseline_start = time.time()
        for query in test_queries: 
            cover, dt = better_baseline(query, machines, dataunit_in_machine)
        b_baseline_dt = time.time() - baseline_start
        b_baseline_ave = 1.0*b_baseline_dt/len(test_queries)

#        print average, better_average, lg_ave, baseline_ave, b_baseline_ave
        print baseline_ave, b_baseline_ave, lg_ave, average, better_average

        covers = gcpa_data.covers
        better_covers = gcpa_better.covers

        to_write = []
        total = 0
        for clusterind, coverset in enumerate(covers): 
            for query_ind, cover in enumerate(coverset): 
                if total % 1000 == 0: 
                    print total 
                total +=1 
                query = clustering.clusters[clusterind][query_ind]

                gcpa_fast_lin = cover
                gcpa_fast_better = better_covers[clusterind][query_ind]

                lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine)
                baseline_cover, baseline_dt = baseline(query, machines, dataunit_in_machine)
                b_baseline_cover, b_baseline_dt = better_baseline(query, machines, dataunit_in_machine)
    #            to_write.append(map(len, [gcpa_fast_lin, gcpa_fast_better, lg_cover, baseline_cover, b_baseline_cover]))
                to_write.append(map(len, [baseline_cover, b_baseline_cover, lg_cover, gcpa_fast_lin, gcpa_fast_better]))

        with open(infile + 'big_comparison.csv', 'wb') as f: 
            w = csv.writer(f)
            w.writerow(['Baseline', 'Better Baseline', 'N-Greedy', 'GCPA_G', 'GCPA_DL'])
            w.writerow([baseline_ave, b_baseline_ave, lg_ave, average, better_average])
            for row in to_write: 
                w.writerow(row)
Exemple #9
0
def paraTune(campaign_list):
    suffix_list = ['n','s','f']
    runtimes_leafSize = {}
    for campaign in campaign_list:
        runtimes_leafSize[campaign] = {}
        for mode in MODE_LIST:
            runtimes_leafSize[campaign][mode] = {}

            ##################################   leafSize   ######################################
            for leafSize in [0]:
                start_time = time.clock()

                info = Info()
                info.basebid = BASE_BID
                info.campaign = campaign
                info.mode = mode
                modeName = MODE_NAME_LIST[mode]
                suffix = suffix_list[mode]

                info.laplace = LAPLACE
                info.leafSize = leafSize        #
                info.treeDepth = TREE_DEPTH

                # create os directory
                if not os.path.exists(OFROOT+campaign+'\\'+modeName):
                    os.makedirs(OFROOT+campaign+'\\'+modeName)
                if not os.path.exists(OFROOT+campaign+'\\'+modeName+'\\paraTune'):
                    os.makedirs(OFROOT+campaign+'\\'+modeName+'\\paraTune')
                if not os.path.exists(OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)):
                    os.makedirs(OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize))
                # info assignment
                info.fname_trainlog = IFROOT+campaign+'\\train.log.txt'
                info.fname_testlog = IFROOT+campaign+'\\test.log.txt'
                info.fname_nodeData = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\nodeData_'+campaign+suffix+'.txt'
                info.fname_nodeInfo = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\nodeInfos_'+campaign+suffix+'.txt'

                info.fname_trainbid = IFROOT+campaign+'\\train_bid.txt'
                info.fname_testbid = IFROOT+campaign+'\\test_bid.txt'
                info.fname_baseline = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_'+campaign+suffix+'.txt'

                info.fname_monitor = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\monitor_'+campaign+suffix+'.txt'
                info.fname_testKmeans = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testKmeans_'+campaign+suffix+'.txt'
                info.fname_testSurvival = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testSurvival_'+campaign+suffix+'.txt'

                info.fname_evaluation = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\evaluation_'+campaign+suffix+'.txt'
                info.fname_baseline_q = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_q_'+campaign+suffix+'.txt'
                info.fname_tree_q = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\tree_q_'+campaign+suffix+'.txt'
                info.fname_baseline_w = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\baseline_w_'+campaign+suffix+'.txt'
                info.fname_tree_w = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\tree_w_'+campaign+suffix+'.txt'

                info.fname_pruneNode = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\pruneNode_'+campaign+suffix+'.txt'
                info.fname_pruneEval = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\pruneEval_'+campaign+suffix+'.txt'
                info.fname_testwin = OFROOT+campaign+'\\'+modeName+'\\paraTune\\leafSize_'+str(leafSize)+'\\testwin_'+campaign+suffix+'.txt'
                # baseline
                print campaign,modeName,'leafSize',leafSize,"baseline begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                baseline(info)
                print campaign,modeName,'leafSize',leafSize,"baseline ends."
                # getDataset
                dataset = getTrainData(info.fname_trainlog,info.fname_trainbid)

                print campaign,modeName,'leafSize',leafSize,"decisionTree2 begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                decisionTree2(dataset,info)

                #evaluation
                print campaign,modeName,'leafSize',leafSize,"evaluation begins."
                print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                evaluate(info)

                # runtime
                end_time = time.clock()
                runtimes_leafSize[campaign][mode][leafSize] = end_time-start_time

                print campaign,modeName,leafSize,"run time: "+str(end_time-start_time)+" s"



    for campaign in runtimes_leafSize:
        for mode in runtimes_leafSize[campaign]:
            for leafSize in runtimes_leafSize[campaign][mode]:
                print campaign,MODE_NAME_LIST[mode],'leafSize',leafSize,"runtime "+str( runtimes_leafSize[campaign][mode][leafSize] )
Exemple #10
0
def main(campaign_list):
    suffix_list = ['n','s','f']
    runtimes = {}
    for campaign in campaign_list:
        for mode in MODE_LIST:
            # tempt filter
            start_time = time.clock()

            info = Info()
            info.basebid = BASE_BID
            info.campaign = campaign
            info.mode = mode
            modeName = MODE_NAME_LIST[mode]
            suffix = suffix_list[mode]

            info.laplace = LAPLACE
            info.leafSize = LEAF_SIZE
            info.treeDepth = TREE_DEPTH

            # create os directory
            if not os.path.exists(OFROOT+campaign+'\\'+modeName):
                os.makedirs(OFROOT+campaign+'\\'+modeName)
            # info assignment
            info.fname_trainlog = IFROOT+campaign+'\\train.log.txt'
            info.fname_testlog = IFROOT+campaign+'\\test.log.txt'
            info.fname_nodeData = OFROOT+campaign+'\\'+modeName+'\\nodeData_'+campaign+suffix+'.txt'
            info.fname_nodeInfo = OFROOT+campaign+'\\'+modeName+'\\nodeInfos_'+campaign+suffix+'.txt'

            info.fname_trainbid = IFROOT+campaign+'\\train_bid.txt'
            info.fname_testbid = IFROOT+campaign+'\\test_bid.txt'
            info.fname_baseline = OFROOT+campaign+'\\'+modeName+'\\baseline_'+campaign+suffix+'.txt'

            info.fname_monitor = OFROOT+campaign+'\\'+modeName+'\\monitor_'+campaign+suffix+'.txt'
            info.fname_testKmeans = OFROOT+campaign+'\\'+modeName+'\\testKmeans_'+campaign+suffix+'.txt'
            info.fname_testSurvival = OFROOT+campaign+'\\'+modeName+'\\testSurvival_'+campaign+suffix+'.txt'

            info.fname_evaluation = OFROOT+campaign+'\\'+modeName+'\\evaluation_'+campaign+suffix+'.txt'
            info.fname_baseline_q = OFROOT+campaign+'\\'+modeName+'\\baseline_q_'+campaign+suffix+'.txt'
            info.fname_tree_q = OFROOT+campaign+'\\'+modeName+'\\tree_q_'+campaign+suffix+'.txt'
            info.fname_baseline_w = OFROOT+campaign+'\\'+modeName+'\\baseline_w_'+campaign+suffix+'.txt'
            info.fname_tree_w = OFROOT+campaign+'\\'+modeName+'\\tree_w_'+campaign+suffix+'.txt'

            info.fname_pruneNode = OFROOT+campaign+'\\'+modeName+'\\pruneNode_'+campaign+suffix+'.txt'
            info.fname_pruneEval = OFROOT+campaign+'\\'+modeName+'\\pruneEval_'+campaign+suffix+'.txt'
            info.fname_testwin = OFROOT+campaign+'\\'+modeName+'\\testwin_'+campaign+suffix+'.txt'
            step = STEP
            # baseline
            print campaign+" "+modeName+" baseline begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            baseline(info)
            print campaign+" "+modeName+" baseline ends."
            # getDataset
            dataset = getTrainData(info.fname_trainlog,info.fname_trainbid)

            print campaign+" "+modeName+" decisionTree2 begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            decisionTree2(dataset,info)

            #evaluation
            print "evaluation begins."
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            evaluate(info)

            # runtime
            end_time = time.clock()
            if not runtimes.has_key(campaign):
                runtimes[campaign] = []
            runtimes[campaign].append(end_time-start_time)

            print campaign+" run time: "+str(end_time-start_time)+" s"

    for campaign in runtimes:
        for mode in range(0,len(runtimes[campaign])):
            print campaign+" "+MODE_NAME_LIST[mode]+" runtime "+str( runtimes[campaign][mode] )
Exemple #11
0
def full_realtime_comparisons(precompute_fraction=.2,
                              nqueries=50000,
                              ndataunits=100000,
                              nmachines=50,
                              r=3,
                              np=.995,
                              min_q_len=6,
                              max_q_len=15,
                              ctype='fast',
                              gcpatype='better',
                              queryfile=None,
                              delim=','):
    queries = []

    if queryfile == None:
        g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits)
        q = 0
        while q < nqueries:
            node = random.randint(0, ndataunits - 1)
            line = iterative_dfs(g, node, path=[])
            if len(line) >= min_q_len:
                queries.append(line)
                q += 1

        graphfile = 'n' + str(
            len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test'
        with open(graphfile + '.csv', 'wb') as f:
            w = csv.writer(f)
            for line in queries:
                w.writerow(line)

        print 'Queries generated', len(queries)
    else:
        with open(queryfile + '.csv', 'rb') as f:
            r = csv.reader(f, delimiter=delim)
            for row in r:
                queries.append(map(int, row))
        graphfile = queryfile

    infile = graphfile
    #    max_to_process = min(nqueries, len(queries))
    #    queries = queries[:max_to_process]

    pre_computed = queries[:int(precompute_fraction * len(queries))]
    machines = generate(range(ndataunits), nmachines)
    dataunit_in_machine = generate_hash(machines, ndataunits)

    clustering = Clustering(pre_computed, notif='loud')

    rt_queries = queries[len(pre_computed):]

    if gcpatype == 'linear':
        gcpa_data = GCPA(clustering, ndataunits)
    elif gcpatype == 'better':
        gcpa_data = GCPA_better(clustering, ndataunits)
    elif gcpatype == 'both':
        gcpa_linear = GCPA(clustering, ndataunits)
        gcpa_better = GCPA_better(clustering, ndataunits)

    if gcpatype != 'both':
        gcpa_data.process(machines, dataunit_in_machine)
    else:
        gcpa_linear.process(machines, dataunit_in_machine)
        gcpa_better.process(machines, dataunit_in_machine)

    gcpa_rt_coverlens = []
    gcpa_times = []

    lg_coverlens = []
    baseline_coverlens = []
    baseline_times = []

    b_baseline_coverlens = []
    b_baseline_times = []
    smaller = 0

    lg_times = []
    for idx, query in enumerate(rt_queries):
        oldlen = len(query)
        if (idx % 1000) == 0:
            print 'Query: ', idx

        if ctype != 'both':
            cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data,
                                              machines, dataunit_in_machine,
                                              ctype)
            gcpa_rt_coverlens.append(len(cover))
            gcpa_times.append(gcpa_dt)
        else:
            cover_fast, gcpa_fast_dt = rt_query_process(
                query, clustering, gcpa_linear, machines, dataunit_in_machine,
                'fast')
            cover_full, gcpa_full_dt = rt_query_process(
                query, clustering, gcpa_linear, machines, dataunit_in_machine,
                'full')
            cover_better_fast, gcpa_better_fast_dt = rt_query_process(
                query, clustering, gcpa_better, machines, dataunit_in_machine,
                'fast')
            cover_better_full, gcpa_better_full_dt = rt_query_process(
                query, clustering, gcpa_better, machines, dataunit_in_machine,
                'full')
            gcpa_rt_coverlens.append(
                map(len, [
                    cover_fast, cover_full, cover_better_fast,
                    cover_better_full
                ]))
            gcpa_times.append([
                gcpa_fast_dt, gcpa_full_dt, gcpa_better_fast_dt,
                gcpa_better_full_dt
            ])

        lg_cover, lg_dt = linear_greedy(query, machines, dataunit_in_machine)

        lg_times.append(lg_dt)

        baseline_cover, baseline_time = baseline(query, machines,
                                                 dataunit_in_machine)

        lg_coverlens.append(len(lg_cover))
        baseline_coverlens.append(len(baseline_cover))
        baseline_times.append(baseline_time)

        b_baseline_cover, b_baseline_time = better_baseline(
            query, machines, dataunit_in_machine)
        b_baseline_coverlens.append(len(b_baseline_cover))
        b_baseline_times.append(b_baseline_time)

    with open(infile + '_cover_len_comparison.csv', 'wb') as f:
        w = csv.writer(f)
        if ctype != 'both':
            w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline'])
            for idx, cl in enumerate(gcpa_rt_coverlens):
                w.writerow([
                    cl, lg_coverlens[idx], baseline_coverlens[idx],
                    b_baseline_coverlens[idx]
                ])
        else:
            w.writerow([
                'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy',
                'Baseline', 'Better Baseline'
            ])
            for idx, cl in enumerate(gcpa_rt_coverlens):
                cl.extend([
                    lg_coverlens[idx], baseline_coverlens[idx],
                    b_baseline_coverlens[idx]
                ])
                w.writerow(cl)

    with open(infile + '_time_comparison.csv', 'wb') as f:
        w = csv.writer(f)
        if ctype != 'both':
            w.writerow(['GCPA', 'Greedy', 'Baseline', 'Better Baseline'])
            for idx, gcpa_dt in enumerate(gcpa_times):
                w.writerow([
                    gcpa_dt, lg_times[idx], baseline_times[idx],
                    b_baseline_times[idx]
                ])
        else:
            w.writerow([
                'GCPA_G_A', 'GCPA_G_U', 'GCPA_DL_A', 'GCPA_DL_U', 'Greedy',
                'Baseline', 'Better Baseline'
            ])
            for idx, gcpa_dt in enumerate(gcpa_times):
                gcpa_dt.extend([
                    lg_times[idx], baseline_times[idx], b_baseline_times[idx]
                ])
                w.writerow(gcpa_dt)
Exemple #12
0
    def unsupervised_with_baseline(self):
        decoder = self.create_decoder()
        assert (os.path.exists(self.options.result_dir + 'model_dec'))
        self.load_decoder(decoder)

        encoder = self.create_encoder()
        assert (os.path.exists(self.options.result_dir + 'model_enc'))
        self.load_encoder(encoder)

        baseline = self.create_baseline()
        if os.path.exists(self.options.result_dir + 'baseline'):
            self.load_baseline(baseline)

        enc_trainer = optimizers[self.options.optimizer](encoder.model)
        dec_trainer = optimizers[self.options.optimizer](decoder.model)
        baseline_trainer = optimizers[self.options.optimizer](baseline.model)
        lr = self.options.lr  #used only for sgd

        i = 0
        lowest_valid_loss = 9999
        print('unsupervised training...')
        for epoch in range(self.options.epochs):
            sents = 0
            total_loss = 0.0

            train = self.reader.next_example(0)
            train_size = len(self.reader.data[0])

            for data in train:
                s1, s2, s3, pos, act = data[0], data[1], data[2], data[
                    3], data[4]
                sents += 1

                # random sample
                enc_loss_act, _, act = encoder.parse(s1,
                                                     s2,
                                                     s3,
                                                     pos,
                                                     sample=True)
                _, dec_loss_act, dec_loss_word = decoder.compute_loss(s3, act)

                # save reward
                logpx = -dec_loss_word.scalar_value()
                total_loss -= logpx

                # reconstruction and regularization loss backprop to theta_d
                dec_loss_total = dec_loss_word + dec_loss_act * dy.scalarInput(
                    self.options.dec_reg)
                dec_loss_total = dec_loss_total * dy.scalarInput(
                    1.0 / self.options.mcsamples)
                dec_loss_total.scalar_value()
                dec_loss_total.backward()

                # update decoder
                if self.options.optimizer == 'sgd':
                    dec_trainer.update(lr)
                else:
                    dec_trainer.update()

                if self.options.enc_update > 0:
                    # compute baseline and backprop to theta_b
                    b = baseline(s3)
                    logpxb = b.scalar_value()
                    b_loss = dy.squared_distance(b, dy.scalarInput(logpx))
                    b_loss.value()
                    b_loss.backward()

                    # update baseline
                    if self.options.optimizer == 'sgd':
                        baseline_trainer.update(lr)
                    else:
                        baseline_trainer.update()

                    # policy and and regularization loss backprop to theta_e
                    enc_loss_act = encoder.train(s1, s2, s3, pos, act)
                    enc_loss_policy = enc_loss_act * dy.scalarInput(
                        (logpx - logpxb) / len(s1))
                    enc_loss_total = enc_loss_policy * dy.scalarInput(
                        self.options.enc_update
                    ) - enc_loss_act * dy.scalarInput(self.options.enc_reg)
                    enc_loss_total = enc_loss_total * dy.scalarInput(
                        1.0 / self.options.mcsamples)
                    enc_loss_total.value()
                    enc_loss_total.backward()

                    # update encoder
                    if self.options.optimizer == 'sgd':
                        enc_trainer.update(lr)
                    else:
                        enc_trainer.update()

                e = float(i) / train_size
                if i % self.options.print_every == 0:
                    print('epoch {}: loss per sentence: {}'.format(
                        e, total_loss / sents))
                    sents = 0
                    total_loss = 0.0
                if i != 0 and i % self.options.save_every == 0:
                    print('computing loss on validation set...')
                    total_valid_loss = 0
                    valid = self.reader.next_example(1)
                    valid_size = len(self.reader.data[1])
                    for vdata in valid:
                        s1, s2, s3, pos, act = vdata[0], vdata[1], vdata[
                            2], vdata[3], vdata[4]
                        _, _, valid_word_loss = decoder.compute_loss(s3, act)
                        if valid_word_loss is not None:
                            total_valid_loss += valid_word_loss.scalar_value()
                    total_valid_loss = total_valid_loss * 1.0 / valid_size
                    if total_valid_loss < lowest_valid_loss:
                        lowest_valid_loss = total_valid_loss
                        print('saving model...')
                        encoder.Save(self.options.result_dir + 'model_enc')
                        decoder.Save(self.options.result_dir + 'model_dec')
                        baseline.Save(self.options.result_dir + 'baseline')
                    else:
                        lr = lr * self.options.decay
                i += 1