def test_heuristics(dataset_name, min_k, max_k, k_step, nSamples, results_file, tau_scale, cores):
    n = LinkServerCP(dataset_name).getNumNodes()
    nBFS_theoretic = int(n *log(n, 2))
    f = open(results_file,'w')
    f.write('# Dataset: %s\n'%dataset_name)
    f.write('\t'.join('INFEST(%d,%d)'%(init_samples, iter_samples) for init_samples, iter_samples in \
                      infest_heuristics) + '\n')
    f.write('\t'.join('Vanilla(%d)'%(vanilla_samples) for vanilla_samples in vanilla_heuristics) + '\n')
    f.close()
    for k_frac in np.arange(min_k, max_k, k_step):
        print "k_frac = ", k_frac
        k = int(n * k_frac)
        seeds_fname = "%s-seeds-%d.cp"%(dataset_name, k)
        for i in xrange(nSamples):
            print "sample #",i
            print "Testing heuristics for dataset: ", dataset_name
            generateSeedFiles(k, k+1, 1, range(n), 1, dataset_name + "-seeds-")
            seeds=cp.load(open(seeds_fname,'r'))
            true_value, num_cycles_full = 0, 0
            print "taking %d samples to evaluate the true value of spread"%nBFS_samples
            true_value, num_cycles_full = runVanilla(dataset = dataset_name, seeds = seeds_fname, nCores = cores, min_samples = min_nSamples, min_relative_std_error = min_relative_error)
            print "true value is: ", true_value
            num_cycles_full = int(1. * nBFS_theoretic / nBFS_samples * num_cycles_full)
            infest_results = []
            print "Running INFEST based heuristics"
            for init_samples, iter_samples in infest_heuristics:
                print "INFEST(%d,%d)"%(init_samples, iter_samples)
                approx_estimate, num_cycles_approx = runApproxHeuristic(dataset_name, seeds_fname, tau_scale, cores,\
                                                 init_samples, iter_samples)
                infest_results.append((approx_estimate, num_cycles_approx))
            
            vanilla_results = []
            print "Running Vanilla based heuristics"
            for samples in vanilla_heuristics:
                seq_estimate, num_cycles_seq = runVanilla(dataset_name, seeds_fname, samples, cores)
                vanilla_results.append((seq_estimate, num_cycles_seq))
            
            removeFile(seeds_fname)
            f = open(results_file, 'a')
            f.write('%.6f\t%.6f\t%d\t'%(k_frac, true_value, num_cycles_full) + '\t'.join('%d\t%d'%results for results in infest_results) + '\t' + \
                    '\t'.join('%d\t%d'%results for results in vanilla_results) + '\n')
            f.close()
 #n = len(V)
 print "Number of nodes: ", n
 nBFS_samples = 1000
 running_times_file = results_dir + output + '-running_times-ratios_k_min-%.3f-k_max-%.3f-samples-%d-bfs_samples-%d-large'%(min_frac,max_frac,nSamples,nBFS_samples)
 running_times_file_raw = running_times_file + "-raw"
 removeFile(running_times_file)
 f = open(running_times_file, 'w')
 f_raw = open(running_times_file_raw,'w')
 nBFS_samples = 1000
 nBFS_samples_theoretic = n * log(n,2)
 for nSeeds in xrange(int(min_frac * n), int(max_frac * n), int(interval * n)):
     print "k = ", nSeeds
     seeds_fname = output + "-seeds-" + str(nSeeds) + '.cp'
     runtimes_approx, runtimes_seq = [], []
     for i in xrange(nSamples):
         generateSeedFiles(nSeeds, nSeeds+1, int(interval * n), range(n), 1, output + "-seeds-")
         perf_csv_fname = dataset + 'runtimes_large.csv'
         subprocess.Popen("perf stat -x, -o %s python ic_bfs_eval.py -dataset %s -res_fname %s -seeds %s -output_mode 3 -cores %d"%\
                           (perf_csv_fname, dataset, output + "-approx-" + str(nSeeds), seeds_fname, parameters.cores), \
                           shell = True, stdout = subprocess.PIPE).stdout.read()
         num_cycles_approx = getNumCycles(perf_csv_fname)
         runtimes_approx.append(num_cycles_approx)
         print "Done approximating, now running naive sequential algorithm"
         removeFile(perf_csv_fname)
         subprocess.Popen("perf stat -x, -o %s python seq_estimation.py -dataset %s -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -cores %d"%(perf_csv_fname, dataset, seeds_fname, output + "-seq-" + str(nSeeds), nBFS_samples, parameters.cores), shell=True,stdout=subprocess.PIPE).stdout.read()
         runtime_seq_samples = getNumCycles(perf_csv_fname)
         theoretic_num_cycles = 1.0 * runtime_seq_samples / nBFS_samples * nBFS_samples_theoretic
         runtimes_seq.append(theoretic_num_cycles)
         f_raw.write('%.3f\t%.3f\t%.3f\n'%(1.*nSeeds/n, num_cycles_approx, theoretic_num_cycles))
         removeFile('runtimes_large_seq.csv')
     print "runtimes_approx: ", runtimes_seq
    n = L.getNumNodes()
    if k_mode == 0:
        nSeeds = int(n * k)
    else:
        nSeeds = int(k)

    perf_csv_fname = "perf_out" + str(sample(range(1000),1)[0]) + ".csv"
    print "Number of nodes: ", n
    print "nSeeds = ", nSeeds

    results_file = results_dir + output + '-influence_values_samples_min-%d-samples_max-%d-k-%.3f-prob_method-%d'%(min_samples,max_samples, k, prob_method)
    removeFile(results_file)

    seeds_fname = "%s-seeds-%d.cp"%(dataset,nSeeds)
    
    generateSeedFiles(nSeeds, nSeeds+1, 1, range(n), 1, dataset + "-seeds-")
    removeFile(results_file)
    values = []
    for samples in xrange(min_samples, max_samples+1, samples_step):
        print "number of samples = ", samples        
        output_fname = results_dir + 'nReached%d.txt'%random.randint(1,1000)
        subprocess.Popen("python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -get_n_reached 1 -reached_nodes_file %s"%(dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), samples_step, output_fname), shell=True,stdout=subprocess.PIPE).stdout.read()
        removeFile(output + "-seq-" + str(nSeeds))
        f_values = open(output_fname,'r')
        values += [int(v) for v in f_values.readline().strip().split()]
        f_values.close()
        removeFile(output_fname)
        print "spread values for %d samples: %s"%(samples,str(values))
        f = open(results_file, 'a')
        f.write('%d\t%.5f\t%s\n'%(samples, np.std(values), "\t".join(str(val) for val in values)))
        f.close()
Example #4
0
    print "creating link-server object"
    if parameters.prob_method == 0:
        edge_prob = 0.2
    else:
        edge_prob = [0.1,0.01]
    L=LinkServerCP(dataset, edges_csv, create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter, undirected=parameters.undirected)
    print "n = ", L.getNumNodes()
    V = LoadNodesFromFile(edges_csv, delimiter)
    n = len(V)
    print 'min_frac', min_frac
    k_min = int(n * min_frac)
    k_max = int(n * max_frac) + 1
    k_step = int(n * interval)
    

    generateSeedFiles(k_min, k_max, k_step, V, nSamples, 'experiments/' + output + "-seeds-")
    
    
    mean_errors, std_errors = [], []

    for k in xrange(k_min, k_max,k_step):
        approx_fname = 'experiments/results/' + output + "-approx_errors-k_min-%d-k_max-%d-k-%d-samples-%d"%(k_min,k_max,k,nSamples)
        seq_fname = 'experiments/results/' + output + "-seq-approx-errors-k-%d-samples-%d"%(k,nSamples)
        print approx_fname
        print seq_fname
        removeFile(approx_fname)
        seeds_fname = 'experiments/' + output + "-seeds-" + str(k) + ".cp"
        print "Running approx algorithm for k=: ", k
        subprocess.Popen("python ic_bfs_eval.py -dataset %s -res_fname %s -seeds %s -output_mode 2"%\
                        (dataset, approx_fname, seeds_fname), \
                        shell = True, stdout = subprocess.PIPE).stdout.read()
Example #5
0
    print "creating link-server object"
    if parameters.prob_method == 0:
        edge_prob = 0.2
    else:
        edge_prob = [0.1,0.01]
    L=LinkServerCP(dataset, edges_csv, create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter)
    print "n = ", L.getNumNodes()
    V = LoadNodesFromFile(edges_csv, delimiter)
    n = len(V)
    k = int(n * k_frac)
    print "n=", n
    print "k=", k
    seeds_fname = output + "-seeds-" + str(k) + ".cp"
    removeFile(seeds_fname)
    removeFile(dataset)
    generateSeedFiles(k, k+1, 1, V, nSamples, output + "-seeds-")

    approx_fnames = ['experiments/results/' + output + "-approx-k-%d-samples-%d-scale-%.5f"%(k,nSamples,scale) for scale in drange(min_frac,max_frac,interval)]
    seq_fname = 'experiments/results/' + output + "-seq-k-%d-samples-%d-eps-%.5f"%(k,nSamples,eps)

    for i, scale in enumerate(drange(min_frac, max_frac, interval)):
        removeFile(approx_fnames[i])
        print "Running approx algorithm for scale factor: ", scale
        subprocess.Popen("python ic_bfs_eval.py -dataset %s -scale %.5f -res_fname %s -seeds %s -output_mode 0"%\
                         (dataset, scale, approx_fnames[i], seeds_fname), \
                         shell = True, stdout = subprocess.PIPE).stdout.read()
    
    print "Running sequential algorithm on seed sets with eps=%.5f"%eps
    removeFile(seq_fname)
    subprocess.Popen("python seq_estimation.py -dataset %s -seeds_file %s -results_file %s -output_mode 2 -min_samples 500 -min_relative_standard_error 0.01 "%\
                     (dataset, seeds_fname, seq_fname), shell=True,stdout=subprocess.PIPE).stdout.read()
Example #6
0
    if k_mode == 0:
        nSeeds = int(n * k)
    else:
        nSeeds = int(k)

    approx_results_fname = output + "-equal_times-approx-" + str(k) + '-' + str(sample(range(1000),1)[0])
    perf_csv_fname = "perf_out" + str(sample(range(1000),1)[0]) + ".csv"
    nBFS_samples = 1000
    print "Number of nodes: ", n
    print "nSeeds = ", nSeeds

    results_file = results_dir + output + '-influence_concentration_samples_min-%d-samples_max-%d-k-%.3f-prob_method-%d-nSeeds_sets-%d'%(min_samples,max_samples, k, prob_method, seed_sets)
    removeFile(results_file)

    seeds_fname = "%s-seeds-%d.cp"%(results_dir + output, nSeeds)
    generateSeedFiles(nSeeds, nSeeds+1, 1, range(n), seed_sets, results_dir + output + "-seeds-")
    values = []
    for samples in xrange(min_samples, max_samples+1, samples_step):
        print "number of samples = ", samples        
        output_fname = results_dir + 'nReached%d.txt'%random.randint(1,1000)
        subprocess.Popen("python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -get_n_reached 1 -reached_nodes_file %s"%(dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), samples_step, output_fname), shell=True,stdout=subprocess.PIPE).stdout.read()
        removeFile(output + "-seq-" + str(nSeeds))
        f_values = open(output_fname,'r')
        f = open(results_file, 'a')
        for i, line in enumerate(f_values.readlines()):
            values += [int(v) for v in line.strip().split()]
            f.write('%d\t%.5f\n'%(samples, sem(values)/np.mean(values)))
        f.close()
        f_values.close()
        removeFile(output_fname)
    
   '-approx-heuristic-prob_method-%d-k_min-%.4f-k_max-%.4f-tau_scale-%.3f-samples-%d-bfs_samples-%d-init_samples-%d-iter_samples-%d'%\
   (parameters.prob_method, min_k, max_k, tau_scale, nSamples, nBFS_samples, init_samples, iter_samples)
 removeFile(results_file)
    
 # Record time to load link-server file
 subprocess.Popen("perf stat -x, -o %s python load_link_server.py -cp %s"%\
                           (perf_csv_fname, dataset), shell = True, stdout = subprocess.PIPE).stdout.read()
 nCycles_link_server = getNumCycles(perf_csv_fname)
 print "Number of cycles for loading link server: ", nCycles_link_server
 removeFile(perf_csv_fname)
 for k in xrange(int(min_k),int(max_k + 1), int(k_step)):
     print "k = ", k
     seeds_fname = "%s-seeds-%d.cp"%(output, k)
     for i in xrange(nSamples):
         print "sample #",i
         generateSeedFiles(k, k+1, 1, range(n), 1, output + "-seeds-")
         seeds=cp.load(open(seeds_fname,'r'))
         print "Running Vanilla with %d samples"%(nBFS_samples)
         true_value, num_cycles_full = runVanilla(dataset, seeds_fname, nBFS_samples, parameters.cores)            
         print "Done. Number of cycles: %d"%num_cycles_full
         nCycles_per_bfs = 1.*(num_cycles_full - nCycles_link_server) / nBFS_samples
         print "Number of cycles per sample", nCycles_per_bfs
         print "Running approximation algorithm"
         approx_estimate, num_cycles_approx = runApproxHeuristic(dataset, seeds_fname, tau_scale, parameters.cores,\
                                              parameters.init_samples, parameters.iter_samples, nCycles_link_server)
         print "Number of cycles without link-server loading: ", num_cycles_approx
         print "Done approximating, now running naive sequential algorithm"
         
         nVanilla_samples = int(ceil(1. * num_cycles_approx / nCycles_per_bfs))
         print "Running Vanilla for %s samples"%nVanilla_samples
         seq_estimate, num_cycles_seq = runVanilla(dataset, seeds_fname, nVanilla_samples, parameters.cores)
Example #8
0
                     create_new=True,
                     prob_method=parameters.prob_method,
                     prob=edge_prob,
                     delim=delimiter,
                     undirected=parameters.undirected)
    print "n = ", L.getNumNodes()
    V = LoadNodesFromFile(edges_csv, delimiter)
    n = len(V)

    k_min = int(n * min_frac)
    k_max = int(n * max_frac) + 1
    k_step = int(n * interval)
    print "max_k = ", max_frac
    print "Minimum k value: %d, maximum k value: %d" % (k_min, k_max)
    removeFile(dataset)
    generateSeedFiles(k_min, k_max, k_step, V, nSamples,
                      'experiments/' + output + "-seeds-")
    results_fname = "experiments/results/" + output + '-approximations-nSamples-%d-k_frac-%.3f-%.3f' % (
        nSamples, min_frac, max_frac)
    removeFile(results_fname)
    mean_errors, std_errors = [], []

    for k in xrange(k_min, k_max, k_step):
        approx_fname = 'experiments/results/' + output + \
          "-approx_errors-k_min-%d-k_max-%d-k-%d-samples-%d"%(k_min,k_max,k,nSamples)
        seq_fname = 'experiments/results/' + output + "-seq-approx-errors-k-%d-samples-%d" % (
            k, nSamples)
        seeds_fname = 'experiments/' + output + "-seeds-" + str(k) + ".cp"

        removeFile(seq_fname)
        print "Running sequential algorithm for k=%d" % k
        subprocess.Popen("python seq_estimation.py -dataset %s -seeds_file %s -cores 40 -results_file %s -output_mode 2 -min_samples 500\
Example #9
0
    L = LinkServerCP(dataset,
                     edges_csv,
                     create_new=True,
                     prob_method=parameters.prob_method,
                     prob=edge_prob,
                     delim=delimiter)
    print "n = ", L.getNumNodes()
    V = LoadNodesFromFile(edges_csv, delimiter)
    n = len(V)
    k = int(n * k_frac)
    print "n=", n
    print "k=", k
    seeds_fname = output + "-seeds-" + str(k) + ".cp"
    removeFile(seeds_fname)
    removeFile(dataset)
    generateSeedFiles(k, k + 1, 1, V, nSamples, output + "-seeds-")

    approx_fnames = [
        'experiments/results/' + output +
        "-approx-k-%d-samples-%d-scale-%.5f" % (k, nSamples, scale)
        for scale in drange(min_frac, max_frac, interval)
    ]
    seq_fname = 'experiments/results/' + output + "-seq-k-%d-samples-%d-eps-%.5f" % (
        k, nSamples, eps)

    for i, scale in enumerate(drange(min_frac, max_frac, interval)):
        removeFile(approx_fnames[i])
        print "Running approx algorithm for scale factor: ", scale
        subprocess.Popen("python ic_bfs_eval.py -dataset %s -scale %.5f -res_fname %s -seeds %s -output_mode 0"%\
                         (dataset, scale, approx_fnames[i], seeds_fname), \
                         shell = True, stdout = subprocess.PIPE).stdout.read()
        L=LinkServerCP('input/datasets/' + dataset, csv_fname, create_new=True, prob_method=parameters.prob_method, prob=[0.1,0.01], delim='\t', undirected = 1)
        # record loading time of link-server -- for interpolation
        removeFile(perf_output_fname)
        subprocess.Popen("perf stat -x, -o %s python load_link_server.py -cp %s"%\
                         (perf_output_fname, "input/datasets/" + dataset), shell = True, stdout = subprocess.PIPE).stdout.read()
        
        nCycles_link_server = getNumCycles(perf_output_fname)
        removeFile(perf_output_fname)
        
        #removeFile(csv_fname)
        V = xrange(n)
        
        for nSeeds in xrange(int(min_frac * n), int(max_frac * n), int(interval * n)):
            for i in xrange(nSamples):
                seeds_fname = output + "-seeds-%d-%d.cp"%(i, nSeeds)
                generateSeedFiles(nSeeds, nSeeds+1, int(interval * n), V, 1, output  + "-seeds-%d-"%i)
                subprocess.Popen("perf stat -x, -o %s python ic_bfs_eval.py -dataset %s -cores %d -res_fname %s -seeds %s -output_mode 3 -undirected 1"%\
                                  (perf_output_fname, 'input/datasets/' + dataset, parameters.cores, output + "-approx-" + str(nSeeds), seeds_fname), \
                                  shell = True, stdout = subprocess.PIPE).stdout.read()
                cycles_approx = getNumCycles(perf_output_fname)

                cycles_seq = {}
                print "Done approximating, now running naive sequential algorithm"
                for eps in eps_list:
                    print "Running sequential for eps = ", eps
                    removeFile(perf_output_fname)                    
                    subprocess.Popen("perf stat -x, -o %s python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d"%(perf_output_fname, 'input/datasets/' + dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), nBFS_samples), shell=True,stdout=subprocess.PIPE).stdout.read()
                    cycles_seq[eps] = (n*log(n,2) / nBFS_samples) * (getNumCycles(perf_output_fname) - nCycles_link_server) + nCycles_link_server
                removeFile(perf_output_fname)
                for i, eps in enumerate(eps_list):
                    f = open(running_times_files[i],'a')