Example #1
0
def main():
    """Parse arguments, run experiments, collect results and stats, write to file."""
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.description = "TODO"
    parser.add_argument("epsilon", type=util.valid_interval_float,
            help="accuracy parameter")
    parser.add_argument("delta", type=util.valid_interval_float,
            help="confidence parameter")
    parser.add_argument("runs", type=util.positive_int, default=20, help="number of runs")
    parser.add_argument("graph", help="graph file")
    parser.add_argument("output", help="output file")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-a", "--approximate", action="store_true",
            default=True, help="use approximate diameter (default)")
    group.add_argument("-d", "--diameter", type=util.positive_int, default=0,
            help="value to use for the diameter")
    group.add_argument("-e", "--exact", action="store_true", default=False,
            help="use exact diameter")
    parser.add_argument("-m", "--maxconn", action="store_true", default=False,
            help="if the graph is not weakly connected, only save the largest connected component")
    parser.add_argument("-p", "--pickle", action="store_true", default=False,
            help="use pickle reader for input file")
    parser.add_argument("-s", "--samplesize", type=util.positive_int,
            default=0, help="use specified sample size. Overrides epsilon, delta, and diameter computation")
    parser.add_argument("-t", "--timeout", type=util.positive_int, default=3600,
            help="Timeout computation after specified number of seconds (default 3600 = 1h, 0 = no timeout)")
    parser.add_argument("-u", "--undirected", action="store_true", default=False,
            help="consider the graph as undirected ")
    parser.add_argument("-v", "--verbose", action="count", default=0,
            help="increase verbosity (use multiple times for more verbosity)")
    parser.add_argument("-w", "--weightFile", default="-",
            help="random weights within the interval 0 to 1, must have as many entries as the number of edges")

    args = parser.parse_args()

    # Set the desired level of logging
    util.set_verbosity(args.verbose)

    # Read graph
    if args.pickle:
        G = util.read_graph(args.graph)
    else:
        G = converter.convert(args.graph, not args.undirected, args.maxconn)

    if args.exact:
        args.approximate = False

    # Read the weights    
    weights_list=[]
    if args.weightFile != "-":
        with open(args.weightFile,'r') as weight_file:
            for line in weight_file:
                weights_list.append(float(line.strip()))

    # Perform experiment multiple times
    results = []
    for i in range(args.runs):
        logging.info("Run #%d", i)
        # Compute betweenness
        if args.samplesize:
            results.append(vc_sample.betweenness_sample_size(G,
                args.samplesize, False, args.timeout))
        else:
            if args.diameter > 0:
                results.append(vc_sample.betweenness(G, args.epsilon, args.delta,
                        weights_list, args.diameter, False, args.timeout))
            else:
                results.append(vc_sample.betweenness(G, args.epsilon, args.delta,
                        weights_list, args.approximate, False, args.timeout))

    # Compute aggregate statistics about the experiments
    stats = dict()
    stats["graph"]= os.path.basename(args.graph)
    stats["vertices"] = G.vcount()
    stats["edges"] = G.ecount()
    stats["runs"] = args.runs
    if args.samplesize:
        stats["sample_size"] = args.samplesize
    else:
        stats["delta"] = args.delta
        stats["epsilon"] = args.epsilon
        stats["sample_size"] = results[0][0]["sample_size"]

    stats_names = ["time", "forward_touched_edges", "backward_touched_edges"]
    if not args.samplesize:
        stats_names.append("diameter")
        stats_names.append("diameter_touched_edges")
    for stat_name in stats_names:
        values = sorted([x[0][stat_name] for x in results])
        stats[stat_name + "_max"] = values[-1]
        stats[stat_name + "_min"] = values[0]
        stats[stat_name + "_avg"] = sum(values) / args.runs
        if args.runs > 1:
            stats[stat_name + "_stddev"] = math.sqrt(sum([math.pow(value -
                stats[stat_name + "_avg"], 2) for value in values]) / (args.runs - 1))
        else:
            stats[stat_name + "_stddev"] = 0.0

    stats["betw_min"] = [0.0] * G.vcount()
    stats["betw_max"] = [0.0] * G.vcount()
    stats["betw_avg"] = [0.0] * G.vcount()
    for i in range(G.vcount()):
        betws = sorted([x[1][i] for x in results])
        stats["betw_min"][i]= betws[0]
        stats["betw_max"][i] = betws[-1]
        stats["betw_avg"][i] = sum(betws) / args.runs

    csvkeys="graph, runs, epsilon, delta, sample_size"
    csvkeys_names= ["{0}_avg, {0}_min, {0}_stddev, {0}_max, {0}_min".format(stat_name) 
            for stat_name in stats_names]
    csvkeys_list = [csvkeys] + csvkeys_names
    csvkeys = ",".join(csvkeys_list)
   # print(stats["betw_min"])   
    print(csvkeys)
    print(util.dict_to_csv(stats, csvkeys))
    # Write stats and results to output file
    try: 
        with open(args.output, "wb") as output:
            logging.info("Writing stats and results to %s", args.output)
            pickle.dump((stats, results), output)
            output.close()
            #pkl_file = open("vc_out.picklez", 'rb')
            #reader = pickle.load(pkl_file)
            #print(reader[0]["diameter_touched_edges_avg"])
    except OSError as E:
        logging.critical("Cannot write stats and results to %s: %s",
                args.output, E.strerror)
        sys.exit(2)
Example #2
0
def main():
    """Parse arguments, do the comparison, write to output."""
    parser = argparse.ArgumentParser()
    parser.description = "compare estimation of betweenness centralities to exact values"
    parser.add_argument("epsilon", type=util.valid_interval_float, help="accuracy parameter")
    parser.add_argument("delta", type=util.valid_interval_float, help="confidence parameter")
    parser.add_argument("graph", help="graph file")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-a", "--approximate", action="store_true",
            default=True, help="use approximate diameter when computing approximation of betweenness using VC-Dimension (default)")
    group.add_argument("-d", "--diameter", type=util.positive_int, default=0,
            help="value to use for the diameter")
    group.add_argument("-e", "--exact", action="store_true", default=False,
            help="use exact diameter when computing approximation of betweenness using VC-Dimension")
    parser.add_argument("-m", "--maxconn", action="store_true", default=False,
            help="if the graph is not weakly connected, only save the largest connected component")
    parser.add_argument("-p", "--pickle", action="store_true", default=False,
            help="use pickle reader for input file")
    parser.add_argument("-r", "--resultfiles", nargs=4, 
    help="Use results files rather than recomputing betweenness. Files should be specified as 'exact_res vc_res bp_res gss_res'")
    parser.add_argument("-s", "--samplesize", type=util.positive_int,
            default=0, help="use specified sample size. Overrides epsilon, delta, and diameter computation")
    parser.add_argument("-t", "--timeout", type=util.positive_int, default=3600,
            help="Timeout computation after specified number of seconds (default 3600 = 1h, 0 = no timeout)")
    parser.add_argument("-u", "--undirected", action="store_true", default=False,
            help="consider the graph as undirected ")
    parser.add_argument("-v", "--verbose", action="count", default=0,
            help="increase verbosity (use multiple times for more verbosity)")
    parser.add_argument("-w", "--write", nargs="?", default=False, const="auto",
            help="write graph (and computed attributes) to file.")
    args = parser.parse_args()

    # Set the desired level of logging
    util.set_verbosity(args.verbose)

    # Seed the random number generator
    random.seed()

    # Read graph
    if args.pickle:
        G = util.read_graph(args.graph)
    else:
        G = converter.convert(args.graph, not args.undirected, args.maxconn)

    if args.exact:
        args.approximate = False

    if not args.resultfiles:
        (exact_stats, exact_betw) = brandes_exact.betweenness(G, args.write,
                args.timeout)
        if args.samplesize: 
            (vc_stats, vc_betw) = vc_sample.betweenness_sample_size(G,
                    args.samplesize, args.write, args.timeout)
            (bp_stats, bp_betw) = brandespich_sample.betweenness_sample_size(G,
                    args.samplesize, args.write, args.timeout)
            (gss_stats, gss_betw) = geisbergerss_sample.betweenness_sample_size(G,
                    args.samplesize, args.write, args.timeout)
        else:
            if args.diameter > 0:
                (vc_stats, vc_betw) = vc_sample.betweenness(G, args.epsilon, args.delta,
                        args.diameter, args.write, args.timeout)
            else:
                (vc_stats, vc_betw) = vc_sample.betweenness(G, args.epsilon, args.delta,
                        args.approximate, args.write, args.timeout)

            (bp_stats, bp_betw) = brandespich_sample.betweenness(G,
                    args.epsilon, args.delta, args.write, args.timeout)
            (gss_stats, gss_betw) = geisbergerss_sample.betweenness(G,
                    args.epsilon, args.delta, args.write, args.timeout)
    else:
        (exact_stats, exact_betw) = util.read_stats_betw(args.result_files[0])
        (vc_stats, vc_betw) = util.read_stats_betw(args.result_files[1])
        (bp_stats, bp_betw) = util.read_stats_betw(args.result_files[2])
        (gss_stats, gss_betw) = util.read_stats_betw(args.result_files[3])

    #Compute useful graph statistics (mainly diameter)
    if "diam" not in G.attributes():
        diameter.diameter(G)

    # If specified, write betweenness as vertex attributes, and time and
    # diameter as graph attributes back to file

    if args.write:
        logging.info("Writing betweenness as vertex attributes and stats as graph attribute")
        if args.write == "auto":
            filename = os.path.splitext(args.graph)[0] + ("-undir" if args.undirected else "dir") + ".picklez"
            G.write(filename)
        else:
            G.write(args.write)

    # Compute error statistics
    # It is not a problem to sort the error by value because we only compute
    # aggregates.
    
    # Normalize
    #normalizer = math.pow(G.vcount(),2)-G.vcount() 
    #norm_exact_betw = [a/normalizer for a in exact_betw]
    #norm_vc_betw = [a/normalizer for a in vc_betw]
    #norm_bp_betw = [a/normalizer for a in bp_betw]
    #norm_gss_betw = [a/normalizer for a in gss_betw]

    #VC-STATISTICS
    logging.info("Computing error statistics")
    max_err = args.epsilon * G.vcount() * (G.vcount() - 1) / 2
    vc_errs = sorted([abs(a - b) for a,b in zip(exact_betw,vc_betw)])
    vc_stats["err_avg"] = sum(vc_errs) / G.vcount()
    vc_stats["err_max"] = vc_errs[-1]
    vc_stats["err_min"] = list(itertools.filterfalse(lambda x: x == 0, vc_errs))[0]
    vc_stats["err_stddev"] = math.sqrt(sum([math.pow(err - vc_stats["err_avg"], 2) for err in vc_errs]) / (G.vcount() -1))
    vc_stats["euc_dist"] = math.sqrt(sum([math.pow(a - b, 2) for a,b in zip(exact_betw,vc_betw)]))
    vc_stats["wrong_eps"] = 0;
    for i in range(G.vcount()):
        err = abs(exact_betw[i] - vc_betw[i])
        #if err > max_err:
            #vc_stats["wrong_eps"] += 1
            #if vc_stats["wrong_eps"] == 1:
                #print("## VC wrong epsilon ##")
            #print("{} {} {} {} {} {} {}".format(i, G.vs[i].degree(),
                #exact_betw[i], vc_betw[i], bp_betw[i],
                #err, err / (G.vcount() * (G.vcount() -1) / 2)))
    #BP-STATISTICS
    bp_errs = sorted([abs(a - b) for a,b in zip(exact_betw,bp_betw)])
    bp_stats["err_avg"] = sum(bp_errs) / G.vcount()
    bp_stats["err_max"] = max(bp_errs)
    bp_stats["err_min"] = list(itertools.filterfalse(lambda x: x == 0, bp_errs))[0]
    bp_stats["err_stddev"] = math.sqrt(sum([math.pow(err - bp_stats["err_avg"], 2) for err in bp_errs]) / (G.vcount() -1))
    bp_stats["euc_dist"] = math.sqrt(sum([math.pow(a - b, 2) for a,b in zip(exact_betw,bp_betw)]))
    bp_stats["wrong_eps"] = 0
    for i in range(G.vcount()):
        err = abs(exact_betw[i] - bp_betw[i])
        #if err > max_err:
            #bp_stats["wrong_eps"] += 1
            #if bp_stats["wrong_eps"] == 1:
                #print("## BP wrong epsilon ##")
            #print("{} {} {} {} {} {} {}".format(i, G.vs[i].degree(),
                 #exact_betw[i], bp_betw[i], vc_betw[i], err, err / (G.vcount() * (G.vcount() -1) / 2)))
    #GSS-STATISTICS
    gss_errs = sorted([abs(a - b) for a,b in zip(exact_betw,gss_betw)])
    gss_stats["err_avg"] = sum(gss_errs) / G.vcount()
    gss_stats["err_max"] = max(gss_errs)
    gss_stats["err_min"] = list(itertools.filterfalse(lambda x: x == 0, gss_errs))[0]
    gss_stats["err_stddev"] = math.sqrt(sum([math.pow(err - gss_stats["err_avg"], 2) for err in gss_errs]) / (G.vcount() -1))
    gss_stats["euc_dist"] = math.sqrt(sum([math.pow(a - b, 2) for a,b in zip(exact_betw,gss_betw)]))
    gss_stats["wrong_eps"] = 0
    for i in range(G.vcount()):
        err = abs(exact_betw[i] - gss_betw[i])
        #if err > max_err:
            #gss_stats["wrong_eps"] += 1
            #if gss_stats["wrong_eps"] == 1:
                #print("## GSS wrong epsilon ##")
            #print("{} {} {} {} {} {} {}".format(i, G.vs[i].degree(),
                 #exact_betw[i], gss_betw[i], vc_betw[i], err, err / (G.vcount() * (G.vcount() -1) / 2)))

    # Print statistics to output as CSV
    logging.info("Printing statistics")
    print("graph,nodes,edges,diam,directed,epsilon,delta,sample_size")
    print("{},{},{},{},{},{},{},{}".format(G["filename"], G.vcount(),
        G.ecount(), G["diam"], G.is_directed(), args.epsilon, args.delta,
        args.samplesize))
        
    #csvkeys="epsilon, delta, sample_size, time, wrong_eps, err_avg, err_max, err_min, err_stddev, forward_touched_edges, backward_touched_edges, diameter_touched_edges, euc_dist, diameter, diam_type"
    csvkeys="epsilon,delta,sample_size,time,wrong_eps,err_avg,err_stddev,forward_touched_edges,backward_touched_edges,diameter_touched_edges,euc_dist,diameter,diam_type"
    print("type,", csvkeys)
    print("vc,", util.dict_to_csv(vc_stats,csvkeys))
    print("bp,", util.dict_to_csv(bp_stats,csvkeys))
    print("gss,", util.dict_to_csv(gss_stats,csvkeys))
    print("exact,", util.dict_to_csv(exact_stats,csvkeys))