Ejemplo n.º 1
0
def compute_subgraph_metrics(dataset, n_jobs, limit):
    print("--- Subgraph Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("---------------------------")

    # paths
    conf = Config(dataset)

    output_fpath = f"{conf.data_root}/subgraph_metrics.csv"

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # compute metrics
    print("Computing metrics ...")

    if n_jobs == 1:
        metrics = [compute_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(compute_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Output:", len(metrics))

    # output to csv
    print("Outputting tree metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")
def compute_user_metrics(dataset, n_jobs=1, limit=None):
    print("--- User Metrics ---")
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    toxicity_threshold = 0.531

    conf = Config(dataset)

    json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir,
                                  limit=limit)

    # all_user_conv_stats = [
    #     compute_user_conversation_stats(json_fpath, toxicity_threshold) \
    #     for json_fpath in json_fpaths]

    parallel = Parallel(n_jobs=n_jobs, verbose=10)
    all_user_conv_stats = parallel(
        delayed(compute_user_conversation_stats)(
                json_fpath,
                toxicity_threshold
            ) \
            for json_fpath in json_fpaths
        )

    print("Aggregating user metrics ...")
    user_stats = agg_user_stats(all_user_conv_stats)

    user_stats_csv = [{"user_id": u_id, **u_stats} \
                    for u_id, u_stats in user_stats.items()]

    # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz"
    # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2)

    out_csv_fpath = f"{conf.data_root}/user_metrics.csv"
    write_dicts_to_csv(user_stats_csv, out_csv_fpath)

    print("Done!")
Ejemplo n.º 3
0
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None):
    print("--- Toxicity Metrics ---")    
    print(f"Dataset: {dataset}")
    print(f"Num Jobs: {n_jobs}")
    print(f"Limit: {limit}")
    print("----------------------------")

    # paths
    conf = Config(dataset)
    
    output_fpath = f"{conf.data_root}/toxicity.csv"

    # iterator
    json_fpaths = json_paths_iter(
        conf.conversations_no_embs_jsons_dir, 
        limit=limit
    )

    # compute metrics
    print("Computing metrics ...")
    
    if n_jobs == 1:
        metrics = [toxicity_metrics(json_fpath) \
            for json_fpath in tqdm(json_fpaths)]
    else:
        parallel = Parallel(n_jobs=n_jobs, verbose=10)
        metrics = parallel(
            delayed(toxicity_metrics)(json_fpath) \
                for json_fpath in json_fpaths
            )

    print("Metrics computed:", len(metrics))    

    print("Outputting metrics to CSV ...")
    write_dicts_to_csv(metrics, output_fpath)

    print("Done!")