コード例 #1
0
    # NOTE: I/O seems to be a bottleneck
    intensities = p.starmap(get_intensity_exp_hawkes,
                            zip([simu_hawkes] * args.n_seqs, event_seqs))

    # computing the optimal acc for predicting the event types
    tmp = []
    for i in range(len(event_seqs)):
        for j in range(len(event_seqs[i])):
            tmp.append((event_seqs[i][j][1], np.argmax(intensities[i][j])))
    print("Optimal acc = {:4f}".format(sum(x == y for x, y in tmp) / len(tmp)))

dataset = f"{args.name}-{args.n_seqs // 1000}K-{args.n_types}"
output_path = f"data/input/{dataset}"

makedirs([output_path])
export_json(vars(args), osp.join(output_path, "config.json"))

train_test_splits = list(
    KFold(args.n_splits, shuffle=True,
          random_state=args.rand_seed).split(range(len(event_seqs))))

with open(osp.join(output_path, "statistics.txt"), "w") as f:
    report = get_event_seqs_report(event_seqs, args.n_types)
    print(report)
    f.writelines(report)

np.savez_compressed(
    osp.join(output_path, "data.npz"),
    event_seqs=event_seqs,
    train_test_splits=train_test_splits,
    intensities=intensities,
コード例 #2
0
                records.append(record)
                record = {}

            elif line[0] == "P":
                record["url"] = line[1]
            elif line[0] == "T":
                record["ts"] = line[1]
                record["ds"] = line[1][:10]
            elif line[0] == "Q":
                if "phrases" not in record:
                    record["phrases"] = []
                record["phrases"].append(line[1])
            elif line[0] == "L":
                if "links" not in record:
                    record["links"] = []
                record["links"].append(line[1])

    with Timer("Convert to DataFrame"):
        df = pd.DataFrame(records)

    with Timer("Exporting"):

        df.to_parquet(
            osp.join(args.input_path, args.name),
            engine="fastparquet",
            partition_cols=["ds"],
            index=False,
        )

export_json(vars(args), osp.join(args.input_path, args.name, "config.json"))