# NOTE: I/O seems to be a bottleneck intensities = p.starmap(get_intensity_exp_hawkes, zip([simu_hawkes] * args.n_seqs, event_seqs)) # computing the optimal acc for predicting the event types tmp = [] for i in range(len(event_seqs)): for j in range(len(event_seqs[i])): tmp.append((event_seqs[i][j][1], np.argmax(intensities[i][j]))) print("Optimal acc = {:4f}".format(sum(x == y for x, y in tmp) / len(tmp))) dataset = f"{args.name}-{args.n_seqs // 1000}K-{args.n_types}" output_path = f"data/input/{dataset}" makedirs([output_path]) export_json(vars(args), osp.join(output_path, "config.json")) train_test_splits = list( KFold(args.n_splits, shuffle=True, random_state=args.rand_seed).split(range(len(event_seqs)))) with open(osp.join(output_path, "statistics.txt"), "w") as f: report = get_event_seqs_report(event_seqs, args.n_types) print(report) f.writelines(report) np.savez_compressed( osp.join(output_path, "data.npz"), event_seqs=event_seqs, train_test_splits=train_test_splits, intensities=intensities,
records.append(record) record = {} elif line[0] == "P": record["url"] = line[1] elif line[0] == "T": record["ts"] = line[1] record["ds"] = line[1][:10] elif line[0] == "Q": if "phrases" not in record: record["phrases"] = [] record["phrases"].append(line[1]) elif line[0] == "L": if "links" not in record: record["links"] = [] record["links"].append(line[1]) with Timer("Convert to DataFrame"): df = pd.DataFrame(records) with Timer("Exporting"): df.to_parquet( osp.join(args.input_path, args.name), engine="fastparquet", partition_cols=["ds"], index=False, ) export_json(vars(args), osp.join(args.input_path, args.name, "config.json"))