Exemple #1
0
def setup(config):
    fake_today = datetime.datetime.strptime(config["fake_today"], "%d%b%Y")
    train_start_date = datetime.datetime.strptime(config["fake_today"],
                                                  "%d%b%Y") - \
        datetime.timedelta(days=config["prediction_interval"])
    test_end_date = datetime.datetime.strptime(config["fake_today"],
                                               "%d%b%Y") + \
        datetime.timedelta(days=config["prediction_interval"])

    log.info("Train label window start: {}".format(train_start_date))
    log.info("Train label window stop: {}".format(fake_today))
    log.info("Test label window start: {}".format(fake_today))
    log.info("Test label window stop: {}".format(test_end_date))

    log.info("Loading officers and features to use as training...")
    train_x, train_y, train_id, names = dataset.grab_officer_data(
        config["features"], train_start_date, fake_today, train_start_date,
        config["accidents"], config["noinvest"])

    log.info("Loading officers and features to use as testing...")
    test_x, test_y, test_id, names = dataset.grab_officer_data(
        config["features"], fake_today, test_end_date, fake_today,
        config["accidents"], config["noinvest"])

    return {"train_x": train_x,
            "train_y": train_y,
            "train_id": train_id,
            "test_x": test_x,
            "test_y": test_y,
            "test_id": test_id,
            "names": names,
            "train_start_date": train_start_date,
            "test_end_date": test_end_date}
Exemple #2
0
def setup(config, today):
    """
    Sets up officer-level experiment

    Args:
    config: dict with config file
    today: string containing the date to split on for temporal cross-validation
    """

    today = datetime.datetime.strptime(today, "%d%b%Y")
    train_start_date = today - datetime.timedelta(days=config["prediction_interval"])
    test_end_date = today + datetime.timedelta(days=config["prediction_interval"])

    log.info("Train label window start: {}".format(train_start_date))
    log.info("Train label window stop: {}".format(today))
    log.info("Test label window start: {}".format(today))
    log.info("Test label window stop: {}".format(test_end_date))

    log.info("Loading officers and features to use as training...")
    train_x, train_y, train_id, names = dataset.grab_officer_data(
        config["features"], train_start_date, today, train_start_date,
        config["accidents"], config["noinvest"])

    # Testing data should include ALL officers, ignoring "noinvest" keyword
    log.info("Loading officers and features to use as testing...")
    test_x, test_y, test_id, names = dataset.grab_officer_data(
        config["features"], today, test_end_date, today,
        config["accidents"], True)

    train_x_index = train_x.index
    test_x_index = test_x.index
    features = train_x.columns.values

    # Feature scaling
    scaler = preprocessing.StandardScaler().fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    return {"train_x": train_x,
            "train_y": train_y,
            "train_id": train_id,
            "test_x": test_x,
            "test_y": test_y,  # For pilot test_y will not be used
            "test_id": test_id,
            "names": names,
            "train_start_date": train_start_date,
            "test_end_date": test_end_date,
            "train_x_index": train_x_index,
            "test_x_index": test_x_index,
            "features": features}
Exemple #3
0
def setup(config):
    fake_today = datetime.datetime.strptime(config["fake_today"], "%d%b%Y")
    train_start_date = datetime.datetime.strptime(config["fake_today"],
                                                  "%d%b%Y") - \
        datetime.timedelta(days=config["prediction_interval"])
    test_end_date = datetime.datetime.strptime(config["fake_today"],
                                               "%d%b%Y") + \
        datetime.timedelta(days=config["prediction_interval"])

    log.info("Train label window start: {}".format(train_start_date))
    log.info("Train label window stop: {}".format(fake_today))
    log.info("Test label window start: {}".format(fake_today))
    log.info("Test label window stop: {}".format(test_end_date))

    log.info("Loading officers and features to use as training...")
    train_x, train_y, train_id, names = dataset.grab_officer_data(
        config["features"], train_start_date, fake_today, train_start_date,
        config["accidents"], config["noinvest"])

    # Testing data should include ALL officers, ignoring "noinvest" keyword
    log.info("Loading officers and features to use as testing...")
    test_x, test_y, test_id, names = dataset.grab_officer_data(
        config["features"], fake_today, test_end_date, fake_today,
        config["accidents"], True)
    test_baseline = dataset.get_baseline(test_id, fake_today, test_end_date)
    eis_baseline = compute_baseline(test_baseline, test_id, test_y)

    return {"train_x": train_x,
            "train_y": train_y,
            "train_id": train_id,
            "test_x": test_x,
            "test_y": test_y,
            "test_id": test_id,
            "names": names,
            "train_start_date": train_start_date,
            "test_end_date": test_end_date,
            "eis_baseline": eis_baseline}