예제 #1
0
def gen_forecast():

    curdir = os.path.dirname(__file__)
    data_dir = os.path.join(curdir, "data")

    #create random forest training input file
    state_file_name = os.path.join(data_dir, "states.csv")
    diff_file_name = os.path.join(data_dir, "diffs.csv")
    learn_file = os.path.join(data_dir, "learn.%dhr.csv" % hour)
    create_learning_input(state_file_name, diff_file_name, learn_file)

    #test the model
    testing_file = os.path.join(data_dir, "learn.%dhr.csv" % hour)
    roi_output_file = os.path.join(data_dir, "roi_output.pred.%dhr.csv" % hour)

    df = pd.read_csv(testing_file, header=0)
    cols = list(df.columns.values)

    with open(model_file_name, 'rb') as f:
        rf = cPickle.load(f)

        if DAT_BEGIN_VAR in cols:
            data_begin_idx = cols.index(DAT_BEGIN_VAR)
        else:
            data_begin_idx = cols.index(roi_items[-1]) + 1

        roi_type_idx = cols.index("type")
        mat = df.as_matrix()

        x = mat[:, data_begin_idx:].astype(np.float32)
        y = mat[:, roi_type_idx].astype(np.int32)
        lats = mat[:, 2].astype(np.float32)
        lons = mat[:, 3].astype(np.float32)
        logger.info("Testing model, shape (%d, %d)" % (x.shape[0], x.shape[1]))

        #preds = rf.predict(x)
        preds = rf.predict_proba(x)

        roi_ofile = open(roi_output_file, 'w')
        roi_ofile.write("roi_name, actual, prediction, lat, lon\n")
        roi_names = df["roi_id"]

        idx = 0
        for p in preds:
            roi_ofile.write(
                "%s, %d, %f, %f, %f\n" %
                (roi_names[idx], y[idx], p[1], lats[idx], lons[idx]))
            #roi_ofile.write("%s, %d, %f, %f, %f\n" % (roi_names[idx], y[idx], p, lats[idx], lons[idx]))
            idx += 1

        roi_ofile.close()
예제 #2
0
def save_learning_files():

    df, cols = load_state_df(train_file)
    save_states(df, cols, train_state_file)
    logger.info("Saved train state file")

    df, cols = load_diff_df(train_file)
    save_diffs(df, cols, train_diff_file)
    logger.info("Saved train diff file")

    df, cols = load_state_df(test_file)
    save_states(df, cols, test_state_file)
    logger.info("Saved test state file")

    df, cols = load_diff_df(test_file)
    save_diffs(df, cols, test_diff_file)
    logger.info("Saved test diff file")
예제 #3
0
def clean(file_name, ofilename, max_waterbody_count=-1):
    logger.info('processing %s' % file_name)
    odf = pd.read_csv(file_name, header=0)  # first line contains header

    logger.info('checking data, %d rows' % len(odf))
    use_rows = good_data(odf)

    odf1 = odf.loc[use_rows].copy()
    logger.info('replacing bad data, %d good rows' % len(odf1))
    replace_data(odf1)

    if max_waterbody_count > 0:
        logger.info('restricting rows by WATERBODY_count')
        odf2 = odf1[odf1["WATERBODY_count"] < max_waterbody_count]
        odf2.to_csv(ofilename, index=False)
    else:
        # print 'writing outpout file %s, %d rows' % (ofilename, len(odf2))
        odf1.to_csv(ofilename, index=False)
예제 #4
0
def save_learning_files():

    logger.info("cleaning output file %s" % ifile)
    clean_output(ifile, ofilename)

    df, cols = load_state_df(ofilename)
    save_states(df, cols, state_file)
    logger.info("Saved state file")

    df, cols = load_diff_df(ofilename)
    save_diffs(df, cols, diff_file)
    logger.info("Saved diff file")
예제 #5
0
def split_test_train(create_combo=False, clean=False, mode=SHUFFLED):

    if create_combo:
        logger.info("creating july/aug combined output file %s" % cfile)
        flag = True
        with open(cfile, 'w') as of:
            for f in ifiles:
                logger.info("processing %s" % f)
                with open(f, 'r') as fin:
                    line = fin.readline()
                    while len(line) > 0:
                        if "roi_id" in line:
                            if flag:
                                of.write(line)
                            flag = False
                        else:
                            of.write(line)
                        line = fin.readline()

    if clean:
        logger.info("cleaning output file %s" % cfile)
        clean_output(cfile, ofilename)

    logger.info("Shuffling data for test and train")
    df = pd.read_csv(ofilename, header=0)

    if mode == SHUFFLED:
        grouped = df.groupby("roi_id")
        roi_ids = grouped.indices.keys()
        np.random.shuffle(roi_ids)
        split_idx = int(split_factor * len(roi_ids))
        test_roi_ids = roi_ids[0:split_idx]
        train_roi_ids = roi_ids[split_idx:-1]

        test_df = df.loc[(df["roi_id"].isin(test_roi_ids))]
        train_df = df.loc[(df["roi_id"].isin(train_roi_ids))]

    elif mode == TEMPORAL:
        starttimes = sorted(df["starttime"].values)
        split_time = starttimes[int(len(starttimes) * (1 - split_factor))]

        train_df = df[df["starttime"] <= split_time]
        test_df = df[df["starttime"] > split_time]

    else:
        raise Exception("mode incorrect")

    test_df.to_csv(test_file, index=False)
    logger.info("Saved test file")

    train_df.to_csv(train_file, index=False)
    logger.info("Saved train file")