Example #1
0
def main():
    flexp.setup("experiments/", "exp01", False)

    flog.setup("debug", path=flexp.get_file_path("experiment.log.txt"))  # , disable_stderr=not cfg.SHORT_RUN)
    log.debug('Starting.')

    data = FlowData()
    data.id = "a"

    # debug level 2 - all detailes will be printed
    data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 18)], debug_level=2)
    data_chain.process(data)

    # hash of this and previous are same
    data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 20)], debug_level=1)
    data_chain.process(data)
Example #2
0
def main(exp):
    flexp.setup("./experiments",
                exp,
                with_date=False,
                loglevel=logging.INFO,
                log_filename="experiment.log.txt")

    # Load
    logging.info("Loading data")
    df_train = pd.read_csv("data/data_clean_train.csv")
    df_dev = pd.read_csv("data/data_clean_dev.csv")

    # Preprocess
    logging.info("Preprocessing")
    df_x_train, y_train = xy_split(df_train)
    df_x_dev, y_dev = xy_split(df_dev)

    feature_transformer = FeatureTransformer()
    x_train = feature_transformer.fit_transform(df_x_train)
    x_dev = feature_transformer.transform(df_x_dev)
    feature_names = feature_transformer.get_feature_names()

    imputer = SimpleImputer(strategy="median")
    x_train = imputer.fit_transform(x_train)
    x_dev = imputer.transform(x_dev)

    features_to_scale = ["city mpg__", "Year__", "Engine HP__"]
    scaler = FeatureScaler(StandardScaler(), feature_names, features_to_scale)
    x_train = scaler.fit_transform(x_train)
    x_dev = scaler.transform(x_dev)

    # Fit
    logging.info("Fitting")
    # model = RandomForestRegressor(n_estimators=10)
    model = Ridge(fit_intercept=False, normalize=False, alpha=1.)
    model.fit(x_train, y_train)

    # Eval
    logging.info("Evaluating")
    y_train_pred = model.predict(x_train)
    y_dev_pred = model.predict(x_dev)

    eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred)
    # eval_feature_importance(model, feature_names)
    plot_histograms(x_train, feature_names)
Example #3
0
def main():
    flexp.setup("./experiments", "tf-idf", with_date=True)

    data = FlowData()

    my_chain = CachingChain([
        PrintIdModule(),
        PickleCache("cached_pkl", "id", [TestModule(12, 14, 18)]),  # id updated by PickleChain hash
        PrintIdModule(),
        PickleCache("cached_pkl", "id", [TestModule(12, 16, 18)]),  # id updated by PickleChain hash
        PrintIdModule(),
        TestModule(12, 16, 20),  # id updated
        PrintIdModule(),
        PrintIdModule(),

    ], update_data_id='id')

    my_chain.process(data)
Example #4
0
def test_override():
    expdir = path.join("tests/data/", "exp01")

    # Remove the experiment dir if it exists
    if os.path.exists(expdir):
        shutil.rmtree(expdir)

    # We have to reset the _eh to make flexp stop complaining about calling setup twice.
    flexp.core._eh = {}
    flexp.setup("tests/data/", "exp01", False, override_dir=False)

    assert path.isdir(expdir), "flexp didn't create experiment dir with override_dir=False"

    # Test that it fails to create the directory, there should be logging file already.
    with pytest.raises(FileExistsError):
        flexp.core._eh = {}
        flexp.setup("tests/data/", "exp01", False, override_dir=False)

    # This should be ok
    flexp.core._eh = {}
    flexp.setup("tests/data/", "exp01", False, override_dir=True)

    # Disable logging to be able to delete the experiment directory.
    flexp.disable()

    # Remove the experiment dir
    if os.path.exists(expdir):
        shutil.rmtree(expdir)
Example #5
0
def test_working():
    flexp.setup("tests/data/", "exp01", False)

    expdir = path.join("tests/data/", "exp01")
    assert path.isdir(expdir), "flexp didn't create experiment dir"

    with io.open("tests/testfile.txt", "wt") as x:
        x.write(u"hello")

    flexp.backup_files(["tests/testfile.txt"])
    assert path.exists(path.join("tests/data/", "exp01", "testfile.txt"))

    # forbid flexp creating metadata on exit
    if hasattr(atexit, "unregister"):
        getattr(atexit,
                "unregister")(flexp.core._eh["experiment"]._write_metadata)

    os.unlink("tests/testfile.txt")
    os.unlink(path.join(expdir, "testfile.txt"))
    if not hasattr(atexit, "unregister") and path.exists(
            path.join(expdir, "flexp_info.txt")):
        os.unlink(path.join(expdir, "flexp_info.txt"))
    flexp.disable()
    shutil.rmtree(expdir)
Example #6
0
def test_close():
    exp_root_dir = "tests/data/"
    expdir1 = path.join(exp_root_dir, "exp01")
    expdir2 = path.join(exp_root_dir, "exp02")

    # Remove the experiment dir if it exists
    if os.path.exists(expdir1):
        shutil.rmtree(expdir1)
    if os.path.exists(expdir2):
        shutil.rmtree(expdir2)

    # We have to reset the _eh to make flexp stop complaining about calling setup twice.
    flexp.core._eh = {}
    flexp.setup(exp_root_dir, "exp01", with_date=False)
    flexp.close()

    assert path.isfile(os.path.join(expdir1, "flexp_info.txt")), \
        "flexp didn't create flexp_info.txt after calling flexp.close()"

    flexp.setup(exp_root_dir, "exp02", with_date=False)
    flexp.close()

    # Ensure log files doesn't contain the same rows
    log1 = load_log_without_timestamp(os.path.join(expdir1, "log.txt"))
    log2 = load_log_without_timestamp(os.path.join(expdir2, "log.txt"))
    assert len(set(log1) & set(log2)) == 0, \
        "Log files contains same rows"

    # Ensure not possible to call flexp.setup() twice
    with pytest.raises(Exception):
        flexp.setup(exp_root_dir, "exp01", with_date=False, override_dir=True)
        flexp.setup(exp_root_dir, "exp02", with_date=False, override_dir=True)

    # Disable logging to be able to delete the experiment directory.
    flexp.disable()

    # Remove the experiment dir
    if os.path.exists(expdir1):
        shutil.rmtree(expdir1)

    if os.path.exists(expdir2):
        shutil.rmtree(expdir2)
def run(exp, override, dataset_size, model, vectorizer):
    exp = "{}-{}".format(exp, dataset_size)
    flexp.setup("./experiments",
                exp,
                with_date=False,
                loglevel=logging.INFO,
                override_dir=override)
    flexp.describe(model.__class__.__name__)

    # Load
    logging.info("Loading")
    df_train, df_dev = load_data(dataset_size, dataset_types=("train", "dev"))

    # Preprocess
    logging.info("Preprocessing")
    ids_train, titles_train, y_train = xy_split(df_train)
    ids_dev, titles_dev, y_dev = xy_split(df_dev)

    # Fit
    logging.info("Fitting")
    model.fit(x_train, y_train)

    # Predict
    logging.info("Predicting")
    with Timer() as t:
        y_train_pred_proba = model.predict_proba(x_train)
        y_train_pred, y_train_pred_proba = best_predictions(y_train_pred_proba,
                                                            model.classes_,
                                                            n=10)

        y_dev_pred_proba = model.predict_proba(x_dev)
        y_dev_pred, y_dev_pred_proba = best_predictions(y_dev_pred_proba,
                                                        model.classes_,
                                                        n=10)

    # Store
    logging.info("Storing")
    store_model_params(model, flexp.get_file("model_params.json"))
    # store_model(model, flexp.get_file("model_dict.pkl"))
    store_predictions(ids_dev, y_dev_pred, y_dev_pred_proba, dataset_size,
                      flexp.get_file("predictions_dev.npz"))

    # Eval
    logging.info("Evaluating accuracy")
    main_acc = eval_accuracy(y_train, y_train_pred, y_dev, y_dev_pred,
                             flexp.get_file("accuracy.csv"))

    logging.info("Evaluating classes")
    eval_classes(y_dev, y_dev_pred, model.classes_,
                 flexp.get_file("eval_classes.csv"))

    logging.info("Evaluating time")
    n_examples = y_train_pred_proba.shape[0] + y_dev_pred_proba.shape[0]
    main_time = eval_time(t.duration, n_examples, model.n_jobs)

    # logging.info("Evaluating confusion matrix")
    # eval_confusion_matrix(y_dev, y_dev_pred[:, 0], model.classes_, flexp.get_file("cm_dev.png"))

    logging.info("Evaluating feature importance")
    eval_feature_importance(model, vectorizer.get_feature_names(),
                            flexp.get_file("feature_importance.csv"))

    logging.info("Evaluating model size")
    main_size = eval_model_size(model)

    logging.info("Evaluating main metrics")
    metric_names, metric_values = zip(main_acc, main_time, main_size)
    eval_main_metrics(metric_names, metric_values,
                      flexp.get_file("metrics.csv"))

    flexp.close()  # Neccessary to use it in a queue
Example #8
0
def main(exp):
    flexp.setup("./experiments",
                exp,
                with_date=False,
                loglevel=logging.INFO,
                override_dir=True)

    # Load
    logging.info("Loading data")
    df_train = pd.read_csv("data/data_clean_train.csv")
    df_dev = pd.read_csv("data/data_clean_dev.csv")

    # Preprocess - split data to x and y, keep x as pd.DataFrame
    logging.info("Preprocessing")
    df_x_train, y_train = xy_split(df_train)
    df_x_dev, y_dev = xy_split(df_dev)

    # Features
    feature_transformer = FeatureTransformer()
    # Fit parameters of transformers and transform x_train
    x_train = feature_transformer.fit_transform(df_x_train)
    # Transformers already fitted, just transform x_dev
    x_dev = feature_transformer.transform(df_x_dev)
    # x_train, x_dev is np.array now, we still want to know
    # the names of features
    feature_names = feature_transformer.get_feature_names()

    # Impute - fill missing values with median of the column
    imputer = SimpleImputer(strategy="median")
    x_train = imputer.fit_transform(x_train)
    x_dev = imputer.transform(x_dev)

    # Scale - transforms columns to be around 0
    # For some methods easier training, better results, for some methods worse
    features_to_scale = [
        "city mpg trans__", "Year__", "Number of Doors__", "Engine HP__"
    ]
    scaler = FeatureScaler(StandardScaler(), feature_names, features_to_scale)
    x_train = scaler.fit_transform(x_train)
    x_dev = scaler.transform(x_dev)

    # Logging
    # It is useful to log almost everything for easy debugging
    logging.info("x_train.shape={} y_train.shape={}".format(
        x_train.shape, y_train.shape))
    logging.info("x_dev.shape={} y_dev.shape={}".format(
        x_dev.shape, y_dev.shape))

    # Fit
    logging.info("Fitting")
    model = RandomForestRegressor(n_estimators=10)
    model.fit(x_train, y_train)

    # Eval
    logging.info("Evaluating")
    y_train_pred = model.predict(x_train)
    y_dev_pred = model.predict(x_dev)

    eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred)
    eval_feature_importance(model, feature_names, x_dev, y_dev)
    eval_pdp(model, x_dev, feature_names)
Example #9
0
import sys
import logging as log

from flexp.flow import Chain
from flexp import flexp

from simple_modules import (
    LoadData, Lowercase, TrainTestSplit, TfIdf, Train, Eval
)

# All the experiments are stored in "experiments" directory.
# This current one is in directory experiments/tf-idf-20180518-10-35-25/
flexp.setup("./experiments", "tf-idf", with_date=True)

# Store source codes
# Store the running program
flexp.backup_files([sys.argv[0], "simple_modules.py"])

# Store all files in these directories into a zip file
flexp.backup_sources(["../flexp/"])

flexp.describe("Query parameter prediction with TF-IDF and linear regression")

# Setup logging
log.debug("flow setup complete.")

# Create the chain to load the data, lowercase and tokenize it.
file_name = "example_queries.tsv"
data_chain = Chain([
    LoadData(file_name),
    Lowercase(),
Example #10
0

args = parse_args()
num_test_positions = args.num_test_positions
num_train_positions = args.num_train_positions
test_positions = (list(range(num_test_positions))
                  if num_test_positions else None)
train_positions = (list(range(num_train_positions))
                   if num_train_positions else None)

train_days = ["201808{:02d}".format(i) for i in range(13, 32)]
test_days = ["201809{:02d}".format(i) for i in range(1, 22)]

flexp.setup(args.results_dir,
            "linear_pos_vowpal_num_train_pos_{}_num_test_pos_{}_cb_{}".format(
                num_train_positions, num_test_positions,
                "_".join(args.cb_types)),
            with_date=True)
flexp.describe("save_all_input_vowpal {}, train on {}, test on {}, "
               "num_train_positions {}, num_test_positions {}".format(
                   "_".join(args.cb_types), train_days, test_days,
                   num_train_positions, num_test_positions))

logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s %(levelname)s %(message)s")

np.random.seed(25)

logging.info("Reading dataset")
train = DatasetReader([
    os.path.join(args.dataset_dir, str(train_day)) for train_day in train_days