def main(): flexp.setup("experiments/", "exp01", False) flog.setup("debug", path=flexp.get_file_path("experiment.log.txt")) # , disable_stderr=not cfg.SHORT_RUN) log.debug('Starting.') data = FlowData() data.id = "a" # debug level 2 - all detailes will be printed data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 18)], debug_level=2) data_chain.process(data) # hash of this and previous are same data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 20)], debug_level=1) data_chain.process(data)
def main(exp): flexp.setup("./experiments", exp, with_date=False, loglevel=logging.INFO, log_filename="experiment.log.txt") # Load logging.info("Loading data") df_train = pd.read_csv("data/data_clean_train.csv") df_dev = pd.read_csv("data/data_clean_dev.csv") # Preprocess logging.info("Preprocessing") df_x_train, y_train = xy_split(df_train) df_x_dev, y_dev = xy_split(df_dev) feature_transformer = FeatureTransformer() x_train = feature_transformer.fit_transform(df_x_train) x_dev = feature_transformer.transform(df_x_dev) feature_names = feature_transformer.get_feature_names() imputer = SimpleImputer(strategy="median") x_train = imputer.fit_transform(x_train) x_dev = imputer.transform(x_dev) features_to_scale = ["city mpg__", "Year__", "Engine HP__"] scaler = FeatureScaler(StandardScaler(), feature_names, features_to_scale) x_train = scaler.fit_transform(x_train) x_dev = scaler.transform(x_dev) # Fit logging.info("Fitting") # model = RandomForestRegressor(n_estimators=10) model = Ridge(fit_intercept=False, normalize=False, alpha=1.) model.fit(x_train, y_train) # Eval logging.info("Evaluating") y_train_pred = model.predict(x_train) y_dev_pred = model.predict(x_dev) eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred) # eval_feature_importance(model, feature_names) plot_histograms(x_train, feature_names)
def main(): flexp.setup("./experiments", "tf-idf", with_date=True) data = FlowData() my_chain = CachingChain([ PrintIdModule(), PickleCache("cached_pkl", "id", [TestModule(12, 14, 18)]), # id updated by PickleChain hash PrintIdModule(), PickleCache("cached_pkl", "id", [TestModule(12, 16, 18)]), # id updated by PickleChain hash PrintIdModule(), TestModule(12, 16, 20), # id updated PrintIdModule(), PrintIdModule(), ], update_data_id='id') my_chain.process(data)
def test_override(): expdir = path.join("tests/data/", "exp01") # Remove the experiment dir if it exists if os.path.exists(expdir): shutil.rmtree(expdir) # We have to reset the _eh to make flexp stop complaining about calling setup twice. flexp.core._eh = {} flexp.setup("tests/data/", "exp01", False, override_dir=False) assert path.isdir(expdir), "flexp didn't create experiment dir with override_dir=False" # Test that it fails to create the directory, there should be logging file already. with pytest.raises(FileExistsError): flexp.core._eh = {} flexp.setup("tests/data/", "exp01", False, override_dir=False) # This should be ok flexp.core._eh = {} flexp.setup("tests/data/", "exp01", False, override_dir=True) # Disable logging to be able to delete the experiment directory. flexp.disable() # Remove the experiment dir if os.path.exists(expdir): shutil.rmtree(expdir)
def test_working(): flexp.setup("tests/data/", "exp01", False) expdir = path.join("tests/data/", "exp01") assert path.isdir(expdir), "flexp didn't create experiment dir" with io.open("tests/testfile.txt", "wt") as x: x.write(u"hello") flexp.backup_files(["tests/testfile.txt"]) assert path.exists(path.join("tests/data/", "exp01", "testfile.txt")) # forbid flexp creating metadata on exit if hasattr(atexit, "unregister"): getattr(atexit, "unregister")(flexp.core._eh["experiment"]._write_metadata) os.unlink("tests/testfile.txt") os.unlink(path.join(expdir, "testfile.txt")) if not hasattr(atexit, "unregister") and path.exists( path.join(expdir, "flexp_info.txt")): os.unlink(path.join(expdir, "flexp_info.txt")) flexp.disable() shutil.rmtree(expdir)
def test_close(): exp_root_dir = "tests/data/" expdir1 = path.join(exp_root_dir, "exp01") expdir2 = path.join(exp_root_dir, "exp02") # Remove the experiment dir if it exists if os.path.exists(expdir1): shutil.rmtree(expdir1) if os.path.exists(expdir2): shutil.rmtree(expdir2) # We have to reset the _eh to make flexp stop complaining about calling setup twice. flexp.core._eh = {} flexp.setup(exp_root_dir, "exp01", with_date=False) flexp.close() assert path.isfile(os.path.join(expdir1, "flexp_info.txt")), \ "flexp didn't create flexp_info.txt after calling flexp.close()" flexp.setup(exp_root_dir, "exp02", with_date=False) flexp.close() # Ensure log files doesn't contain the same rows log1 = load_log_without_timestamp(os.path.join(expdir1, "log.txt")) log2 = load_log_without_timestamp(os.path.join(expdir2, "log.txt")) assert len(set(log1) & set(log2)) == 0, \ "Log files contains same rows" # Ensure not possible to call flexp.setup() twice with pytest.raises(Exception): flexp.setup(exp_root_dir, "exp01", with_date=False, override_dir=True) flexp.setup(exp_root_dir, "exp02", with_date=False, override_dir=True) # Disable logging to be able to delete the experiment directory. flexp.disable() # Remove the experiment dir if os.path.exists(expdir1): shutil.rmtree(expdir1) if os.path.exists(expdir2): shutil.rmtree(expdir2)
def run(exp, override, dataset_size, model, vectorizer): exp = "{}-{}".format(exp, dataset_size) flexp.setup("./experiments", exp, with_date=False, loglevel=logging.INFO, override_dir=override) flexp.describe(model.__class__.__name__) # Load logging.info("Loading") df_train, df_dev = load_data(dataset_size, dataset_types=("train", "dev")) # Preprocess logging.info("Preprocessing") ids_train, titles_train, y_train = xy_split(df_train) ids_dev, titles_dev, y_dev = xy_split(df_dev) # Fit logging.info("Fitting") model.fit(x_train, y_train) # Predict logging.info("Predicting") with Timer() as t: y_train_pred_proba = model.predict_proba(x_train) y_train_pred, y_train_pred_proba = best_predictions(y_train_pred_proba, model.classes_, n=10) y_dev_pred_proba = model.predict_proba(x_dev) y_dev_pred, y_dev_pred_proba = best_predictions(y_dev_pred_proba, model.classes_, n=10) # Store logging.info("Storing") store_model_params(model, flexp.get_file("model_params.json")) # store_model(model, flexp.get_file("model_dict.pkl")) store_predictions(ids_dev, y_dev_pred, y_dev_pred_proba, dataset_size, flexp.get_file("predictions_dev.npz")) # Eval logging.info("Evaluating accuracy") main_acc = eval_accuracy(y_train, y_train_pred, y_dev, y_dev_pred, flexp.get_file("accuracy.csv")) logging.info("Evaluating classes") eval_classes(y_dev, y_dev_pred, model.classes_, flexp.get_file("eval_classes.csv")) logging.info("Evaluating time") n_examples = y_train_pred_proba.shape[0] + y_dev_pred_proba.shape[0] main_time = eval_time(t.duration, n_examples, model.n_jobs) # logging.info("Evaluating confusion matrix") # eval_confusion_matrix(y_dev, y_dev_pred[:, 0], model.classes_, flexp.get_file("cm_dev.png")) logging.info("Evaluating feature importance") eval_feature_importance(model, vectorizer.get_feature_names(), flexp.get_file("feature_importance.csv")) logging.info("Evaluating model size") main_size = eval_model_size(model) logging.info("Evaluating main metrics") metric_names, metric_values = zip(main_acc, main_time, main_size) eval_main_metrics(metric_names, metric_values, flexp.get_file("metrics.csv")) flexp.close() # Neccessary to use it in a queue
def main(exp): flexp.setup("./experiments", exp, with_date=False, loglevel=logging.INFO, override_dir=True) # Load logging.info("Loading data") df_train = pd.read_csv("data/data_clean_train.csv") df_dev = pd.read_csv("data/data_clean_dev.csv") # Preprocess - split data to x and y, keep x as pd.DataFrame logging.info("Preprocessing") df_x_train, y_train = xy_split(df_train) df_x_dev, y_dev = xy_split(df_dev) # Features feature_transformer = FeatureTransformer() # Fit parameters of transformers and transform x_train x_train = feature_transformer.fit_transform(df_x_train) # Transformers already fitted, just transform x_dev x_dev = feature_transformer.transform(df_x_dev) # x_train, x_dev is np.array now, we still want to know # the names of features feature_names = feature_transformer.get_feature_names() # Impute - fill missing values with median of the column imputer = SimpleImputer(strategy="median") x_train = imputer.fit_transform(x_train) x_dev = imputer.transform(x_dev) # Scale - transforms columns to be around 0 # For some methods easier training, better results, for some methods worse features_to_scale = [ "city mpg trans__", "Year__", "Number of Doors__", "Engine HP__" ] scaler = FeatureScaler(StandardScaler(), feature_names, features_to_scale) x_train = scaler.fit_transform(x_train) x_dev = scaler.transform(x_dev) # Logging # It is useful to log almost everything for easy debugging logging.info("x_train.shape={} y_train.shape={}".format( x_train.shape, y_train.shape)) logging.info("x_dev.shape={} y_dev.shape={}".format( x_dev.shape, y_dev.shape)) # Fit logging.info("Fitting") model = RandomForestRegressor(n_estimators=10) model.fit(x_train, y_train) # Eval logging.info("Evaluating") y_train_pred = model.predict(x_train) y_dev_pred = model.predict(x_dev) eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred) eval_feature_importance(model, feature_names, x_dev, y_dev) eval_pdp(model, x_dev, feature_names)
import sys import logging as log from flexp.flow import Chain from flexp import flexp from simple_modules import ( LoadData, Lowercase, TrainTestSplit, TfIdf, Train, Eval ) # All the experiments are stored in "experiments" directory. # This current one is in directory experiments/tf-idf-20180518-10-35-25/ flexp.setup("./experiments", "tf-idf", with_date=True) # Store source codes # Store the running program flexp.backup_files([sys.argv[0], "simple_modules.py"]) # Store all files in these directories into a zip file flexp.backup_sources(["../flexp/"]) flexp.describe("Query parameter prediction with TF-IDF and linear regression") # Setup logging log.debug("flow setup complete.") # Create the chain to load the data, lowercase and tokenize it. file_name = "example_queries.tsv" data_chain = Chain([ LoadData(file_name), Lowercase(),
args = parse_args() num_test_positions = args.num_test_positions num_train_positions = args.num_train_positions test_positions = (list(range(num_test_positions)) if num_test_positions else None) train_positions = (list(range(num_train_positions)) if num_train_positions else None) train_days = ["201808{:02d}".format(i) for i in range(13, 32)] test_days = ["201809{:02d}".format(i) for i in range(1, 22)] flexp.setup(args.results_dir, "linear_pos_vowpal_num_train_pos_{}_num_test_pos_{}_cb_{}".format( num_train_positions, num_test_positions, "_".join(args.cb_types)), with_date=True) flexp.describe("save_all_input_vowpal {}, train on {}, test on {}, " "num_train_positions {}, num_test_positions {}".format( "_".join(args.cb_types), train_days, test_days, num_train_positions, num_test_positions)) logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s") np.random.seed(25) logging.info("Reading dataset") train = DatasetReader([ os.path.join(args.dataset_dir, str(train_day)) for train_day in train_days