feature_output_file = \
    os.path.join(feature_output_path,
                 "{}_all_featurized_data.csv".format(data_name))

if os.path.exists(feature_output_file):
    df = pd.read_csv(feature_output_file, index_col=0)
else:
    df_init = load_glass_ternary_landolt()

    prof = Profile()
    prof.enable()

    featzer = Featurize()
    df_feats = featzer.featurize_formula(df_init, featurizers="all")
    prep = PreProcess(max_colnull=0.1)
    df = prep.preprocess(df_feats)

    prof.create_stats()
    print("featurize time:\n")
    pstats.Stats(prof).strip_dirs().sort_stats("time").print_stats(5)

    if os.path.exists(feature_output_path):
        print("output path: {} exists!".format(feature_output_path))
    else:
        os.makedirs(feature_output_path)
        print("create output path: {} successful!".format(feature_output_path))

    prof.dump_stats(
        os.path.join(feature_output_path,
                     "cProfile_for_featurize_{}.log".format(data_name)))
# actual pipeline:
df_init = load_castelli_perovskites()
if LIMIT and LIMIT < len(df_init):
    df_init = df_init.iloc[np.random.choice(len(df_init), LIMIT,
                                            replace=False)]

featzer = Featurize(ignore_cols=IGNORE_THESE_COLUMNS,
                    exclude=EXCLUDED_FEATURIZERS,
                    multiindex=MULTIINDEX,
                    drop_featurized_col=True)

df = featzer.auto_featurize(df_init,
                            input_cols=FEATUREIZE_THESE_COLUMNS,
                            guess_oxidstates=True)

prep = PreProcess(target=TARGET)
df = prep.preprocess(df)

X_train, X_test, y_train, y_test = train_test_split(df.drop(TARGET, axis=1),
                                                    df[TARGET])

print('start timing...')
start_time = time()
tpot = TPOTAutoML(mode=MODE,
                  max_time_mins=TIMEOUT_MINS,
                  generations=GENERATIONS,
                  population_size=POPULATION_SIZE,
                  scoring=SCORING,
                  random_state=RS,
                  feature_names=df.drop(TARGET, axis=1).columns,
                  n_jobs=1,
target = 'gap expt'
RS = 24
mode = 'regression'
MULTIINDEX = True
if MULTIINDEX:
    target = ('Input Data', target)

df_init = load_expt_gap()
featzer = Featurize(exclude=['CohesiveEnergy', 'AtomicPackingEfficiency'],
                    multiindex=MULTIINDEX)

df = featzer.featurize_formula(df_init,
                               featurizers='all',
                               guess_oxidstates=False)

prep = PreProcess(target=target)
df = prep.preprocess(df)

print(df.head())
df.to_csv('test.csv')

X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1),
                                                    df[target])

model = RandomForestRegressor(n_estimators=100,
                              bootstrap=False,
                              max_features=0.8,
                              min_samples_leaf=1,
                              min_samples_split=4,
                              random_state=RS)