def test_explain_smoke_titanic(): titanic = load_titanic() titanic_clean = clean(titanic) sc = SimpleClassifier().fit(titanic_clean, target_col='survived') explain(sc) X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived ep = EasyPreprocessor() preprocessed = ep.fit_transform(X) tree = DecisionTreeClassifier().fit(preprocessed, y) explain(tree, feature_names=ep.get_feature_names()) pipe = make_pipeline(EasyPreprocessor(), LogisticRegression()) pipe.fit(X, y) explain(pipe, feature_names=pipe[0].get_feature_names())
def test_explain_titanic_val(model): # add multi-class # add regression titanic = load_titanic() titanic_clean = clean(titanic) X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42) pipe = make_pipeline(EasyPreprocessor(), model) pipe.fit(X_train, y_train) # without validation set explain(pipe, feature_names=X.columns) # with validation set explain(pipe, X_val, y_val, feature_names=X.columns)
def preprocess_data(df, sample=20000): data = df.sample(n=min(sample, len(df)), random_state=42) data_clean, data_types = dabl.clean(data, return_types=True, verbose=3) return data_clean, data_types
def run(csv_metadata, list_files, output_folder): csv_id = csv_metadata[0] csv_metadata = csv_metadata[1] tqdm.write(f"\nTreating {csv_id} file") #Find the full path for the csv_id mask = [csv_id in str(f) for f in list_files] csv_file_path = list_files[mask.index(True)] dabl_analysis_path = (output_folder / (csv_id + '_dabl')).with_suffix('.csv') """ if dabl_analysis_path.exists(): tqdm.write(f"File {csv_id} already analyzed: {dabl_analysis_path} already exists") return dabl_analysis_path""" result_list = [] if csv_metadata and len(csv_metadata) > 1: encoding = csv_metadata["encoding"] sep = csv_metadata["separator"] else: encoding = "latin-1" # because why not sep = ";" csv_detective_columns = [] if "columns" in csv_metadata: # keep columns that are not boolean csv_detective_columns = [ k.strip('"') for k, v in csv_metadata['columns'].items() if "booleen" not in v ] try: data: pd.DataFrame = pd.read_csv(csv_file_path.as_posix(), encoding=encoding, sep=sep, error_bad_lines=False) # remove csv_detective columns #data = data.drop(csv_detective_columns, axis=1) # TODO change this as now the columns are not in the same order data_clean, data_types = dabl.clean(data, return_types=True, verbose=3) # dabl.detect_types(data) money_variables = csv_metadata['columns']['money'] for target_col in money_variables: try: data_clean_no_nan = data_clean[data_clean[target_col].notna()] if len(data_clean_no_nan ) < 100: # less than 100 examples is too few examples continue print(f"Building models with target variable: {target_col}") sc = dabl.SimpleRegressor(random_state=42).fit( data_clean_no_nan, target_col=target_col) features_names = sc.est_.steps[0][1].get_feature_names() inner_dict = { "csv_id": csv_id, "task": "regression", "algorithm": sc.current_best_.name, "target_col": target_col, "nb_features": len(features_names), "features_names": "|".join(features_names), "nb_classes": len(data[target_col].unique()), "nb_lines": data_clean_no_nan.shape[0], "date": today, } inner_dict.update(sc.current_best_.to_dict()) inner_dict.update({ "avg_scores": np.mean(list(sc.current_best_.to_dict().values())) }) result_list.append(inner_dict) except Exception as e: tqdm.write( f"Could not analyze file {csv_id} with target col {target_col}. Error {str(e)}" ) except Exception as e: tqdm.write(f"Could not analyze file {csv_id}. Error: {e}") return None if not result_list: return result_df = pd.DataFrame(result_list) with open(dabl_analysis_path, "w") as filo: result_df.to_csv(filo, header=True, index=False) return dabl_analysis_path
bdf.save_within_48.value_counts() # ## Extract and do DABL # In[ ]: import dabl # In[ ]: feature_df = bdf[final_feature_list] # In[ ]: dabl_data = dabl.clean(feature_df) # In[ ]: dabl.plot(dabl_data, target_col='save_within_48') # In[ ]: X = dabl_data.drop("save_within_48", axis=1) Y = dabl_data.save_within_48 # In[ ]: preprocessor = dabl.EasyPreprocessor() X_trans = preprocessor.fit_transform(X)