def upload_dataset(): if "file" in request.files: file = request.files["file"] if file.filename == "": abort(400) filename = secure_filename(file.filename) path = DATASETS_DIRECTORY / filename file.save(path) Dataset.create_from_path(path).save() return {"status": "ok"}, 201 else: abort(400)
def export_confusion_matrix(id): model, _, _ = Dataset.model_from_id(id) if model.status != "done": return {"error": "Model is not trained"}, 409 if not model.confusion_matrix_path: return {"error": "No confusion matrix available"}, 404 return send_file(model.confusion_matrix_path, as_attachment=True)
def delete_config(id): app.logger.info(f"Removing config {id}") config, dataset = Dataset.config_from_id(id) config.delete_data() dataset.configs = [c for c in dataset.configs if c.id != config.id] dataset.save() return jsonify({})
def create_model(id): config, dataset = Dataset.config_from_id(id) app.logger.info(f"Creating model with config: {request.json}") model = DatasetModel(model_config=request.json) config.models.append(model) dataset.save() return model.to_json(), 201
def delete_model(id): app.logger.info(f"Removing model {id}") model, config, dataset = Dataset.model_from_id(id) model.delete_data() config.models = [m for m in config.models if m.id != model.id] dataset.save() return jsonify({})
def run(self): datasetFile = 'CSVdataset - Sheet1.csv' df = pd.read_csv(datasetFile) # Create a new Faker and tell it how to create User objects for index, dataset in df.iterrows(): # print(dataset['paragraph']) dt = Dataset(dataset['paragraph'], dataset['intent'], 'nocontext') self.db.session.add(dt)
def set_dataset_config(id): result = Dataset.from_id(id) columns = request.json.get("columns") label = request.json.get("label") model_type = request.json.get("model_type") config = DatasetConfig(columns=columns, label=label, model_type=model_type) result.configs.append(config) app.logger.info(f"Inserting config {request.json}") app.logger.info(result.configs) result.save() return config.to_json(), 201
def load_all_datasets(datasets_directory): """Load all unknown datasets into the database """ datasets_already_loaded = [Path(d.path) for d in Dataset.objects] for path in map(Path, os.listdir(datasets_directory)): path = (datasets_directory / path).resolve() path = datasets_directory / path if path.suffix == ".csv" and path not in datasets_already_loaded: log.info(f"Loading {path}") d = Dataset.create_from_path(path).save() log.info(f"Created entry for dataset {path}: {d.to_json()}") else: log.info(f"Not loading {path}")
def get_dataset_visualization(path: Path, dataset: Dataset, config: DatasetConfig = None): """Get or generate the SweetViz vizualisation""" if config is not None: viz_name = f"{path.name}-{config.id}-sweetviz.html" else: viz_name = f"{path.name}-sweetviz.html" viz_path = path.with_name(viz_name) log.info(f"Searching viz file {viz_path}") if viz_path.exists(): log.info("Viz found") else: # data = pd.read_csv(path, sep=None) # data.drop(data.filter(regex="Unname"), axis=1, inplace=True) # log.info("Generating viz") # if config is not None: # viz = sv.analyze(data, target_feat=config.label, # feat_cfg=sv.FeatureConfig(force_num=[config.label])) # else: # viz = sv.analyze(data) # log.info(f"Saving viz to {viz_path}") # viz.show_html(filepath=viz_path, open_browser=False) if config is not None: options = { "target_feat": config.label, "feat_cfg": sv.FeatureConfig(force_num=[config.label]) } config.visualization_path = str(viz_path) else: options = {} dataset.visualization_path = str(viz_path) generate_visulisation(path, viz_path, options).compute() dataset.save() log.info("Returning viz") return viz_path
def dataset_status(id): model: DatasetModel model, _, _ = Dataset.model_from_id(id) reply = {"status": model.status} if model.log_path: try: with open(model.log_path) as f: reply["logs"] = f.read() except FileNotFoundError: pass return reply
def store(): context = 'nocontext' intent = request.json['intent'] paragraph = request.json['paragraph'] if 'context' in request.json: context = request.json['context'] dataset = Dataset(context=context, intent=intent, paragraph=paragraph) db.session.add(dataset) db.session.commit() data = singleTransform(dataset) return responses.created(data, 'Dataset successfully created')
def predict_result(id): dataset: Dataset config: DatasetConfig model, config, dataset = Dataset.model_from_id(id) # Check if model is trained if model.status != "done": return {"error": "Model is not trained"}, 409 app.logger.info(f"predicting for dataset {dataset.name}") app.logger.info(f"Found configuration {config}") data = request.json app.logger.info(f"got data {data}") mapping = column_mapping.decode_mapping(dataset.column_mapping) for line in data: for k in line.keys(): if k in mapping: line[k] = mapping[k][line[k]] else: line[k] = float(line[k]) app.logger.info(f"Decoded data {data}") columns_order = [ col for col in dataset.columns if col in config.columns and config.columns[col] and col != config.label ] app.logger.info(f"columns order {columns_order}") data = np.array([[line[col] for col in columns_order] for line in data]) app.logger.info(f"sorted data {data}") with open(model.pickled_model_path, "rb") as f: pipeline = pickle.load(f) app.logger.info("loaded pipeline") result = pipeline.predict(data).tolist() app.logger.info(f"Predicted {result}") if config.label in mapping: result = [ column_mapping.reconvert_one_value(config.label, value, mapping) for value in result ] return jsonify([{config.label: value} for value in result])
def train_model(id): model, config, dataset = Dataset.model_from_id(id) # Check if training is already done or in progress if model.status == "done": return {"error": "Model is already trained"}, 409 if model.status not in ["not started", "error"]: return {"error": "Model is currently training"}, 409 app.logger.info(f"Starting training dataset {dataset.name}") app.logger.info(f"config: {config.to_json()}") app.logger.info(f"model: {model.to_json()}") app.logger.info(f"Found configuration {config}") # update status model.status = "starting" dataset.save() fut = client.submit(training.train_model, id) fire_and_forget(fut) return {"status": model.status}, 202
def lint_config_from_request(id): config = request.json dataset = Dataset.from_id(id) return lint_config(config, dataset)
def train_model(model_id): config: DatasetConfig model, config, dataset = Dataset.model_from_id(model_id) def set_status(status): logger.info(f"Setting status of {model.id} to: {status}") model.status = status dataset.save() try: # Create the different assets path dataset_path = Path(dataset.path) model_dir = dataset_path.parent / \ f"{dataset.name}-model-{str(model.id)}" model_dir.mkdir(exist_ok=True) log_path = model_dir / "training.log" pickled_model_path = model_dir / "pipeline.pickle" exported_model_path = model_dir / "pipeline.py" shap_model_path = model_dir / "save.png" confusion_matrix_path = model_dir / "confusion_matrix.png" model.log_path = str(log_path) set_status("started") # Load the dataset mapping = column_mapping.decode_mapping(dataset.column_mapping) X, y = get_dataset(dataset_path, config, mapping) logger.info(f"Loaded dataset: {X} {y}") logger.info(f"Mapping: {mapping}") # Copy data before column name drop (using it for shap) copy_X = X copy_y = y # Convert to types TPOT understands X = X.to_numpy().astype(np.float64) y = y.to_numpy().astype(np.float64) # Separate training and testing data with column name _, X_test_col, _, y_test_col = train_test_split(copy_X, copy_y, test_size=0.2) # Separate training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) logger.info(config.to_json()) # Split values #X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2) # Train the model classifier = tpot_training(X_train, y_train, model.model_config, log_file=log_path, model_type=config.model_type) # Save best pipeline save_res = save_pipeline(classifier, pickled_model_path) # Export best pipeline code export_res = export_pipeline_code(classifier, exported_model_path) # Save shap image image_res = save_shap(classifier, shap_model_path, copy_X, copy_y, mapping) # Create metrics on the generated pipeline analysis_res = analyse_model(classifier, X_train, y_train, X_test, y_test) # Create the confusion matrix if config.model_type == "classification": matrix_res = create_confusion_matrix(classifier, X_test_col, y_test_col, confusion_matrix_path) else: matrix_res = dask.delayed(None) # Get the results of the exportation and model saving _, _, analysis, *_ = dask.compute(save_res, export_res, analysis_res, matrix_res, image_res) # Update the model with the exported paths # and set the status as done logger.info( f"PATH MATRIX : {confusion_matrix_path}\n\nPATH SHAP : {shap_model_path}\n\n\n\n" ) model.pickled_model_path = str(pickled_model_path) model.exported_model_path = str(exported_model_path) if config.model_type == "classification": model.confusion_matrix_path = str(confusion_matrix_path) model.shap_model_path = str(shap_model_path) model.analysis = analysis model.status = "done" dataset.save() except Exception as e: logger.error(f"Got error while training: {e}") traceback.print_exc() set_status("error")
def delete_dataset(id): dataset = Dataset.from_id(id) dataset.delete_data() dataset.delete() return {}
def route_get_dataset(id): dataset = Dataset.from_id(id) return dataset.to_json()
def get_dataset_config(id): config, _ = Dataset.config_from_id(id) return config.to_json()
def get_dataset_visualization(id): d = Dataset.from_id(id) path = dataset.get_dataset_visualization(Path(d.path), d) return send_file(path)
def lint_config_from_db(id): config, dataset = Dataset.config_from_id(id) return lint_config(config, dataset)
def get_config_visualization(id): config, d = Dataset.config_from_id(id) path = dataset.get_dataset_visualization(Path(d.path), d, config) return send_file(path)
def export_shap_value(id): model, _, _ = Dataset.model_from_id(id) if model.status != "done": return {"error": "Model is not trained"}, 409 app.logger.info(f"ICI : {model.shap_model_path}\n\n\n\n") return send_file(model.shap_model_path, as_attachment=True)
def get_model(id): model, config, dataset = Dataset.model_from_id(id) return model.to_json()
def export_pickle(id): model, _, _ = Dataset.model_from_id(id) if model.status != "done": return {"error": "Model is not trained"}, 409 return send_file(model.pickled_model_path, as_attachment=True)