def tde_export(): print "Start export to TDE" input_name = get_input_names_for_role('input')[0] input_dataset = dataiku.Dataset(input_name) input_schema = input_dataset.read_schema() partitions = input_dataset.list_partitions(raise_if_empty=False) if partitions not in [[], [u'NP']]: raise Exception( "Due to the current APIs, this plugin cannot support partitioned input " "(and it seems the input dataset " + input_name + " is partitioned). " "A workaround is to first run a sync recipe " "from " + input_name + " into a non partitioned dataset, " "then take the latter as input for tde export.") output_name = get_output_names_for_role('output_folder')[0] output_folder = dataiku.Folder(output_name) output_path = output_folder.get_path() os.chdir(output_path) # Clean output dir. We assume there is no subfolder. # (because this recipe never creates one. If there is, better fail than remove someone else's data) for file in os.listdir(output_path): os.remove(file) ExtractAPI.initialize() with Extract(output_filename()) as extract: assert (not extract.hasTable('Extract')) tableDef = makeTableDefinition(input_schema) table = extract.addTable('Extract', tableDef) insertData(input_dataset, table) extract.close() ExtractAPI.cleanup() print "End export to TDE"
def get_output_folder(config, client, project_key): output_managed_id = config.get('output_managed_folder', None) output_new_folder_name = config.get('output_new_folder_name', None) project = client.get_project(project_key) if output_managed_id == "create_new_folder": if output_new_folder_name: project_managed_folders = client.get_project( project_key).list_managed_folders() managed_folders = { mf["name"]: mf["id"] for mf in project_managed_folders } if output_new_folder_name in managed_folders: output_folder_dss = project.get_managed_folder( managed_folders[output_new_folder_name]) #raise ValueError("The managed folder '{}' already exists. Please rename it.".format(output_new_folder_name)) else: output_folder_dss = project.create_managed_folder( output_new_folder_name) else: raise ValueError("The name for the input folder is missing.") elif output_managed_id: output_folder_dss = project.get_managed_folder(output_managed_id) else: raise ValueError( "The output folder parameter is missing. Create or select one from the setting of the web app" ) output_folder = dataiku.Folder(output_folder_dss.get_definition()['name'], project_key=project_key) return output_folder
def get_tokens(x_train, maxlen): tok = None handle = dataiku.Folder("glove") tokenizerPath = handle.get_path() try: print(" [INFO] :: LOADING TOKENIZER.") with open(os.path.join(tokenizerPath, "tokenizer_001.pickle"), 'rb') as f: tok = pickle.load(f) except: print(" [INFO] :: TOKENIZER NOT FOUND...") tok = Tokenizer() print(" [INFO] :: FITTING NEW TOKENIZER...") tok.fit_on_texts(list(X_train)) print(" [INFO] :: SAVING TOKENIZER...") # saving tokenizer for future use with open(os.path.join(tokenizerPath, "tokenizer_001.pickle"), 'wb') as f: pickle.dump(tok, f, protocol=pickle.HIGHEST_PROTOCOL) vocab_size = len(tok.word_index) + 1 x_train = tok.texts_to_sequences(x_train) x_train = pad_sequences(x_train, maxlen=maxlen, padding='post') return x_train, vocab_size, tok
def copy_plugin_to_dss_folder(plugin_id, root_path, folder_id, project_key, force_copy=False): """ Copy python-lib from a plugin to a managed folder """ plugin_path = get_plugin_path(root_path, plugin_id) plugin_lib_path = os.path.join(plugin_path, 'python-lib') folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path() lib_folder_path = os.path.join(folder_path, 'python-lib') if os.path.isdir(lib_folder_path) and force_copy: shutil.rmtree(lib_folder_path) if not os.path.isdir(lib_folder_path): os.mkdir(lib_folder_path) sys.path.append(lib_folder_path) for item in os.listdir(plugin_lib_path): s = os.path.join(plugin_lib_path, item) d = os.path.join(lib_folder_path, item) if os.path.isdir(s): shutil.copytree(s, d, symlinks=False, ignore=None) else: shutil.copy2(s, d) else: logger.info('python-lib already exists in folder')
def get_folder_parameters(folder_ref: str, filename: str): """Extracts sample sizes from the managed folder :param str filename: name of the json containing the experiment parameters :raises: :class:`ValueError`: Missing folder or filename. :returns: sample sizes :rtype: tuple """ folder = dataiku.Folder(folder_ref) paths = folder.list_paths_in_partition() if len(paths) == 0: raise ValueError("The input folder is empty") else: if filename: if filename in paths: tracking = folder.read_json(filename) size_A = int(tracking[Parameters.SIZE_A.value]) size_B = int(tracking[Parameters.SIZE_B.value]) return size_A, size_B else: raise ValueError( "The parameter's file is not in the managed folder. It should be a json file created from the Web App 'AB testing design'. This web app is a component of the same plugin" ) else: raise ValueError( "The parameters' filename is missing. It should point to a json file created from the Web App 'AB testing design'. This web app is a component of the same plugin" )
def get_glove(tok, vocab_size, dim_length): print(" [INFO] :: Loading GLOVE...") handle = dataiku.Folder("glove") embeddingPath = handle.get_path() embeddings_index = dict() try: with open(os.path.join(embeddingPath, "glove.42B.300d.txt"), 'r', errors='ignore', encoding='utf8') as f: for line in f: values = line.split() word = ''.join(values[:-300]) coefs = np.asarray(values[-300:], dtype='float32') embeddings_index[word] = coefs except Exception as e: print(' [ERROR] :: GLOVE path not found...', e) exit(1) print(' [INFO] :: Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs embedding_matrix = zeros((vocab_size, dim_length)) for word, i in tok.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # free memory del embeddings_index return embedding_matrix
def validate_input_params(self) -> Dict: """Validate input parameters""" input_params = {} input_folder_names = get_input_names_for_role("input_folder") if len(input_folder_names) == 0: raise PluginParamValidationError("Please specify input folder") input_params["input_folder"] = dataiku.Folder(input_folder_names[0]) if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION: file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_DOCUMENT_FORMATS self.batch_support = True else: file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_IMAGE_FORMATS input_params["input_df"] = generate_path_df( folder=input_params["input_folder"], file_extensions=file_extensions, path_column=PATH_COLUMN) input_folder_type = input_params["input_folder"].get_info().get( "type", "") input_params["input_folder_is_gcs"] = input_folder_type == "GCS" if input_params["input_folder_is_gcs"]: self.batch_support = True input_folder_access_info = input_params["input_folder"].get_info( ).get("accessInfo", {}) input_params["input_folder_bucket"] = input_folder_access_info.get( "bucket") input_params["input_folder_root_path"] = str( input_folder_access_info.get("root", ""))[1:] logging.info( "Input folder is stored on GCS, enabling Batch API feature") else: logging.info( f"Input folder is not stored on GCS ({input_folder_type}), disabling Batch API feature" ) return input_params
def validate_input_params(self) -> Dict: """Validate input parameters""" input_params_dict = {} input_folder_names = get_input_names_for_role("input_folder") if len(input_folder_names) == 0: raise PluginParamValidationError("Please specify input folder") input_params_dict["input_folder"] = dataiku.Folder( input_folder_names[0]) image_path_list = [ p for p in generate_path_list(input_params_dict["input_folder"]) if p.split(".")[-1].lower() in {"jpeg", "jpg", "png"} ] if len(image_path_list) == 0: raise PluginParamValidationError( "No images of supported format (PNG or JPG) were found in input folder" ) input_params_dict["input_df"] = pd.DataFrame( image_path_list, columns=[IMAGE_PATH_COLUMN]) input_params_dict["input_folder_is_s3"] = input_params_dict[ "input_folder"].get_info().get("type", "") == "S3" if input_params_dict["input_folder_is_s3"]: input_folder_access_info = input_params_dict[ "input_folder"].get_info().get("accessInfo", {}) input_params_dict[ "input_folder_bucket"] = input_folder_access_info.get("bucket") input_params_dict["input_folder_root_path"] = str( input_folder_access_info.get("root", ""))[1:] logging.info( "Input folder is on Amazon S3 with bucket: {} and root path: {}" .format(input_params_dict["input_folder_bucket"], input_params_dict["input_folder_root_path"])) return input_params_dict
def validate_output_params(self) -> Dict: """Validate output parameters""" output_params = {} # Output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") output_params["output_dataset"] = dataiku.Dataset( output_dataset_names[0]) # Output folder output_folder_names = get_output_names_for_role("output_folder") if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION or self.recipe_id == RecipeID.CROPPING: if len(output_folder_names) == 0: raise PluginParamValidationError( "Please specify output folder") output_params["output_folder"] = dataiku.Folder( output_folder_names[0]) output_folder_type = output_params["output_folder"].get_info().get( "type", "") output_params["output_folder_is_gcs"] = output_folder_type == "GCS" if output_params["output_folder_is_gcs"]: output_folder_access_info = output_params[ "output_folder"].get_info().get("accessInfo", {}) output_params[ "output_folder_bucket"] = output_folder_access_info.get( "bucket") output_params["output_folder_root_path"] = str( output_folder_access_info.get("root", ""))[1:] logging.info("Output folder is stored on GCS") else: logging.info( f"Output folder is stored on {output_folder_type}") return output_params
def save_model(rfc_models, dss_folder_id, model_name): """ Save model in DSS folder Parameters ---------- rfc_models : dictionary with all models dss_folder_id : id of folder to save model into model_name : Name of model to be saved as Returns ------- rfc_models : dict where keys are class labels with trained models """ models = dataiku.Folder(dss_folder_id) path_to_folder = models.get_path() current_time = datetime.today().strftime('%Y-%m-%d_%H-%M-%S') path_to_save_model = os.path.join(path_to_folder, current_time + '_' + model_name) path_to_save_compressed_model = path_to_save_model + '_compressed_v01' model_save_path = joblib.dump(rfc_models, path_to_save_compressed_model, compress=True) print(model_save_path)
def do(payload, config, plugin_config, inputs): if "method" not in payload: return {} client = dataiku.api_client() if payload["method"] == "get-valid-csv-filenames": required_columns = ["id", "className"] sep = "," # Retrieving model folder model_folder_full_name = [ inp for inp in inputs if inp["role"] == "modelFolder" ][0]["fullName"] model_folder = dataiku.Folder(model_folder_full_name).get_path() csv_files_root_mf = glob.glob(model_folder + "/*.csv") # Filtering out files without required columns csv_valid_filenames = [] for f in csv_files_root_mf: schema = retrieve_schema_from_pandas_compatible_csv_file(f, sep) if len([col for col in required_columns if col not in schema]) == 0: valid_file = {"path": f, "name": os.path.basename(f)} csv_valid_filenames.append(valid_file) return {"csv_valid_filenames": csv_valid_filenames}
def copy_plugin_to_dss_folder(plugin_id, folder_id, project_key, force_copy=False): """ Copy python-lib from a plugin to a managed folder """ root_path = dataiku.get_custom_variables( project_key=project_key)['dip.home'] # TODO change this to plugins/installed/... plugin_lib_path = os.path.join(root_path, 'plugins', 'installed', plugin_id, 'python-lib') folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path() lib_folder_path = os.path.join(folder_path, 'python-lib') if os.path.isdir(lib_folder_path) and force_copy: shutil.rmtree(lib_folder_path) if not os.path.isdir(lib_folder_path): os.mkdir(lib_folder_path) sys.path.append(lib_folder_path) for item in os.listdir(plugin_lib_path): s = os.path.join(plugin_lib_path, item) d = os.path.join(lib_folder_path, item) if os.path.isdir(s): shutil.copytree(s, d, symlinks=False, ignore=None) else: shutil.copy2(s, d) else: logger.info('python-lib already exists in folder')
def load_models(territory_filename, folder): print(territory_filename) model_folder = dataiku.Folder(folder) folder_info = model_folder.get_info() path = model_folder.get_path() xgb_filename = os.path.join(path,'{}_xgb_model.joblib.compressed'.format(territory_filename)) xgb_model = joblib.load(xgb_filename) return xgb_model
def _get_managed_folder(self): if self.managed_folder is None: # Hack to access managed folder containing files without declaring it as input of recipe # when used in the context of a recipe (Scoring, Evaluation) self.managed_folder = dataiku.Folder(self.managed_folder_id, ignore_flow=True) return self.managed_folder
def get_input_output(): if len(get_input_names_for_role("input_dataset")) == 0: raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] input_dataset = dataiku.Dataset(input_dataset_name) output_folder_name = get_output_names_for_role("output_folder")[0] output_folder = dataiku.Folder(output_folder_name) return (input_dataset, output_folder)
def get_dimension_value_from_flow_variables(dku_flow_variables, input_id, dimension): if input_id: # input folder, there can be multiple read partitions dimension_value = dku_flow_variables.get( f"DKU_SRC_{input_id}_{dimension}") dimension_values = dku_flow_variables.get( f"DKU_SRC_{input_id}_{dimension}_VALUES") if not dimension_value and dimension_values: check_only_one_read_partition(dimension_values, dataiku.Folder(input_id)) else: # output folder, there can be only one write partition dimension_value = dku_flow_variables.get(f"DKU_DST_{dimension}") return dimension_value
def save_models(rfc_models, dss_folder_id, model_name): models = dataiku.Folder(dss_folder_id) path_to_folder = models.get_path() current_time = dt.today().strftime('%Y-%m-%d_%H-%M-%S') path_to_save_model = os.path.join(path_to_folder, current_time + '_' + model_name) path_to_save_compressed_model = path_to_save_model + '_compressed_v01' model_save_path = joblib.dump(rfc_models, path_to_save_compressed_model, compress=True) print(model_save_path)
def run(self, progress_callback): # Retrieving parameters output_folder_name = self.config.get('outputName', '') # Creating new Managed Folder if needed project = self.client.get_project(self.project_key) output_folder_found = False for folder in project.list_managed_folders(): if output_folder_name == folder['name']: output_folder = project.get_managed_folder(folder['id']) output_folder_found = True break if not output_folder_found: if output_folder_name == '': raise ValueError('The model folder name is empty, please name the folder') output_folder = project.create_managed_folder(output_folder_name) output_folder_path = dataiku.Folder(output_folder.get_definition()["id"], project_key=self.project_key).get_path() os.chmod(output_folder_path, 0o755) for root, dirs, files in os.walk(output_folder_path): for momo in dirs: os.chmod(os.path.join(root, momo), 0o644) for momo in files: os.chmo(os.path.join(root, momo), 0o644) def download_weights (folder, weights_url, name): r = requests.get(weights_url, allow_redirects=True) folder.put_file(name,r.content) model_urls =['https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5',\ 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'] for url in model_urls: name = url.split('/')[-1] download_weights(output_folder, url, name) return "<span>DONE</span>"
def train_agent(agent): # Get the parameters (the common ones) environment_var = get_recipe_config()['environment'] agent_var = get_recipe_config()['agent'] policy_var = get_recipe_config()['policy'] gamma_var = get_recipe_config()['gamma'] lr_var = get_recipe_config()['dqn_learning_rate'] training_episodes_var = 5000 # Create the JSON FILE and dump it into the output folder training_infos = { 'name': environment_var, 'agent': agent_var, 'type': 'OpenAI Gym', 'num_episodes': training_episodes_var, 'lr': lr_var, 'gamma': gamma_var, 'policy': policy_var, 'training_date': str(datetime.datetime.now()) } saved_models = dataiku.Folder(get_output_names_for_role('main_output')[0]) saved_models_info = saved_models.get_info() saved_models_path = saved_models.get_path() with open(saved_models_path + '/training_infos.json', 'w') as fp: json.dump(training_infos, fp) # Choose the agent if agent == "dqn": from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines.deepq.policies import CnnPolicy from stable_baselines import DQN model = DQN(policy = policy_var, env = environment_var, gamma = gamma_var, learning_rate = lr_var) # Start the training and dump the model into the output folder print("========================== Start Training ==========================") model.learn(training_episodes_var) model_name = agent_var + "_" + environment_var print("Model Saved") model.save(saved_models_path + "/" + model_name)
def dataframe_to_xlsx(input_dataframe, folder_name, output_file_name): folder = dataiku.Folder(folder_name) folder_infos = folder.get_info() if folder_infos["type"] == "S3": pickle_bytes = io.BytesIO() pickle.dump(input_dataframe, pickle_bytes) with folder.get_writer("input_dataframe.p") as w: w.write(pickle_bytes.getvalue()) else: folder_path = folder.get_path() folder_path = folder_path + '/' + output_file_name + '.xlsx' writer = pd.ExcelWriter(folder_path, engine='openpyxl') input_dataframe.to_excel(writer, index=False, encoding='utf-8') writer.save()
def load_data(input_dataset): # DSS entities loading logging.info("Reading samples from {0}".format(input_dataset)) input_is_folder = False try: input_df = pd.DataFrame(dataiku.Dataset(input_dataset).get_dataframe()) logging.info("{0} input is a dataset".format(input_dataset)) except Exception as e: logging.info("{0} input is a folder".format(input_dataset)) input_samples = dataiku.Folder(input_dataset) input_df = pd.DataFrame(input_samples.list_paths_in_partition(), columns=["path"]) input_is_folder = True return input_df, input_is_folder
def load_predict_config(): """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # model folder model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0]) params["model_folder"] = model_folder params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True) params["external_features_future_dataset"] = None external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset") if len(external_features_future_dataset_names) > 0: params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0]) # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) check_only_one_read_partition(params["partition_root"], params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"]) params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False params["performance_metric"] = recipe_config.get("performance_metric") params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session") params["selected_model_label"] = recipe_config.get("manually_selected_model_label") params["prediction_length"] = recipe_config.get("prediction_length", -1) params["confidence_interval"] = recipe_config.get("confidence_interval", 95) params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"]) params["include_history"] = recipe_config.get("include_history", False) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["history_length_limit"] = None if params["sampling_method"] == "last_records": params["history_length_limit"] = recipe_config.get("number_records", 1000) if params["history_length_limit"] < 1: raise PluginParamValidationError("Number of historical records must be higher than 1") printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param} logger.info(f"Recipe parameters: {printable_params}") return params
def validate_output_params(self) -> Dict: """Validate output parameters""" output_params_dict = {} # Mandatory output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output folder") output_params_dict["output_dataset"] = dataiku.Dataset( output_dataset_names[0]) # Optional output folder output_folder_names = get_output_names_for_role("output_folder") output_params_dict["output_folder"] = None if len(output_folder_names) != 0: output_params_dict["output_folder"] = dataiku.Folder( output_folder_names[0]) return output_params_dict
def __get_logs_path(self): # Retrieve model managed-folder path folder_found = False project = self.client.get_project(self.project_key) for folder in project.list_managed_folders(): if self.folder_name == folder['name']: folder_path = dataiku.Folder( folder['id'], project_key=self.project_key).get_path() folder_found = True break if not folder_found: raise DataikuException( "The folder '{}' (in project '{}' cannot be found".format( self.folder_name, self.project_key)) log_path = os.path.join(folder_path, constants.TENSORBOARD_LOGS) return log_path
def build_model(input_shapes, n_classes=None): #### DEFINING INPUT AND BASE ARCHITECTURE # You need to modify the name and shape of the "image_input" # according to the preprocessing and name of your # initial feature. # This feature should to be preprocessed as an "Image", with a # custom preprocessing. image_shape = (197, 197, 3) image_input_name = "name_of_your_image_input_preprocessed" image_input = Input(shape=image_shape, name=image_input_name) base_model = VGG16(include_top=False, weights=None, input_tensor=image_input) #### LOADING WEIGHTS OF PRE TRAINED MODEL # To leverage this architecture, it is better to use weights # computed on a previous training on a large dataset (Imagenet). # To do so, you need to download the file containing the weights # and load them into your model. # You can do it by using the macro "Download pre-trained model" # of the "Deep Learning image" plugin (CPU or GPU version depending # on your setup) available in the plugin store. For this architecture, # you need to select: # "VGG16 trained on Imagenet" # This will download the weights and put them into a managed folder folder = dataiku.Folder("name_of_folder_containing_vgg16_weights") weights_path = "vgg16_imagenet_weights_notop.h5" base_model.load_weights(os.path.join(folder.get_path(), weights_path), by_name=True, skip_mismatch=True) #### ADDING FULLY CONNECTED CLASSIFICATION LAYER x = base_model.layers[-1].output x = Flatten()(x) predictions = Dense(n_classes, activation="softmax")(x) model = Model(input=base_model.input, output=predictions) return model
def do(payload, config, plugin_config, inputs): """ Retrieve a list of past training session timestamps and the label of all the trained models. """ choices = [] input_folder_name = [ input["fullName"] for input in inputs if input["type"] == "MANAGED_FOLDER" ][0] input_folder = dataiku.Folder(input_folder_name) sessions = [] for child in input_folder.get_path_details(path="/")["children"]: if re.match(TIMESTAMP_REGEX_PATTERN, child["name"]): sessions += [child["name"]] if payload.get("parameterName") == "manually_selected_session": choices = [{"label": "Latest available", "value": "latest_session"}] if len(sessions) > 0: # not partitioned folder for i, session in enumerate(sorted(sessions, reverse=True)): choices += [{"label": session, "value": session}] elif payload.get("parameterName") == "manually_selected_model_label": all_paths = input_folder.list_paths_in_partition() for model_label in list_available_models_labels(): for path in all_paths: if bool(re.search(f"({model_label})(/model.pk.gz)", path)): choices += [{"label": model_label, "value": model_label}] break elif payload.get("parameterName") == "model_selection_mode": choices = [{ "label": "Automatic", "value": "auto" }, { "label": "Manual", "value": "manual" }] return {"choices": choices}
def get_avg_side(inputs, n_first=3000): """Min side is first quartile, max side is third quartile.""" image_folder_full_name = get_input_name_from_role(inputs, 'images') image_folder = dataiku.Folder(image_folder_full_name) folder_path = image_folder.get_path() paths = image_folder.list_paths_in_partition()[:n_first] sides = [] for path in paths: path = os.path.join(folder_path, path[1:]) with Image.open( path ) as img: # PIL does not load the raster data at this point, so it's fast. w, h = img.size sides.append(w) sides.append(h) sides = np.array(sides) return { 'min_side': int(np.percentile(sides, 25)), 'max_side': int(np.percentile(sides, 75)) }
def train(model, x, y, batch_size, epoches): #save trained model checkpoints handle = dataiku.Folder("glove") checkpointPath = handle.get_path() #checkpoint = ModelCheckpoint(os.path.join(checkpointPath,'/weights.{epoch:03d}-{val_acc:.4f}.hdf5'), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') #tensorboard = TensorBoard(log_dir="logs/{}".format(time())) adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) # Train Model print(" [INFO] :: TRAINING MODEL...") #model.fit(x_train, y_train, batch_size=batch_size, epochs=epoches, verbose=1, callbacks=[checkpoint,tensorboard], validation_split=0.2) model.fit(x, y, batch_size=batch_size, epochs=epoches, verbose=1, validation_split=0.2) model.save(os.path.join(checkpointPath, 'my_model.h5'))
def do(payload, config, plugin_config, inputs): inputDataSets = [] # print(inputtablename) connection = {} project = '' inputFolderLocation = None for input in inputs: if (input.get('role') == 'main'): inputtablename = input['fullName'].split('.')[1] project = input['fullName'].split('.')[0] inputDataSets.append(inputtablename) if not connection: inputdataset = dataiku.Dataset(inputtablename) connection = getConnectionParamsFromDataset(inputdataset).get( 'connectionParams', {}) else: inputfoldername = input['fullName'].split('.')[1] inputFolderLocation = dataiku.Folder(inputfoldername) folderpath = inputFolderLocation.get_path() if inputFolderLocation else '' fileList = os.listdir(folderpath) if folderpath else [] DATA_DIR = os.environ["DIP_HOME"] PYNBDIR = "config/ipython_notebooks/" pypath = os.path.join(DATA_DIR, PYNBDIR, project) pynbList = filter(lambda f: not f.startswith('.'), os.listdir(pypath)) if\ os.path.exists(pypath) else [] return { 'inputfolder': folderpath, 'fileList': fileList, 'nbList': pynbList, 'connection': connection, 'inputDataSets': inputDataSets, 'inputs': inputs }
def run(self, progress_callback): # Retrieving parameters output_folder_name = self.config['folder_name'] model = self.config['model'] architecture, trained_on = model.split('_') # Creating new Managed Folder if needed project = self.client.get_project(self.project_key) for folder in project.list_managed_folders(): if output_folder_name == folder['name']: output_folder = project.get_managed_folder(folder['id']) break else: output_folder = project.create_managed_folder(output_folder_name) output_folder_path = dataiku.Folder( output_folder.get_definition()["id"], project_key=self.project_key).get_path() # Building config file config = {"architecture": architecture, "trained_on": trained_on} dl_utils.download_labels( trained_on, op.join(output_folder_path, constants.LABELS_FILE)) # Download weights from s3 (dataiku-labs-public). dl_utils.download_model( architecture, trained_on, op.join(output_folder_path, constants.WEIGHTS_FILE), progress_callback) output_folder.put_file(constants.CONFIG_FILE, json.dumps(config)) return "<span>DONE</span>"