def tde_export():
    print "Start export to TDE"
    input_name = get_input_names_for_role('input')[0]
    input_dataset = dataiku.Dataset(input_name)
    input_schema = input_dataset.read_schema()
    partitions = input_dataset.list_partitions(raise_if_empty=False)
    if partitions not in [[], [u'NP']]:
        raise Exception(
            "Due to the current APIs, this plugin cannot support partitioned input "
            "(and it seems the input dataset " + input_name +
            " is partitioned). "
            "A workaround is to first run a sync recipe "
            "from " + input_name + " into a non partitioned dataset, "
            "then take the latter as input for tde export.")
    output_name = get_output_names_for_role('output_folder')[0]
    output_folder = dataiku.Folder(output_name)
    output_path = output_folder.get_path()

    os.chdir(output_path)

    # Clean output dir. We assume there is no subfolder.
    # (because this recipe never creates one. If there is, better fail than remove someone else's data)
    for file in os.listdir(output_path):
        os.remove(file)

    ExtractAPI.initialize()

    with Extract(output_filename()) as extract:
        assert (not extract.hasTable('Extract'))
        tableDef = makeTableDefinition(input_schema)
        table = extract.addTable('Extract', tableDef)
        insertData(input_dataset, table)
        extract.close()
        ExtractAPI.cleanup()
    print "End export to TDE"
Esempio n. 2
0
def get_output_folder(config, client, project_key):
    output_managed_id = config.get('output_managed_folder', None)
    output_new_folder_name = config.get('output_new_folder_name', None)
    project = client.get_project(project_key)
    if output_managed_id == "create_new_folder":
        if output_new_folder_name:
            project_managed_folders = client.get_project(
                project_key).list_managed_folders()
            managed_folders = {
                mf["name"]: mf["id"]
                for mf in project_managed_folders
            }
            if output_new_folder_name in managed_folders:
                output_folder_dss = project.get_managed_folder(
                    managed_folders[output_new_folder_name])
                #raise ValueError("The managed folder '{}' already exists. Please rename it.".format(output_new_folder_name))
            else:
                output_folder_dss = project.create_managed_folder(
                    output_new_folder_name)
        else:
            raise ValueError("The name for the input folder is missing.")
    elif output_managed_id:
        output_folder_dss = project.get_managed_folder(output_managed_id)
    else:
        raise ValueError(
            "The output folder parameter is missing. Create or select one from the setting of the web app"
        )
    output_folder = dataiku.Folder(output_folder_dss.get_definition()['name'],
                                   project_key=project_key)
    return output_folder
Esempio n. 3
0
def get_tokens(x_train, maxlen):
    tok = None
    handle = dataiku.Folder("glove")
    tokenizerPath = handle.get_path()

    try:
        print(" [INFO] :: LOADING TOKENIZER.")
        with open(os.path.join(tokenizerPath, "tokenizer_001.pickle"),
                  'rb') as f:
            tok = pickle.load(f)
    except:
        print(" [INFO] :: TOKENIZER NOT FOUND...")
        tok = Tokenizer()
        print(" [INFO] :: FITTING NEW TOKENIZER...")
        tok.fit_on_texts(list(X_train))
        print(" [INFO] :: SAVING TOKENIZER...")

        # saving tokenizer for future use
        with open(os.path.join(tokenizerPath, "tokenizer_001.pickle"),
                  'wb') as f:
            pickle.dump(tok, f, protocol=pickle.HIGHEST_PROTOCOL)

    vocab_size = len(tok.word_index) + 1
    x_train = tok.texts_to_sequences(x_train)
    x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')

    return x_train, vocab_size, tok
def copy_plugin_to_dss_folder(plugin_id,
                              root_path,
                              folder_id,
                              project_key,
                              force_copy=False):
    """
    Copy python-lib from a plugin to a managed folder
    """
    plugin_path = get_plugin_path(root_path, plugin_id)
    plugin_lib_path = os.path.join(plugin_path, 'python-lib')

    folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path()
    lib_folder_path = os.path.join(folder_path, 'python-lib')

    if os.path.isdir(lib_folder_path) and force_copy:
        shutil.rmtree(lib_folder_path)

    if not os.path.isdir(lib_folder_path):
        os.mkdir(lib_folder_path)
        sys.path.append(lib_folder_path)

        for item in os.listdir(plugin_lib_path):
            s = os.path.join(plugin_lib_path, item)
            d = os.path.join(lib_folder_path, item)
            if os.path.isdir(s):
                shutil.copytree(s, d, symlinks=False, ignore=None)
            else:
                shutil.copy2(s, d)
    else:
        logger.info('python-lib already exists in folder')
Esempio n. 5
0
def get_folder_parameters(folder_ref: str, filename: str):
    """Extracts sample sizes from the managed folder

    :param str filename: name of the json containing the experiment parameters
    :raises: :class:`ValueError`: Missing folder or filename.

    :returns: sample sizes
    :rtype: tuple
    """
    folder = dataiku.Folder(folder_ref)
    paths = folder.list_paths_in_partition()
    if len(paths) == 0:
        raise ValueError("The input folder is empty")
    else:
        if filename:
            if filename in paths:
                tracking = folder.read_json(filename)
                size_A = int(tracking[Parameters.SIZE_A.value])
                size_B = int(tracking[Parameters.SIZE_B.value])
                return size_A, size_B
            else:
                raise ValueError(
                    "The parameter's file is not in the managed folder. It should be a json file created from the Web App 'AB testing design'. This web app is a component of the same plugin"
                )
        else:
            raise ValueError(
                "The parameters' filename is missing. It should point to a json file created from the Web App 'AB testing design'. This web app is a component of the same plugin"
            )
Esempio n. 6
0
def get_glove(tok, vocab_size, dim_length):
    print(" [INFO] :: Loading GLOVE...")
    handle = dataiku.Folder("glove")
    embeddingPath = handle.get_path()
    embeddings_index = dict()

    try:
        with open(os.path.join(embeddingPath, "glove.42B.300d.txt"),
                  'r',
                  errors='ignore',
                  encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = ''.join(values[:-300])
                coefs = np.asarray(values[-300:], dtype='float32')
                embeddings_index[word] = coefs

    except Exception as e:
        print(' [ERROR] :: GLOVE path not found...', e)
        exit(1)

    print(' [INFO] :: Loaded %s word vectors.' % len(embeddings_index))

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, dim_length))
    for word, i in tok.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # free memory
    del embeddings_index

    return embedding_matrix
 def validate_input_params(self) -> Dict:
     """Validate input parameters"""
     input_params = {}
     input_folder_names = get_input_names_for_role("input_folder")
     if len(input_folder_names) == 0:
         raise PluginParamValidationError("Please specify input folder")
     input_params["input_folder"] = dataiku.Folder(input_folder_names[0])
     if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION:
         file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_DOCUMENT_FORMATS
         self.batch_support = True
     else:
         file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_IMAGE_FORMATS
     input_params["input_df"] = generate_path_df(
         folder=input_params["input_folder"],
         file_extensions=file_extensions,
         path_column=PATH_COLUMN)
     input_folder_type = input_params["input_folder"].get_info().get(
         "type", "")
     input_params["input_folder_is_gcs"] = input_folder_type == "GCS"
     if input_params["input_folder_is_gcs"]:
         self.batch_support = True
         input_folder_access_info = input_params["input_folder"].get_info(
         ).get("accessInfo", {})
         input_params["input_folder_bucket"] = input_folder_access_info.get(
             "bucket")
         input_params["input_folder_root_path"] = str(
             input_folder_access_info.get("root", ""))[1:]
         logging.info(
             "Input folder is stored on GCS, enabling Batch API feature")
     else:
         logging.info(
             f"Input folder is not stored on GCS ({input_folder_type}), disabling Batch API feature"
         )
     return input_params
Esempio n. 8
0
 def validate_input_params(self) -> Dict:
     """Validate input parameters"""
     input_params_dict = {}
     input_folder_names = get_input_names_for_role("input_folder")
     if len(input_folder_names) == 0:
         raise PluginParamValidationError("Please specify input folder")
     input_params_dict["input_folder"] = dataiku.Folder(
         input_folder_names[0])
     image_path_list = [
         p for p in generate_path_list(input_params_dict["input_folder"])
         if p.split(".")[-1].lower() in {"jpeg", "jpg", "png"}
     ]
     if len(image_path_list) == 0:
         raise PluginParamValidationError(
             "No images of supported format (PNG or JPG) were found in input folder"
         )
     input_params_dict["input_df"] = pd.DataFrame(
         image_path_list, columns=[IMAGE_PATH_COLUMN])
     input_params_dict["input_folder_is_s3"] = input_params_dict[
         "input_folder"].get_info().get("type", "") == "S3"
     if input_params_dict["input_folder_is_s3"]:
         input_folder_access_info = input_params_dict[
             "input_folder"].get_info().get("accessInfo", {})
         input_params_dict[
             "input_folder_bucket"] = input_folder_access_info.get("bucket")
         input_params_dict["input_folder_root_path"] = str(
             input_folder_access_info.get("root", ""))[1:]
         logging.info(
             "Input folder is on Amazon S3 with bucket: {} and root path: {}"
             .format(input_params_dict["input_folder_bucket"],
                     input_params_dict["input_folder_root_path"]))
     return input_params_dict
 def validate_output_params(self) -> Dict:
     """Validate output parameters"""
     output_params = {}
     # Output dataset
     output_dataset_names = get_output_names_for_role("output_dataset")
     if len(output_dataset_names) == 0:
         raise PluginParamValidationError("Please specify output dataset")
     output_params["output_dataset"] = dataiku.Dataset(
         output_dataset_names[0])
     # Output folder
     output_folder_names = get_output_names_for_role("output_folder")
     if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION or self.recipe_id == RecipeID.CROPPING:
         if len(output_folder_names) == 0:
             raise PluginParamValidationError(
                 "Please specify output folder")
         output_params["output_folder"] = dataiku.Folder(
             output_folder_names[0])
         output_folder_type = output_params["output_folder"].get_info().get(
             "type", "")
         output_params["output_folder_is_gcs"] = output_folder_type == "GCS"
         if output_params["output_folder_is_gcs"]:
             output_folder_access_info = output_params[
                 "output_folder"].get_info().get("accessInfo", {})
             output_params[
                 "output_folder_bucket"] = output_folder_access_info.get(
                     "bucket")
             output_params["output_folder_root_path"] = str(
                 output_folder_access_info.get("root", ""))[1:]
             logging.info("Output folder is stored on GCS")
         else:
             logging.info(
                 f"Output folder is stored on {output_folder_type}")
     return output_params
Esempio n. 10
0
def save_model(rfc_models, dss_folder_id, model_name):
    """
    Save model in DSS folder

    Parameters
    ----------
    rfc_models :  dictionary with all models
    dss_folder_id : id of folder to save model into
    model_name : Name of model to be saved as

    Returns
    -------
    rfc_models : dict where keys are class labels with trained models
    """

    models = dataiku.Folder(dss_folder_id)
    path_to_folder = models.get_path()
    current_time = datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
    path_to_save_model = os.path.join(path_to_folder,
                                      current_time + '_' + model_name)

    path_to_save_compressed_model = path_to_save_model + '_compressed_v01'
    model_save_path = joblib.dump(rfc_models,
                                  path_to_save_compressed_model,
                                  compress=True)
    print(model_save_path)
Esempio n. 11
0
def do(payload, config, plugin_config, inputs):
    if "method" not in payload:
        return {}

    client = dataiku.api_client()

    if payload["method"] == "get-valid-csv-filenames":

        required_columns = ["id", "className"]
        sep = ","

        # Retrieving model folder
        model_folder_full_name = [
            inp for inp in inputs if inp["role"] == "modelFolder"
        ][0]["fullName"]
        model_folder = dataiku.Folder(model_folder_full_name).get_path()

        csv_files_root_mf = glob.glob(model_folder + "/*.csv")

        # Filtering out files without required columns
        csv_valid_filenames = []
        for f in csv_files_root_mf:
            schema = retrieve_schema_from_pandas_compatible_csv_file(f, sep)
            if len([col for col in required_columns
                    if col not in schema]) == 0:
                valid_file = {"path": f, "name": os.path.basename(f)}
                csv_valid_filenames.append(valid_file)

    return {"csv_valid_filenames": csv_valid_filenames}
def copy_plugin_to_dss_folder(plugin_id,
                              folder_id,
                              project_key,
                              force_copy=False):
    """
    Copy python-lib from a plugin to a managed folder
    """

    root_path = dataiku.get_custom_variables(
        project_key=project_key)['dip.home']
    # TODO change this to plugins/installed/...
    plugin_lib_path = os.path.join(root_path, 'plugins', 'installed',
                                   plugin_id, 'python-lib')

    folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path()
    lib_folder_path = os.path.join(folder_path, 'python-lib')

    if os.path.isdir(lib_folder_path) and force_copy:
        shutil.rmtree(lib_folder_path)

    if not os.path.isdir(lib_folder_path):
        os.mkdir(lib_folder_path)
        sys.path.append(lib_folder_path)

        for item in os.listdir(plugin_lib_path):
            s = os.path.join(plugin_lib_path, item)
            d = os.path.join(lib_folder_path, item)
            if os.path.isdir(s):
                shutil.copytree(s, d, symlinks=False, ignore=None)
            else:
                shutil.copy2(s, d)
    else:
        logger.info('python-lib already exists in folder')
Esempio n. 13
0
def load_models(territory_filename, folder):
    print(territory_filename)
    model_folder = dataiku.Folder(folder)
    folder_info = model_folder.get_info()
    path = model_folder.get_path()
    xgb_filename = os.path.join(path,'{}_xgb_model.joblib.compressed'.format(territory_filename))
    xgb_model = joblib.load(xgb_filename)
    return xgb_model
Esempio n. 14
0
    def _get_managed_folder(self):

        if self.managed_folder is None:
            # Hack to access managed folder containing files without declaring it as input of recipe
            # when used in the context of a recipe (Scoring, Evaluation)
            self.managed_folder = dataiku.Folder(self.managed_folder_id,
                                                 ignore_flow=True)

        return self.managed_folder
Esempio n. 15
0
def get_input_output():
    if len(get_input_names_for_role("input_dataset")) == 0:
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    input_dataset = dataiku.Dataset(input_dataset_name)

    output_folder_name = get_output_names_for_role("output_folder")[0]
    output_folder = dataiku.Folder(output_folder_name)
    return (input_dataset, output_folder)
def get_dimension_value_from_flow_variables(dku_flow_variables, input_id,
                                            dimension):
    if input_id:  # input folder, there can be multiple read partitions
        dimension_value = dku_flow_variables.get(
            f"DKU_SRC_{input_id}_{dimension}")
        dimension_values = dku_flow_variables.get(
            f"DKU_SRC_{input_id}_{dimension}_VALUES")
        if not dimension_value and dimension_values:
            check_only_one_read_partition(dimension_values,
                                          dataiku.Folder(input_id))
    else:  # output folder, there can be only one write partition
        dimension_value = dku_flow_variables.get(f"DKU_DST_{dimension}")
    return dimension_value
Esempio n. 17
0
def save_models(rfc_models, dss_folder_id, model_name):

    models = dataiku.Folder(dss_folder_id)
    path_to_folder = models.get_path()
    current_time = dt.today().strftime('%Y-%m-%d_%H-%M-%S')
    path_to_save_model = os.path.join(path_to_folder,
                                      current_time + '_' + model_name)

    path_to_save_compressed_model = path_to_save_model + '_compressed_v01'
    model_save_path = joblib.dump(rfc_models,
                                  path_to_save_compressed_model,
                                  compress=True)
    print(model_save_path)
Esempio n. 18
0
    def run(self, progress_callback):
        # Retrieving parameters
        output_folder_name = self.config.get('outputName', '')
        
        # Creating new Managed Folder if needed
        project = self.client.get_project(self.project_key)
        output_folder_found = False
        
        for folder in project.list_managed_folders():
            if output_folder_name == folder['name']:
                output_folder = project.get_managed_folder(folder['id'])
                output_folder_found = True
                break
        
        if not output_folder_found:
            if output_folder_name == '':
                raise ValueError('The model folder name is empty, please name the folder')
            
            output_folder = project.create_managed_folder(output_folder_name)

        output_folder_path = dataiku.Folder(output_folder.get_definition()["id"],
                                            project_key=self.project_key).get_path()
        
        os.chmod(output_folder_path,
                 0o755)
        
        for root, dirs, files in os.walk(output_folder_path):  
            for momo in dirs:  
                os.chmod(os.path.join(root, momo),
                         0o644)
            for momo in files:
                os.chmo(os.path.join(root, momo), 
                        0o644)
        
        def download_weights (folder, weights_url, name):
            r = requests.get(weights_url, 
                             allow_redirects=True)
            folder.put_file(name,r.content)
                
        model_urls =['https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5',\
                    'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5']
                
        for url in model_urls:
            name = url.split('/')[-1]
            download_weights(output_folder,
                             url,
                             name)            
        
            
        return "<span>DONE</span>"
        
Esempio n. 19
0
def train_agent(agent):
    
    # Get the parameters (the common ones)
    environment_var = get_recipe_config()['environment']
    agent_var = get_recipe_config()['agent']
    policy_var = get_recipe_config()['policy']
    gamma_var = get_recipe_config()['gamma']
    lr_var = get_recipe_config()['dqn_learning_rate']
    training_episodes_var = 5000

    
    # Create the JSON FILE and dump it into the output folder
    training_infos = {
        'name': environment_var,
        'agent': agent_var,
        'type': 'OpenAI Gym',
        'num_episodes': training_episodes_var,
        'lr': lr_var,
        'gamma': gamma_var,
        'policy': policy_var,
        'training_date': str(datetime.datetime.now())
    }
    
    saved_models = dataiku.Folder(get_output_names_for_role('main_output')[0])
    saved_models_info = saved_models.get_info()
    saved_models_path = saved_models.get_path()
    
    with open(saved_models_path + '/training_infos.json', 'w') as fp:
        json.dump(training_infos, fp)

    # Choose the agent
    if agent == "dqn":
        from stable_baselines.common.vec_env import DummyVecEnv
        from stable_baselines.deepq.policies import MlpPolicy
        from stable_baselines.deepq.policies import CnnPolicy
        from stable_baselines import DQN
        
        model = DQN(policy = policy_var, env = environment_var, gamma = gamma_var, learning_rate = lr_var)
        
    # Start the training and dump the model into the output folder
    print("========================== Start Training ==========================")
    model.learn(training_episodes_var)
    model_name = agent_var + "_" + environment_var
    print("Model Saved")
    model.save(saved_models_path + "/" + model_name)
    
    
    
    
    
Esempio n. 20
0
def dataframe_to_xlsx(input_dataframe, folder_name, output_file_name):
    folder = dataiku.Folder(folder_name)
    folder_infos = folder.get_info()
    if folder_infos["type"] == "S3":
        pickle_bytes = io.BytesIO()
        pickle.dump(input_dataframe, pickle_bytes)
        with folder.get_writer("input_dataframe.p") as w:
            w.write(pickle_bytes.getvalue())
    else:
        folder_path = folder.get_path()
        folder_path = folder_path + '/' + output_file_name + '.xlsx'
        writer = pd.ExcelWriter(folder_path, engine='openpyxl')
        input_dataframe.to_excel(writer, index=False, encoding='utf-8')

        writer.save()
def load_data(input_dataset):
    # DSS entities loading
    logging.info("Reading samples from {0}".format(input_dataset))
    input_is_folder = False
    try:
        input_df = pd.DataFrame(dataiku.Dataset(input_dataset).get_dataframe())
        logging.info("{0} input is a dataset".format(input_dataset))
    except Exception as e:
        logging.info("{0} input is a folder".format(input_dataset))
        input_samples = dataiku.Folder(input_dataset)
        input_df = pd.DataFrame(input_samples.list_paths_in_partition(),
                                columns=["path"])
        input_is_folder = True

    return input_df, input_is_folder
Esempio n. 22
0
def load_predict_config():
    """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}
    recipe_config = get_recipe_config()

    # model folder
    model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0])
    params["model_folder"] = model_folder
    params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True)

    params["external_features_future_dataset"] = None
    external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset")
    if len(external_features_future_dataset_names) > 0:
        params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0])

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    check_only_one_read_partition(params["partition_root"], params["model_folder"])
    check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"])

    params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False

    params["performance_metric"] = recipe_config.get("performance_metric")
    params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session")
    params["selected_model_label"] = recipe_config.get("manually_selected_model_label")

    params["prediction_length"] = recipe_config.get("prediction_length", -1)
    params["confidence_interval"] = recipe_config.get("confidence_interval", 95)
    params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"])
    params["include_history"] = recipe_config.get("include_history", False)

    params["sampling_method"] = recipe_config.get("sampling_method", "last_records")
    params["history_length_limit"] = None
    if params["sampling_method"] == "last_records":
        params["history_length_limit"] = recipe_config.get("number_records", 1000)
        if params["history_length_limit"] < 1:
            raise PluginParamValidationError("Number of historical records must be higher than 1")

    printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param}
    logger.info(f"Recipe parameters: {printable_params}")
    return params
Esempio n. 23
0
 def validate_output_params(self) -> Dict:
     """Validate output parameters"""
     output_params_dict = {}
     # Mandatory output dataset
     output_dataset_names = get_output_names_for_role("output_dataset")
     if len(output_dataset_names) == 0:
         raise PluginParamValidationError("Please specify output folder")
     output_params_dict["output_dataset"] = dataiku.Dataset(
         output_dataset_names[0])
     # Optional output folder
     output_folder_names = get_output_names_for_role("output_folder")
     output_params_dict["output_folder"] = None
     if len(output_folder_names) != 0:
         output_params_dict["output_folder"] = dataiku.Folder(
             output_folder_names[0])
     return output_params_dict
    def __get_logs_path(self):
        # Retrieve model managed-folder path
        folder_found = False
        project = self.client.get_project(self.project_key)
        for folder in project.list_managed_folders():
            if self.folder_name == folder['name']:
                folder_path = dataiku.Folder(
                    folder['id'], project_key=self.project_key).get_path()
                folder_found = True
                break

        if not folder_found:
            raise DataikuException(
                "The folder '{}' (in project '{}' cannot be found".format(
                    self.folder_name, self.project_key))

        log_path = os.path.join(folder_path, constants.TENSORBOARD_LOGS)
        return log_path
Esempio n. 25
0
def build_model(input_shapes, n_classes=None):

    #### DEFINING INPUT AND BASE ARCHITECTURE
    # You need to modify the name and shape of the "image_input"
    # according to the preprocessing and name of your
    # initial feature.
    # This feature should to be preprocessed as an "Image", with a
    # custom preprocessing.
    image_shape = (197, 197, 3)
    image_input_name = "name_of_your_image_input_preprocessed"
    image_input = Input(shape=image_shape, name=image_input_name)

    base_model = VGG16(include_top=False,
                       weights=None,
                       input_tensor=image_input)

    #### LOADING WEIGHTS OF PRE TRAINED MODEL
    # To leverage this architecture, it is better to use weights
    # computed on a previous training on a large dataset (Imagenet).
    # To do so, you need to download the file containing the weights
    # and load them into your model.
    # You can do it by using the macro "Download pre-trained model"
    # of the "Deep Learning image" plugin (CPU or GPU version depending
    # on your setup) available in the plugin store. For this architecture,
    # you need to select:
    #    "VGG16 trained on Imagenet"
    # This will download the weights and put them into a managed folder
    folder = dataiku.Folder("name_of_folder_containing_vgg16_weights")
    weights_path = "vgg16_imagenet_weights_notop.h5"

    base_model.load_weights(os.path.join(folder.get_path(), weights_path),
                            by_name=True,
                            skip_mismatch=True)

    #### ADDING FULLY CONNECTED CLASSIFICATION LAYER
    x = base_model.layers[-1].output
    x = Flatten()(x)
    predictions = Dense(n_classes, activation="softmax")(x)

    model = Model(input=base_model.input, output=predictions)
    return model
def do(payload, config, plugin_config, inputs):
    """ Retrieve a list of past training session timestamps and the label of all the trained models. """
    choices = []

    input_folder_name = [
        input["fullName"] for input in inputs
        if input["type"] == "MANAGED_FOLDER"
    ][0]

    input_folder = dataiku.Folder(input_folder_name)

    sessions = []
    for child in input_folder.get_path_details(path="/")["children"]:
        if re.match(TIMESTAMP_REGEX_PATTERN, child["name"]):
            sessions += [child["name"]]

    if payload.get("parameterName") == "manually_selected_session":
        choices = [{"label": "Latest available", "value": "latest_session"}]
        if len(sessions) > 0:  # not partitioned folder
            for i, session in enumerate(sorted(sessions, reverse=True)):
                choices += [{"label": session, "value": session}]

    elif payload.get("parameterName") == "manually_selected_model_label":
        all_paths = input_folder.list_paths_in_partition()
        for model_label in list_available_models_labels():
            for path in all_paths:
                if bool(re.search(f"({model_label})(/model.pk.gz)", path)):
                    choices += [{"label": model_label, "value": model_label}]
                    break

    elif payload.get("parameterName") == "model_selection_mode":
        choices = [{
            "label": "Automatic",
            "value": "auto"
        }, {
            "label": "Manual",
            "value": "manual"
        }]

    return {"choices": choices}
def get_avg_side(inputs, n_first=3000):
    """Min side is first quartile, max side is third quartile."""
    image_folder_full_name = get_input_name_from_role(inputs, 'images')
    image_folder = dataiku.Folder(image_folder_full_name)
    folder_path = image_folder.get_path()

    paths = image_folder.list_paths_in_partition()[:n_first]
    sides = []
    for path in paths:
        path = os.path.join(folder_path, path[1:])
        with Image.open(
                path
        ) as img:  # PIL does not load the raster data at this point, so it's fast.
            w, h = img.size
        sides.append(w)
        sides.append(h)
    sides = np.array(sides)

    return {
        'min_side': int(np.percentile(sides, 25)),
        'max_side': int(np.percentile(sides, 75))
    }
Esempio n. 28
0
def train(model, x, y, batch_size, epoches):
    #save trained model checkpoints
    handle = dataiku.Folder("glove")
    checkpointPath = handle.get_path()
    #checkpoint = ModelCheckpoint(os.path.join(checkpointPath,'/weights.{epoch:03d}-{val_acc:.4f}.hdf5'), monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    #tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train Model
    print(" [INFO] :: TRAINING MODEL...")
    #model.fit(x_train, y_train, batch_size=batch_size, epochs=epoches, verbose=1, callbacks=[checkpoint,tensorboard], validation_split=0.2)
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=epoches,
              verbose=1,
              validation_split=0.2)

    model.save(os.path.join(checkpointPath, 'my_model.h5'))
def do(payload, config, plugin_config, inputs):
    inputDataSets = []
    # print(inputtablename)
    connection = {}
    project = ''
    inputFolderLocation = None
    for input in inputs:
        if (input.get('role') == 'main'):
            inputtablename = input['fullName'].split('.')[1]
            project = input['fullName'].split('.')[0]
            inputDataSets.append(inputtablename)
            if not connection:
                inputdataset = dataiku.Dataset(inputtablename)
                connection = getConnectionParamsFromDataset(inputdataset).get(
                    'connectionParams', {})
        else:
            inputfoldername = input['fullName'].split('.')[1]
            inputFolderLocation = dataiku.Folder(inputfoldername)

    folderpath = inputFolderLocation.get_path() if inputFolderLocation else ''

    fileList = os.listdir(folderpath) if folderpath else []
    DATA_DIR = os.environ["DIP_HOME"]
    PYNBDIR = "config/ipython_notebooks/"
    pypath = os.path.join(DATA_DIR, PYNBDIR, project)
    pynbList = filter(lambda f: not f.startswith('.'), os.listdir(pypath)) if\
        os.path.exists(pypath) else []

    return {
        'inputfolder': folderpath,
        'fileList': fileList,
        'nbList': pynbList,
        'connection': connection,
        'inputDataSets': inputDataSets,
        'inputs': inputs
    }
Esempio n. 30
0
    def run(self, progress_callback):
        # Retrieving parameters
        output_folder_name = self.config['folder_name']
        model = self.config['model']

        architecture, trained_on = model.split('_')

        # Creating new Managed Folder if needed
        project = self.client.get_project(self.project_key)

        for folder in project.list_managed_folders():
            if output_folder_name == folder['name']:
                output_folder = project.get_managed_folder(folder['id'])
                break
        else:
            output_folder = project.create_managed_folder(output_folder_name)

        output_folder_path = dataiku.Folder(
            output_folder.get_definition()["id"],
            project_key=self.project_key).get_path()

        # Building config file
        config = {"architecture": architecture, "trained_on": trained_on}

        dl_utils.download_labels(
            trained_on, op.join(output_folder_path, constants.LABELS_FILE))

        # Download weights from s3 (dataiku-labs-public).
        dl_utils.download_model(
            architecture, trained_on,
            op.join(output_folder_path, constants.WEIGHTS_FILE),
            progress_callback)

        output_folder.put_file(constants.CONFIG_FILE, json.dumps(config))

        return "<span>DONE</span>"