def custom_corrections_checker(
        custom_corrections_dataset: dataiku.Dataset) -> Dict:
    """Utility function to check the content of the optional custom corrections dataset

    Args:
        custom_corrections_dataset: Dataset instance with the first column for words
            and the second one for their correction

    Returns:
        Dictionary of words (key) and their custom correction (value)

    """
    dataset_schema = custom_corrections_dataset.get_config()["schema"]
    columns = dataset_schema["columns"]
    if len(columns) != 2:
        raise PluginParamValidationError(
            "Custom corrections dataset must have only two columns")

    (word_column, correction_column) = (columns[0], columns[1])
    if word_column["type"] != "string" or correction_column["type"] != "string":
        raise PluginParamValidationError(
            "Columns of custom corrections dataset must be of string type")

    df = custom_corrections_dataset.get_dataframe(infer_with_pandas=False)
    df = clean_text_df(df, dropna_columns=[word_column["name"]
                                           ]).fillna("").astype(str)
    custom_corrections_dict = {
        row[0]: row[1]
        for row in df.itertuples(index=False)
    }
    return custom_corrections_dict
Exemple #2
0
def set_column_descriptions(output_dataset: dataiku.Dataset,
                            column_descriptions: Dict,
                            input_dataset: dataiku.Dataset = None) -> None:
    """Set column descriptions of the output dataset based on a dictionary of column descriptions

    Retain the column descriptions from the input dataset if the column name matches.

    Args:
        output_dataset: Output dataiku.Dataset instance
        column_descriptions: Dictionary holding column descriptions (value) by column name (key)
        input_dataset: Optional input dataiku.Dataset instance
            in case you want to retain input column descriptions

    """
    output_dataset_schema = output_dataset.read_schema()
    input_dataset_schema = []
    input_columns_names = []
    if input_dataset is not None:
        input_dataset_schema = input_dataset.read_schema()
        input_columns_names = [col["name"] for col in input_dataset_schema]
    for output_col_info in output_dataset_schema:
        output_col_name = output_col_info.get("name", "")
        output_col_info["comment"] = column_descriptions.get(output_col_name)
        if output_col_name in input_columns_names:
            matched_comment = [
                input_col_info.get("comment", "")
                for input_col_info in input_dataset_schema
                if input_col_info.get("name") == output_col_name
            ]
            if len(matched_comment) != 0:
                output_col_info["comment"] = matched_comment[0]
    output_dataset.write_schema(output_dataset_schema)
def custom_vocabulary_checker(
        custom_vocabulary_dataset: dataiku.Dataset) -> Set:
    """Utility function to check the content of the optional custom vocabulary dataset

    Args:
        custom_vocabulary_dataset: Dataset with a single column for words that should not be corrected

    Returns:
        Set of words in the custom vocabulary

    """
    dataset_schema = custom_vocabulary_dataset.get_config()["schema"]
    columns = dataset_schema["columns"]
    if len(columns) != 1:
        raise PluginParamValidationError(
            "Custom vocabulary dataset must have only one column")

    col_name = columns[0]["name"]
    col_type = columns[0]["type"]
    if col_type != "string":
        raise PluginParamValidationError(
            "Column of custom vocabulary dataset must be of string type")

    df = clean_text_df(
        custom_vocabulary_dataset.get_dataframe(infer_with_pandas=False))
    custom_vocabulary = set(df[col_name].astype(str).tolist())
    return custom_vocabulary
def count_records(dataset: dataiku.Dataset) -> int:
    """
    Count the number of records of a dataset using the Dataiku dataset metrics API
    """
    metric_id = "records:COUNT_RECORDS"
    dataset_name = dataset.name.split(".")[1]
    partitions = dataset.read_partitions
    client = dataiku.api_client()
    project = client.get_project(dataiku.default_project_key())
    logging.info("Counting records of dataset: {}".format(dataset_name))
    if partitions is None or len(partitions) == 0:
        project.get_dataset(dataset_name).compute_metrics(metric_ids=[metric_id])
        metric = dataset.get_last_metric_values()
        record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id))
        logging.info("Dataset contains {:d} records and is not partitioned".format(record_count))
    else:
        record_count = 0
        for partition in partitions:
            project.get_dataset(dataset_name).compute_metrics(partition=partition, metric_ids=[metric_id])
            metric = dataset.get_last_metric_values()
            record_count += dataiku.ComputedMetrics.get_value_from_data(
                metric.get_partition_data(partition=partition, metric_id=metric_id)
            )
        logging.info("Dataset contains {:d} records in partition(s) {}".format(record_count, partitions))
    return record_count
def set_column_description(
    input_dataset: dataiku.Dataset,
    output_dataset: dataiku.Dataset,
    column_description_dict: Dict,
) -> None:
    """
    Set column descriptions of the output dataset based on a dictionary of column descriptions
    and retains the column descriptions from the input dataset if the column name matches
    """
    input_dataset_schema = input_dataset.read_schema()
    output_dataset_schema = output_dataset.read_schema()
    input_columns_names = [col["name"] for col in input_dataset_schema]
    for output_col_info in output_dataset_schema:
        output_col_name = output_col_info.get("name", "")
        output_col_info["comment"] = column_description_dict.get(
            output_col_name)
        if output_col_name in input_columns_names:
            matched_comment = [
                input_col_info.get("comment", "")
                for input_col_info in input_dataset_schema
                if input_col_info.get("name") == output_col_name
            ]
            if len(matched_comment) != 0:
                output_col_info["comment"] = matched_comment[0]
    output_dataset.write_schema(output_dataset_schema)
def load_input_output(config):
    if not get_input_names_for_role("input_dataset"):
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    config.input_dataset = Dataset(input_dataset_name)

    output_dataset_name = get_output_names_for_role("output_dataset")[0]
    config.output_dataset = Dataset(output_dataset_name)
Exemple #7
0
    def __init__(self,
                 dataset=None,
                 database=None,
                 connection=None,
                 vtype=None):
        if connection and dataset or connection and database or database and dataset:
            raise ValueError(
                "only one of connection, database or dataset should be given")

        if dataset:
            if isinstance(dataset, Dataset):
                self._vconn = dataset.full_name
            else:
                self._vconn = Dataset(dataset).full_name
            self._find_connection_from_dataset = True
        elif connection:
            self._vconn = "@virtual(%s):connection:%s" % (vtype, connection)
            self._find_connection_from_dataset = False
        elif database:
            self._vconn = "@virtual(%s):%s" % (vtype, database)
            self._find_connection_from_dataset = False
        else:
            self._vconn = None
            self._find_connection_from_dataset = None

        print("Vconn = %s find=%s" %
              (self._vconn, self._find_connection_from_dataset))
Exemple #8
0
 def _add_output_dataset(self):
     output_dataset_name = get_output_names_for_role("tagged_documents")[0]
     self.dku_config.add_param(
         name="output_dataset",
         value=Dataset(output_dataset_name),
         required=True,
     )
Exemple #9
0
    def __init__(self):
        """Instanciate class with DkuConfigLoading and add input datasets to dku_config"""

        super().__init__()
        text_input = get_input_names_for_role("document_dataset")[0]
        self.dku_config.add_param(
            name="text_input", value=Dataset(text_input), required=True
        )
        ontology_input = get_input_names_for_role("ontology_dataset")[0]
        self.dku_config.add_param(
            name="ontology_input", value=Dataset(ontology_input), required=True
        )
        self.document_dataset_columns = [
            p["name"] for p in self.dku_config.text_input.read_schema()
        ]
        self.ontology_dataset_columns = [
            p["name"] for p in self.dku_config.ontology_input.read_schema()
        ]
Exemple #10
0
def process_dataset_chunks(input_dataset: dataiku.Dataset,
                           output_dataset: dataiku.Dataset,
                           func: Callable,
                           chunksize: float = 1000,
                           **kwargs) -> None:
    """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.

    Pass keyword arguments to the function, adds a tqdm progress bar and generic logging.
    Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.

    Args:
        input_dataset: Input dataiku.Dataset instance
        output_dataset: Output dataiku.Dataset instance
        func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame
            This function must take a pandas.DataFrame as first input argument,
            and output another pandas.DataFrame
        chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func`
        **kwargs: Optional keyword arguments fed to `func`

    Raises:
        ValueError: If the input dataset is empty or if pandas cannot read it without type inference

    """
    input_count_records = count_records(input_dataset)
    if input_count_records == 0:
        raise ValueError("Input dataset has no records")
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows by chunks of {chunksize}..."
    )
    start = perf_counter()
    # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails.
    if not output_dataset.read_schema(raise_if_empty=False):
        df = input_dataset.get_dataframe(limit=5, infer_with_pandas=False)
        output_df = func(df=df, **kwargs)
        output_dataset.write_schema_from_dataframe(output_df)
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize,
                                                    infer_with_pandas=False)
        len_iterator = math.ceil(input_count_records / chunksize)
        for i, df in tqdm(enumerate(df_iterator),
                          total=len_iterator,
                          unit="chunk",
                          mininterval=1.0):
            output_df = func(df=df, **kwargs)
            if i == 0:
                output_dataset.write_schema_from_dataframe(
                    output_df,
                    dropAndCreate=bool(not output_dataset.writePartition))
            writer.write_dataframe(output_df)
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows: "
        + f"Done in {perf_counter() - start:.2f} seconds.")
Exemple #11
0
    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema, preprocessing_params["per_feature"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for input_df in input_dataset.iter_dataframes_forced_types(
                names, dtypes, parse_date_columns, chunksize=100000):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            if recipe_desc.get("filterInputColumns", False):
                input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]]

            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            transformed = pipeline.process(input_df)
            logging.info("Applying it")

            (labels_arr,
             additional_columns) = clustering_predict(modeling_params, clf,
                                                      transformed)
            cluster_labels = pd.Series(labels_arr,
                                       name="cluster_labels").map(naming)
            cluster_labels.index = transformed["TRAIN"].index

            final_df = pd.concat([
                input_df_orig.join(cluster_labels, how='left'),
                additional_columns
            ],
                                 axis=1)

            if preprocessing_params["outliers"]["method"] == "CLUSTER":
                outliers_cluter_name = cluster_name_map.get(
                    constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS)
                final_df['cluster_labels'].fillna(outliers_cluter_name,
                                                  inplace=True)

            logging.info("Done predicting it")

            yield final_df
Exemple #12
0
    def __init__(self, connection=None, dataset=None):
        if connection and dataset:
            raise ValueError(
                "only one of connection or dataset should be given")

        if dataset:
            if isinstance(dataset, Dataset):
                self._iconn = dataset.full_name
            else:
                self._iconn = Dataset(dataset).full_name
            self._find_connection_from_dataset = True
        else:
            self._iconn = connection
            self._find_connection_from_dataset = False
Exemple #13
0
def count_records(dataset: dataiku.Dataset) -> int:
    """Count the number of records of a dataset using the Dataiku dataset metrics API

    Args:
        dataset: dataiku.Dataset instance

    Returns:
        Number of records

    """
    metric_id = "records:COUNT_RECORDS"
    partitions = dataset.read_partitions
    client = dataiku.api_client()
    project = client.get_project(dataset.project_key)
    record_count = 0
    logging.info(f"Counting records of dataset: {dataset.name}...")
    if partitions is None or len(partitions) == 0:
        project.get_dataset(
            dataset.short_name).compute_metrics(metric_ids=[metric_id])
        metric = dataset.get_last_metric_values()
        record_count = dataiku.ComputedMetrics.get_value_from_data(
            metric.get_global_data(metric_id=metric_id))
        logging.info(
            f"Dataset {dataset.name} contains {record_count:d} records and is not partitioned"
        )
    else:
        for partition in partitions:
            project.get_dataset(dataset.short_name).compute_metrics(
                partition=partition, metric_ids=[metric_id])
            metric = dataset.get_last_metric_values()
            record_count += dataiku.ComputedMetrics.get_value_from_data(
                metric.get_partition_data(partition=partition,
                                          metric_id=metric_id))
        logging.info(
            f"Dataset {dataset.name} contains {record_count:d} records in partition(s) {partitions}"
        )
    return record_count
Exemple #14
0
def df_from_split_desc_no_normalization(split_desc,
                                        split,
                                        feature_params,
                                        prediction_type=None):
    if split_desc["format"] != "csv1":
        raise Exception("Unsupported format")
    (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
        split_desc["schema"]["columns"],
        parse_dates=True,
        infer_with_pandas=True)

    if split == "full":
        f = split_desc["fullPath"]
    else:
        f = split == "train" and split_desc["trainPath"] or split_desc[
            "testPath"]

    # We infer everything with Pandas, EXCEPT booleans.
    # Because then pandas completely looses the original syntax
    # So for example if target is true/false, and we let pandas infer, then it will become
    # True/False, and when we remap, we try to remap with true/false and end up with no
    # target at all
    # for col in split_desc["schema"]["columns"]:
    #     if col["type"] == "boolean":
    #         if dtypes is None:
    #             dtypes = {}
    #         dtypes[col["name"]] = "str"

    logging.info("Reading with dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(split_desc["schema"],
                                             feature_params,
                                             prediction_type=prediction_type)

    logging.info("Reading with FIXED dtypes: %s" % dtypes)
    df = pd.read_table(f,
                       names=names,
                       dtype=dtypes,
                       header=None,
                       sep='\t',
                       doublequote=True,
                       quotechar='"',
                       parse_dates=parse_date_columns,
                       float_precision="round_trip")
    logging.info("Loaded table")

    return df
    def run(self, progress_callback):
        clobber = self.config.get("clobber", False)
        prefix = self.config.get("prefix")

        connections = set()

        done = 0
        for project_key in self.project_keys:
            project = self.client.get_project(project_key)

            for dataset_name in Dataset.list(project_key=project_key):
                d = project.get_dataset(dataset_name)
                connection_name = d.get_definition().get('params', {}).get(
                    'connection', None)
                if connection_name is not None:
                    connections.add(connection_name)

            sql_notebooks = intercom.backend_json_call(
                "sql-notebooks/list/", data={"projectKey": project_key})
            for sql_notebook in sql_notebooks:
                connection_name = sql_notebook.get('connection', None)
                if connection_name is not None:
                    m = re.search('@virtual\(([^\)]+)\):(.*)', connection_name)
                    if m is not None:
                        connection_name = 'hive-%s' % m.group(2)

                    connections.add(connection_name)

            meta = project.get_metadata()

            # Update tags list
            if clobber:
                tags = [x for x in meta["tags"] if not x.startswith(prefix)]
            else:
                tags = meta["tags"]
            tags.extend([
                "%s%s" % (prefix, connection)
                for connection in list(connections)
            ])

            meta["tags"] = tags
            project.set_metadata(meta)

            done += 1
            progress_callback(done)
def process_dataset_chunks(
    input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 10000, **kwargs
) -> None:
    """
    Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.
    Automatically adds a tqdm progress bar and generic logging.
    """
    logging.info("Processing dataframe chunks of size {:d})...".format(chunksize))
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False)
        len_iterator = math.ceil(count_records(input_dataset) / chunksize)
        for i, df in tqdm(enumerate(df_iterator), total=len_iterator):
            output_df = func(df=df, **kwargs)
            if i == 0:
                if output_dataset.writePartition is None or output_dataset.writePartition == "":
                    output_dataset.write_schema_from_dataframe(output_df, dropAndCreate=True)
                else:
                    output_dataset.write_schema_from_dataframe(output_df)
            writer.write_dataframe(output_df)
    logging.info("Processing dataframe chunks: Done!")
Exemple #17
0
def _streamed_query_to_df(connection,
                          query,
                          pre_queries,
                          post_queries,
                          find_connection_from_dataset,
                          db_type,
                          extra_conf={},
                          infer_from_schema=False,
                          parse_dates=True,
                          bool_as_str=False,
                          dtypes=None,
                          script_steps=None,
                          script_input_schema=None,
                          script_output_schema=None):
    import pandas as pd
    data = {
        "connection":
        connection,
        "query":
        query,
        "preQueries":
        json.dumps(pre_queries),
        "postQueries":
        json.dumps(post_queries),
        "findConnectionFromDataset":
        find_connection_from_dataset,
        "dbType":
        db_type,
        "extraConf":
        json.dumps(extra_conf),
        "scriptSteps":
        json.dumps(script_steps) if script_steps is not None else None,
        "scriptInputSchema":
        json.dumps(script_input_schema)
        if script_input_schema is not None else None,
        "scriptOutputSchema":
        json.dumps(script_output_schema)
        if script_output_schema is not None else None
    }

    logging.info("Starting SQL query reader")
    # initiate the streaming (blocks until the database says it's ready to return values)
    streamingSession = backend_json_call("sql-queries/start-streaming",
                                         data=data)

    logging.info("Got initial SQL query response")

    queryId = streamingSession['queryId']

    # handle the special case of 'nothing to stream'
    if streamingSession['hasResults'] == False:
        return pd.DataFrame()

    parse_date_columns = None
    if infer_from_schema and "schema" in streamingSession:
        schema_columns = streamingSession["schema"]
        (inferred_names, inferred_dtypes,
         inferred_parse_date_columns) = Dataset.get_dataframe_schema_st(
             schema_columns, parse_dates=parse_dates, bool_as_str=bool_as_str)
        dtypes = inferred_dtypes
        parse_date_columns = inferred_parse_date_columns

    # fetch the data...
    resp_stream = backend_stream_call("sql-queries/stream",
                                      data={"queryId": queryId},
                                      err_msg="Query failed")
    # ... and stuff it (streamed) in a dataframe
    results = pd.read_table(resp_stream,
                            sep='\t',
                            doublequote=True,
                            quotechar='"',
                            dtype=dtypes,
                            parse_dates=parse_date_columns)

    # query seems to have run fine. 'Seems'. Verify that.
    # note to self: this call has to be made after the dataframe creation, because it is streamed and the call
    # returns before the query is actually done
    backend_void_call("sql-queries/verify",
                      data={"queryId": queryId},
                      err_msg="Query failed")

    return results
Exemple #18
0
    def run(self, progress_callback):
        """
        If `run` is successful, we use the method success() to return an HTML message.
        In case of an error, we don't return the error in such an HTML message but we raise an Error instead
        so it is considered as a failed step if called from a scenario.
        """
        if 'input_dataset' in self.config:
            # the macro is run from the flow
            dataset_name = self.config.get('input_dataset')
        else:
            # the macro is run from a scenario
            dataset_name = self.config.get('dataset')

        if not dataset_name:
            logging.error(
                'The mandatory param `dataset` is missing or invalid to export dataset to Snowflake stage'
            )
            raise ValueError(
                f"The mandatory parameter `Dataset to export` is invalid")

        # We use the API `dataiku.core.dataset.Dataset.get_location_info` rather than `dataikuapi.dss.dataset.DSSDataset.get_settings().get_raw_params()`
        # because it expands variables if any in the connection settings (see https://doc.dataiku.com/dss/latest/variables/index.html)
        dataset_connection_info = Dataset(
            dataset_name).get_location_info()["info"]

        if dataset_connection_info.get("databaseType") != 'Snowflake':
            logging.error(
                'Cannot export non Snowflake dataset `%s.%s` to Snowflake stage',
                self.project_key, dataset_name)
            raise ValueError(f"'{dataset_name}' is not a Snowflake dataset")

        mandatory_params = [{"name": "Snowflake stage", "id": "stage"}]

        for param in mandatory_params:
            if param['id'] not in self.config or not self.config.get(
                    param['id']):
                logging.error(
                    'The mandatory param `%s` is missing or invalid to export dataset `%s.%s` to Snowflake stage',
                    param['name'], self.project_key, dataset_name)
                raise ValueError(f"The parameter '{param['name']}' is invalid")

        fully_qualified_stage_name = self.config.get('stage')

        output_path = (self.config.get('path')
                       or self.project_key).strip(' ').strip('/')
        destination = os.path.join(output_path, dataset_name)

        file_format_param = self.config.get('file_format') or 'default'
        file_format = '' if file_format_param == 'default' else f"FILE_FORMAT = (FORMAT_NAME = {file_format_param})"

        overwrite = 'OVERWRITE = TRUE' if self.config.get("overwrite") else ''

        sql_copy_query = f"COPY INTO @{fully_qualified_stage_name}/{destination} FROM {resolve_table_name(dataset_connection_info)} {file_format} {overwrite}"

        logging.info("Exporting dataset `%s.%s` with the copy command: `%s`",
                     self.project_key, dataset_name, sql_copy_query)

        executor = SQLExecutor2(dataset=dataset_name)
        executor.query_to_df(sql_copy_query)

        logging.info(
            f"Successfully exported dataset `{self.project_key}.{dataset_name}` in Snowflake stage `{fully_qualified_stage_name}` to `{destination}`"
        )

        return success(
            'The dataset has been successfully exported in stage <strong>%s</strong> to <strong>%s_*</strong>'
            % (fully_qualified_stage_name.replace('"', ''), destination))
Exemple #19
0
def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         metrics_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):
    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rmodeling_params.json"))
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline(
        with_target=True)

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    logging.info("Scoring data")

    (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
        preparation_output_schema["columns"],
        parse_dates=True,
        infer_with_pandas=False)
    logging.info("Reading with INITIAL dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(
        preparation_output_schema,
        preprocessing_params["per_feature"],
        prediction_type=core_params["prediction_type"])
    logging.info("Reading with dtypes: %s" % dtypes)

    for i in xrange(0, len(names)):
        logging.info("Column %s = %s (dtype=%s)" %
                     (i, names[i], dtypes.get(names[i], None)))

    with input_dataset._stream(infer_with_pandas=True,
                               sampling='head',
                               sampling_column=None,
                               limit=None,
                               ratio=None,
                               columns=names) as stream:
        input_df = pd.read_table(stream,
                                 names=names,
                                 dtype=dtypes,
                                 header=None,
                                 sep='\t',
                                 doublequote=True,
                                 quotechar='"',
                                 parse_dates=parse_date_columns,
                                 float_precision="round_trip")

    input_df_orig = input_df.copy()
    logging.info("Got a dataframe : %s" % str(input_df.shape))
    normalize_dataframe(input_df, preprocessing_params['per_feature'])

    for col in input_df:
        logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype))

    logging.info("Processing it")
    transformed = pipeline.process(input_df)
    logging.info("Predicting it")

    if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
        pred_df = binary_classification_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            recipe_desc["forcedClassifierThreshold"],
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

        # Probability percentile & Conditional outputs
        has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs
        has_percentiles = recipe_desc["outputProbaPercentiles"] or (
            has_cond_output and len([
                co for co in cond_outputs if co["input"] == "proba_percentile"
            ]))
        if has_percentiles:
            model_perf = dkujson.load_from_filepath(
                osp.join(model_folder, "perf.json"))
            if model_perf.has_key(
                    "probaPercentiles") and model_perf["probaPercentiles"]:
                percentile = pd.Series(model_perf["probaPercentiles"])
                proba_1 = "proba_" + str(
                    (k for k, v in preprocessing_handler.target_map.items()
                     if v == 1).next())
                pred_df["proba_percentile"] = pred_df[proba_1].apply(
                    lambda p: percentile.where(percentile <= p).count() + 1)
            else:
                raise Exception(
                    "Probability percentiles are missing from model.")
        if has_cond_output:
            for co in cond_outputs:
                inp = pred_df[co["input"]]
                acc = inp.notnull()  # condition accumulator
                for r in co["rules"]:
                    if r["operation"] == 'GT':
                        cond = inp > r["operand"]
                    elif r["operation"] == 'GE':
                        cond = inp >= r["operand"]
                    elif r["operation"] == 'LT':
                        cond = inp < r["operand"]
                    elif r["operation"] == 'LE':
                        cond = inp <= r["operand"]
                    pred_df.loc[acc & cond, co["name"]] = r["output"]
                    acc = acc & (~cond)
                pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "")
        if has_percentiles and not recipe_desc[
                "outputProbaPercentiles"]:  # was only for conditional outputs
            pred_df.drop("proba_percentile", axis=1, inplace=True)

    elif core_params["prediction_type"] == constants.MULTICLASS:
        pred_df = multiclass_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

    elif core_params["prediction_type"] == constants.REGRESSION:
        pred_df = regression_predict(
            clf,
            pipeline,
            modeling_params,
            input_df,
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)
    else:
        raise ValueError("bad prediction type %s" %
                         core_params["prediction_type"])

    # add error information to pred_df
    y = transformed["target"]
    target_mapping = {}
    if core_params["prediction_type"] in [
            constants.BINARY_CLASSIFICATION, constants.MULTICLASS
    ]:
        target_mapping = {
            label: int(class_id)
            for label, class_id in preprocessing_handler.target_map.items()
        }

    pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df,
                                     y, target_mapping)

    logging.info("Done predicting it")
    if recipe_desc.get("filterInputColumns", False):
        clean_kept_columns = [
            c for c in recipe_desc["keptInputColumns"]
            if c not in pred_df.columns
        ]
    else:
        clean_kept_columns = [
            c for c in input_df_orig.columns if c not in pred_df.columns
        ]
    output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)

    # write scored data
    output_dataset = dataiku.Dataset(output_dataset_smartname)
    #logging.info("writing scored schema")
    #output_dataset.write_schema_from_dataframe(output_df)  # backend should do this
    logging.info("writing scored data")
    output_dataset.write_from_dataframe(output_df)

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    if with_sample_weight:
        sample_weight = transformed["weight"]
    else:
        sample_weight = None

    metrics_df = compute_metrics_df(core_params["prediction_type"],
                                    target_mapping, modeling_params, output_df,
                                    recipe_desc, y, transformed["UNPROCESSED"],
                                    sample_weight)

    # write metrics dataset
    if metrics_dataset_smartname:
        metrics_dataset = dataiku.Dataset(metrics_dataset_smartname)
        #logging.info("writing metrics schema")
        #metrics_dataset.write_schema_from_dataframe(metrics_df)  # backend should maybe do this ?
        logging.info("writing metrics data")
        metrics_dataset.write_from_dataframe(metrics_df)
Exemple #20
0
def scrape(page_num):
    url = LISTING + str(page_num).strip()
    # load page content
    page = urllib.urlopen(url)
    # find the interesting data with html
    soup = BeautifulSoup(page)
    result = soup.find('div', {'id': 'places'}).find('form')['data-results']
    listing = json.loads(result)
    # keep the data in RESULTS
    data = listing['results']
    for e in data:
        RESULTS.append(e)


for p in xrange(1, NPAGES + 1):
    print "Crawling page", p
    scrape(p)

print "Crawled %i places" % len(RESULTS)

# Write in a dataset
site_data = Dataset("__FIRST_OUTPUT__")

schema = [{'name': 'key', 'type': 'int'}, {'name': 'data', 'type': 'string'}]

site_data.write_schema(schema)

writer = site_data.get_writer()
for i, e in enumerate(RESULTS):
    data = [i, json.dumps(e)]
    writer.write_tuple(data)
Exemple #21
0
    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema,
            preprocessing_params["per_feature"],
            prediction_type=core_params["prediction_type"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for i in xrange(0, len(names)):
            logging.info("Column %s = %s (dtype=%s)" %
                         (i, names[i], dtypes.get(names[i], None)))

        for input_df in input_dataset.iter_dataframes_forced_types(
                names,
                dtypes,
                parse_date_columns,
                chunksize=batch_size,
                float_precision="round_trip"):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            logging.info("Predicting it")

            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classification_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    recipe_desc["forcedClassifierThreshold"],
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])
                # Probability percentile & Conditional outputs
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    preprocessing_handler.target_map)

            elif core_params["prediction_type"] == constants.MULTICLASS:
                pred_df = multiclass_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])

            elif core_params["prediction_type"] == constants.REGRESSION:
                pred_df = regression_predict(clf, pipeline, modeling_params,
                                             input_df)

            else:
                raise ValueError("bad prediction type %s" %
                                 core_params["prediction_type"])

            logging.info("pred df debug :")
            logging.info(pred_df)

            logging.info("Done predicting it")
            if recipe_desc.get("filterInputColumns", False):
                clean_kept_columns = [
                    c for c in recipe_desc["keptInputColumns"]
                    if c not in pred_df.columns
                ]
            else:
                clean_kept_columns = [
                    c for c in input_df_orig.columns
                    if c not in pred_df.columns
                ]
            yield pd.concat([input_df_orig[clean_kept_columns], pred_df],
                            axis=1)