Esempio n. 1
0
def stub(stage, test_df=None):
    # type: (str, pd.DataFrame) -> None
    # if random.randint(0, 1):
    #     raise Exception("brrrr")

    log_metric(stage, utcnow())
    log_dataframe("df_" + stage, test_df)
def train_model_spark(
    test_set: parameter(log_histograms=True)[spark.DataFrame],
    training_set: spark.DataFrame,
    alpha: float = 1.0,
    l1_ratio: float = 0.5,
    saved_model=model_output_parameter,
) -> str:
    transform = VectorAssembler(inputCols=SELECTED_FEATURES,
                                outputCol="features")
    lr = LogisticRegression(
        featuresCol="features",
        labelCol=LABEL_COLUMN,
        regParam=l1_ratio,
        elasticNetParam=alpha,
        family="multinomial",
        maxIter=1,
    )
    ppl = Pipeline(stages=[transform, lr])

    # Fit the pipeline to training documents.
    model = ppl.fit(training_set)

    prediction = model.transform(test_set)
    evaluation = prediction.withColumn(
        "label", prediction["score_label"].cast(DoubleType())).select(
            ["label", "prediction"])
    evaluation.show()
    metrics = RegressionMetrics(evaluation.rdd)

    log_metric("r2", metrics.r2)
    log_metric("alpha", alpha)

    path = str(saved_model)
    model.write().save(path)
    return path
Esempio n. 3
0
def bash_script(
    script=None,
    check_retcode=0,
    cwd=None,
    env=None,
    dbnd_env=True,
    output_encoding="utf-8",
    popen_kwargs=None,
):
    # type:( str, Optional[int],str, Dict[str,str], bool, str, Dict[str,Any]) -> int

    # we need a working folder to create bash script
    task_run = try_get_current_task_run()
    if task_run:
        script_dir = str(task_run.local_task_run_root)
    else:
        script_dir = None

    bash_script_path = os.path.join(script_dir, "bash_cmd.sh")
    with open(bash_script_path, "wb") as bs:
        bs.write(bytes(script, "utf_8"))

    log_metric("bash_script", bash_script_path)

    logger.info("Bash script location: %s", bash_script_path)
    args = ["bash", bash_script_path]
    return bash_cmd.func(
        args=args,
        check_retcode=check_retcode,
        cwd=cwd or script_dir,
        env=env,
        dbnd_env=dbnd_env,
        output_encoding=output_encoding,
        popen_kwargs=popen_kwargs,
    )
Esempio n. 4
0
def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):
    # dataframe to array
    df_val = df.values

    # determine feature size
    nb_cols = nb_customer + nb_products
    log_metric("# of rows", nb_rows)
    log_metric("# of cols", nb_cols)

    # extract customers and ratings
    df_X = df_val[:, 0:2]
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((nb_rows, nb_cols)).astype("float32")
    df_X[:, 1] = nb_customer + df_X[:, 1]
    coords = df_X[:, 0:2]
    X[np.arange(nb_rows), coords[:, 0]] = 1
    X[np.arange(nb_rows), coords[:, 1]] = 1

    # create label with ratings
    Y = df_val[:, 2].astype("float32")

    # validate size and shape

    logger.info("Shape of X: %s ", str(X.shape))
    logger.info("Shape of Y: %s", str(Y.shape))
    assert X.shape == (nb_rows, nb_cols)
    assert Y.shape == (nb_rows, )

    return X, Y
Esempio n. 5
0
def compare_metrics(**context):
    '''
    compare metrics passed through xcom. This task will log the following metrics
    - largest bucket by number of objects
    - largest bucket by memory size
    '''
    bucket_names = context['bucket_names']
    task_ids = ["{}_monitor".format(bucket_name) for bucket_name in bucket_names]
    log_metric("task_ids", task_ids)

    # extract the metric dictionaries from xcom and join them by bucket name
    aggregated_metrics = {"buckets": bucket_names}
    for task_id in task_ids:
        task_metric = context["ti"].xcom_pull(task_ids=task_id)
        for task_metric_name, task_metric_value in task_metric.items():
            metric_name = task_metric_name.split('-')[-1]
            if metric_name in aggregated_metrics:
                aggregated_metrics[metric_name].append(task_metric_value)
            else:
                aggregated_metrics[metric_name] = [task_metric_value]

    # log interesting metrics
    largest_num_objs= max(aggregated_metrics['number_of_objects'])
    largest_bucket_by_obj = aggregated_metrics['buckets'][aggregated_metrics['number_of_objects'].index(largest_num_objs)]
    log_metric("largest_obj_count", largest_num_objs)
    log_metric("largest_bucket_by_obj_count", largest_bucket_by_obj)

    largest_bucket_mem = max(aggregated_metrics['total_bucket_size'])
    largest_bucket_by_mem = aggregated_metrics['buckets'][aggregated_metrics['total_bucket_size'].index(largest_bucket_mem)]
    log_metric("largest_bucket_by_memory", largest_bucket_by_mem)
    log_metric("largest_memory_consumed", largest_bucket_mem)
        def prepare_data() -> Tuple[DataFrame, DataFrame]:
            """load dataset from sklearn. split into training and testing sets"""
            raw_data = datasets.load_diabetes()

            # create a pandas DataFrame from sklearn dataset
            df = DataFrame(raw_data["data"], columns=raw_data["feature_names"])
            df["target"] = Series(raw_data["target"])

            # split the data into training and testing sets
            training_data, testing_data = train_test_split(df, test_size=0.25)

            # use DBND logging features to log DataFrames with histograms
            log_dataframe(
                "training data",
                training_data,
                with_histograms=True,
                with_schema=True,
                with_stats=True,
            )
            log_dataframe("testing_data", testing_data)

            # use DBND logging features to log the mean of s1
            log_metric("mean s1", training_data["s1"].mean())

            return training_data, testing_data
Esempio n. 7
0
def target_timeit_log(target, operation):
    from dbnd import log_metric

    log_level = get_target_logging_level()

    path = target
    if hasattr(target, "target"):
        path = target.target
    if hasattr(target, "path"):
        path = target.path

    name = target.source.name if target.source else path

    start_time = time.time()
    try:
        yield
    finally:
        end_time = time.time()
        delta_ms = (end_time - start_time) * 1000
        logger.log(
            level=log_level,
            msg="Total {} time for target {} is {} milliseconds".format(
                operation, path, delta_ms),
        )
        log_metric("marshalling_{}".format(name), delta_ms, source="system")
Esempio n. 8
0
def train_model_spark(
    test_set: spark.DataFrame,
    training_set: spark.DataFrame,
    alpha: float = 1.0,
    l1_ratio: float = 0.5,
    saved_model=parameter.output.folder_data.with_flag(None)[PathStr],
) -> str:

    transform = VectorAssembler(inputCols=["0", "1", "2"],
                                outputCol="features")
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="target",
        regParam=l1_ratio,
        elasticNetParam=alpha,
        family="multinomial",
        maxIter=1,
    )
    ppl = Pipeline(stages=[transform, lr])

    # Fit the pipeline to training documents.
    model = ppl.fit(training_set)

    prediction = model.transform(test_set)
    evaluation = prediction.withColumn("label", prediction["target"].cast(
        DoubleType())).select(["label", "prediction"])
    evaluation.show()
    metrics = RegressionMetrics(evaluation.rdd)

    log_metric("r2", metrics.r2)
    log_metric("alpha", alpha)

    model.write().save(str(saved_model))
    return "ok"
Esempio n. 9
0
def unit_imputation(raw_data: DataFrame,
                    columns_to_impute=["10"],
                    value=0) -> DataFrame:
    counter = int(raw_data.describe().first().phone)
    noise = randint(-counter, counter)
    log_metric("Replaced NaNs", counter + noise)
    return raw_data.na.fill(value, columns_to_impute)
Esempio n. 10
0
def create_report(data: pd.DataFrame) -> pd.DataFrame:
    avg_score = int(data["score_label"].sum() +
                    randint(-2 * len(data.columns), 2 * len(data.columns)))
    log_metric("Column Count", len(data.columns))
    log_metric("Avg Score", avg_score)
    log_dataframe("ready_data", data, with_histograms=True)
    return pd.DataFrame(data=[[avg_score]], columns=["avg_score"])
Esempio n. 11
0
def split_data_spark(
    raw_data: spark.DataFrame
) -> Tuple[spark.DataFrame, spark.DataFrame, spark.DataFrame]:

    columns_to_remove = set(["id", "0_norm", "10_norm"])
    if columns_to_remove.issubset(list(raw_data.schema.names)):
        raw_data = raw_data.drop(columns_to_remove)

    (train, test) = raw_data.randomSplit([0.8, 0.2])
    (test, validation) = raw_data.randomSplit([0.5, 0.5])

    target_stats = raw_data.describe(["target"])

    log_metric(
        "target.mean",
        target_stats.filter(
            target_stats["summary"] == "mean").collect()[0].asDict()["target"],
    )
    log_metric(
        "target.std",
        target_stats.filter(target_stats["summary"] == "stddev").collect()
        [0].asDict()["target"],
    )

    return train, test, validation
Esempio n. 12
0
def unit_imputations(raw_data: DataFrame, value: int) -> DataFrame:
    counter = int(raw_data.describe().first().phone)
    noise = random.randint(-counter, counter)

    log_metric("Replaced NaNs", counter + noise)

    return raw_data.na.fill(value)
Esempio n. 13
0
def log_operator_result(task_run, result, operator, track_xcom):
    _log_result(task_run, result)

    # after airflow runs the operator it xcom_push the result, so we log it
    if track_xcom and operator.do_xcom_push and result is not None:
        from airflow.models import XCOM_RETURN_KEY

        log_metric(key=XCOM_RETURN_KEY, value=result)
Esempio n. 14
0
def word_count(text, factor=1):
    # type: (DataList[str], int)-> int
    log_metric("input", len(text))
    logger.info("Factor: %s", factor)

    result = Counter()
    for line in text:
        result.update(line.split() * factor)
    return sum(result.values())
Esempio n. 15
0
def generate_report():
    spark = SparkSession.builder.appName("benchmark-pyspark").getOrCreate()

    df = spark.read.csv("./datasets/backblaze-data-01gb",
                        inferSchema=True,
                        header=True,
                        sep=",")
    df.select("serial_number").show(10)

    log_metric("records_count", 1)
Esempio n. 16
0
def t_f_a(t_input, t_param, t_default="d1"):
    # type: (DataList[str], str, str) -> DataList[str]
    # adds dressing
    assert t_default == "d1"
    assert t_param == "d2"

    log_metric("t_input", len(t_input))

    logger.info("Got string: %s", t_input)
    return t_input[:2]
Esempio n. 17
0
def train_model(training_data: DataFrame) -> LinearRegression:
    """ train a linear regression model """
    model = LinearRegression()

    # train a linear regression model
    model.fit(training_data.drop("target", axis=1), training_data["target"])

    # use DBND log crucial details about the regression model with log_metric:
    log_metric("model intercept", model.intercept_)  # logging a numeric
    log_metric("coefficients", model.coef_)  # logging an np array
    return model
Esempio n. 18
0
def create_report(data: DataFrame) -> DataFrame:
    log_metric("Column Count", len(data.columns))
    log_metric(
        "Avg Score",
        int(
            data.agg({
                "score": "sum"
            }).collect()[0][0] +
            randint(-2 * len(data.columns), 2 * len(data.columns))),
    )
    return data
Esempio n. 19
0
def fake_task_inside_dag():
    log_metric("Testing", "Metric")
    run = try_get_databand_run()
    assert run is not None, "Task should run in databand run, check airflow tracking!"
    root_task = run.root_task

    # Validate regular subdag properties
    assert run.job_name == "%s.%s" % (PARENT_DAG, CHILD_DAG)
    # this test got problematic cause airflow_inplace task named as the script that ran it
    assert root_task.task_name

    return "Regular test"
Esempio n. 20
0
def fake_task_inside_dag():
    log_metric("Testing", "Metric")
    run = try_get_databand_run()
    assert run is not None, "Task should run in databand run, check airflow tracking!"
    root_task = run.root_task

    # Validate regular subdag properties
    assert run.job_name == "%s.%s.fake_task_inside_dag" % (PARENT_DAG,
                                                           CHILD_DAG)
    assert root_task.task_name == "fake_task_inside_dag__execute"

    return "Regular test"
Esempio n. 21
0
def task_pass_through_default(data, dt, expect_pass_through):
    # type: (pd.DataFrame, datetime.datetime, bool) -> str
    # print needed to test that log is sent
    print("hello task_pass_through_default")
    if expect_pass_through:
        assert isinstance(data, str)
        assert isinstance(dt, str)
    else:
        assert isinstance(data, pd.DataFrame)
        assert isinstance(dt, datetime.datetime)
    log_metric("data", data)
    return str(data)
Esempio n. 22
0
def dbnd_status():
    report = DoctorStatusReportBuilder("Databand Status")

    report.log("env.DBND_HOME", os.environ.get("DBND_HOME"))
    dc = try_get_databand_context()
    report.log("DatabandContext", dc)
    if dc:
        report.log("initialized", dc)

    # calling metrics.
    log_metric("metric_check", "OK")
    log_metric("metric_random_value", random.random())
    return report.get_status_str()
        def prepare_data(data, output_file):
            spark = SparkSession.builder.appName(
                "PythonWordCount").getOrCreate()

            lines = spark.read.text(data).rdd.map(lambda r: r[0])
            counts = (lines.flatMap(lambda x: x.split(" ")).map(
                lambda x: (x, 1)).reduceByKey(add))
            counts.saveAsTextFile(output_file)
            output = counts.collect()
            for (word, count) in output:
                print("%s: %i" % (word, count))
            log_metric("counts", len(output))
            spark.sparkContext.stop()
Esempio n. 24
0
def test_model(model: LinearRegression, testing_data: DataFrame) -> str:
    """ test the model, output mean squared error and r2 score """
    testing_x = testing_data.drop("target", axis=1)
    testing_y = testing_data["target"]
    predictions = model.predict(testing_x)
    mse = mean_squared_error(testing_y, predictions)
    r2_score = model.score(testing_x, testing_y)

    # use DBND log_metric to capture important model details:
    log_metric("mean squared error:", mse)
    log_metric("r2 score", r2_score)

    return f"MSE: {mse}, R2: {r2_score}"
Esempio n. 25
0
def clean_pii(data: pd.DataFrame,
              pii_columns: List[str],
              target_date: datetime.date = None) -> pd.DataFrame:
    # I am not sure about this code, but this might help
    if target_date and target_date >= datetime.date(2020, 7, 12):
        if "10" not in data.columns:
            log_metric("Fixed columns", ["10"])
            data["10"] = 0
    data[pii_columns] = data[pii_columns].apply(
        lambda x: x.apply(get_hash_for_obj), axis=1)
    log_metric("PII items removed:", len(pii_columns) * data.shape[0])
    log_dataframe("pii_clean", data)
    return data
def train_model(
    test_set: DataFrame,
    training_set: DataFrame,
    alpha: float = 0.5,
    l1_ratio: float = 0.5,
) -> ElasticNet:
    """ Train wine prediction model """
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    lr.fit(training_set.drop(["quality"], 1), training_set[["quality"]])
    prediction = lr.predict(test_set.drop(["quality"], 1))

    (rmse, mae, r2) = calculate_metrics(test_set[["quality"]], prediction)

    log_metric("alpha", alpha)
    log_metric("rmse", rmse)
    log_metric("mae", rmse)
    log_metric("r2", r2)

    logging.info(
        "Elasticnet model (alpha=%f, l1_ratio=%f): rmse = %f, mae = %f, r2 = %f",
        alpha,
        l1_ratio,
        rmse,
        mae,
        r2,
    )
    return lr
Esempio n. 27
0
    def run(self):
        """
        Generates bogus data and writes it into the :py:meth:`~.Streams.output` target.
        """
        logger.warning("Hey, this is streams task!")

        with self.output().open("w") as output:
            for _ in range(1000):
                output.write("{} {} {}\n".format(
                    random.randint(0, 999),
                    get_random_name(),
                    random.randint(0, 999),
                ))
            log_metric("lines", 1000)
Esempio n. 28
0
    def run(self):
        from dbnd import log_metric

        logger.warning("Hey, this is top artists task!")
        top_10 = nlargest(10, self._input_iterator())
        log_metric(key="Top10Artists", value=str(top_10))
        with self.output().open("w") as out_file:
            for streams, artist in top_10:
                out_line = "\t".join([
                    str(self.date_interval.date_a),
                    str(self.date_interval.date_b),
                    artist,
                    str(streams),
                ])
                out_file.write((out_line + "\n"))
Esempio n. 29
0
def split_data(
    raw_data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    columns_to_remove = set(["id", "0_norm", "10_norm"])
    if columns_to_remove.issubset(raw_data.columns):
        raw_data.drop(columns_to_remove, axis=1, inplace=True)

    train_df, test_df = train_test_split(raw_data)
    test_df, validation_df = train_test_split(test_df, test_size=0.5)

    log_dataframe("raw", raw_data)
    log_metric("target.mean", raw_data["target"].mean())
    log_metric("target.std", raw_data["target"].std())

    return train_df, test_df, validation_df
Esempio n. 30
0
def prepare(
    train: pd.DataFrame,
    test: pd.DataFrame,
    validation: pd.DataFrame,
    train_out=output[Target],
    test_out=output.folder_data.with_flag(None)[Target],
    validation_out=output[Target],
):
    """Prepare data for training with Sagemaker algorithms

    - Read preprocessed data and converts to ProtoBuf format to prepare for
      training with Sagemaker algorithms

    Args:
        test - dataframe with test data
        train  - dataframe with train data
        validation - dataframe with validation data
        train_out - output path for train data in protobuf format
        test_out - output path for test data in protobuf format
        validation_out - output path for validation data in protobuf format
    Returns:
        s3 url with key to the prepared data
    """
    all_df = pd.concat([train, validation, test])
    nb_customer = np.unique(all_df["customer"].values).shape[0]
    nb_products = np.unique(all_df["product"].values).shape[0]
    feature_dim = nb_customer + nb_products
    log_metric("customers x products x feature_dim",
               (nb_customer, nb_products, feature_dim))

    train_X, train_Y = convert_sparse_matrix(train, train.shape[0],
                                             nb_customer, nb_products)
    validate_X, validate_Y = convert_sparse_matrix(validation,
                                                   validation.shape[0],
                                                   nb_customer, nb_products)
    test_X, test_Y = convert_sparse_matrix(test, test.shape[0], nb_customer,
                                           nb_products)

    save_as_protobuf(train_X, train_Y, train_out)
    save_as_protobuf(validate_X, validate_Y, validation_out)

    test_x_chunks = chunk(test_X, 10000)
    test_y_chunks = chunk(test_Y, 10000)
    N = len(test_x_chunks)
    for i in range(N):
        save_as_protobuf(test_x_chunks[i], test_y_chunks[i],
                         test_out.partition())
    return "OK"