Ejemplo n.º 1
0
def create_report(data: pd.DataFrame) -> pd.DataFrame:
    avg_score = int(data["score_label"].sum() +
                    randint(-2 * len(data.columns), 2 * len(data.columns)))
    log_metric("Column Count", len(data.columns))
    log_metric("Avg Score", avg_score)
    log_dataframe("ready_data", data, with_histograms=True)
    return pd.DataFrame(data=[[avg_score]], columns=["avg_score"])
Ejemplo n.º 2
0
def monitor_redshift_table(**op_kwarg):
    """Redshift table monitor collects the following metrics:
    - record count
    - duplicate records
    - Null/NaN record counts in each column
    - mean, median, min, max, std of each numeric column
    """

    hook = PostgresHook(REDSHIFT_CONNECTION_ID)
    data = hook.get_pandas_df(SELECT_DATA,
                              parameters=[REDSHIFT_MONITOR_TABLE_LIMIT])

    log_dataframe(
        "{}".format(REDSHIFT_TABLE),
        data,
        with_histograms=True,
        with_stats=True,
        with_schema=True,
    )

    log_metric("record count", data.shape[0])
    log_metric("Duplicate records",
               data.shape[0] - data.drop_duplicates().shape[0])
    for column in data.columns:
        log_metric("{} null record count".format(column),
                   int(data[column].isna().sum()))

        if issubdtype(data[column].dtype, number):
            log_metric("{} mean".format(column), round(data[column].mean(), 2))
            log_metric("{} median".format(column), data[column].median())
            log_metric("{} min".format(column), data[column].min())
            log_metric("{} max".format(column), data[column].max())
            log_metric("{} std".format(column), round(data[column].std(), 2))
Ejemplo n.º 3
0
def stub(stage, test_df=None):
    # type: (str, pd.DataFrame) -> None
    # if random.randint(0, 1):
    #     raise Exception("brrrr")

    log_metric(stage, utcnow())
    log_dataframe("df_" + stage, test_df)
Ejemplo n.º 4
0
def validate_model_for_customer(
    model: ElasticNet,
    validation_dataset: pd.DataFrame,
    threshold=0.2,
    target_date: datetime.date = None,
) -> Tuple[str, figure.Figure]:
    log_dataframe("validation", validation_dataset)

    # support for py3 parqeut
    validation_dataset = validation_dataset.rename(str, axis="columns")
    validation_x = validation_dataset.drop([TARGET_LABEL], 1)
    validation_y = validation_dataset[[TARGET_LABEL]]

    prediction = model.predict(validation_x)
    (rmse, mae, r2) = calculate_metrics(validation_y,
                                        prediction,
                                        target_date=target_date,
                                        additional_name="_validate")

    fig = create_scatter_plot(validation_y, prediction)
    # if r2 < threshold:
    #     raise Exception(
    #         "Model quality is below threshold. Got R2 equal to %s, expect at least %s"
    #         % (r2, threshold)
    #     )

    return "%s,%s,%s" % (rmse, mae, r2), fig
        def prepare_data() -> Tuple[DataFrame, DataFrame]:
            """load dataset from sklearn. split into training and testing sets"""
            raw_data = datasets.load_diabetes()

            # create a pandas DataFrame from sklearn dataset
            df = DataFrame(raw_data["data"], columns=raw_data["feature_names"])
            df["target"] = Series(raw_data["target"])

            # split the data into training and testing sets
            training_data, testing_data = train_test_split(df, test_size=0.25)

            # use DBND logging features to log DataFrames with histograms
            log_dataframe(
                "training data",
                training_data,
                with_histograms=True,
                with_schema=True,
                with_stats=True,
            )
            log_dataframe("testing_data", testing_data)

            # use DBND logging features to log the mean of s1
            log_metric("mean s1", training_data["s1"].mean())

            return training_data, testing_data
Ejemplo n.º 6
0
def histogram_test(input_file, app_name, stats):
    execution_time = None
    if stats:
        app_name += "_with_stats"
    app_name += "-" + os.path.basename(input_file)
    spark = SparkSession.builder.appName(app_name).getOrCreate()

    try:
        if input_file.endswith(".csv"):
            df = spark.read.csv(input_file,
                                inferSchema=True,
                                header=True,
                                sep=",")
        elif input_file.endswith(".parquet"):
            df = spark.read.parquet(input_file)
        else:
            print("not supported file type: {}".format(input_file))
            return

        start_time = time.time()
        log_dataframe("df", df, with_histograms=True, with_stats=stats)
        execution_time = time.time() - start_time
    finally:
        spark.stop()
        create_test_report(input_file, app_name, execution_time)
Ejemplo n.º 7
0
def run_create_report(input_path, output_path):
    data = pd.read_csv(input_path)
    log_dataframe(
        "data",
        data,
        path=input_path,
        with_histograms=True,
        operation_type=DbndTargetOperationType.write,
    )
    create_report(data).to_csv(output_path, index=False)
    return output_path
Ejemplo n.º 8
0
def split_data(
    raw_data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    raw_data.drop(["id", "1_norm", "10_norm"],
                  axis=1,
                  inplace=True,
                  errors="ignore")
    log_dataframe("raw", raw_data, with_histograms=True)
    train_df, test_df = train_test_split(raw_data)
    test_df, validation_df = train_test_split(test_df, test_size=0.5)

    return train_df, test_df, validation_df
Ejemplo n.º 9
0
def clean_pii(data: pd.DataFrame,
              pii_columns: List[str],
              target_date: datetime.date = None) -> pd.DataFrame:
    # I am not sure about this code, but this might help
    if target_date and target_date >= datetime.date(2020, 7, 12):
        if "10" not in data.columns:
            log_metric("Fixed columns", ["10"])
            data["10"] = 0
    data[pii_columns] = data[pii_columns].apply(
        lambda x: x.apply(get_hash_for_obj), axis=1)
    log_metric("PII items removed:", len(pii_columns) * data.shape[0])
    log_dataframe("pii_clean", data)
    return data
Ejemplo n.º 10
0
def run_clean_piis(input_path, output_path, pii_columns, target_date_str=None):
    target_date = datetime.datetime.strptime(target_date_str, "%Y-%m-%d").date()
    data = pd.read_csv(input_path)
    log_dataframe(
        "data",
        data,
        path=input_path,
        with_histograms=True,
        operation_type=DbndTargetOperationType.read,
    )
    clean_pii(data=data, pii_columns=pii_columns, target_date=target_date).to_csv(
        output_path, index=False
    )
    return output_path
Ejemplo n.º 11
0
def prepare_data(
        raw_data: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame]:
    """ Split data into train, test and validation """
    train_df, test_df = train_test_split(raw_data)
    test_df, validation_df = train_test_split(test_df, test_size=0.5)

    sys.stderr.write(
        "Running Prepare Data! You'll see this message in task log \n")
    print("..and this one..\n")
    logger.info("..and this one for sure!")

    log_dataframe("raw", raw_data)

    return train_df, test_df, validation_df
Ejemplo n.º 12
0
def split_data(
    raw_data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    columns_to_remove = set(["id", "0_norm", "10_norm"])
    if columns_to_remove.issubset(raw_data.columns):
        raw_data.drop(columns_to_remove, axis=1, inplace=True)

    train_df, test_df = train_test_split(raw_data)
    test_df, validation_df = train_test_split(test_df, test_size=0.5)

    log_dataframe("raw", raw_data)
    log_metric("target.mean", raw_data["target"].mean())
    log_metric("target.std", raw_data["target"].std())

    return train_df, test_df, validation_df
Ejemplo n.º 13
0
def validate_model(model: ElasticNet, validation_dataset: DataFrame) -> str:
    log_dataframe("validation", validation_dataset)

    validation_x = validation_dataset.drop(["quality"], 1)
    validation_y = validation_dataset[["quality"]]

    prediction = model.predict(validation_x)
    rmse = np.sqrt(mean_squared_error(validation_y, prediction))
    mae = mean_absolute_error(validation_y, prediction)
    r2 = r2_score(validation_y, prediction)

    log_metric("rmse", rmse)
    log_metric("mae", rmse)
    log_metric("r2", r2)

    return "%s,%s,%s" % (rmse, mae, r2)
Ejemplo n.º 14
0
def validate_model(model: ElasticNet, validation_dataset: DataFrame) -> str:
    """Calculates metrics of wine prediction model"""
    log_dataframe("validation", validation_dataset)
    # support for py3 parqeut
    validation_dataset = validation_dataset.rename(str, axis="columns")
    validation_x = validation_dataset.drop(["quality"], 1)
    validation_y = validation_dataset[["quality"]]

    prediction = model.predict(validation_x)
    (rmse, mae, r2) = calculate_metrics(validation_y, prediction)

    log_metric("rmse", rmse)
    log_metric("mae", rmse)
    log_metric("r2", r2)

    return "%s,%s,%s" % (rmse, mae, r2)
Ejemplo n.º 15
0
def word_count_inline(text=parameter.csv[spark.DataFrame],
                      counters=output.txt.data):
    # type:  (spark.DataFrame, Target) -> spark.DataFrame
    from operator import add
    from dbnd_spark.spark import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(str(counters))
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    counts_df = get_spark_session().createDataFrame(counts)
    log_dataframe("counts_df", counts_df)
    log_metric("test", 1)

    return counts_df
Ejemplo n.º 16
0
    def run(self):
        validation = self.validation_dataset
        self.log_metric("test_size", len(validation))

        actual_model = self.model.read_pickle()
        logger.info("%s", validation.shape)

        validation_x = validation.drop(["quality"], 1)
        validation_y = validation[["quality"]]

        prediction = actual_model.predict(validation_x)
        (rmse, mae, r2) = calculate_metrics(validation_y, prediction)

        log_dataframe("validation", validation)
        log_metric("rmse", rmse)
        log_metric("mae", rmse)
        log_metric("r2", r2)

        self.model_metrics.write("%s,%s,%s" % (rmse, mae, r2))
Ejemplo n.º 17
0
def word_count(input_path, output_path):
    spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()

    lines = spark.read.text(input_path)

    check = Check(spark, CheckLevel.Warning, "Review Check")

    # check result should be run before analysis runner because its spins up Java Gateway server
    check_result = (VerificationSuite(spark).onData(lines).addCheck(
        check.hasSize(lambda x: x >= 3)).run())

    # "name" will be used as prefix in metric key
    result_key = ResultKey(spark, ResultKey.current_milli_time(),
                           {"name": "words_df"})
    AnalysisRunner(spark).onData(lines).addAnalyzer(
        ApproxCountDistinct("value")).useRepository(
            DbndMetricsRepository(spark)).saveOrAppendResult(result_key).run()

    log_dataframe("lines", lines)
    lines = lines.rdd.map(lambda r: r[0])

    log_dataframe("lines_rdd", lines)
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    # counts.saveAsTextFile(output_path)
    output = counts.collect()
    log_dataframe("output", output)
    for (word, count) in output:
        print("%s: %i" % (word, count))
    # this makes trouble on job submit on databricks!
    # spark.close()
    # Java gateway should be closed. If it won't be closed, the process won't quit.
    spark.sparkContext._gateway.close()
Ejemplo n.º 18
0
def track_database():
    engine = create_engine(DB_CONNECTION)
    log_metric("query executed", QUERY)

    with engine.connect() as connection:
        result = connection.execute(QUERY).keys()
        header = [row for row in result]

        result = connection.execute(QUERY)
        data = [row for row in result]

    df = pd.DataFrame(data, columns=header)

    log_dataframe("DataFrame",
                  df,
                  with_histograms=True,
                  with_schema=True,
                  with_size=True,
                  with_stats=True,
                  with_preview=True)
    log_metric("row_count", df.shape[0])
    log_metric("column_count", df.shape[1])
Ejemplo n.º 19
0
def validate_model_for_customer(model: ElasticNet,
                                validation_dataset: pd.DataFrame,
                                threshold=0.2) -> Tuple[str, figure.Figure]:
    log_dataframe("validation", validation_dataset)
    # support for py3 parqeut
    validation_dataset = validation_dataset.rename(str, axis="columns")
    validation_x = validation_dataset.drop(["target"], 1)
    validation_y = validation_dataset[["target"]]

    prediction = model.predict(validation_x)
    (rmse, mae, r2) = calculate_metrics(validation_y, prediction)

    log_metric("rmse", rmse)
    log_metric("mae", mae)
    log_metric("r2", r2)
    fig = _create_scatter_plot(validation_y, prediction)
    if r2 < threshold:
        raise Exception(
            "Model quality is below threshold. Got R2 equal to %s, expect at least %s"
            % (r2, threshold))

    return "%s,%s,%s" % (rmse, mae, r2), fig
Ejemplo n.º 20
0
def dedup_records(
    data: DataFrame,
    key_columns: list,
    to_pandas: bool,
    with_histograms: bool,
    sampling_type: str,
    sampling_fraction: float,
) -> Tuple[DataFrame, tuple]:
    data = data.dropDuplicates(key_columns)

    if sampling_type is not None:
        if sampling_type == "random":
            data = data.sample(False, sampling_fraction)
        if sampling_type == "first":
            data = data.limit(int(data.count() * sampling_fraction))

    inputs_shape = (data.count(), len(data.columns))

    if to_pandas:
        log_dataframe("data", data.toPandas(), with_histograms=with_histograms)
    else:
        log_dataframe("data", data, with_histograms=with_histograms)

    return data, inputs_shape
Ejemplo n.º 21
0
def word_count(input_path, output_path):
    spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()
    lines = spark.read.text(input_path)

    log_dataframe("lines", lines)
    lines = lines.rdd.map(lambda r: r[0])

    log_dataframe("lines_rdd", lines)
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(output_path)
    output = counts.collect()
    log_dataframe("output", output)
    for (word, count) in output:
        print("%s: %i" % (word, count))
Ejemplo n.º 22
0
def preprocess(
    raw_data: pd.DataFrame = sagemaker_data_repo.amazon_reviews_raw_data,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Preprocesses data based on business logic
    - Reads delimited file passed as s3_url and preprocess data by filtering
    long tail in the customer ratings data i.e. keep customers who have rated 5
    or more videos, and videos that have been rated by 9+ customers
    - Preprocessed data is then written to output

    """
    # limit dataframe to customer_id, product_id, and star_rating
    # `product_title` will be useful validating recommendations
    df = raw_data[[
        "customer_id", "product_id", "star_rating", "product_title"
    ]]

    # clean out the long tail because most people haven't seen most videos,
    # and people rate fewer videos than they actually watch
    customers = df["customer_id"].value_counts()
    products = df["product_id"].value_counts()

    # based on data exploration only about 5% of customers have rated 5 or
    # more videos, and only 25% of videos have been rated by 9+ customers
    customers = customers[customers >= 5]
    products = products[products >= 10]
    log_dataframe("Original data shape", df)

    reduced_df = df.merge(pd.DataFrame(
        {"customer_id":
         customers.index})).merge(pd.DataFrame({"product_id": products.index}))

    log_dataframe("Shape after removing long tail", reduced_df)

    reduced_df = reduced_df.drop_duplicates(["customer_id", "product_id"])
    log_dataframe("Shape after removing duplicates", reduced_df)

    # recreate customer and product lists since there are customers with
    # more than 5 reviews, but all of their reviews are on products with
    # less than 5 reviews (and vice versa)
    customers = reduced_df["customer_id"].value_counts()
    products = reduced_df["product_id"].value_counts()

    # sequentially index each user and item to hold the sparse format where
    # the indices indicate the row and column in our ratings matrix
    customer_index = pd.DataFrame({
        "customer_id": customers.index,
        "customer": np.arange(customers.shape[0])
    })
    product_index = pd.DataFrame({
        "product_id": products.index,
        "product": np.arange(products.shape[0])
    })
    reduced_df = reduced_df.merge(customer_index).merge(product_index)

    nb_customer = reduced_df["customer"].max() + 1
    nb_products = reduced_df["product"].max() + 1
    feature_dim = nb_customer + nb_products

    log_metric("features(customer,product,total)",
               (nb_customer, nb_products, feature_dim))
    # print(nb_customer, nb_products, feature_dim)

    product_df = reduced_df[["customer", "product", "star_rating"]]

    # split into train, validation and test data sets
    train_df, validate_df, test_df = np.split(
        product_df.sample(frac=1),
        [int(0.6 * len(product_df)),
         int(0.8 * len(product_df))],
    )

    log_metric("# of rows train", train_df.shape[0])
    log_metric("# of rows test", test_df.shape[0])
    log_metric("# of rows validation", validate_df.shape[0])

    # select columns required for training the model
    # excluding columns "customer_id", "product_id", "product_title" to
    # keep files small
    cols = ["customer", "product", "star_rating"]
    train_df = train_df[cols]
    validate_df = validate_df[cols]
    test_df = test_df[cols]

    return train_df, test_df, validate_df
Ejemplo n.º 23
0
#

from __future__ import print_function

import sys

from operator import add

from pyspark.sql import SparkSession

from dbnd import log_dataframe, log_metric

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: wordcount <file> <output>")
        sys.exit(-1)

    spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()

    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(sys.argv[2])
    output = counts.collect()

    log_dataframe("output", output)
    for (word, count) in output:
        print("%s: %i" % (word, count))

    log_metric("output_len", len(output))
Ejemplo n.º 24
0
def filter_partner(data, partner):
    # type: (DataFrame, int) -> DataFrame
    partner_data = data[data["partner"] == partner]
    log_dataframe("partner_%s" % partner, partner_data)
    return partner_data
Ejemplo n.º 25
0
def monitor_S3_bucket(**context):
    '''
    This S3 monitor takes a niave approach and is not suitable for large buckets
    S3 monitor will log metrics for the target key, collecting the following metrics: 
    - Total bucket size (GB)
    - Largest key name 
    - Largest key (MB)
    - Pandas DataFrame with the following metrics on each object inside bucket:
        - key
        - size (MB)
        - last modified timestamp
    '''
    MB = 1048576 
    GB = 1073741824
    bucket_name = context['bucket_name']

    s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID)
    bucket = s3_hook.get_bucket(bucket_name)

    bucket_info = {
        "{}-key".format(bucket_name): [], 
        "{}-size(MB)".format(bucket_name): [], 
        "{}-last_modified".format(bucket_name): []
    }

    bucket_size = 0

    # WARNING: bucket.objects.all() returns objects recursively and can affect performance on large buckets
    for s3_object in bucket.objects.all():
        bucket_size += s3_object.size
        bucket_info["{}-key".format(bucket_name)].append(s3_object.key)
        bucket_info["{}-size(MB)".format(bucket_name)].append(s3_object.size/MB)
        bucket_info["{}-last_modified".format(bucket_name)].append(s3_object.last_modified)

    bucket_info_df = pd.DataFrame(bucket_info)

    num_objects = bucket_info_df.shape[0]
    largest_key_size = max(bucket_info["{}-size(MB)".format(bucket_name)])
    largest_key_size_idx = bucket_info["{}-size(MB)".format(bucket_name)].index(largest_key_size)
    largest_key_name = bucket_info["{}-key".format(bucket_name)][largest_key_size_idx]
    
    log_metric("{}-largest_key_size_(MB)".format(bucket_name), largest_key_size)
    log_metric("{}-largest_key_name".format(bucket_name), largest_key_name)
    log_dataframe(
        "{}-full_bucket_information".format(bucket_name), 
        bucket_info_df,
        with_histograms=True,
        with_stats=True,
        with_schema=True,
        path="s3://{}".format(bucket_name),

    )
    log_metric("{}-total_bucket_size(GB)".format(bucket_name), bucket_size/GB)
    log_metric("{}-number_of_objects".format(bucket_name), num_objects)

    key_metrics = {
        "{}-largest_key_size(MB)".format(bucket_name): largest_key_size,
        "{}-largest_key_name".format(bucket_name): largest_key_name, 
        "{}-total_bucket_size".format(bucket_name): bucket_size/GB,
        "{}-number_of_objects".format(bucket_name): num_objects
    }
    return key_metrics
Ejemplo n.º 26
0
def filter_by_id(data, partner_id):
    # type: (DataFrame, int) -> DataFrame
    partner_data = data[data["partner"] == partner_id]
    log_dataframe("partner_%s" % partner_id, partner_data)
    return partner_data
Ejemplo n.º 27
0
def join_data(raw_data: List[pd.DataFrame]) -> pd.DataFrame:
    result = raw_data.pop(0)
    for d in raw_data:
        result = result.merge(d, on="id")
    log_dataframe(result)
    return result
Ejemplo n.º 28
0
def dedup_records(data: DataFrame, key_columns=["name"]) -> DataFrame:
    data = data.dropDuplicates(key_columns)
    log_dataframe("data", data, with_histograms=True)
    return data
Ejemplo n.º 29
0
def calculate_features(data: pd.DataFrame,
                       selected_features: List[str] = None,
                       data_path: PathStr = None) -> pd.DataFrame:
    log_dataframe("data_path", data, with_histograms=True, path=data_path)
    data = data[selected_features]
    return data