コード例 #1
0
def skew_drift_validator(mode, gcp_bucket, control_set_path,
                         treatment_set_path, feature_list_str, Linf_value):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting skew drift validator ..')
    logging.info('Input data:')
    logging.info('mode:{}'.format(mode))
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('control_set_path:{}'.format(control_set_path))
    logging.info('treatment_set_path:{}'.format(treatment_set_path))
    logging.info('Linf_value:{}'.format(Linf_value))

    feature_list = eval(feature_list_str)
    control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path,
                                 sep=',')
    treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path,
                               sep=',')
    control_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=control_set_df)
    treat_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=treat_set_df)
    control_schema = tfdv.infer_schema(control_stats)
    treat_schema = tfdv.infer_schema(treat_stats)

    for feature in feature_list:
        if (mode == "skew"):
            if (tfdv.get_feature(control_schema, feature).domain
                ):  # if we have domain it is a categorical variable
                tfdv.get_feature(
                    control_schema, feature
                ).skew_comparator.infinity_norm.threshold = Linf_value
            else:
                logging.critical(
                    "feature: {} is not categorical".format(feature))
                sys.exit(1)
        elif (mode == "drift"):
            tfdv.get_feature(
                control_schema,
                feature).drift_comparator.infinity_norm.threshold = Linf_value
        else:
            logging.critical("mode: {} not supported".format(mode))
            sys.exit(1)
    anomalies = tfdv.validate_statistics(statistics=control_stats,
                                         schema=control_schema,
                                         serving_statistics=treat_stats)
    if (anomalies.anomaly_info):
        logging.info("Data-{} detected:".format(anomalies))
        return anomalies
    else:
        logging.info("No data-{} detected".format(mode))
コード例 #2
0
def Load_TFDV(df):

    lencols = len(df.columns)
    # print(lencols)
    y_tfdv = [0] * lencols

    i = 0
    for col in df.columns:
        # print(col)
        df_col = df[[col]]
        st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True)
        stats = tfdv.generate_statistics_from_dataframe(
            df_col, stats_options=st_option)
        schema = tfdv.infer_schema(statistics=stats)
        categ_lst = get_categorical_features(schema)
        for x in categ_lst:
            y_tfdv[i] = 1
            break

        xc = schema.feature
        # print(xc)
        for x in xc:
            cnt_NLD = str(x).count('natural_language_domain')
            cnt_TD = str(x).count('time_domain')

            if cnt_NLD: y_tfdv[i] = 3
            if cnt_TD: y_tfdv[i] = 2
        print(y_tfdv[i])
        i = i + 1

    return y_tfdv
コード例 #3
0
 def compute_training_stats(self, stats_path=None):
     """compute training stats."""
     dataset = self.fs.create_dataset(self.train_feature_set, "2009-01-01", "2016-01-01")
     training_df = self.fs.download_dataset_to_df(dataset, self.STAGING_LOCATION)
     training_stats = tfdv.generate_statistics_from_dataframe(training_df)
     
     if stats_path:
         logging.info("Saving training stats to %s", stats_path)
         demo_util.save_proto(training_stats, stats_path)
     else:
         logging.info("No stats_path provided; not saving stats")
     return training_stats
コード例 #4
0
def feature_stats_dataset_agg(client, feature_stats_feature_set):
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    start_date = time_offset - timedelta(days=10)
    end_date = time_offset - timedelta(days=7)
    df1 = pd.DataFrame({
        "datetime": [start_date] * 5,
        "entity_id": [i for i in range(5)],
        "strings": ["a", "b", "b", "b", "a"],
        "ints": [4, 3, 2, 6, 3],
        "floats": [2.1, 5.2, 4.3, 0.6, 0.1],
    })
    ingestion_id_1 = client.ingest(feature_stats_feature_set, df1)
    df2 = pd.DataFrame({
        "datetime": [start_date + timedelta(days=1)] * 3,
        "entity_id": [i for i in range(3)],
        "strings": ["a", "b", "c"],
        "ints": [2, 6, 7],
        "floats": [1.6, 2.4, 2],
    })
    ingestion_id_2 = client.ingest(feature_stats_feature_set, df2)

    combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]]
    expected_stats = tfdv.generate_statistics_from_dataframe(combined_df)
    clear_unsupported_agg_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = combined_df[name].std()
            feature.num_stats.std_dev = std

    time.sleep(10)

    return {
        "ids": [ingestion_id_1, ingestion_id_2],
        "start_date":
        datetime(start_date.year, start_date.month,
                 start_date.day).replace(tzinfo=pytz.utc),
        "end_date":
        datetime(end_date.year, end_date.month,
                 end_date.day).replace(tzinfo=pytz.utc),
        "stats":
        expected_stats,
    }
コード例 #5
0
def tfdv_and_additional_anomalies(
        df: pd.DataFrame,
        tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]:
    """
    Get TFDV and additional anomalies.
    Args:
        df {pandas.DataFrame}: dataframe
        tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics
    Returns:
        Tuple[bool, Dict]:
            True if anomalies are detected, otherwise False,
            dictionary with structure:
            {
                <column_name>: {
                    'description': <description>,
                    'severity': 'ERROR',
                    'shortDescription': <short description>,
                    'reason': [{'type': <error_type>,
                                'shortDescription': <short description>,
                                'description': <description>}],
                    'path': {'step': [<column_name>]}
                }
            }
    """

    df_statistics = tfdv.generate_statistics_from_dataframe(df)
    interval_anomalies_detected = False
    tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies(
        tfdv_statistics, df_statistics)
    interval_anomalies = {}

    if len(tfdv_anomalies) > 0:
        tfdv_anomalies_detected = True

    if os.getenv('CHECK_NUMERIC_INTERVALS_ON_PREDICT') == 'true':
        interval_anomalies_detected, interval_anomalies = data_intervals_anomalies(
            df, tfdv_statistics)

    anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected
    anomalies = {**tfdv_anomalies, **interval_anomalies}

    return anomalies_detected, anomalies
コード例 #6
0
    def evaluate(self, model: BaseEstimator, num_repetitions: int,
                 *corruptions: DataCorruption):

        schema = self.schema_from_train_data()

        baseline_predictions = model.predict_proba(self._task.test_data)
        baseline_score = self._task.score_on_test_data(baseline_predictions)

        results = []

        # Repeatedly corrupt the test data
        for corruption in corruptions:
            corrupted_scores = []
            anomalies = []
            for _ in range(0, num_repetitions):
                test_data_copy = self._task.test_data.copy(deep=True)
                corrupted_data = corruption.transform(test_data_copy)

                # Determine whether tfdv finds anomalies in the data
                corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                    corrupted_data)
                tfdv_anomalies = tfdv.validate_statistics(
                    statistics=corrupted_data_stats, schema=schema)

                schema_anomalies = tfdv_anomalies.anomaly_info

                # Compute the prediction score on the test data
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = self._task.score_on_test_data(
                    corrupted_predictions)

                anomalies.append(schema_anomalies)
                corrupted_scores.append(corrupted_score)

            results.append(
                SchemaValidationResult(corruption, anomalies, baseline_score,
                                       corrupted_scores))

        return results
コード例 #7
0
def feature_stats_dataset_basic(client, feature_stats_feature_set):

    N_ROWS = 20

    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "entity_id": [i for i in range(N_ROWS)],
        "strings": ["a", "b"] * int(N_ROWS / 2),
        "ints": [int(i) for i in range(N_ROWS)],
        "floats": [10.5 - i for i in range(N_ROWS)],
    })

    expected_stats = tfdv.generate_statistics_from_dataframe(
        df[["strings", "ints", "floats"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = df[name].std()
            feature.num_stats.std_dev = std

    ingestion_id = client.ingest(feature_stats_feature_set, df)
    time.sleep(10)
    return {
        "df":
        df,
        "id":
        ingestion_id,
        "date":
        datetime(time_offset.year, time_offset.month,
                 time_offset.day).replace(tzinfo=pytz.utc),
        "stats":
        expected_stats,
    }
コード例 #8
0
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic,
                                     feature_stats_feature_set):
    df = feature_stats_dataset_basic["df"]

    df2 = pd.DataFrame({
        "datetime": [df.iloc[0].datetime],
        "entity_id": [10],
        "strings": ["c"],
        "ints": [2],
        "floats": [1.3],
    })
    client.ingest(feature_stats_feature_set, df2)
    time.sleep(10)

    actual_stats = client.get_statistics(
        "feature_stats",
        features=["strings", "ints", "floats"],
        store="historical",
        start_date=feature_stats_dataset_basic["date"],
        end_date=feature_stats_dataset_basic["date"] + timedelta(days=1),
        force_refresh=True,
    )

    combined_df = pd.concat([df, df2])
    expected_stats = tfdv.generate_statistics_from_dataframe(combined_df)

    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = combined_df[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, actual_stats)
# MAGIC %md
# MAGIC ## Use Tensorflow Validation Library
# MAGIC - check schema between the training and serving periods of time
# MAGIC - check for data drift and skew between training and serving

# COMMAND ----------

from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings

warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats_train = tfdv.generate_statistics_from_dataframe(
    dataframe=train_df.toPandas())
stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas())

schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------
コード例 #10
0
we are using Tensorflow Data Validation since Azure Data validation seems to reside in a separate service
'''

# %%
import pyarrow
import apache_beam as beam
import apache_beam.io.iobase
import tensorflow
import tensorflow_data_validation as tfdv


# %%
#train_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_train)

# %%
train_stats = tfdv.generate_statistics_from_dataframe(train)

# %%


# %%
'''
## visualize statistics of train data
'''

# %%
tfdv.visualize_statistics(train_stats)

# %%

コード例 #11
0
ファイル: deployments.py プロジェクト: mlrepa/mlpanel
def get_validation_report(deployment_id: int,
                   timestamp_from: float,
                   timestamp_to: float) -> JSONResponse:

    conf = Config()
    connection = psycopg2.connect(
            database=conf.get('PROJECTS_DB_NAME'),
            host=conf.get('DB_HOST'),
            port=conf.get('DB_PORT'),
            user=conf.get('DB_USER'),
            password=conf.get('DB_PASSWORD')
        )
    cursor = connection.cursor()

    cursor.execute(
        f'SELECT model_uri FROM {DeployDbSchema.DEPLOYMENTS_TABLE} '
        f'WHERE id = {deployment_id}'
    )

    try:
        model_uri = cursor.fetchone()[0]
    except TypeError:
        raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found')

    cursor.execute(
        f'SELECT incoming_data FROM {DeployDbSchema.INCOMING_DATA_TABLE} '
        f'WHERE deployment_id = {deployment_id} AND '
        f'     timestamp >= {timestamp_from} AND timestamp <= {timestamp_to}'
    )

    schema_file_path = get_schema_file_path(model_uri)

    if not schema_file_exists(schema_file_path):
        return JSONResponse({})

    tfdv_statistics = read_tfdv_statistics(schema_file_path)
    tfdv_statistics_dict = tfdv_object_to_dict(tfdv_statistics)

    data_batches = cursor.fetchall()
    dataframes = []

    for batch in data_batches:
        df = load_data(batch[0])
        dataframes.append(df)

    if len(dataframes) == 0:
        return JSONResponse({})

    incoming_data_df = pd.concat(dataframes, ignore_index=False)

    """
     Convert object columns to string to avoid errors when trying to generate TFDV
     statistics. If column contains string and numeric values it's type is object, but 
     the column still contains values of different types. And TFDV tries to convert string 
     value to numeric (integer, float) => error 
    """
    object_columns = incoming_data_df.select_dtypes(include='object').columns.tolist()
    incoming_data_df[object_columns] = incoming_data_df[object_columns].astype('str')

    incoming_data_statistics = tfdv.generate_statistics_from_dataframe(incoming_data_df)
    incoming_data_statistics_dict = tfdv_object_to_dict(incoming_data_statistics)

    tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies(
        tfdv_statistics, incoming_data_statistics
    )
    interval_anomalies_detected, interval_anomalies = data_intervals_anomalies(
        incoming_data_df, tfdv_statistics
    )
    anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected
    anomalies = {**tfdv_anomalies, **interval_anomalies}

    return JSONResponse({
        'timestamp_from': timestamp_from,
        'timestamp_to': timestamp_to,
        'model_statistics': tfdv_statistics_dict,
        'incoming_data_statistics': incoming_data_statistics_dict,
        'anomalies_detected': anomalies_detected,
        'anomalies_info': anomalies
    })
コード例 #12
0
def test_batch_dataset_statistics(client):
    fs1 = client.get_feature_set(name="feature_set_1")
    fs2 = client.get_feature_set(name="feature_set_2")
    id_offset = 20

    n_rows = 21
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "feature_value6": ["a" for i in range(n_rows)],
    })
    ingestion_id1 = client.ingest(fs1, features_1_df)

    features_2_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "other_entity_id": [id_offset + i for i in range(n_rows)],
        "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)],
    })
    ingestion_id2 = client.ingest(fs2, features_2_df)

    entity_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "other_entity_id": [id_offset + i for i in range(n_rows)],
    })

    time.sleep(15)  # wait for rows to get written to bq
    while True:
        rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1)
        rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2)
        if rows_ingested1 == len(features_1_df) and rows_ingested2 == len(
                features_2_df):
            print(
                f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing."
            )
            break
        time.sleep(30)

    feature_retrieval_job = client.get_historical_features(
        entity_rows=entity_df,
        feature_refs=["feature_value6", "feature_set_2:other_feature_value7"],
        project=PROJECT_NAME,
        compute_statistics=True,
    )
    output = feature_retrieval_job.to_dataframe(timeout_sec=180)
    print(output.head(10))
    stats = feature_retrieval_job.statistics(timeout_sec=180)
    clear_unsupported_fields(stats)

    expected_stats = tfdv.generate_statistics_from_dataframe(
        output[["feature_value6", "feature_set_2__other_feature_value7"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = output[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, stats)
    clean_up_remote_files(feature_retrieval_job.get_avro_files())
コード例 #13
0
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv

# Simple dataset analysis
dataset = pd.read_csv("data/pollution-small.csv")
print(dataset.shape)

training_data = dataset[:1600]
print(training_data.describe())

test_set = dataset[1600:]
print(test_set.describe())

# Generate training data statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)
schema = tfdv.infer_schema(statistics=train_stats)
print(tfdv.display_schema(schema))

test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

# Compare test statistics with the Schema
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
display(weatherDF['avg_temp_f', 'tot_precip_mm', 'avg_wnd_mps', 'avg_vis_m',
                  'avg_slp_hpa', 'avg_dewpt_f'].summary())

# COMMAND ----------

# MAGIC %md visualize statistics  using Tensorflow data validation lib

# COMMAND ----------

import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats = tfdv.generate_statistics_from_dataframe(dataframe=weatherDF.toPandas())
tfdv.visualize_statistics(stats)
displayHTML(
    get_statistics_html(stats)
)  #tot_precip_mm has about 90% zeros!, avg_wnd_mps has about 16% #No missing data

# COMMAND ----------

# MAGIC %md infer schema

# COMMAND ----------

weather_data_schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=weather_data_schema)

# COMMAND ----------
コード例 #15
0
ファイル: example3.py プロジェクト: mlrepa/mlpanel
    args = parser.parse_args()

    experiment_name = 'IrisSVC'
    mlflow.set_experiment(experiment_name)

    DATASET = 'data/iris.csv'
    TARGET_LABELED_DATASET = 'data/labeled_iris.csv'
    TARGET_COLUMN = 'species'
    IRIS_STATISTICS = '/tmp/stats.tfdv'

    with mlflow.start_run() as run:
        dataset = pd.read_csv(DATASET)
        dataset[TARGET_COLUMN] = LabelEncoder().fit_transform(dataset[TARGET_COLUMN])
        dataset.to_csv(TARGET_LABELED_DATASET, index=False)

        statistics = tfdv.generate_statistics_from_dataframe(dataset.drop(TARGET_COLUMN, axis=1))

        with open(IRIS_STATISTICS, 'wb') as out_stats:
            out_stats.write(statistics.SerializeToString())

        mlflow.log_artifact(DATASET)
        mlflow.log_artifact(TARGET_LABELED_DATASET)
        mlflow.log_artifact(IRIS_STATISTICS)

        train, test = train_test_split(dataset, test_size=0.2, random_state=42)

        X_train = train.drop(TARGET_COLUMN, axis=1).astype('float32')
        y_train = train[TARGET_COLUMN].astype('int32')

        X_test = test.drop(TARGET_COLUMN, axis=1).astype('float32')
        y_test = test[TARGET_COLUMN].astype('int32')
コード例 #16
0
    parser.add_argument('-o',
                        '--output_dir',
                        type=str,
                        required=False,
                        default='./output',
                        help='Path to a where stats must be saved')

    parser.add_argument('-f',
                        '--file_name',
                        type=str,
                        required=False,
                        default='stats.txt',
                        help='Name of the stats file')

    args = parser.parse_args()

    # tfdv doesnt support generating stats directly from parquet
    # so read through pandas parquet reader
    # Ideally, this should be be an accelerated parquet reader and stats
    # computation should happen via GPU
    df = pd.read_parquet(args.data_dir)

    stats = tfdv.generate_statistics_from_dataframe(df)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    output_path = os.path.join(args.output_dir, args.file_name)

    tfdv.write_stats_text(stats, output_path=output_path)
コード例 #17
0
    def run(self, task, model, schema, num_corruptions, performance_threshold):
        # Make sure the schema works on the clean test data
        assert (not self.has_anomaly(self.validate(schema, task.test_data)))

        baseline_predictions = model.predict_proba(task.test_data)
        baseline_score = task.score_on_test_data(baseline_predictions)

        random_corruptions = set()

        for _ in range(0, num_corruptions):
            num_columns = len(task.numerical_columns +
                              task.categorical_columns + task.text_columns)
            p_numerical_column_affected = float(len(
                task.numerical_columns)) / num_columns
            p_categorical_column_affected = float(len(
                task.categorical_columns)) / num_columns
            p_text_column_affected = float(len(
                task.text_columns)) / num_columns

            affected_column_type = np.random.choice(
                ['numerical', 'categorical', 'text'],
                1,
                p=[
                    p_numerical_column_affected, p_categorical_column_affected,
                    p_text_column_affected
                ])

            fraction = float(np.random.randint(100)) / 100

            if affected_column_type == 'numerical':

                if len(task.numerical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.numerical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:

                    corruption_type = np.random.choice(
                        ['missing', 'noise', 'scaling'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value=np.nan,
                                          missingness=missingness))
                    elif corruption_type == 'noise':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            GaussianNoise(affected_column, fraction))
                    elif corruption_type == 'scaling':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            Scaling(affected_column, fraction))

            elif affected_column_type == 'categorical':

                if len(task.categorical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.categorical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

            elif affected_column_type == 'text':

                if len(task.text_columns) >= 2 and np.random.uniform() < 0.1:
                    affected_columns = np.random.choice(task.text_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

        outcome = {
            'corruption': [],
            'status': [],
            'anomalies': [],
            'baseline_score': [],
            'corrupted_score': []
        }

        for corruption in random_corruptions:
            print(corruption)
            test_data_copy = task.test_data.copy(deep=True)
            corrupted_data = corruption.transform(test_data_copy)

            corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                corrupted_data)
            tfdv_anomalies = tfdv.validate_statistics(
                statistics=corrupted_data_stats, schema=schema)

            schema_anomalies = tfdv_anomalies.anomaly_info

            try:
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = task.score_on_test_data(
                    corrupted_predictions)

                performance_drop = (baseline_score -
                                    corrupted_score) / baseline_score

                has_negative_impact = performance_drop > performance_threshold
            except:
                corrupted_score = None
                has_negative_impact = True

            has_anomalies = len(tfdv_anomalies.anomaly_info) != 0

            if has_anomalies:
                if has_negative_impact:
                    status = 'TP'
                else:
                    status = 'FP'
            else:
                if not has_negative_impact:
                    status = 'TN'
                else:
                    status = 'FN'

            outcome['corruption'].append(str(corruption))
            outcome['status'].append(status)
            outcome['anomalies'].append(str(schema_anomalies))
            outcome['baseline_score'].append(baseline_score)
            outcome['corrupted_score'].append(corrupted_score)

        return pd.DataFrame.from_dict(outcome)
コード例 #18
0
 def validate(self, schema, data):
     stats = tfdv.generate_statistics_from_dataframe(data)
     return tfdv.validate_statistics(statistics=stats, schema=schema)
コード例 #19
0
 def schema_from_train_data(self):
     train_data_stats = tfdv.generate_statistics_from_dataframe(
         self._task.train_data)
     schema = tfdv.infer_schema(statistics=train_data_stats)
     return schema