コード例 #1
0
    def _execute_session(self):
        dataset = self.dataset_run.dataset
        data_df = get_data(dataset)
        column_types = get_schema(data_df)

        profile = {
            "column": {},
            "row_count": len(data_df.index),
            "column_count": len(list(data_df))
        }

        dataset_profile = ActualDatasetProfile.objects.create(
            dataset_run=self.dataset_run,
            row_count=profile["row_count"],
            column_count=profile["column_count"])

        columns = list(data_df)
        for index, column in enumerate(columns):
            analyzer = AnalyzerActualColumn(data_df, column_types,
                                            dataset_profile, column,
                                            profile["row_count"], index)
            profile["column"][column] = analyzer.execute()

        profile["hash_sum"] = calculate_hash_sum(profile)

        dataset_profile.hash_sum = profile["hash_sum"]
        dataset_profile.save()

        return dataset_profile, profile
コード例 #2
0
def save_dataset_to_datalake(dataset):
    """Save a snapshot of the given dataset instance to the datalake. Saves the data, preview and schema."""
    datastore = dataset.connection.datastore
    data_df = datastore.retrieve_data(dataset)
    schema = get_schema(data_df)

    save_data_to_datalake(data_df, schema, dataset.get_datalake_path())
コード例 #3
0
    def setUpTestData(cls):
        super(TaskTests, cls).setUpTestData()

        cls.dataset = baker.make("datasets.Dataset",
                                 name="biostats",
                                 type="TABLE")
        cls.user = baker.make("users.User")

        # Download the test data and save it to the temporary test datalake.
        cls.df = pd.read_csv("database/data/test/biostats.csv")
        save_data_to_datalake(cls.df, get_schema(cls.df),
                              cls.dataset.get_datalake_path())
コード例 #4
0
    def test_delete_outdated_datalake_files(self):
        now = datetime.now()
        data_folder = "test_datalake/saef/landing/uploads/biostats/data"

        # Save multiple versions of the dataset, each version timestamped an hour earlier.
        for i in range(1, 3):
            time = now - timedelta(hours=i)
            save_data_to_datalake(self.df,
                                  get_schema(self.df),
                                  self.dataset.get_datalake_path(),
                                  time=time)

        self.assertEqual(len(os.listdir(data_folder)), 3)

        # Deleting outdated datalake files should delete the 2 versions that are older than an hour.
        delete_outdated_datalake_files(60)

        self.assertEqual(len(os.listdir(data_folder)), 1)
コード例 #5
0
    def test_profile_dataset(self):
        # Do the first profile run with the original biostats data.
        first_run = baker.make("datasets.DatasetRun",
                               dataset=self.dataset,
                               task_name="Profile dataset")
        task_profile_dataset(dataset_run=first_run)

        # Load changed biostats data into the datalake where the "Weight(lbs)" column is removed.
        df = self.df.drop(labels="Weight(lbs)", axis=1)
        save_data_to_datalake(df, get_schema(df),
                              self.dataset.get_datalake_path())

        # Do the second run with the changed data, resulting in a non-zero degree of change.
        second_run = baker.make("datasets.DatasetRun",
                                dataset=self.dataset,
                                task_name="Profile dataset")
        task_result = task_profile_dataset(dataset_run=second_run)

        self.assertEqual(task_result["degree_of_change"], 0.015873015873015872)
コード例 #6
0
def task_extract_metadata(**kwargs):
    """Extract current dataset metadata, including column count, row count and column names and types."""
    dataset_run = kwargs["dataset_run"]
    dataset = dataset_run.dataset

    try:
        data_df, timestamp = get_data(dataset, get_timestamp=True)
        column_types = get_schema(data_df)

        result = {
            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            "columns": column_types,
            "column_count": len(list(data_df)),
            "row_count": len(data_df.index)
        }
        dataset_run.status = DatasetRun.Status.SUCCEEDED

        return result
    except Exception as e:
        logger.error(f"Error while extracting metadata from {dataset}: {e}")
        return {"error": type(e).__name__, "message": str(e)}
コード例 #7
0
def data_overview(dataset):
    """
    Return information used to get an overview of the datasets data (column types and data preview). Also return the
    timestamp of when this information is from.
    """

    try:
        data_preview, timestamp = get_data(dataset,
                                           preview=True,
                                           get_timestamp=True)
        column_types = get_schema(data_preview)

        # Handle null values and convert the dataframe into a list of tuples.
        data_preview.fillna("[null]", inplace=True)
        data_preview = list(data_preview.itertuples(index=False, name=None))

        timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        # If the data overview cannot be retrieved from either the datalake or datastore, set to None to alert the user.
        data_preview, column_types, timestamp = [], None, None

    return timestamp, column_types, data_preview