def setup_df(self): df = DFMock(count=100) df.columns = {"grouped_col": {"option_count": 4, "option_type": "string"}, "text_col": "string", "int_col": "integer", "float_col": "float"} df.generate_dataframe() df.dataframe return tuple(df.columns.keys()), df.dataframe
def setup_grouped_dataframe(self, count): df = DFMock() df.count = count df.columns = { "string_col": { "option_count": 3, "option_type": "string" }, "int_col": { "option_count": 3, "option_type": "int" }, "float_col": { "option_count": 3, "option_type": "float" }, "bool_col": { "option_count": 3, "option_type": "bool" }, "datetime_col": { "option_count": 3, "option_type": "datetime" }, "metrics": "int" } df.generate_dataframe() return df.dataframe
def test_parquet_sizes(): bucket = "testbucket" key = "testdataset" s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=bucket) df = DFMock(count=1000) df.columns = {"int_col": "int", "str_col": "string", "grouped_col": {"option_count": 4, "option_type": "string"}} df.generate_dataframe() df.grow_dataframe_to_size(250) pub_parq.publish( dataframe=df.dataframe, key=key, bucket=bucket, partitions=['grouped_col']) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): assert float(obj['Size']) <= 61 * float(1 << 20)
def mock_publish(self, partition_types: Dict[str, str], bucket="safebucketname", key='safekeyprefixname/safedatasetname'): mocker = MockHelper(count=100, s3=True) df = mocker.dataframe partitions = list(partition_types.keys()) dfmock = DFMock() dfmock.count = 10 # add partition columns columns = dict({ key: { "option_count": 3, "option_type": value } for key, value in partition_types.items() }) # add one actual data column, called metrics columns["metrics"] = "int" dfmock.columns = columns dfmock.generate_dataframe() # generate dataframe we will write df = dfmock.dataframe bucket = mocker.s3_bucket defaults = { 'bucket': bucket, 'key': key, 'dataframe': df, 'partitions': partitions } published_files = publish(bucket=bucket, key=key, dataframe=df, partitions=partitions) return bucket, df, partitions, published_files
def setup_grouped_dataframe(count: int = 100, columns: Dict = None): """ Creates mock dataframe using dfmock Args: count (int): Row length to generate on the dataframe columns (Dict): dictionary of columns and types, following dfmock guides Returns: A freshly mocked dataframe """ df = DFMock() df.count = count if not columns: columns = { "string_col": { "option_count": 3, "option_type": "string" }, "int_col": { "option_count": 3, "option_type": "int" }, "float_col": { "option_count": 3, "option_type": "float" }, "bool_col": { "option_count": 3, "option_type": "bool" }, "datetime_col": { "option_count": 3, "option_type": "datetime" }, "text_col": "string", "metrics": "int" } df.columns = columns df.generate_dataframe() return df.dataframe