Esempio n. 1
0
    def setup_df(self):
        df = DFMock(count=100)
        df.columns = {"grouped_col": {"option_count": 4, "option_type": "string"},
                      "text_col": "string",
                      "int_col": "integer",
                      "float_col": "float"}
        df.generate_dataframe()
        df.dataframe

        return tuple(df.columns.keys()), df.dataframe
Esempio n. 2
0
 def setup_grouped_dataframe(self, count):
     df = DFMock()
     df.count = count
     df.columns = {
         "string_col": {
             "option_count": 3,
             "option_type": "string"
         },
         "int_col": {
             "option_count": 3,
             "option_type": "int"
         },
         "float_col": {
             "option_count": 3,
             "option_type": "float"
         },
         "bool_col": {
             "option_count": 3,
             "option_type": "bool"
         },
         "datetime_col": {
             "option_count": 3,
             "option_type": "datetime"
         },
         "metrics": "int"
     }
     df.generate_dataframe()
     return df.dataframe
Esempio n. 3
0
def test_parquet_sizes():
    bucket = "testbucket"
    key = "testdataset"
    s3_client = boto3.client('s3')
    s3_client.create_bucket(Bucket=bucket)
    df = DFMock(count=1000)
    df.columns = {"int_col": "int", "str_col": "string",
                  "grouped_col": {"option_count": 4, "option_type": "string"}}
    df.generate_dataframe()
    df.grow_dataframe_to_size(250)
    pub_parq.publish(
        dataframe=df.dataframe, key=key, bucket=bucket, partitions=['grouped_col'])

    for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
        if obj['Key'].endswith(".parquet"):
            assert float(obj['Size']) <= 61 * float(1 << 20)
Esempio n. 4
0
    def mock_publish(self,
                     partition_types: Dict[str, str],
                     bucket="safebucketname",
                     key='safekeyprefixname/safedatasetname'):
        mocker = MockHelper(count=100, s3=True)
        df = mocker.dataframe
        partitions = list(partition_types.keys())
        dfmock = DFMock()
        dfmock.count = 10

        # add partition columns
        columns = dict({
            key: {
                "option_count": 3,
                "option_type": value
            }
            for key, value in partition_types.items()
        })

        # add one actual data column, called metrics
        columns["metrics"] = "int"

        dfmock.columns = columns
        dfmock.generate_dataframe()

        # generate dataframe we will write
        df = dfmock.dataframe
        bucket = mocker.s3_bucket

        defaults = {
            'bucket': bucket,
            'key': key,
            'dataframe': df,
            'partitions': partitions
        }
        published_files = publish(bucket=bucket,
                                  key=key,
                                  dataframe=df,
                                  partitions=partitions)

        return bucket, df, partitions, published_files
Esempio n. 5
0
def setup_grouped_dataframe(count: int = 100, columns: Dict = None):
    """ Creates mock dataframe using dfmock

    Args:
        count (int): Row length to generate on the dataframe
        columns (Dict): dictionary of columns and types, following dfmock guides

    Returns:
        A freshly mocked dataframe
    """
    df = DFMock()
    df.count = count
    if not columns:
        columns = {
            "string_col": {
                "option_count": 3,
                "option_type": "string"
            },
            "int_col": {
                "option_count": 3,
                "option_type": "int"
            },
            "float_col": {
                "option_count": 3,
                "option_type": "float"
            },
            "bool_col": {
                "option_count": 3,
                "option_type": "bool"
            },
            "datetime_col": {
                "option_count": 3,
                "option_type": "datetime"
            },
            "text_col": "string",
            "metrics": "int"
        }
    df.columns = columns
    df.generate_dataframe()
    return df.dataframe