Example #1
0
    def address(cls,
                *,
                locale=Locales.EN,
                calling_code=None,
                city=None,
                country=None,
                country_code=None,
                latitude=None,
                longitude=None,
                postal_code=None,
                state=None,
                street_name=None,
                street_number=None,
                street_suffix=None):
        '''
            Create an Address Data Entity object.

            All individual fields are automatically randomly generated based on locale. If provided, the corresponding values are overriden.

            Note:
                All individual fields are randomly generated. Don't expect correct correlation e.g. correct postal code for the generated city.

            Keyword Arguments:
                locale: Approprite Random.locale.<local_name> object. Default is Random.locale.EN
                calling_code: Calling Code
                city: City
                country: Country Name
                country_code: Country Code 
                latitude: Latitude
                longitude: Longitde
                postal_code: Postal Code
                state: State
                street_name: Street Name
                street_number Street Number
                street_suffix: Street Suffix
        '''
        address = Address(locale=locale)
        from arjuna.engine.data.entity.address import Address as ArjAddress

        return ArjAddress(
            calling_code=calling_code is not None and calling_code
            or address.calling_code(),
            city=city and city is not None or address.city(),
            country=country is not None and country or address.country(),
            country_code=country_code is not None and country_code
            or address.country_code(),
            latitude=latitude is not None and latitude or address.latitude(),
            longitude=longitude is not None and longitude
            or address.longitude(),
            postal_code=postal_code is not None and postal_code
            or address.postal_code(),
            state=state is not None and state or address.state(),
            street_name=street_name is not None and street_name
            or address.street_name(),
            street_number=street_number is not None and street_number
            or address.street_number(),
            street_suffix=street_suffix is not None and street_suffix
            or address.street_suffix(),
        )
Example #2
0
    def gen_data_change_column_name(self, data_path, partition_date, num_rows,
                                    file_format):
        """
        Input
        - data_path: path where the partition will be created (string)
        - partition_date: partition date to be created (date)
        - num_rows: number of rows to be generated (integer)
        - file_format: format of file to be generated (parquet or avro)

        This function creates a data sample changing column name
        """

        person = Person('en')
        address = Address('en')

        # Create schema
        schema_street = StructType([
            StructField('street_name', StringType(), True),
            StructField('lat', FloatType(), True),  #column renamed
            StructField('long', FloatType(), True)  #column renamed
        ])

        schema_address_details = StructType([
            StructField('street', schema_street, True),
            StructField('number', IntegerType(), True)
        ])

        schema_address = StructType([
            StructField('address_details', schema_address_details, True),
            StructField('city', StringType(), True),
            StructField('country', StringType(), True),
            StructField('country_code', StringType(), True),
            StructField('state', StringType(), True),
            StructField('postal_code', IntegerType(), True)
        ])

        schema_df = StructType([
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('address', schema_address, True),
            StructField('title_name', StringType(), True),  #column renamed
            StructField('date', DateType(), True)
        ])

        # Generate data
        for _ in range(num_rows):
            df_temp = self.spark.createDataFrame([[
                person.identifier(),
                person.first_name(),
                person.last_name(),
                person.occupation(),
                person.age(),
                [[[
                    address.street_name(),
                    float(address.latitude()),
                    float(address.longitude())
                ],
                  int(address.street_number())],
                 address.city(),
                 address.country(),
                 address.country_code(),
                 address.state(),
                 int(address.postal_code())],
                person.title(), partition_date
            ]], schema_df)

            try:
                df = df.union(df_temp)
            except:
                df = df_temp

        df.coalesce(1).write.partitionBy('date').mode('overwrite').format(
            file_format).save(data_path)

        print('Partition created: {data_path}/date={date}'.format(
            data_path=data_path, date=partition_date))
        print('# Rows:', df.count())
        print('Schema:')
        df.printSchema()
        print('\n')

        return