def test_create_connection(): """ test case for method create_connection in SpackHelper """ assert isinstance(S.create_connection(), SparkSession) assert S.create_connection().conf.get( 'spark.some.config.option') == 'some-value'
def test_read_parquet(): """ test case for method read_parquet in SpackHelper """ connection = S.create_connection() with pytest.raises(Exception): S.read_parquet(connection, 'folder_is_not_exists') assert isinstance(S.read_parquet(connection, 'test/resources/result'), DataFrame) assert 'event_label' in S.read_parquet(connection, 'test/resources/result').columns
def main(self): """ Main method for transform raw snowplow data to dataset tracker """ event_label_udf = SparkHelper.get_udf('se_la', self.parser.parse_event) event_aid_udf = SparkHelper.get_udf('aid', self.parser.parse_event) event_url_udf = SparkHelper.get_udf('url', self.parser.parse_event) event_refr_udf = SparkHelper.get_udf('refr', self.parser.parse_event) event_tz_udf = SparkHelper.get_udf('tz', self.parser.parse_event) event_category_udf = SparkHelper.get_udf('se_ca', self.parser.parse_event) event_action_udf = SparkHelper.get_udf('se_ac', self.parser.parse_event) event_sessionid_udf = SparkHelper.get_udf('sid', self.parser.parse_event) event_visitorid_udf = SparkHelper.get_udf('fp', self.parser.parse_event) event_platform_type_udf = SparkHelper.get_udf('p', self.parser.parse_event) # db = self.db_geoip event_ip_to_locality = udf(self.geoip.get_locality_from_ip, StringType()) df_tracker = self.get_snowplow_data() df_tracker.select( event_ip_to_locality(df_tracker.ip).alias('event_ip_to_locality'), df_tracker.ip.alias('event_ip'), df_tracker.time.alias('event_time'), # event_category_udf('path').alias('event_category'), # event_tz_udf('path').alias('event_tz'), # event_action_udf('path').alias('event_action'), # event_label_udf('path').alias('event_label'), # event_sessionid_udf('path').alias('event_sessionID'), # event_visitorid_udf('path').alias('event_visitorID'), # event_platform_type_udf('path').alias('event_platform_type'), # event_aid_udf('path').alias('event_aid'), # event_url_udf('path').alias('event_url'), # event_refr_udf('path').alias('event_refr') ).show(20, False)
def get_parquet(self): """ Get Tracker parquet """ return SparkHelper.read_parquet( self.connection, self.config.get('data_source', 'result') )
def __init__(self): self.connection = SparkHelper.create_connection() self.parser = Parser() self.config = get_config() self.result = self.config.get('data_source', 'result') self.geoip = geoip() self.db_geoip = self.geoip.getConnection()
def __init__(self): self.connection = SparkHelper.create_connection() self.parser = Parser() self.config = get_config()
def main(self): """ Main Class for transform tracket dataset to search dataset """ search_listing_type_udf = SparkHelper.get_udf( 'listing_type', self.parser.parse_event_search ) search_property_type_udf = SparkHelper.get_udf( 'property_type', self.parser.parse_event_search ) search_rent_type_udf = SparkHelper.get_udf( 'rent_type', self.parser.parse_event_search ) search_price_evaluation_udf = SparkHelper.get_udf( 'price_evaluation', self.parser.parse_event_search ) search_certification_type_udf = SparkHelper.get_udf( 'certification_type', self.parser.parse_event_search ) search_min_price_udf = SparkHelper.get_udf( 'min_price', self.parser.parse_event_search ) search_max_price_udf = SparkHelper.get_udf( 'max_price', self.parser.parse_event_search ) search_keyword_udf = SparkHelper.get_udf( 'keyword', self.parser.parse_event_search ) search_min_land_size_udf = SparkHelper.get_udf( 'min_land_size', self.parser.parse_event_search ) search_max_land_size_udf = SparkHelper.get_udf( 'max_land_size', self.parser.parse_event_search ) search_min_building_size_udf = SparkHelper.get_udf( 'min_building_size', self.parser.parse_event_search ) search_max_building_size_udf = SparkHelper.get_udf( 'max_building_size', self.parser.parse_event_search ) search_min_bedroom_size_udf = SparkHelper.get_udf( 'min_numbers_bedroom', self.parser.parse_event_search ) search_max_bedroom_size_udf = SparkHelper.get_udf( 'max_numbers_bedroom', self.parser.parse_event_search ) search_min_bathroom_size_udf = SparkHelper.get_udf( 'min_numbers_bathroom', self.parser.parse_event_search ) search_max_bathroom_size_udf = SparkHelper.get_udf( 'max_numbers_bathroom', self.parser.parse_event_search ) search_sort_type_size_udf = SparkHelper.get_udf( 'sort_type', self.parser.parse_event_search ) search_pagination_udf = SparkHelper.get_udf( 'pagination', self.parser.parse_event_search ) df_tracker = self.get_parquet( ).filter("event_url LIKE '%cari%' OR event_url LIKE '%search%'") df_tracker.select( search_listing_type_udf('event_url' ).alias('event_search_param_listing_type'), search_property_type_udf('event_url'). alias('event_search_param_property_type'), search_rent_type_udf('event_url' ).alias('event_search_param_rent_type'), search_price_evaluation_udf('event_url'). alias('event_search_param_price_evaluation'), search_certification_type_udf('event_url'). alias('event_search_param_certification_type'), search_min_price_udf('event_url' ).alias('event_search_param_min_price'), search_max_price_udf('event_url' ).alias('event_search_param_max_price'), search_keyword_udf('event_url' ).alias('event_search_param_keyword'), search_min_land_size_udf('event_url'). alias('event_search_param_min_land_size'), search_max_land_size_udf('event_url'). alias('event_search_param_max_land_size'), search_min_building_size_udf('event_url'). alias('event_search_param_min_building_size'), search_max_building_size_udf('event_url'). alias('event_search_param_max_building_size'), search_min_bedroom_size_udf('event_url'). alias('event_search_param_min_bedrooms_building_size'), search_max_bedroom_size_udf('event_url'). alias('event_search_param_max_bedrooms_building_size'), search_min_bathroom_size_udf('event_url'). alias('event_search_param_min_bathrooms_building_size'), search_max_bathroom_size_udf('event_url'). alias('event_search_param_max_bathrooms_building_size'), search_sort_type_size_udf('event_url'). alias('event_search_param_sort_type'), search_pagination_udf('event_url'). alias('event_search_param_pagination') ).show()
def get_snowplow_data(self): """ Get data from bucket s3 snowplow.log.urbanindo.com """ return SparkHelper.read_json(self.connection, self.config.get('data_source', 'tracker'))