コード例 #1
0
def test_create_connection():
    """
    test case for method create_connection in SpackHelper
    """

    assert isinstance(S.create_connection(), SparkSession)
    assert S.create_connection().conf.get(
        'spark.some.config.option') == 'some-value'
コード例 #2
0
def test_read_parquet():
    """
    test case for method read_parquet in SpackHelper
    """

    connection = S.create_connection()
    with pytest.raises(Exception):
        S.read_parquet(connection, 'folder_is_not_exists')

    assert isinstance(S.read_parquet(connection, 'test/resources/result'),
                      DataFrame)
    assert 'event_label' in S.read_parquet(connection,
                                           'test/resources/result').columns
コード例 #3
0
    def main(self):
        """
        Main method for transform raw snowplow data to dataset tracker
        """

        event_label_udf = SparkHelper.get_udf('se_la', self.parser.parse_event)

        event_aid_udf = SparkHelper.get_udf('aid', self.parser.parse_event)

        event_url_udf = SparkHelper.get_udf('url', self.parser.parse_event)

        event_refr_udf = SparkHelper.get_udf('refr', self.parser.parse_event)

        event_tz_udf = SparkHelper.get_udf('tz', self.parser.parse_event)

        event_category_udf = SparkHelper.get_udf('se_ca',
                                                 self.parser.parse_event)

        event_action_udf = SparkHelper.get_udf('se_ac',
                                               self.parser.parse_event)

        event_sessionid_udf = SparkHelper.get_udf('sid',
                                                  self.parser.parse_event)

        event_visitorid_udf = SparkHelper.get_udf('fp',
                                                  self.parser.parse_event)

        event_platform_type_udf = SparkHelper.get_udf('p',
                                                      self.parser.parse_event)

        # db = self.db_geoip

        event_ip_to_locality = udf(self.geoip.get_locality_from_ip,
                                   StringType())

        df_tracker = self.get_snowplow_data()
        df_tracker.select(
            event_ip_to_locality(df_tracker.ip).alias('event_ip_to_locality'),
            df_tracker.ip.alias('event_ip'),
            df_tracker.time.alias('event_time'),
            # event_category_udf('path').alias('event_category'),
            # event_tz_udf('path').alias('event_tz'),
            # event_action_udf('path').alias('event_action'),
            # event_label_udf('path').alias('event_label'),
            # event_sessionid_udf('path').alias('event_sessionID'),
            # event_visitorid_udf('path').alias('event_visitorID'),
            # event_platform_type_udf('path').alias('event_platform_type'),
            # event_aid_udf('path').alias('event_aid'),
            # event_url_udf('path').alias('event_url'),
            # event_refr_udf('path').alias('event_refr')
        ).show(20, False)
コード例 #4
0
ファイル: agent.py プロジェクト: ramadhanjanuar/etl-data
 def get_parquet(self):
     """
     Get Tracker parquet
     """
     return SparkHelper.read_parquet(
         self.connection, self.config.get('data_source', 'result')
     )
コード例 #5
0
 def __init__(self):
     self.connection = SparkHelper.create_connection()
     self.parser = Parser()
     self.config = get_config()
     self.result = self.config.get('data_source', 'result')
     self.geoip = geoip()
     self.db_geoip = self.geoip.getConnection()
コード例 #6
0
ファイル: agent.py プロジェクト: ramadhanjanuar/etl-data
 def __init__(self):
     self.connection = SparkHelper.create_connection()
     self.parser = Parser()
     self.config = get_config()
コード例 #7
0
ファイル: search.py プロジェクト: ramadhanjanuar/etl-data
    def main(self):
        """
        Main Class for transform tracket dataset to search dataset
        """

        search_listing_type_udf = SparkHelper.get_udf(
            'listing_type', self.parser.parse_event_search
        )

        search_property_type_udf = SparkHelper.get_udf(
            'property_type', self.parser.parse_event_search
        )

        search_rent_type_udf = SparkHelper.get_udf(
            'rent_type', self.parser.parse_event_search
        )

        search_price_evaluation_udf = SparkHelper.get_udf(
            'price_evaluation', self.parser.parse_event_search
        )

        search_certification_type_udf = SparkHelper.get_udf(
            'certification_type', self.parser.parse_event_search
        )

        search_min_price_udf = SparkHelper.get_udf(
            'min_price', self.parser.parse_event_search
        )

        search_max_price_udf = SparkHelper.get_udf(
            'max_price', self.parser.parse_event_search
        )

        search_keyword_udf = SparkHelper.get_udf(
            'keyword', self.parser.parse_event_search
        )

        search_min_land_size_udf = SparkHelper.get_udf(
            'min_land_size', self.parser.parse_event_search
        )

        search_max_land_size_udf = SparkHelper.get_udf(
            'max_land_size', self.parser.parse_event_search
        )

        search_min_building_size_udf = SparkHelper.get_udf(
            'min_building_size', self.parser.parse_event_search
        )

        search_max_building_size_udf = SparkHelper.get_udf(
            'max_building_size', self.parser.parse_event_search
        )

        search_min_bedroom_size_udf = SparkHelper.get_udf(
            'min_numbers_bedroom', self.parser.parse_event_search
        )

        search_max_bedroom_size_udf = SparkHelper.get_udf(
            'max_numbers_bedroom', self.parser.parse_event_search
        )

        search_min_bathroom_size_udf = SparkHelper.get_udf(
            'min_numbers_bathroom', self.parser.parse_event_search
        )

        search_max_bathroom_size_udf = SparkHelper.get_udf(
            'max_numbers_bathroom', self.parser.parse_event_search
        )

        search_sort_type_size_udf = SparkHelper.get_udf(
            'sort_type', self.parser.parse_event_search
        )

        search_pagination_udf = SparkHelper.get_udf(
            'pagination', self.parser.parse_event_search
        )

        df_tracker = self.get_parquet(
        ).filter("event_url LIKE '%cari%' OR event_url LIKE '%search%'")

        df_tracker.select(
            search_listing_type_udf('event_url'
                                    ).alias('event_search_param_listing_type'),
            search_property_type_udf('event_url').
            alias('event_search_param_property_type'),
            search_rent_type_udf('event_url'
                                 ).alias('event_search_param_rent_type'),
            search_price_evaluation_udf('event_url').
            alias('event_search_param_price_evaluation'),
            search_certification_type_udf('event_url').
            alias('event_search_param_certification_type'),
            search_min_price_udf('event_url'
                                 ).alias('event_search_param_min_price'),
            search_max_price_udf('event_url'
                                 ).alias('event_search_param_max_price'),
            search_keyword_udf('event_url'
                               ).alias('event_search_param_keyword'),
            search_min_land_size_udf('event_url').
            alias('event_search_param_min_land_size'),
            search_max_land_size_udf('event_url').
            alias('event_search_param_max_land_size'),
            search_min_building_size_udf('event_url').
            alias('event_search_param_min_building_size'),
            search_max_building_size_udf('event_url').
            alias('event_search_param_max_building_size'),
            search_min_bedroom_size_udf('event_url').
            alias('event_search_param_min_bedrooms_building_size'),
            search_max_bedroom_size_udf('event_url').
            alias('event_search_param_max_bedrooms_building_size'),
            search_min_bathroom_size_udf('event_url').
            alias('event_search_param_min_bathrooms_building_size'),
            search_max_bathroom_size_udf('event_url').
            alias('event_search_param_max_bathrooms_building_size'),
            search_sort_type_size_udf('event_url').
            alias('event_search_param_sort_type'),
            search_pagination_udf('event_url').
            alias('event_search_param_pagination')
        ).show()
コード例 #8
0
 def get_snowplow_data(self):
     """
     Get data from bucket s3 snowplow.log.urbanindo.com
     """
     return SparkHelper.read_json(self.connection,
                                  self.config.get('data_source', 'tracker'))