Beispiel #1
0
    def test_result_iterator(self):
        # sort=default
        rule = InfernoRule()
        eq_(rule.result_iterator, sorted_iterator)

        # sort=True
        rule = InfernoRule(sort=True)
        eq_(rule.result_iterator, sorted_iterator)

        # sort=False
        rule = InfernoRule(sort=False)
        eq_(rule.result_iterator, result_iterator)
Beispiel #2
0
    def setUp(self):
        self.rule = InfernoRule(
            day_range=3,
            day_offset=1,
            day_start=date(2012, 12, 02),
            source_tags=['tag1', 'tag2'],
            result_tag='result_tag_rule')
        self.settings = InfernoSettings(
            day_range=4,
            day_offset=2,
            day_start=date(2011, 12, 02),
            source_tags=['tag3', 'tag4'],
            result_tag='result_tag_settings')

        # expected results
        self.result_tag_from_rule = 'result_tag_rule'
        self.result_tag_from_settings = 'result_tag_settings'
        self.tags_from_rule = [
                'tag1:2012-12-01',
                'tag1:2012-11-30',
                'tag1:2012-11-29',
                'tag2:2012-12-01',
                'tag2:2012-11-30',
                'tag2:2012-11-29']
        self.tags_from_settings = ['tag3', 'tag4']
Beispiel #3
0
    def test_source_tags(self):
        # list
        rule = InfernoRule(source_tags=['tag1', 'tag2'])
        eq_(rule.source_tags, ['tag1', 'tag2'])

        # empty list
        rule = InfernoRule(source_tags=[])
        eq_(rule.source_tags, [])

        # one tag (string)
        rule = InfernoRule(source_tags='tag1')
        eq_(rule.source_tags, ['tag1'])

        # none tag
        rule = InfernoRule(source_tags=None)
        eq_(rule.source_tags, [])
Beispiel #4
0
 def test_map_serialization(self):
     # key parts are str casted & json serialized, value parts are are not
     # (note the difference between the key date and value date results)
     rule = InfernoRule(key_parts=['date'], value_parts=['date'])
     expected = [('["_default","2012-12-01"]', [datetime.date(2012, 12,
                                                              1)])]
     self._assert_map(self.data, rule, expected)
Beispiel #5
0
 def test_tags_from_settings_and_rule_mix(self):
     rule = InfernoRule(source_tags=['tag5'], day_range=2)
     settings = InfernoSettings(day_start=date(2011, 12, 01))
     actual = JobOptions(rule, settings).tags
     # even though day_range=2, we only expect 1 day, as the settings will override
     # the rule
     expected = ['tag5:2011-12-01']
     eq_(actual, expected)
Beispiel #6
0
    def test_parts_preprocess(self):
        def foo(parts, params):
            parts['bar'] = 1
            yield parts

        rule = InfernoRule(parts_preprocess=[foo])
        eq_(rule.params.parts_preprocess, [foo])
        actual = rule.params.parts_preprocess[0]({'hello': 'world'}, None)
        eq_(list(actual), [{'bar': 1, 'hello': 'world'}])
Beispiel #7
0
 def setUp(self):
     self.data = {
         'city': 'toronto',
         'country': 'canada',
         'population': 100,
         'size': 1000,
         'date': datetime.date(2012, 12, 01)
     }
     self.rule = InfernoRule(key_parts=['country', 'city'],
                             value_parts=['population', 'size'])
Beispiel #8
0
 def setUp(self):
     settings = InfernoSettings(day_range=2, day_start=date(2011, 11, 12))
     rule = InfernoRule(archive=True,
                        max_blobs=self.MAX_BLOBS,
                        name='some_rule_name',
                        archive_tag_prefix='archived',
                        source_tags=['incoming:data:chunk'])
     self.job = InfernoJob(rule, settings)
     self.job.disco = Disco()
     self.job.ddfs = DDFS()
Beispiel #9
0
    def test_keyset_parts_preprocess(self):
        def foo(parts, params):
            parts['bar'] = 1
            yield parts

        rule = InfernoRule(keysets={
            'keyset1': Keyset(parts_preprocess=[foo]),
        })
        funcs = rule.params.keysets['keyset1']['parts_preprocess']
        eq_(funcs, [foo])
        actual = funcs[0]({'hello': 'world'}, None)
        eq_(list(actual), [{'bar': 1, 'hello': 'world'}])
Beispiel #10
0
    def test_field_transforms(self):
        def upper(val):
            return val.upper()

        rule = InfernoRule(key_parts=['country', 'city'],
                           value_parts=['population', 'size'],
                           field_transforms={
                               'city': upper,
                               'country': upper
                           })
        expected = [('["_default","CANADA","TORONTO"]', [100, 1000])]
        self._assert_map(self.data, rule, expected)
Beispiel #11
0
    def test_parts_preprocess_that_yields_multiple_parts(self):
        def lookup_language(parts, params):
            for language in ['french', 'english']:
                parts_copy = parts.copy()
                parts_copy['language'] = language
                yield parts_copy

        rule = InfernoRule(key_parts=['country'],
                           value_parts=['language'],
                           parts_preprocess=[lookup_language])
        expected = [('["_default","canada"]', ['french']),
                    ('["_default","canada"]', ['english'])]
        self._assert_map(self.data, rule, expected)
Beispiel #12
0
    def test_field_transforms_happen_after_parts_preprocess(self):
        def lookup_language(parts, params):
            for language in ['french', 'english']:
                parts_copy = parts.copy()
                parts_copy['language'] = language
                yield parts_copy

        def upper(val):
            return val.upper()

        rule = InfernoRule(key_parts=['country'],
                           value_parts=['language'],
                           parts_preprocess=[lookup_language],
                           field_transforms={'language': upper})
        expected = [('["_default","canada"]', ['FRENCH']),
                    ('["_default","canada"]', ['ENGLISH'])]
        self._assert_map(self.data, rule, expected)
Beispiel #13
0
 def test_tags_from_settings(self):
     actual = JobOptions(InfernoRule(), self.settings).tags
     eq_(actual, self.tags_from_settings)
Beispiel #14
0
 def test_kwargs(self):
     rule = InfernoRule(some_extra_param='some_extra_value')
     eq_(rule.params.some_extra_param, 'some_extra_value')
Beispiel #15
0
 def test_str(self):
     rule = InfernoRule(name='some_rule_name')
     eq_(str(rule), '<InfernoRule: some_rule_name>')
Beispiel #16
0
    def test_keysets(self):
        #        # no key sets
        #        rule = InfernoRule()
        #        eq_(rule.params.keysets, {})

        # one key set
        rule = InfernoRule(key_parts=['id'],
                           value_parts=['count'],
                           table='some_table',
                           column_mappings={'id': 'some_id'})
        keysets = {
            '_default': {
                'column_mappings': {
                    'id': 'some_id'
                },
                'table': 'some_table',
                'value_parts': ['count'],
                'key_parts': ['_keyset', 'id'],
                'parts_preprocess': [],
                'parts_postprocess': []
            }
        }
        eq_(rule.params.keysets, keysets)

        # many key sets
        rule = InfernoRule(
            keysets={
                'keyset1':
                Keyset(key_parts=['id1'],
                       value_parts=['count1'],
                       column_mappings={'id1': 'some_id1'},
                       table='some_table1'),
                'keyset2':
                Keyset(key_parts=['id2'],
                       value_parts=['count2'],
                       column_mappings={'id2': 'some_id2'},
                       table='some_table2')
            })
        keysets = {
            'keyset1': {
                'column_mappings': {
                    'id1': 'some_id1'
                },
                'table': 'some_table1',
                'value_parts': ['count1'],
                'key_parts': ['_keyset', 'id1'],
                'parts_preprocess': [],
                'parts_postprocess': [],
            },
            'keyset2': {
                'column_mappings': {
                    'id2': 'some_id2'
                },
                'table': 'some_table2',
                'value_parts': ['count2'],
                'key_parts': ['_keyset', 'id2'],
                'parts_preprocess': [],
                'parts_postprocess': [],
            },
        }
        eq_(rule.params.keysets, keysets)
Beispiel #17
0
                yield cparts
    except:
        print "Error parsing tiles: %s" % str(tiles)


def filter_all(parts, params, **kwargs):
    for col, val in kwargs.items():
        if col and parts[col] != val:
            return
    yield parts


def filter_clicks(keys, vals, params, threshold=1):
    if vals[0] > threshold:
        yield keys, vals

RULES = [
    InfernoRule(
        name='ip_click_counter',
        source_tags=['incoming:impression'],
        map_input_stream=chunk_json_stream,
        parts_preprocess=[clean_data, parse_tiles, partial(filter_all, tile_id=504), count],
        partitions=32,
        sort_buffer_size='25%',
        combiner_function=combiner,
        key_parts=['ip'],
        value_parts=['count'],
        parts_postprocess=[partial(filter_clicks, threshold=5)],
    ),
]
Beispiel #18
0
 def setUp(self):
     self.settings = InfernoSettings()
     self._make_temp_pid_dir()
     self.job = InfernoJob(InfernoRule(name='some_rule_name'), {}, Params())
     self.pid_dir = pid.pid_dir(self.settings)
Beispiel #19
0
 def test_empty_rule_and_empty_settings(self):
     job_options = JobOptions(InfernoRule(), InfernoSettings())
     eq_(job_options.tags, [])
     eq_(job_options.result_tag, None)
Beispiel #20
0
 InfernoRule(
     name='enhanced_stats',
     source_tags=['incoming:impression'],
     day_range=1,
     map_input_stream=chunk_json_stream,
     map_init_function=impression_stats_init,
     parts_preprocess=[
         clean_data, parse_date, parse_locale, parse_ip, parse_ua,
         parse_tiles, filter_enhanced
     ],
     geoip_file=GEOIP,
     partitions=32,
     sort_buffer_size='25%',
     locale_whitelist={
         'ach', 'af', 'an', 'ar', 'as', 'ast', 'az', 'be', 'bg', 'bn-bd',
         'bn-in', 'br', 'bs', 'ca', 'cs', 'csb', 'cy', 'da', 'de', 'el',
         'en-gb', 'en-us', 'en-za', 'eo', 'es-ar', 'es-cl', 'es-es',
         'es-mx', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy-nl', 'ga-ie',
         'gd', 'gl', 'gu-in', 'he', 'hi-in', 'hr', 'hu', 'hsb', 'hy-am',
         'id', 'is', 'it', 'ja', 'ja-jp-mac', 'ka', 'kk', 'km', 'kn', 'ko',
         'ku', 'lij', 'lt', 'lv', 'mai', 'mk', 'ml', 'mr', 'ms', 'my',
         'nb-no', 'nl', 'nn-no', 'oc', 'or', 'pa-in', 'pl', 'pt-br',
         'pt-pt', 'rm', 'ro', 'ru', 'si', 'sk', 'sl', 'son', 'sq', 'sr',
         'sv-se', 'sw', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'vi', 'xh',
         'zh-cn', 'zh-tw', 'zu'
     },
     combiner_function=combiner,
     keysets={
         'impression_stats':
         Keyset(
             key_parts=['date', 'locale', 'tile_id', 'country_code'],
             value_parts=[
                 'impressions', 'clicks', 'pinned', 'blocked', 'sponsored',
                 'sponsored_link'
             ],
         ),
     },
 ),
Beispiel #21
0
 def test_explict_tags(self):
     rule = InfernoRule(source_tags=['tag:foo', 'tag:bar'])
     settings = InfernoSettings()
     actual = JobOptions(rule, settings).tags
     expected = ['tag:foo', 'tag:bar']
     eq_(actual, expected)
Beispiel #22
0
 def test_result_tag_from_settings(self):
     actual = JobOptions(InfernoRule(), self.settings).result_tag
     eq_(actual, self.result_tag_from_settings)
Beispiel #23
0
from inferno.lib.rule import InfernoRule
from inferno.lib.rule import Keyset


RULES = [
    InfernoRule(
        name='manual_rule_4',
        keysets={
            'keyset_1':Keyset(
                key_parts=['key_1'],
                value_parts=['value_1'],
             ),
            'keyset_2':Keyset(
                key_parts=['key_2'],
                value_parts=['value_2']
             )
        }
    ),
]
Beispiel #24
0
 def test_explict_tags_despite_day_range_on_the_rule(self):
     rule = InfernoRule(source_tags=['tag:foo', 'tag:bar'], day_range=2)
     settings = InfernoSettings(day_range=0)
     actual = JobOptions(rule, settings).tags
     expected = ['tag:foo', 'tag:bar']
     eq_(actual, expected)
Beispiel #25
0
    yield parts


# an example keyset parts_preprocess that works only for a specific keyset
def count_again(parts, params):
    parts['count'] = parts['count'] + 1
    yield parts


RULES = [
    InfernoRule(name='last_names_json',
                source_tags=['example:chunk:users'],
                map_input_stream=chunk_json_stream,
                parts_preprocess=[count],
                partitions=2,
                keysets={
                    'last_name_keyset':
                    Keyset(key_parts=['last'],
                           value_parts=['count'],
                           parts_preprocess=[count_again])
                }),
    InfernoRule(
        name='last_names_csv',
        source_tags=['example:chunk:users'],
        map_input_stream=chunk_csv_stream,
        csv_fields=('first', 'last'),
        csv_dialect='excel',
        parts_preprocess=[count],
        partitions=2,
        key_parts=['last'],
        value_parts=['count'],
 InfernoRule(
     name='impression_stats',
     source_tags=['incoming:impression'],
     min_blobs=IMPRESSION_MIN_BLOBS,
     max_blobs=IMPRESSION_MAX_BLOBS,
     archive=True,
     rule_cleanup=report_rule_stats,
     map_input_stream=chunk_json_stream,
     map_init_function=impression_stats_init,
     parts_preprocess=[clean_data, parse_date, parse_locale, check_locale_whitelist, parse_ip, parse_ua, parse_tiles],
     geoip_file=GEOIP,
     partitions=32,
     sort_buffer_size='25%',
     locale_whitelist=LOCALE_WHITELIST,
     result_processor=partial(insert_redshift,
                              host=RS_HOST,
                              port=RS_PORT,
                              database=RS_DB,
                              user=RS_USER,
                              password=RS_PASSWORD,
                              bucket_name=RS_BUCKET),
     combiner_function=combiner,
     keysets={
         'impression_stats': Keyset(
             key_parts=['date', 'position', 'locale', 'tile_id', 'country_code', 'os', 'browser',
                        'version', 'device', 'year', 'month', 'week', 'enhanced', 'blacklisted'],
             value_parts=['impressions', 'clicks', 'pinned', 'blocked', 'sponsored', 'sponsored_link'],
             table='impression_stats_daily'),
         'site_stats': Keyset(
             key_parts=['date', 'locale', 'country_code', 'os', 'browser', 'version', 'device', 'year',
                        'month', 'week', 'url'],
             value_parts=['impressions', 'clicks', 'pinned', 'blocked', 'sponsored', 'sponsored_link'],
             table='site_stats_daily',
         ),
         'newtab_stats': Keyset(
             key_parts=['date', 'locale', 'country_code', 'os', 'browser', 'version', 'device', 'year',
                        'month', 'week'],
             value_parts=['newtabs'],
             table='newtab_stats_daily')
     }
 ),
Beispiel #27
0
                print "pushed remote: %s" % url
        except Exception as e:
            print "failed: %s %s" % (e, url)
            yield unicode('["_default", "%s", "%s", "%s"]' %
                          (e, gethostname(), url)).encode('ascii',
                                                          'ignore'), [1]


RULES = [
    # this rule loads data into a cluster from s3
    InfernoRule(
        name='bulk_load',
        source_urls=partial(
            get_keys_for_pattern,
            bucket='tiles-incoming-prod-us-west-2',
            pattern=r'.+-([^-]*)-(2015\.01\.(05|06|07|08|09|10|11|12|13))',
            tag_expr=["processed:", 1, ":2015-01-", 3]),
        map_input_stream=(disco.schemes.scheme_raw.input_stream, ),
        map_init_function=init,
        map_function=s3_import_map,
    ),
    # this rule copies tags from one Disco cluster to another
    InfernoRule(
        name='copy_tags',
        source_tags=[],
        target_disco_master='disco://localhost',
        target_tag='',
        chunk=False,
        map_input_stream=(task_input_stream, filename_input_stream),
        map_function=copy_tags_map,
    ),
Beispiel #28
0
from inferno.lib.rule import chunk_json_stream
from inferno.lib.rule import InfernoRule
from infernyx.rules import combiner

AUTO_RUN = False


def count(parts, params):
    parts['count'] = 1
    yield parts


RULES = [
    InfernoRule(
        name='busiest_ips',
        source_tags=['processed:impression'],
        day_range=1,
        map_input_stream=chunk_json_stream,
        parts_preprocess=[count],
        partitions=32,
        sort_buffer_size='25%',
        combiner_function=combiner,
        key_parts=['ip'],
        value_parts=['count'],
    ),
]
Beispiel #29
0
    def test_field_transforms(self):
        def upper(val):
            return val.upper()

        rule = InfernoRule(field_transforms={'hello': upper})
        eq_(rule.params.field_transforms, {'hello': upper})
Beispiel #30
0
def filter_site(parts, params):
    filter_for_site = params.filter_for_site
    if parts['url_a'] == filter_for_site or parts['url_b'] == filter_for_site:
        yield parts


RULES = [
    InfernoRule(
        name='analyze_tuples',

        # from the command line - override the input tags with the "-t" option
        source_tags=['incoming:site_tuples'],
        map_input_stream=chain_stream +
        (partial(kv_reader,
                 keyset='tuples',
                 keys=('keyset', 'date', 'locale', 'country_code', 'url_a',
                       'url_b'),
                 values=('count', )), ),
        key_parts=['date'],
        value_parts=['count'],
        parts_preprocess=[filter_site],

        # override this on the command line with:
        #   -P 'filter_for_site: override.org'
        filter_for_site='booking.com',
        partitions=32,
        sort_buffer_size='35%',
    ),
]