Beispiel #1
0
    def test_should_reduce_items(self):
        batch = [
            BaseRecord({
                'name': 'item1',
                'country_code': 'es'
            }),
            BaseRecord({
                'name': 'item2',
                'country_code': 'uk'
            }),
            BaseRecord({
                'name': 'item3',
                'something': 'else'
            }),
        ]

        reduce_code = """
def reduce_function(item, accumulator=None):
    from collections import Counter
    accumulator = accumulator or Counter()
    for key in item:
        accumulator[key] += 1
    return accumulator
"""
        writer = ReduceWriter({"options": {"code": reduce_code}}, meta())
        writer.write_batch(batch)
        writer.write_batch(batch)
        expected = {'country_code': 4, 'name': 6, 'something': 2}
        self.assertEquals(expected, dict(writer.reduced_result))
        writer.close()
Beispiel #2
0
    def test_should_push_items_to_hubstorage(self, mock_col):
        mock_writer = mock_col.return_value.create_writer.return_value
        # given:
        batch = [
            BaseRecord({'name': 'item1', 'country_code': 'es'}),
            BaseRecord({'name': 'item2', 'country_code': 'uk'}),
            BaseRecord({'name': 'item3', 'something': 'else'}),
        ]

        options = {
            "project_id": "10804",
            "collection_name": "test_collection",
            "key_field": "name",
            'apikey': 'fakeapikey',
            'size_per_buffer_write': 2,
        }

        # when:
        writer = HubstorageWriter({"options": options}, meta())
        with closing(writer):
            writer.write_batch(batch)
            writer.flush()

        self.assertEqual(3, len(mock_writer.write.mock_calls))
        self.assertEqual(3, writer.get_metadata('items_count'))

        expected_calls = [mock.call(dict(it, _key=it['name'])) for it in batch]
        self.assertEqual(expected_calls, mock_writer.write.mock_calls)
 def test_filter_batch_no_op(self):
     items = [{'name': 'item1', 'value': 'value1'}, {'name': 'item2', 'value': 'value2'}]
     batch = []
     for item in items:
         record = BaseRecord()
         record.record = item
         batch.append(record)
     self.assertTrue(self.filter.filter_batch(batch) == batch)
Beispiel #4
0
 def sample_batch(self):
     return [
         BaseRecord({
             'name': 'item1',
             'country': 'es'
         }),
         BaseRecord({
             'name': 'item2',
             'country': 'uk'
         }),
     ]
 def test_filter_batch_with_python_expression(self):
     batch = [
         BaseRecord({'name': 'item1', 'country_code': 'es'}),
         BaseRecord({'name': 'item2', 'country_code': 'uk'}),
     ]
     python_filter = PythonexpFilter(
         {'options': {'python_expression': 'item[\'country_code\']==\'uk\''}},
         meta()
     )
     result = list(python_filter.filter_batch(batch))
     self.assertEqual(1, len(result))
     self.assertEqual('uk', dict(result[0])['country_code'])
Beispiel #6
0
    def setUp(self):
        self.options = {
            'exporter_options': {
                'log_level': 'DEBUG',
                'logger_name': 'export-pipeline'
            },
        }

        self.batch = [
            BaseRecord({'name': 'item1', 'country_code': 'es'}),
            BaseRecord({'name': 'item2', 'country_code': 'uk'}),
        ]
Beispiel #7
0
 def test_filter_batch_no_op(self):
     items = [{
         'name': 'item1',
         'value': 'value1'
     }, {
         'name': 'item2',
         'value': 'value2'
     }]
     batch = []
     for item in items:
         record = BaseRecord()
         record.record = item
         batch.append(record)
     self.assertTrue(self.filter.filter_batch(batch) == batch)
 def test_filter_with_datetime(self):
     now = datetime.datetime.now()
     batch = [
         BaseRecord({'name': 'item1', 'updated': str(now - datetime.timedelta(days=2))}),
         BaseRecord({'name': 'item2', 'updated': str(now - datetime.timedelta(days=1))}),
         BaseRecord({'name': 'item3', 'updated': str(now)}),
     ]
     expr = ("item.get('updated') and item['updated'] >= "
             "str(datetime.datetime.now() - datetime.timedelta(days=1))[:10]")
     python_filter = PythonexpFilter(
         {'options': {'python_expression': expr}}, meta())
     result = list(python_filter.filter_batch(batch))
     self.assertEqual(['item2', 'item3'],
                      [d['name'] for d in result])
Beispiel #9
0
 def setUp(self):
     self.batch = [
         BaseRecord({
             u'key1': u'value11',
             u'key2': u'value21'
         }),
         BaseRecord({
             u'key1': u'value12',
             u'key2': u'value22'
         }),
         BaseRecord({
             u'key1': u'value13',
             u'key2': u'value23'
         }),
     ]
Beispiel #10
0
    def test_filter_with_in_key_value(self):

        keys = [{
            'name': 'country_code',
            'value': 'es',
            'operator': 'contains'
        }]

        items = [{
            'name': 'item1',
            'country_code': ['es', 'us']
        }, {
            'name': 'item2',
            'country_code': ['es', 'us']
        }, {
            'name': 'item3',
            'country_code': ['uk']
        }]
        batch = []
        for item in items:
            record = BaseRecord(item)
            batch.append(record)
        filter = KeyValueFilter({'options': {'keys': keys}}, meta())

        batch = filter.filter_batch(batch)
        batch = list(batch)
        self.assertEqual(2, len(batch))
Beispiel #11
0
    def test_filter_batch_with_key_value_regex(self):
        # given:
        items = [
            {
                'name': 'item1',
                'country': u'es'
            },
            {
                'name': 'item2',
                'country': u'egypt'
            },
            {
                'name': 'item3',
                'country': u'uk'
            },
            {
                'name': 'item4',
                'country': u'españa'
            },
        ]
        batch = [BaseRecord(it) for it in items]

        keys = [{'name': 'country', 'value': 'e[sg]'}]
        regex_filter = KeyValueRegexFilter({'options': {'keys': keys}}, meta())

        # when:
        result = list(regex_filter.filter_batch(batch))

        # then:
        self.assertEqual(['es', 'egypt', u'españa'],
                         [d['country'] for d in result])
Beispiel #12
0
    def test_write_reservoir_sample_s3(self):
        # given
        sample_size = 10
        items_to_write = [
            BaseRecord({
                u'key1': u'value1{}'.format(i),
                u'key2': u'value2{}'.format(i)
            }) for i in range(100)
        ]
        options = self.get_writer_config()
        options['options'].update({
            'compression': 'none',
            'write_buffer': RESERVOIR_SAMPLING_BUFFER_CLASS,
            'write_buffer_options': {
                'sample_size': sample_size
            }
        })

        # when:
        writer = S3Writer(options, meta())
        try:
            writer.write_batch(items_to_write)
            writer.flush()
        finally:
            writer.close()

        # then:
        bucket = self.s3_conn.get_bucket('fake_bucket')
        saved_keys = [k for k in bucket.list()]
        self.assertEquals(1, len(saved_keys))
        self.assertEqual(saved_keys[0].name, 'tests/0.jl')
        content = saved_keys[0].get_contents_as_string()
        self.assertEquals(len(content.strip().splitlines()), sample_size)
        self.assertNotEquals(content.strip().splitlines(),
                             items_to_write[:sample_size])
Beispiel #13
0
 def setUp(self):
     self.sample_size = 10
     self.batch = [
         BaseRecord({
             u'key1': u'value1{}'.format(i),
             u'key2': u'value2{}'.format(i)
         }) for i in range(100)
     ]
Beispiel #14
0
    def test_should_reduce_and_push_accumulator_to_hubstorage(self):
        # given:
        batch = [
            BaseRecord({
                'name': 'item1',
                'country_code': 'es'
            }),
            BaseRecord({
                'name': 'item2',
                'country_code': 'uk'
            }),
            BaseRecord({
                'name': 'item3',
                'something': 'else'
            }),
        ]
        reduce_code = """
def reduce_function(item, accumulator=None):
    accumulator = 0 if accumulator is None else accumulator
    return accumulator + len(item)
        """
        collection_url = "%s/p/10804/collections/s/test_collection" % DASH_URL
        options = {
            "code": reduce_code,
            "collection_url": collection_url,
            "key": "0004",
            'apikey': 'fakeapikey'
        }
        writer = HubstorageReduceWriter({"options": options}, meta())

        # when:
        writer.write_batch(batch)

        # then:
        self.assertEqual(6, writer.reduced_result)
        self.assertEqual({
            'value': 6,
            '_finished': False
        }, writer.collection.get("0004"))
        writer.finish_writing()
        self.assertEqual({
            'value': 6,
            '_finished': True
        }, writer.collection.get("0004"))
        writer.close()
Beispiel #15
0
    def test_writer_with_grouped_data(self):
        # given:
        batch = [
            BaseRecord(city=u'Madrid', country=u'ES', monument='Royal Palace'),
            BaseRecord(city=u'Valencia',
                       country=u'ES',
                       monument='Torres de Serranos'),
            BaseRecord(city=u'Paris', country=u'FR', monument='Eiffel Tour'),
            BaseRecord(city=u'Paris', country=u'FR', monument='Champ de Mars'),
            BaseRecord(city=u'Paris',
                       country=u'FR',
                       monument='Arc de Triomphe'),
        ]
        grouped_batch = self._build_grouped_batch(
            batch, python_expressions=["item['country']", "item['city']"])

        options = self.get_writer_config()
        options['options']['filebase'] = os.path.join(
            self.tmp_dir, '{groups[0]}/{groups[1]}/file')
        options['options']['items_per_buffer_write'] = 2
        writer = FSWriter(options=options, metadata=meta())

        # when:
        with closing(writer) as w:
            w.write_batch(grouped_batch)
            w.flush()
            w.finish_writing()

        # then:
        expected_files = [
            'ES/Madrid/file0000.jl.gz',
            'ES/Valencia/file0000.jl.gz',
            'FR/Paris/file0000.jl.gz',
            'FR/Paris/file0001.jl.gz',
        ]
        expected = [os.path.join(self.tmp_dir, f) for f in expected_files]

        def listdir_recursive(path):
            return [
                os.path.join(d, f) for d, _, fnames in os.walk(path)
                for f in fnames
            ]

        self.assertEqual(sorted(expected),
                         sorted(listdir_recursive(self.tmp_dir)))
 def test_filter_with_fuzzywuzzy(self):
     batch = [
         BaseRecord({'name': 'Bilbao Falcons', 'country_code': 'es'}),
         BaseRecord({'name': 'New York Jets', 'country_code': 'us'}),
         BaseRecord({'name': 'Madrid Cabbaleros', 'country_code': 'es'}),
         BaseRecord({'name': 'New York Giants', 'country_code': 'us'}),
     ]
     expr = "fuzz.ratio('New York', item.get('name')) > 50"
     python_filter = PythonexpFilter(
         {'options': {
             'python_expression': expr,
             'imports': {'fuzz': 'fuzzywuzzy.fuzz'}
         }},
         meta()
     )
     result = list(python_filter.filter_batch(batch))
     self.assertEqual(['New York Jets', 'New York Giants'],
                      [d['name'] for d in result])
Beispiel #17
0
    def test_write_console(self):
        items_to_write = []
        for i in range(0, 10):
            item = BaseRecord()
            item['key'] = i
            item['value'] = random.randint(0, 10000)
            items_to_write.append(item)

        self.writer.write_batch(items_to_write)
        self.assertEqual(self.writer.get_metadata('items_count'), 10)
Beispiel #18
0
def get_batch(batch_size=1000):
    batch = []
    for i in range(0, batch_size):
        item = BaseRecord()
        item['key'] = i
        item['country_code'] = random.choice(country_codes)
        item['state'] = random.choice(states)
        item['city'] = random.choice(cities)
        item['value'] = random.randint(0, 10000)
        batch.append(item)
    return batch
Beispiel #19
0
 def get_items_to_write(self):
     data = [
         {
             'name': 'Roberto',
             'birthday': '12/05/1987'
         },
         {
             'name': 'Claudia',
             'birthday': '21/12/1985'
         },
     ]
     return [BaseRecord(d) for d in data]
Beispiel #20
0
    def transform_batch(self, batch):
        for item in batch:
            try:
                transformed_item = self.jq_program.transform(item)
            except StopIteration:
                # jq.transform() raise StopIteration for filtered items
                continue

            if not isinstance(transformed_item, dict):
                transformed_item = yaml.safe_load(transformed_item)

            yield BaseRecord(transformed_item)
        self.logger.debug('Transformed items')
Beispiel #21
0
    def get_next_batch(self):
        """
        This method is called from the manager. It must return a list or a generator
        of BaseRecord objects.
        When it has nothing else to read, it must set class variable "finished" to True.
        """
        messages = self.get_from_kafka()
        if messages:
            for message in messages:
                item = BaseRecord(message)
                self.increase_read()
                yield item

        self.logger.debug('Done reading batch')
        self.last_position = self.consumer.offsets
Beispiel #22
0
 def get_next_batch(self):
     """
     This method is called from the manager. It must return a list or a generator
     of BaseRecord objects.
     When it has nothing else to read, it must set class variable "finished" to True.
     """
     try:
         batch = self.get_from_kafka()
         for message in batch:
             item = BaseRecord(message)
             self.increase_read()
             yield item
     except:
         self.finished = True
     self.logger.debug('Done reading batch')
 def get_next_batch(self):
     """
     This method is called from the manager. It must return a list or a generator
     of BaseRecord objects.
     When it has nothing else to read, it must set class variable "finished" to True.
     """
     if self.collection_scanner.is_enabled:
         batch = self.collection_scanner.get_new_batch()
         for item in batch:
             base_item = BaseRecord(item)
             self.increase_read()
             self.last_position['last_key'] = item['_key']
             yield base_item
         self.logger.debug('Done reading batch')
     else:
         self.logger.debug('No more batches')
         self.finished = True
    def test_write_aggregated(self):
        # given
        data = [
            {
                'name': 'Roberto',
                'birthday': '12/05/1987'
            },
            {
                'name': 'Claudia',
                'birthday': '21/12/1985'
            },
            {
                'name': 'Bob',
                'birthday': '21/12/1985'
            },
            {
                'name': 'Claude',
                'last_login': '******'
            },
        ]
        items_to_write = [BaseRecord(d) for d in data]
        options = self.get_writer_config()

        # when:
        writer = AggregationStatsWriter(options, meta())
        writer.write_batch(items_to_write)
        writer.close()

        # then:
        expected_info = {
            'birthday': {
                'occurrences': 3,
                'coverage': 75.0
            },
            'last_login': {
                'occurrences': 1,
                'coverage': 25.0
            },
            'name': {
                'occurrences': 4,
                'coverage': 100.0
            }
        }
        self.assertEqual(expected_info, writer._get_aggregated_info())
Beispiel #25
0
    def test_filter_duplicates_items_without_keys_dont_get_filtered(self):
        items = [{
            'name': 'item1',
            'country_code': 'es'
        }, {
            'name': 'item2',
            'country_code': 'us'
        }, {
            'name': 'item3',
            'country_code': 'uk'
        }]
        batch = []
        for item in items:
            record = BaseRecord(item)
            batch.append(record)
        filter = DupeFilter({'options': {}}, meta())

        batch = filter.filter_batch(batch)
        batch = list(batch)
        self.assertEqual(3, len(batch))
Beispiel #26
0
    def test_filter_duplicates_with_custom_key(self):
        keys = ['8062219f00c79c88', '1859834d918981df', 'e2abb7b480edf910']
        items = [{
            'custom_key': keys[0],
            'name': 'item1',
            'country_code': 'es'
        }, {
            'custom_key': keys[0],
            'name': 'item2',
            'country_code': 'es'
        }, {
            'custom_key': keys[1],
            'name': 'item3',
            'country_code': 'us'
        }, {
            'custom_key': keys[1],
            'name': 'item4',
            'country_code': 'us'
        }, {
            'custom_key': keys[2],
            'name': 'item5',
            'country_code': 'uk'
        }, {
            'custom_key': keys[2],
            'name': 'item6',
            'country_code': 'uk'
        }]

        batch = []
        for item in items:
            record = BaseRecord(item)
            batch.append(record)
        filter = DupeFilter({'options': {'key_field': 'custom_key'}}, meta())

        batch = filter.filter_batch(batch)
        batch = list(batch)
        self.assertEqual(3, len(batch))
        self.assertEquals(set(keys),
                          set([item['custom_key'] for item in batch]))
        self.assertEquals(set(['item1', 'item3', 'item5']),
                          set([item['name'] for item in batch]))
Beispiel #27
0
    def setUp(self):
        self.options = {
            'exporter_options': {
                'log_level': 'DEBUG',
                'logger_name': 'export-pipeline'
            }
        }
        self.keys = [{'name': 'country_code', 'value': 'es'}]

        items = [{
            'name': 'item1',
            'country_code': 'es'
        }, {
            'name': 'item2',
            'country_code': 'uk'
        }]
        self.batch = []
        for item in items:
            record = BaseRecord(item)
            self.batch.append(record)
        self.filter = KeyValueFilter({'options': {'keys': self.keys}}, meta())
Beispiel #28
0
    def test_filter_with_non_existing_op(self):

        keys = [{
            'name': 'country_code',
            'value': ['es', 'us'],
            'operator': 'not_an_operator'
        }]

        items = [{
            'name': 'item1',
            'country_code': 'es'
        }, {
            'name': 'item2',
            'country_code': 'us'
        }, {
            'name': 'item3',
            'country_code': 'uk'
        }]
        batch = []
        for item in items:
            record = BaseRecord(item)
            batch.append(record)
        with self.assertRaisesRegexp(InvalidOperator, 'operator not valid'):
            KeyValueFilter({'options': {'keys': keys}}, meta())
Beispiel #29
0
 def get_next_batch(self):
     """
     This method is called from the manager. It must return a list or a generator
     of BaseRecord objects.
     When it has nothing else to read, it must set class variable "finished" to True.
     """
     number_of_items = self.read_option('number_of_items')
     for i in range(0, self.batch_size):
         to_read = self.last_read + 1
         if to_read >= number_of_items:
             self.finished = True
             break
         else:
             item = BaseRecord()
             self.last_read = to_read
             item['key'] = self.last_read
             item['country_code'] = random.choice(self.country_codes)
             item['state'] = random.choice(self.states)
             item['city'] = random.choice(self.cities)
             item['value'] = random.randint(0, 10000)
             self.increase_read()
             self.last_position['last_read'] = self.last_read
             yield item
     self.logger.debug('Done reading batch')
Beispiel #30
0
 def deserialize(self, stream):
     stream.mode = "lines"
     reader = csv.DictReader(stream)
     for item in reader:
         yield BaseRecord(item)
Beispiel #31
0
 def deserialize(self, stream):
     for line in stream.iterlines():
         yield BaseRecord(json.loads(line))