def test_should_reduce_items(self): batch = [ BaseRecord({ 'name': 'item1', 'country_code': 'es' }), BaseRecord({ 'name': 'item2', 'country_code': 'uk' }), BaseRecord({ 'name': 'item3', 'something': 'else' }), ] reduce_code = """ def reduce_function(item, accumulator=None): from collections import Counter accumulator = accumulator or Counter() for key in item: accumulator[key] += 1 return accumulator """ writer = ReduceWriter({"options": {"code": reduce_code}}, meta()) writer.write_batch(batch) writer.write_batch(batch) expected = {'country_code': 4, 'name': 6, 'something': 2} self.assertEquals(expected, dict(writer.reduced_result)) writer.close()
def test_should_push_items_to_hubstorage(self, mock_col): mock_writer = mock_col.return_value.create_writer.return_value # given: batch = [ BaseRecord({'name': 'item1', 'country_code': 'es'}), BaseRecord({'name': 'item2', 'country_code': 'uk'}), BaseRecord({'name': 'item3', 'something': 'else'}), ] options = { "project_id": "10804", "collection_name": "test_collection", "key_field": "name", 'apikey': 'fakeapikey', 'size_per_buffer_write': 2, } # when: writer = HubstorageWriter({"options": options}, meta()) with closing(writer): writer.write_batch(batch) writer.flush() self.assertEqual(3, len(mock_writer.write.mock_calls)) self.assertEqual(3, writer.get_metadata('items_count')) expected_calls = [mock.call(dict(it, _key=it['name'])) for it in batch] self.assertEqual(expected_calls, mock_writer.write.mock_calls)
def test_filter_batch_no_op(self): items = [{'name': 'item1', 'value': 'value1'}, {'name': 'item2', 'value': 'value2'}] batch = [] for item in items: record = BaseRecord() record.record = item batch.append(record) self.assertTrue(self.filter.filter_batch(batch) == batch)
def sample_batch(self): return [ BaseRecord({ 'name': 'item1', 'country': 'es' }), BaseRecord({ 'name': 'item2', 'country': 'uk' }), ]
def test_filter_batch_with_python_expression(self): batch = [ BaseRecord({'name': 'item1', 'country_code': 'es'}), BaseRecord({'name': 'item2', 'country_code': 'uk'}), ] python_filter = PythonexpFilter( {'options': {'python_expression': 'item[\'country_code\']==\'uk\''}}, meta() ) result = list(python_filter.filter_batch(batch)) self.assertEqual(1, len(result)) self.assertEqual('uk', dict(result[0])['country_code'])
def setUp(self): self.options = { 'exporter_options': { 'log_level': 'DEBUG', 'logger_name': 'export-pipeline' }, } self.batch = [ BaseRecord({'name': 'item1', 'country_code': 'es'}), BaseRecord({'name': 'item2', 'country_code': 'uk'}), ]
def test_filter_batch_no_op(self): items = [{ 'name': 'item1', 'value': 'value1' }, { 'name': 'item2', 'value': 'value2' }] batch = [] for item in items: record = BaseRecord() record.record = item batch.append(record) self.assertTrue(self.filter.filter_batch(batch) == batch)
def test_filter_with_datetime(self): now = datetime.datetime.now() batch = [ BaseRecord({'name': 'item1', 'updated': str(now - datetime.timedelta(days=2))}), BaseRecord({'name': 'item2', 'updated': str(now - datetime.timedelta(days=1))}), BaseRecord({'name': 'item3', 'updated': str(now)}), ] expr = ("item.get('updated') and item['updated'] >= " "str(datetime.datetime.now() - datetime.timedelta(days=1))[:10]") python_filter = PythonexpFilter( {'options': {'python_expression': expr}}, meta()) result = list(python_filter.filter_batch(batch)) self.assertEqual(['item2', 'item3'], [d['name'] for d in result])
def setUp(self): self.batch = [ BaseRecord({ u'key1': u'value11', u'key2': u'value21' }), BaseRecord({ u'key1': u'value12', u'key2': u'value22' }), BaseRecord({ u'key1': u'value13', u'key2': u'value23' }), ]
def test_filter_with_in_key_value(self): keys = [{ 'name': 'country_code', 'value': 'es', 'operator': 'contains' }] items = [{ 'name': 'item1', 'country_code': ['es', 'us'] }, { 'name': 'item2', 'country_code': ['es', 'us'] }, { 'name': 'item3', 'country_code': ['uk'] }] batch = [] for item in items: record = BaseRecord(item) batch.append(record) filter = KeyValueFilter({'options': {'keys': keys}}, meta()) batch = filter.filter_batch(batch) batch = list(batch) self.assertEqual(2, len(batch))
def test_filter_batch_with_key_value_regex(self): # given: items = [ { 'name': 'item1', 'country': u'es' }, { 'name': 'item2', 'country': u'egypt' }, { 'name': 'item3', 'country': u'uk' }, { 'name': 'item4', 'country': u'españa' }, ] batch = [BaseRecord(it) for it in items] keys = [{'name': 'country', 'value': 'e[sg]'}] regex_filter = KeyValueRegexFilter({'options': {'keys': keys}}, meta()) # when: result = list(regex_filter.filter_batch(batch)) # then: self.assertEqual(['es', 'egypt', u'españa'], [d['country'] for d in result])
def test_write_reservoir_sample_s3(self): # given sample_size = 10 items_to_write = [ BaseRecord({ u'key1': u'value1{}'.format(i), u'key2': u'value2{}'.format(i) }) for i in range(100) ] options = self.get_writer_config() options['options'].update({ 'compression': 'none', 'write_buffer': RESERVOIR_SAMPLING_BUFFER_CLASS, 'write_buffer_options': { 'sample_size': sample_size } }) # when: writer = S3Writer(options, meta()) try: writer.write_batch(items_to_write) writer.flush() finally: writer.close() # then: bucket = self.s3_conn.get_bucket('fake_bucket') saved_keys = [k for k in bucket.list()] self.assertEquals(1, len(saved_keys)) self.assertEqual(saved_keys[0].name, 'tests/0.jl') content = saved_keys[0].get_contents_as_string() self.assertEquals(len(content.strip().splitlines()), sample_size) self.assertNotEquals(content.strip().splitlines(), items_to_write[:sample_size])
def setUp(self): self.sample_size = 10 self.batch = [ BaseRecord({ u'key1': u'value1{}'.format(i), u'key2': u'value2{}'.format(i) }) for i in range(100) ]
def test_should_reduce_and_push_accumulator_to_hubstorage(self): # given: batch = [ BaseRecord({ 'name': 'item1', 'country_code': 'es' }), BaseRecord({ 'name': 'item2', 'country_code': 'uk' }), BaseRecord({ 'name': 'item3', 'something': 'else' }), ] reduce_code = """ def reduce_function(item, accumulator=None): accumulator = 0 if accumulator is None else accumulator return accumulator + len(item) """ collection_url = "%s/p/10804/collections/s/test_collection" % DASH_URL options = { "code": reduce_code, "collection_url": collection_url, "key": "0004", 'apikey': 'fakeapikey' } writer = HubstorageReduceWriter({"options": options}, meta()) # when: writer.write_batch(batch) # then: self.assertEqual(6, writer.reduced_result) self.assertEqual({ 'value': 6, '_finished': False }, writer.collection.get("0004")) writer.finish_writing() self.assertEqual({ 'value': 6, '_finished': True }, writer.collection.get("0004")) writer.close()
def test_writer_with_grouped_data(self): # given: batch = [ BaseRecord(city=u'Madrid', country=u'ES', monument='Royal Palace'), BaseRecord(city=u'Valencia', country=u'ES', monument='Torres de Serranos'), BaseRecord(city=u'Paris', country=u'FR', monument='Eiffel Tour'), BaseRecord(city=u'Paris', country=u'FR', monument='Champ de Mars'), BaseRecord(city=u'Paris', country=u'FR', monument='Arc de Triomphe'), ] grouped_batch = self._build_grouped_batch( batch, python_expressions=["item['country']", "item['city']"]) options = self.get_writer_config() options['options']['filebase'] = os.path.join( self.tmp_dir, '{groups[0]}/{groups[1]}/file') options['options']['items_per_buffer_write'] = 2 writer = FSWriter(options=options, metadata=meta()) # when: with closing(writer) as w: w.write_batch(grouped_batch) w.flush() w.finish_writing() # then: expected_files = [ 'ES/Madrid/file0000.jl.gz', 'ES/Valencia/file0000.jl.gz', 'FR/Paris/file0000.jl.gz', 'FR/Paris/file0001.jl.gz', ] expected = [os.path.join(self.tmp_dir, f) for f in expected_files] def listdir_recursive(path): return [ os.path.join(d, f) for d, _, fnames in os.walk(path) for f in fnames ] self.assertEqual(sorted(expected), sorted(listdir_recursive(self.tmp_dir)))
def test_filter_with_fuzzywuzzy(self): batch = [ BaseRecord({'name': 'Bilbao Falcons', 'country_code': 'es'}), BaseRecord({'name': 'New York Jets', 'country_code': 'us'}), BaseRecord({'name': 'Madrid Cabbaleros', 'country_code': 'es'}), BaseRecord({'name': 'New York Giants', 'country_code': 'us'}), ] expr = "fuzz.ratio('New York', item.get('name')) > 50" python_filter = PythonexpFilter( {'options': { 'python_expression': expr, 'imports': {'fuzz': 'fuzzywuzzy.fuzz'} }}, meta() ) result = list(python_filter.filter_batch(batch)) self.assertEqual(['New York Jets', 'New York Giants'], [d['name'] for d in result])
def test_write_console(self): items_to_write = [] for i in range(0, 10): item = BaseRecord() item['key'] = i item['value'] = random.randint(0, 10000) items_to_write.append(item) self.writer.write_batch(items_to_write) self.assertEqual(self.writer.get_metadata('items_count'), 10)
def get_batch(batch_size=1000): batch = [] for i in range(0, batch_size): item = BaseRecord() item['key'] = i item['country_code'] = random.choice(country_codes) item['state'] = random.choice(states) item['city'] = random.choice(cities) item['value'] = random.randint(0, 10000) batch.append(item) return batch
def get_items_to_write(self): data = [ { 'name': 'Roberto', 'birthday': '12/05/1987' }, { 'name': 'Claudia', 'birthday': '21/12/1985' }, ] return [BaseRecord(d) for d in data]
def transform_batch(self, batch): for item in batch: try: transformed_item = self.jq_program.transform(item) except StopIteration: # jq.transform() raise StopIteration for filtered items continue if not isinstance(transformed_item, dict): transformed_item = yaml.safe_load(transformed_item) yield BaseRecord(transformed_item) self.logger.debug('Transformed items')
def get_next_batch(self): """ This method is called from the manager. It must return a list or a generator of BaseRecord objects. When it has nothing else to read, it must set class variable "finished" to True. """ messages = self.get_from_kafka() if messages: for message in messages: item = BaseRecord(message) self.increase_read() yield item self.logger.debug('Done reading batch') self.last_position = self.consumer.offsets
def get_next_batch(self): """ This method is called from the manager. It must return a list or a generator of BaseRecord objects. When it has nothing else to read, it must set class variable "finished" to True. """ try: batch = self.get_from_kafka() for message in batch: item = BaseRecord(message) self.increase_read() yield item except: self.finished = True self.logger.debug('Done reading batch')
def get_next_batch(self): """ This method is called from the manager. It must return a list or a generator of BaseRecord objects. When it has nothing else to read, it must set class variable "finished" to True. """ if self.collection_scanner.is_enabled: batch = self.collection_scanner.get_new_batch() for item in batch: base_item = BaseRecord(item) self.increase_read() self.last_position['last_key'] = item['_key'] yield base_item self.logger.debug('Done reading batch') else: self.logger.debug('No more batches') self.finished = True
def test_write_aggregated(self): # given data = [ { 'name': 'Roberto', 'birthday': '12/05/1987' }, { 'name': 'Claudia', 'birthday': '21/12/1985' }, { 'name': 'Bob', 'birthday': '21/12/1985' }, { 'name': 'Claude', 'last_login': '******' }, ] items_to_write = [BaseRecord(d) for d in data] options = self.get_writer_config() # when: writer = AggregationStatsWriter(options, meta()) writer.write_batch(items_to_write) writer.close() # then: expected_info = { 'birthday': { 'occurrences': 3, 'coverage': 75.0 }, 'last_login': { 'occurrences': 1, 'coverage': 25.0 }, 'name': { 'occurrences': 4, 'coverage': 100.0 } } self.assertEqual(expected_info, writer._get_aggregated_info())
def test_filter_duplicates_items_without_keys_dont_get_filtered(self): items = [{ 'name': 'item1', 'country_code': 'es' }, { 'name': 'item2', 'country_code': 'us' }, { 'name': 'item3', 'country_code': 'uk' }] batch = [] for item in items: record = BaseRecord(item) batch.append(record) filter = DupeFilter({'options': {}}, meta()) batch = filter.filter_batch(batch) batch = list(batch) self.assertEqual(3, len(batch))
def test_filter_duplicates_with_custom_key(self): keys = ['8062219f00c79c88', '1859834d918981df', 'e2abb7b480edf910'] items = [{ 'custom_key': keys[0], 'name': 'item1', 'country_code': 'es' }, { 'custom_key': keys[0], 'name': 'item2', 'country_code': 'es' }, { 'custom_key': keys[1], 'name': 'item3', 'country_code': 'us' }, { 'custom_key': keys[1], 'name': 'item4', 'country_code': 'us' }, { 'custom_key': keys[2], 'name': 'item5', 'country_code': 'uk' }, { 'custom_key': keys[2], 'name': 'item6', 'country_code': 'uk' }] batch = [] for item in items: record = BaseRecord(item) batch.append(record) filter = DupeFilter({'options': {'key_field': 'custom_key'}}, meta()) batch = filter.filter_batch(batch) batch = list(batch) self.assertEqual(3, len(batch)) self.assertEquals(set(keys), set([item['custom_key'] for item in batch])) self.assertEquals(set(['item1', 'item3', 'item5']), set([item['name'] for item in batch]))
def setUp(self): self.options = { 'exporter_options': { 'log_level': 'DEBUG', 'logger_name': 'export-pipeline' } } self.keys = [{'name': 'country_code', 'value': 'es'}] items = [{ 'name': 'item1', 'country_code': 'es' }, { 'name': 'item2', 'country_code': 'uk' }] self.batch = [] for item in items: record = BaseRecord(item) self.batch.append(record) self.filter = KeyValueFilter({'options': {'keys': self.keys}}, meta())
def test_filter_with_non_existing_op(self): keys = [{ 'name': 'country_code', 'value': ['es', 'us'], 'operator': 'not_an_operator' }] items = [{ 'name': 'item1', 'country_code': 'es' }, { 'name': 'item2', 'country_code': 'us' }, { 'name': 'item3', 'country_code': 'uk' }] batch = [] for item in items: record = BaseRecord(item) batch.append(record) with self.assertRaisesRegexp(InvalidOperator, 'operator not valid'): KeyValueFilter({'options': {'keys': keys}}, meta())
def get_next_batch(self): """ This method is called from the manager. It must return a list or a generator of BaseRecord objects. When it has nothing else to read, it must set class variable "finished" to True. """ number_of_items = self.read_option('number_of_items') for i in range(0, self.batch_size): to_read = self.last_read + 1 if to_read >= number_of_items: self.finished = True break else: item = BaseRecord() self.last_read = to_read item['key'] = self.last_read item['country_code'] = random.choice(self.country_codes) item['state'] = random.choice(self.states) item['city'] = random.choice(self.cities) item['value'] = random.randint(0, 10000) self.increase_read() self.last_position['last_read'] = self.last_read yield item self.logger.debug('Done reading batch')
def deserialize(self, stream): stream.mode = "lines" reader = csv.DictReader(stream) for item in reader: yield BaseRecord(item)
def deserialize(self, stream): for line in stream.iterlines(): yield BaseRecord(json.loads(line))