def test_get_file_number(self): writer_config = self.get_writer_config() writer = FSWriter(writer_config, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() expected_file = '{}/exporter_test0000.jl.gz'.format(self.tmp_dir) self.assertTrue(expected_file in writer.written_files)
def test_writer_md5_generation(self): # given options = self.get_writer_config() options['options']['generate_md5'] = True # when: writer = FSWriter(options, meta()) with closing(writer) as w: w.write_batch(self.get_batch()) w.flush() w.finish_writing() self.assertTrue( os.path.isfile(os.path.join(self.tmp_dir, 'md5checksum.md5')), "Didn't found an expected md5checksum.md5 file")
def test_no_compression(self): writer_config = self.get_writer_config() writer_config['options'].update({'compression': 'none'}) writer = FSWriter(writer_config, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() expected_file = '{}/exporter_test0000.jl'.format(self.tmp_dir) self.assertTrue(expected_file in writer.written_files) written = [] with open(expected_file, 'r') as fin: for line in fin: written.append(json.loads(line)) self.assertEqual(written, self.get_batch())
def test_writer_with_grouped_data(self): # given: batch = [ BaseRecord(city=u'Madrid', country=u'ES', monument='Royal Palace'), BaseRecord(city=u'Valencia', country=u'ES', monument='Torres de Serranos'), BaseRecord(city=u'Paris', country=u'FR', monument='Eiffel Tour'), BaseRecord(city=u'Paris', country=u'FR', monument='Champ de Mars'), BaseRecord(city=u'Paris', country=u'FR', monument='Arc de Triomphe'), ] grouped_batch = self._build_grouped_batch( batch, python_expressions=["item['country']", "item['city']"]) options = self.get_writer_config() options['options']['filebase'] = os.path.join( self.tmp_dir, '{groups[0]}/{groups[1]}/file') options['options']['items_per_buffer_write'] = 2 writer = FSWriter(options=options, metadata=meta()) # when: with closing(writer) as w: w.write_batch(grouped_batch) w.flush() w.finish_writing() # then: expected_files = [ 'ES/Madrid/file0000.jl.gz', 'ES/Valencia/file0000.jl.gz', 'FR/Paris/file0000.jl.gz', 'FR/Paris/file0001.jl.gz', ] expected = [os.path.join(self.tmp_dir, f) for f in expected_files] def listdir_recursive(path): return [ os.path.join(d, f) for d, _, fnames in os.walk(path) for f in fnames ] self.assertEqual(sorted(expected), sorted(listdir_recursive(self.tmp_dir)))
def test_compression_zip_format(self): writer_config = self.get_writer_config() writer_config['options'].update({'compression': 'zip'}) writer = FSWriter(writer_config, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() expected_file = '{}/exporter_test0000.jl.zip'.format(self.tmp_dir) self.assertTrue(expected_file in writer.written_files) import zipfile written = [] with zipfile.ZipFile(expected_file) as z: with z.open('exporter_test0000.jl') as f: for line in f: written.append(json.loads(line)) self.assertEqual(written, self.get_batch())
def test_get_file_number_with_date(self): file_path = '/tmp/%Y%m%d/' file_name = '{file_number}_exporter_test_%m%d%y' start_file_count = 1 writer_config = self.get_writer_config() writer_config.update({'options': { 'filebase': file_path + file_name, 'start_file_count': start_file_count }}) writer = FSWriter(writer_config, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() file_path = datetime.datetime.now().strftime(file_path).format(file_number=start_file_count) file_name = datetime.datetime.now().strftime(file_name).format(file_number=start_file_count) self.assertIn(file_path + file_name + '.jl.gz', writer.written_files)
def test_get_file_number_with_date(self): file_path = '/tmp/%Y%m%d/' file_name = '{file_number}_exporter_test_%m%d%y' start_file_count = 1 writer_config = self.get_writer_config() writer_config.update({ 'options': { 'filebase': file_path + file_name, 'start_file_count': start_file_count } }) writer = FSWriter(writer_config, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() file_path = datetime.datetime.now().strftime(file_path).format( file_number=start_file_count) file_name = datetime.datetime.now().strftime(file_name).format( file_number=start_file_count) self.assertIn(file_path + file_name + '.jl.gz', writer.written_files)
def test_check_writer_consistency(self): # given options = self.get_writer_config() options['options']['check_consistency'] = True # when: writer = FSWriter(options, meta()) try: writer.write_batch(self.get_batch()) writer.flush() finally: writer.close() # Consistency check passes writer.finish_writing() with open(os.path.join(self.tmp_dir, 'exporter_test0000.jl.gz'), 'w'): with self.assertRaisesRegexp(InconsistentWriteState, 'Wrong size for file'): writer.finish_writing() os.remove(os.path.join(self.tmp_dir, 'exporter_test0000.jl.gz')) with self.assertRaisesRegexp(InconsistentWriteState, 'file is not present at destination'): writer.finish_writing()