def test_header_only_input_one_output_file_with_header(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) m.split(rows, prefix='split.', chunk_size=1) self.assertEqual(u'a,b', header(u'split.0'))
def test_input_contain_zip_field_exception(self): csv_in = self.csv_header_a_b_c() csv_out_spec = ReaderWriter() csv_out_unspec = ReaderWriter() with self.assertRaises(m.DuplicateFieldError): m.unzip(csv_in, ['a'], csv_out_spec, csv_out_unspec, zip_field='a')
def test_zip_id_defaults_to_id(self): csv_in = self.csv_header_a_b_c() csv_out_spec = ReaderWriter() csv_out_unspec = ReaderWriter() m.unzip(csv_in, ['a'], csv_out_spec, csv_out_unspec) self.assertListEqual(['id b c'.split()], csv_out_unspec.rows)
def test_output_header(self): reader = ReaderWriter() reader.rows = [('aa', 'bb', 'cc', 'dd')] writer = ReaderWriter() m.ExtractMap('b=bb,c=cc', 'a=id').process(reader, writer) self.assertEqual( [('aa', 'dd', 'id')], writer.rows)
def test_multiple_output_files_have_same_header(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) rows.writerow([1, 2]) rows.writerow([3, 4]) m.split(rows, prefix='split.', chunk_size=1) self.assertEqual(u'a,b', header(u'split.0')) self.assertEqual(u'a,b', header(u'split.1'))
def test_custom_zip_id_in_out_unspec(self): csv_in = self.csv_header_a_b_c() csv_out_spec = ReaderWriter() csv_out_unspec = ReaderWriter() m.unzip( csv_in, ['a'], csv_out_spec, csv_out_unspec, zip_field='zip_id') self.assertListEqual(['zip_id b c'.split()], csv_out_unspec.rows)
def test_ids_are_converted_to_string(self): rw = ReaderWriter() rw.rows = [('id', 'value'), ('1', 'one')] newmap = make_map('value', 'id') newmap.read(rw) self.assertEqual( {('one',): 1}, newmap.values)
def test_less_data_rows_than_chunk_size_one_file_created(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) rows.writerow([1, 2]) rows.writerow([3, 4]) m.split(rows, prefix='split.', chunk_size=3) self.assertTrue(os.path.exists(u'split.0')) self.assertFalse(os.path.exists(u'split.1'))
def test_header_is_output_field_names(self): reader = ReaderWriter() reader.rows = [('a', 'b')] writer = ReaderWriter() t = m.Transformer() t.output_field_names = sentinel.output_field_names t.process(reader, writer) self.assertEqual([sentinel.output_field_names], writer.rows)
def test_content_is_produced_by_process(self): reader = ReaderWriter() reader.rows = [('a', 'b'), (1, 2), (1, 2)] writer = ReaderWriter() t = m.Transformer() t.transform = mock.Mock(t.transform, return_value=sentinel.output) t.process(reader, writer) self.assertEqual([sentinel.output, sentinel.output], writer.rows[1:])
def __init__(self): self.appender = ReaderWriter() self.mapper_reader = self._mapper_reader() self.mapper_appender = ReaderWriter() self.reader = self._reader() self.extractor = m.EntityExtractor( # in/output: ab_id entity-mapper: id ref_field_map=FieldsMap.parse('id=ab_id'), # in/output: a, b entity-mapper: a, other fields_map=FieldsMap.parse('a,other=b'), keep_fields=True)
def test_keep_fields(self): with open('map.csv', 'w') as f: f.write('id,a\n5,a') reader = ReaderWriter() reader.rows = [('a', 'b'), ('a', 'b'), ('c', 'd')] writer = ReaderWriter() m.extract_map(reader, writer, 'map.csv', 'a', 'id', keep_fields=True) self.assertEqual( [('a', 'b', 'id'), ('a', 'b', 5), ('c', 'd', 6)], writer.rows)
def test_existing_map_used(self): with open('map.csv', 'w') as f: f.write('id,a\n5,a') reader = ReaderWriter() reader.rows = [('a', 'b'), ('a', 'b'), ('c', 'd')] writer = ReaderWriter() m.extract_map(reader, writer, 'map.csv', 'a', 'id') self.assertEqual( sorted([('b', 5), ('d', 6)]), sorted(writer.rows[1:]))
def test_11_data_rows_chunk_size_1_11_files_created(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) for i in range(11): rows.writerow([i, i + 1]) m.split(rows, prefix='split.', chunk_size=1) self.assertTrue(os.path.exists(u'split.0')) self.assertTrue(os.path.exists(u'split.1')) # ... self.assertTrue(os.path.exists(u'split.10')) self.assertFalse(os.path.exists(u'split.11'))
def test_output_file_contains_rows_from_input(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) rows.writerow([1, 2]) rows.writerow([3, 4]) m.split(rows, prefix='split.', chunk_size=2) with codecs.open('split.0', encoding='utf8') as f: self.assertEqual( [[u'a', u'b'], [u'1', u'2'], [u'3', u'4']], list(csv.reader(f)))
def test_changed_map_is_written_out(self): with open('map.csv', 'w') as f: f.write('id,a\n5,a') reader = ReaderWriter() reader.rows = [('a', 'b'), ('a', 'b'), ('c', 'd')] writer = ReaderWriter() m.extract_map(reader, writer, 'map.csv', 'a', 'id') with open('map.csv') as f: items = tuple(csv.reader(f)) self.assertEqual( sorted((('id', 'a'), ('5', 'a'), ('6', 'c'))), map(tuple, sorted(items)))
def test_map_header(self): reader = ReaderWriter() reader.rows = [('aa', 'bb', 'cc', 'dd'), ] writer = ReaderWriter() extract_map = m.ExtractMap('b=bb,c=cc', 'a=id') extract_map.process(reader, writer) map_writer = ReaderWriter() extract_map.map.write(map_writer) # all fields renamed, id comes first - for sorting? self.assertEqual( [('a', 'b', 'c')], map_writer.rows)
def test(self): reader = ReaderWriter() reader.rows = [ ('aa', 'bb', 'cc', 'dd'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2), ] writer = ReaderWriter() m.RemoveFields(['cc', 'aa']).process(reader, writer) self.assertEqual([ ('bb', 'dd'), (sentinel.bb1, sentinel.dd1), (sentinel.bb2, sentinel.dd2), ], writer.rows)
def test_new_map_can_be_used(self): appender = ReaderWriter() mapper = m.Mapper.new('id', ['a', 'b'], appender=appender) mapped_id = mapper.map(('aa', 'bb')) self.assertEqual(1, mapped_id) self.assertEqual(2, len(appender.rows)) self.assertListEqual([1, 'aa', 'bb'], appender.rows[1])
def test_output_values(self): reader = ReaderWriter() reader.rows = [ ('aa', 'bb', 'cc', 'dd'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2), (sentinel.aa3, sentinel.bb1, sentinel.cc1, sentinel.dd3), ] writer = ReaderWriter() m.ExtractMap('b=bb,c=cc', 'a=id').process(reader, writer) self.assertEqual( sorted([ (sentinel.aa1, sentinel.dd1, 0), (sentinel.aa2, sentinel.dd2, 1), (sentinel.aa3, sentinel.dd3, 0), ]), sorted(writer.rows[1:]))
def test_id_field_is_not_in_output(self): csv_in1 = csv_reader('a,b,id') csv_in2 = csv_reader('c,d,id') csv_out = ReaderWriter() m.csvzip(csv_in1, csv_in2, csv_out) self.assertNotIn('id', csv_out.rows[0])
def test(self): reader = ReaderWriter() reader.rows = [ ('aa', 'bb', 'cc', 'dd'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2), ] writer = ReaderWriter() m.RemoveFields(['cc', 'aa']).process(reader, writer) self.assertEqual( [ ('bb', 'dd'), (sentinel.bb1, sentinel.dd1), (sentinel.bb2, sentinel.dd2), ], writer.rows)
def test_map_new_value(self): reader = self.map_reader() appender = ReaderWriter() mapper = m.Mapper('id', ['a', 'b'], reader, appender) mapped_id = mapper.map(('aaa', 'bbb')) self.assertEqual(2, mapped_id) self.assertEqual(1, len(appender.rows)) self.assertListEqual([2, 'aaa', 'bbb'], appender.rows[0])
def test_keep_fields(self): reader = ReaderWriter() reader.rows = [ ('aa', 'bb', 'cc', 'dd'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2), (sentinel.aa3, sentinel.bb1, sentinel.cc1, sentinel.dd3), ] writer = ReaderWriter() em = m.ExtractMap('b=bb,c=cc', 'a=id', keep_fields=True) em.process(reader, writer) self.assertEqual( [ ('aa', 'bb', 'cc', 'dd', 'id'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1, 0), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2, 1), (sentinel.aa3, sentinel.bb1, sentinel.cc1, sentinel.dd3, 0), ], writer.rows)
def test_map_with_different_field_order_read_in_properly(self): reader = csv_reader('''\ b,id,a b,1,a ''') appender = ReaderWriter() mapper = m.Mapper('id', ['a', 'b'], reader, appender) mapped_id = mapper.map(('a', 'b')) self.assertEqual(1, mapped_id)
def test_map_content(self): reader = ReaderWriter() reader.rows = [ ('aa', 'bb', 'cc', 'dd'), (sentinel.aa1, sentinel.bb1, sentinel.cc1, sentinel.dd1), (sentinel.aa2, sentinel.bb2, sentinel.cc2, sentinel.dd2), (sentinel.aa3, sentinel.bb1, sentinel.cc1, sentinel.dd3), ] writer = ReaderWriter() extract_map = m.ExtractMap('b=bb,c=cc', 'a=id') extract_map.process(reader, writer) map_writer = ReaderWriter() extract_map.map.write(map_writer) # all fields renamed, id comes first - for sorting? self.assertEqual( sorted([ (0, sentinel.bb1, sentinel.cc1), (1, sentinel.bb2, sentinel.cc2), ]), sorted(map_writer.rows[1:]))
def test_unsorted_map_with_gaps_works_correctly(self): reader = csv_reader('''\ id,a,b 5,aaa,bbb 1,aa,bb ''') appender = ReaderWriter() mapper = m.Mapper('id', ['a', 'b'], reader, appender) mapped_id = mapper.map(('a3', 'b3')) self.assertEqual(6, mapped_id)
def test_map_with_different_field_order_is_written_properly(self): reader = csv_reader('''\ b,id,a b,1,a ''') appender = ReaderWriter() mapper = m.Mapper('id', ['a', 'b'], reader, appender) mapped_id = mapper.map(('aa', 'bb')) self.assertEqual(2, mapped_id) self.assertListEqual([['bb', 2, 'aa']], appender.rows)
def test_out_spec(self): csv_in = ReaderWriter() csv_in.writerow('a b c'.split()) csv_in.writerow('a1 b1 c1'.split()) csv_in.writerow('a2 b2 c2'.split()) csv_out_spec = ReaderWriter() csv_out_unspec = ReaderWriter() m.unzip(csv_in, ['a'], csv_out_spec, csv_out_unspec) self.assertListEqual( ['id a'.split(), '0 a1'.split(), '1 a2'.split()], csv_out_spec.rows)
def test_output_file_contains_rows_from_input(self): rows = ReaderWriter() rows.writerow(u'a b'.split()) rows.writerow([1, 2]) rows.writerow([3, 4]) m.split(rows, prefix='split.', chunk_size=2) with codecs.open('split.0', encoding='utf8') as f: self.assertEqual([[u'a', u'b'], [u'1', u'2'], [u'3', u'4']], list(csv.reader(f)))
def test_mismatch_in_id_values_raises_error(self): csv_in1 = csv_reader('''\ a,b,id a,b,1 aa,bb,2''') csv_in2 = csv_reader('''\ c,d,id c,d,1 cc,dd,3''') csv_out = ReaderWriter() with self.assertRaises(m.IdMismatch): m.csvzip(csv_in1, csv_in2, csv_out) self.assertEqual(2, len(csv_out.rows)) self.assertEqual('a b c d'.split(), csv_out.rows[0]) self.assertEqual('a b c d'.split(), csv_out.rows[1])
def test_normal_case(self): csv_in1 = csv_reader('''\ a,b,id a,b,1 aa,bb,2''') csv_in2 = csv_reader('''\ c,d,id c,d,1 cc,dd,2''') csv_out = ReaderWriter() m.csvzip(csv_in1, csv_in2, csv_out) self.assertEqual(3, len(csv_out.rows)) self.assertEqual('a b c d'.split(), csv_out.rows[0]) self.assertEqual('a b c d'.split(), csv_out.rows[1]) self.assertEqual('aa bb cc dd'.split(), csv_out.rows[2])
def test_keep_id_id_field_is_in_output(self): csv_in1 = csv_reader('''\ a,b,id a,b,1 aa,bb,2''') csv_in2 = csv_reader('''\ c,d,id c,d,1 cc,dd,2''') csv_out = ReaderWriter() m.csvzip(csv_in1, csv_in2, csv_out, keep_id=True) self.assertEqual(3, len(csv_out.rows)) self.assertEqual('id a b c d'.split(), csv_out.rows[0]) self.assertEqual('1 a b c d'.split(), csv_out.rows[1]) self.assertEqual('2 aa bb cc dd'.split(), csv_out.rows[2])
def csv_header_a_b_c(self): csv = ReaderWriter() csv.writerow('a b c'.split()) return csv
def test_valuess_not_unique_dies(self): reader = ReaderWriter() reader.rows = [self.header, (1, 1, 1), (2, 1, 1)] map = make_map('aa,bb', 'id') self.assertRaises(Exception, lambda: map.read(reader))
def test_two_common_fields_zip_raises_error(self): csv_in1 = csv_reader('a,b') csv_in2 = csv_reader('a,b') csv_out = ReaderWriter() with self.assertRaises(m.BadInput): m.csvzip(csv_in1, csv_in2, csv_out)
def test_missing_value_field(self): reader = ReaderWriter() reader.rows = [('id', 'bb')] map = make_map('aa,bb', 'id') self.assertRaises(Exception, lambda: map.read(reader))
def test_calls_bind_before_transform(self): reader = ReaderWriter() reader.rows = [('a', 'b'), (1, 2)] writer = ReaderWriter() BindCheckerTransformer().process(reader, writer)
def test_new_creates_header(self): appender = ReaderWriter() m.Mapper.new('id', ['a', 'b'], appender=appender) self.assertEqual(1, len(appender.rows)) self.assertListEqual(['id', 'a', 'b'], appender.rows[0])