def test_row_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from ambry_sources.intuit import RowIntuiter cache_fs = fsopendir('temp://') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) rows = list(s) l = len(rows) # the files are short, so the head and tail overlap ri = RowIntuiter(debug=False).run(rows[:int(l * .75)], rows[int(l * .25):], len(rows)) print source_name, ri.start_line, ri.header_lines self.assertEqual( spec.expect_headers, ','.join(str(e) for e in ri.header_lines), 'Headers of {} source does not match to row intuiter'.format( spec.name)) self.assertEqual( spec.expect_start, ri.start_line, 'Start line of {} source does not match to row intuiter start line.' .format(spec.name))
def run_row_intuiter(self): """Run the row intuiter and store the results back into the metadata""" from .intuit import RowIntuiter from itertools import islice try: self._process = 'intuit_rows' self._start_time = time.time() with self.reader as r: if r.n_rows == 0: return head = list(islice(r.raw, RowIntuiter.N_TEST_ROWS)) n_rows = r.n_rows with self.reader as r: # Reset the iterator to get the tail if RowIntuiter.N_TEST_ROWS < r.n_rows: tail = list( islice(r.raw, r.n_rows - RowIntuiter.N_TEST_ROWS, r.n_rows)) else: tail = list(islice(r.raw, 0, r.n_rows)) ri = RowIntuiter().run(head, tail, n_rows) with self.writer as w: w.set_row_spec(ri) finally: self._process = 'none'
def test_generator(self): from ambry_sources.sources import GeneratorSource, SourceSpec from ambry_sources import head, tail cache_fs = fsopendir(self.setup_temp_dir()) def gen(): yield list('abcde') for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] f = HDFPartition(cache_fs, 'foobar') s = GeneratorSource(SourceSpec('foobar'), gen()) ri = RowIntuiter().run(head(s, 100), tail(s, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(GeneratorSource(SourceSpec('foobar'), gen())) with f.writer as w: w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) self.assertEqual(f.headers, list('abcde')) rows = [] for row in f.select(): rows.append(row.dict) self.assertEqual(len(rows), 10) self.assertEqual(rows[0], {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}) self.assertEqual(rows[-1], {'a': 9, 'b': 10, 'c': 11, 'd': 12, 'e': 13})
def test_converts_tuples(self): ret = RowIntuiter.coalesce_headers([('Header-row0', ''), ('Header-row1', ''), ('Header-row2-1', 'Header-row2-2') ]) self.assertEqual(len(ret), 2) self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1') self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
def test_converts_tuples(self): ret = RowIntuiter.coalesce_headers([ ('Header-row0', ''), ('Header-row1', ''), ('Header-row2-1', 'Header-row2-2')]) self.assertEqual(len(ret), 2) self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1') self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
def _get_headers(self, source, spec): """ Collects headers from spec and returns them. """ if spec.header_lines: max_header_line = max(spec.header_lines) rows = list(islice(source, max_header_line + 1)) header_lines = itemgetter(*spec.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None if header_lines: return [h for h in RowIntuiter.coalesce_headers(header_lines)] return []
def test_header_coalesce(self): from ambry_sources.intuit import RowIntuiter def csplit(h): return [r.split(',') for r in h] h = [ 'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,', 'd1,d2,d3,d4,d5,d6,d7' ] hc = [ u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4', u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7' ] self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
def test_fixed(self): from ambry_sources import head, tail cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] assert spec.has_rowspec is False s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # prepare HDFPartition. f = HDFPartition(cache_fs, spec.name) ri = RowIntuiter().run(head(s, 100), tail(s, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(s) with f.writer as w: w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(s) self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
def test_stats(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from ambry_sources import head, tail cache_fs = fsopendir('temp://') source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y)) f = HDFPartition(cache_fs, source.spec.name) with f.writer as w: ri = RowIntuiter().run(head(source, 100), tail(source, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(source) w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(source, run_stats=True) expected = { u('str_a'): (30, None, None, None, 10), u('str_b'): (30, None, None, None, 10), u('float_a'): (30, 1.0, 5.5, 10.0, 10), u('float_b'): (30, 1.1, 5.5, 9.9, 10), u('float_c'): (30, 1.1, 5.5, 9.9, 10), u('int_b'): (30, 1.0, 5.0, 9.0, 10), u('int_a'): (30, 1.0, 5.5, 10.0, 10)} with f.reader as r: for col in r.columns: stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None, col.max, col.nuniques) for a, b in zip(expected[col.name], stats): self.assertEqual( a, b, 'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
def test_header_coalesce(self): from ambry_sources.intuit import RowIntuiter def csplit(h): return [r.split(',') for r in h] h = [ 'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,', 'd1,d2,d3,d4,d5,d6,d7' ] hc = [u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4', u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7'] self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
def set_row_spec(self, ri_or_ss): """Set the row spec and schema from a RowIntuiter object or a SourceSpec""" from itertools import islice from operator import itemgetter from ambry_sources.intuit import RowIntuiter def set_descriptions(w, descriptions): for c, d in zip(w.columns, descriptions): col = w.column(c.name) d = d.replace('\n', ' ').replace('\r', ' ') col.description = d if isinstance(ri_or_ss, RowIntuiter): ri = ri_or_ss with self.parent.writer as w: w.data_start_row = ri.start_line w.data_end_row = ri.end_line if ri.end_line else None w.meta['row_spec']['header_rows'] = ri.header_lines w.meta['row_spec']['comment_rows'] = ri.comment_lines w.meta['row_spec']['start_row'] = ri.start_line w.meta['row_spec']['end_row'] = ri.end_line w.meta['row_spec']['data_pattern'] = ri.data_pattern_source set_descriptions(w, [h for h in ri.headers]) w.headers = [self.header_mangler(h) for h in ri.headers] else: ss = ri_or_ss with self.parent.reader as r: # If the header lines are specified, we need to also coalesce them ad # set the header if ss.header_lines: max_header_line = max(ss.header_lines) rows = list(islice(r.raw, max_header_line + 1)) header_lines = itemgetter(*ss.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None with self.parent.writer as w: w.data_start_row = ss.start_line w.data_end_row = ss.end_line if ss.end_line else None w.meta['row_spec']['header_rows'] = ss.header_lines w.meta['row_spec']['comment_rows'] = None w.meta['row_spec']['start_row'] = ss.start_line w.meta['row_spec']['end_row'] = ss.end_line w.meta['row_spec']['data_pattern'] = None if header_lines: set_descriptions(w, [ h for h in RowIntuiter.coalesce_headers(header_lines) ]) w.headers = [ self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines) ] # Now, look for the end line. if False: # FIXME: Maybe later ... r = self.parent.reader # Look at the last 100 rows, but don't start before the start row. test_rows = 100 start = max(r.data_start_row, r.data_end_row - test_rows) end_rows = list(islice(r.raw, start, None)) ri.find_end(end_rows)
def set_row_spec(self, ri_or_ss): """Set the row spec and schema from a RowIntuiter object or a SourceSpec""" from itertools import islice from operator import itemgetter from ambry_sources.intuit import RowIntuiter def set_descriptions(w, descriptions): for c, d in zip(w.columns, descriptions): col = w.column(c.name) d = d.replace('\n', ' ').replace('\r', ' ') col.description = d if isinstance(ri_or_ss, RowIntuiter): ri = ri_or_ss with self.parent.writer as w: w.data_start_row = ri.start_line w.data_end_row = ri.end_line if ri.end_line else None w.meta['row_spec']['header_rows'] = ri.header_lines w.meta['row_spec']['comment_rows'] = ri.comment_lines w.meta['row_spec']['start_row'] = ri.start_line w.meta['row_spec']['end_row'] = ri.end_line w.meta['row_spec']['data_pattern'] = ri.data_pattern_source set_descriptions(w, [h for h in ri.headers]) w.headers = [self.header_mangler(h) for h in ri.headers] else: ss = ri_or_ss with self.parent.reader as r: # If the header lines are specified, we need to also coalesce them ad # set the header if ss.header_lines: max_header_line = max(ss.header_lines) rows = list(islice(r.raw, max_header_line + 1)) header_lines = itemgetter(*ss.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None with self.parent.writer as w: w.data_start_row = ss.start_line w.data_end_row = ss.end_line if ss.end_line else None w.meta['row_spec']['header_rows'] = ss.header_lines w.meta['row_spec']['comment_rows'] = None w.meta['row_spec']['start_row'] = ss.start_line w.meta['row_spec']['end_row'] = ss.end_line w.meta['row_spec']['data_pattern'] = None if header_lines: set_descriptions(w, [h for h in RowIntuiter.coalesce_headers(header_lines)]) w.headers = [self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines)] # Now, look for the end line. if False: # FIXME: Maybe later ... r = self.parent.reader # Look at the last 100 rows, but don't start before the start row. test_rows = 100 start = max(r.data_start_row, r.data_end_row - test_rows) end_rows = list(islice(r.raw, start, None)) ri.find_end(end_rows)
def test_load_and_headers(self): """ Just checks that all of the sources can be loaded without exceptions. """ from ambry_sources import head, tail cache_fs = fsopendir('temp://') source_headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = HDFPartition(cache_fs, spec.name) if f.exists: f.remove() # FIXME: This is really complicated setup for HDFPartition file. Try to simplify. with f.writer as w: if spec.has_rowspec: row_spec = self._spec_to_dict(spec) headers = self._get_headers(s, spec) ti = TypeIntuiter().process_header(headers).run(s) w.set_row_spec(row_spec, headers) w.set_types(ti) else: ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(s) w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(s) with f.reader as r: if spec.name in source_headers: self.assertEqual(source_headers[spec.name], r.headers)