def test_converts_tuples(self): ret = RowIntuiter.coalesce_headers([('Header-row0', ''), ('Header-row1', ''), ('Header-row2-1', 'Header-row2-2') ]) self.assertEqual(len(ret), 2) self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1') self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
def test_converts_tuples(self): ret = RowIntuiter.coalesce_headers([ ('Header-row0', ''), ('Header-row1', ''), ('Header-row2-1', 'Header-row2-2')]) self.assertEqual(len(ret), 2) self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1') self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
def _get_headers(self, source, spec): """ Collects headers from spec and returns them. """ if spec.header_lines: max_header_line = max(spec.header_lines) rows = list(islice(source, max_header_line + 1)) header_lines = itemgetter(*spec.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None if header_lines: return [h for h in RowIntuiter.coalesce_headers(header_lines)] return []
def test_header_coalesce(self): from ambry_sources.intuit import RowIntuiter def csplit(h): return [r.split(',') for r in h] h = [ 'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,', 'd1,d2,d3,d4,d5,d6,d7' ] hc = [ u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4', u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7' ] self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
def test_header_coalesce(self): from ambry_sources.intuit import RowIntuiter def csplit(h): return [r.split(',') for r in h] h = [ 'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,', 'd1,d2,d3,d4,d5,d6,d7' ] hc = [u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4', u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7'] self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
def set_row_spec(self, ri_or_ss): """Set the row spec and schema from a RowIntuiter object or a SourceSpec""" from itertools import islice from operator import itemgetter from ambry_sources.intuit import RowIntuiter def set_descriptions(w, descriptions): for c, d in zip(w.columns, descriptions): col = w.column(c.name) d = d.replace('\n', ' ').replace('\r', ' ') col.description = d if isinstance(ri_or_ss, RowIntuiter): ri = ri_or_ss with self.parent.writer as w: w.data_start_row = ri.start_line w.data_end_row = ri.end_line if ri.end_line else None w.meta['row_spec']['header_rows'] = ri.header_lines w.meta['row_spec']['comment_rows'] = ri.comment_lines w.meta['row_spec']['start_row'] = ri.start_line w.meta['row_spec']['end_row'] = ri.end_line w.meta['row_spec']['data_pattern'] = ri.data_pattern_source set_descriptions(w, [h for h in ri.headers]) w.headers = [self.header_mangler(h) for h in ri.headers] else: ss = ri_or_ss with self.parent.reader as r: # If the header lines are specified, we need to also coalesce them ad # set the header if ss.header_lines: max_header_line = max(ss.header_lines) rows = list(islice(r.raw, max_header_line + 1)) header_lines = itemgetter(*ss.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None with self.parent.writer as w: w.data_start_row = ss.start_line w.data_end_row = ss.end_line if ss.end_line else None w.meta['row_spec']['header_rows'] = ss.header_lines w.meta['row_spec']['comment_rows'] = None w.meta['row_spec']['start_row'] = ss.start_line w.meta['row_spec']['end_row'] = ss.end_line w.meta['row_spec']['data_pattern'] = None if header_lines: set_descriptions(w, [ h for h in RowIntuiter.coalesce_headers(header_lines) ]) w.headers = [ self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines) ] # Now, look for the end line. if False: # FIXME: Maybe later ... r = self.parent.reader # Look at the last 100 rows, but don't start before the start row. test_rows = 100 start = max(r.data_start_row, r.data_end_row - test_rows) end_rows = list(islice(r.raw, start, None)) ri.find_end(end_rows)
def set_row_spec(self, ri_or_ss): """Set the row spec and schema from a RowIntuiter object or a SourceSpec""" from itertools import islice from operator import itemgetter from ambry_sources.intuit import RowIntuiter def set_descriptions(w, descriptions): for c, d in zip(w.columns, descriptions): col = w.column(c.name) d = d.replace('\n', ' ').replace('\r', ' ') col.description = d if isinstance(ri_or_ss, RowIntuiter): ri = ri_or_ss with self.parent.writer as w: w.data_start_row = ri.start_line w.data_end_row = ri.end_line if ri.end_line else None w.meta['row_spec']['header_rows'] = ri.header_lines w.meta['row_spec']['comment_rows'] = ri.comment_lines w.meta['row_spec']['start_row'] = ri.start_line w.meta['row_spec']['end_row'] = ri.end_line w.meta['row_spec']['data_pattern'] = ri.data_pattern_source set_descriptions(w, [h for h in ri.headers]) w.headers = [self.header_mangler(h) for h in ri.headers] else: ss = ri_or_ss with self.parent.reader as r: # If the header lines are specified, we need to also coalesce them ad # set the header if ss.header_lines: max_header_line = max(ss.header_lines) rows = list(islice(r.raw, max_header_line + 1)) header_lines = itemgetter(*ss.header_lines)(rows) if not isinstance(header_lines[0], (list, tuple)): header_lines = [header_lines] else: header_lines = None with self.parent.writer as w: w.data_start_row = ss.start_line w.data_end_row = ss.end_line if ss.end_line else None w.meta['row_spec']['header_rows'] = ss.header_lines w.meta['row_spec']['comment_rows'] = None w.meta['row_spec']['start_row'] = ss.start_line w.meta['row_spec']['end_row'] = ss.end_line w.meta['row_spec']['data_pattern'] = None if header_lines: set_descriptions(w, [h for h in RowIntuiter.coalesce_headers(header_lines)]) w.headers = [self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines)] # Now, look for the end line. if False: # FIXME: Maybe later ... r = self.parent.reader # Look at the last 100 rows, but don't start before the start row. test_rows = 100 start = max(r.data_start_row, r.data_end_row - test_rows) end_rows = list(islice(r.raw, start, None)) ri.find_end(end_rows)