예제 #1
0
 def test_converts_tuples(self):
     ret = RowIntuiter.coalesce_headers([('Header-row0', ''),
                                         ('Header-row1', ''),
                                         ('Header-row2-1', 'Header-row2-2')
                                         ])
     self.assertEqual(len(ret), 2)
     self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1')
     self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
예제 #2
0
 def test_converts_tuples(self):
     ret = RowIntuiter.coalesce_headers([
         ('Header-row0', ''),
         ('Header-row1', ''),
         ('Header-row2-1', 'Header-row2-2')])
     self.assertEqual(len(ret), 2)
     self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1')
     self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
예제 #3
0
    def _get_headers(self, source, spec):
        """ Collects headers from spec and returns them. """
        if spec.header_lines:
            max_header_line = max(spec.header_lines)
            rows = list(islice(source, max_header_line + 1))
            header_lines = itemgetter(*spec.header_lines)(rows)
            if not isinstance(header_lines[0], (list, tuple)):
                header_lines = [header_lines]
        else:
            header_lines = None

        if header_lines:
            return [h for h in RowIntuiter.coalesce_headers(header_lines)]
        return []
예제 #4
0
    def test_header_coalesce(self):
        from ambry_sources.intuit import RowIntuiter

        def csplit(h):
            return [r.split(',') for r in h]

        h = [
            'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,',
            'd1,d2,d3,d4,d5,d6,d7'
        ]

        hc = [
            u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4',
            u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7'
        ]

        self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
예제 #5
0
    def test_header_coalesce(self):
        from ambry_sources.intuit import RowIntuiter

        def csplit(h):
            return [r.split(',') for r in h]

        h = [
            'a1,,a3,,a5,,a7',
            'b1,,b3,,b5,,b7',
            ',c2,,c4,,c6,',
            'd1,d2,d3,d4,d5,d6,d7'
        ]

        hc = [u'a1 b1 d1',
              u'a1 b1 c2 d2',
              u'a3 b3 c2 d3',
              u'a3 b3 c4 d4',
              u'a5 b5 c4 d5',
              u'a5 b5 c6 d6',
              u'a7 b7 c6 d7']

        self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
예제 #6
0
    def set_row_spec(self, ri_or_ss):
        """Set the row spec and schema from a RowIntuiter object or a SourceSpec"""

        from itertools import islice
        from operator import itemgetter
        from ambry_sources.intuit import RowIntuiter

        def set_descriptions(w, descriptions):

            for c, d in zip(w.columns, descriptions):
                col = w.column(c.name)
                d = d.replace('\n', ' ').replace('\r', ' ')
                col.description = d

        if isinstance(ri_or_ss, RowIntuiter):
            ri = ri_or_ss

            with self.parent.writer as w:

                w.data_start_row = ri.start_line
                w.data_end_row = ri.end_line if ri.end_line else None

                w.meta['row_spec']['header_rows'] = ri.header_lines
                w.meta['row_spec']['comment_rows'] = ri.comment_lines
                w.meta['row_spec']['start_row'] = ri.start_line
                w.meta['row_spec']['end_row'] = ri.end_line
                w.meta['row_spec']['data_pattern'] = ri.data_pattern_source

                set_descriptions(w, [h for h in ri.headers])

                w.headers = [self.header_mangler(h) for h in ri.headers]

        else:
            ss = ri_or_ss

            with self.parent.reader as r:
                # If the header lines are specified, we need to also coalesce them ad
                # set the header
                if ss.header_lines:

                    max_header_line = max(ss.header_lines)
                    rows = list(islice(r.raw, max_header_line + 1))

                    header_lines = itemgetter(*ss.header_lines)(rows)

                    if not isinstance(header_lines[0], (list, tuple)):
                        header_lines = [header_lines]

                else:
                    header_lines = None

            with self.parent.writer as w:

                w.data_start_row = ss.start_line
                w.data_end_row = ss.end_line if ss.end_line else None

                w.meta['row_spec']['header_rows'] = ss.header_lines
                w.meta['row_spec']['comment_rows'] = None
                w.meta['row_spec']['start_row'] = ss.start_line
                w.meta['row_spec']['end_row'] = ss.end_line
                w.meta['row_spec']['data_pattern'] = None

                if header_lines:
                    set_descriptions(w, [
                        h for h in RowIntuiter.coalesce_headers(header_lines)
                    ])
                    w.headers = [
                        self.header_mangler(h)
                        for h in RowIntuiter.coalesce_headers(header_lines)
                    ]

        # Now, look for the end line.
        if False:
            # FIXME: Maybe later ...
            r = self.parent.reader
            # Look at the last 100 rows, but don't start before the start row.
            test_rows = 100
            start = max(r.data_start_row, r.data_end_row - test_rows)

            end_rows = list(islice(r.raw, start, None))

            ri.find_end(end_rows)
예제 #7
0
    def set_row_spec(self, ri_or_ss):
        """Set the row spec and schema from a RowIntuiter object or a SourceSpec"""

        from itertools import islice
        from operator import itemgetter
        from ambry_sources.intuit import RowIntuiter

        def set_descriptions(w, descriptions):

            for c, d in zip(w.columns, descriptions):
                col = w.column(c.name)
                d = d.replace('\n', ' ').replace('\r', ' ')
                col.description = d

        if isinstance(ri_or_ss, RowIntuiter):
            ri = ri_or_ss

            with self.parent.writer as w:

                w.data_start_row = ri.start_line
                w.data_end_row = ri.end_line if ri.end_line else None

                w.meta['row_spec']['header_rows'] = ri.header_lines
                w.meta['row_spec']['comment_rows'] = ri.comment_lines
                w.meta['row_spec']['start_row'] = ri.start_line
                w.meta['row_spec']['end_row'] = ri.end_line
                w.meta['row_spec']['data_pattern'] = ri.data_pattern_source

                set_descriptions(w, [h for h in ri.headers])

                w.headers = [self.header_mangler(h) for h in ri.headers]

        else:
            ss = ri_or_ss

            with self.parent.reader as r:
                # If the header lines are specified, we need to also coalesce them ad
                # set the header
                if ss.header_lines:

                    max_header_line = max(ss.header_lines)
                    rows = list(islice(r.raw, max_header_line + 1))

                    header_lines = itemgetter(*ss.header_lines)(rows)

                    if not isinstance(header_lines[0], (list, tuple)):
                        header_lines = [header_lines]

                else:
                    header_lines = None

            with self.parent.writer as w:

                w.data_start_row = ss.start_line
                w.data_end_row = ss.end_line if ss.end_line else None

                w.meta['row_spec']['header_rows'] = ss.header_lines
                w.meta['row_spec']['comment_rows'] = None
                w.meta['row_spec']['start_row'] = ss.start_line
                w.meta['row_spec']['end_row'] = ss.end_line
                w.meta['row_spec']['data_pattern'] = None

                if header_lines:
                    set_descriptions(w, [h for h in RowIntuiter.coalesce_headers(header_lines)])
                    w.headers = [self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines)]

        # Now, look for the end line.
        if False:
            # FIXME: Maybe later ...
            r = self.parent.reader
            # Look at the last 100 rows, but don't start before the start row.
            test_rows = 100
            start = max(r.data_start_row, r.data_end_row - test_rows)

            end_rows = list(islice(r.raw, start, None))

            ri.find_end(end_rows)