예제 #1
0
    def test_row_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from ambry_sources.intuit import RowIntuiter

        cache_fs = fsopendir('temp://')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            rows = list(s)
            l = len(rows)

            # the files are short, so the head and tail overlap
            ri = RowIntuiter(debug=False).run(rows[:int(l * .75)],
                                              rows[int(l * .25):], len(rows))

            print source_name, ri.start_line, ri.header_lines

            self.assertEqual(
                spec.expect_headers, ','.join(str(e) for e in ri.header_lines),
                'Headers of {} source does not match to row intuiter'.format(
                    spec.name))

            self.assertEqual(
                spec.expect_start, ri.start_line,
                'Start line of {} source does not match to row intuiter start line.'
                .format(spec.name))
예제 #2
0
    def run_row_intuiter(self):
        """Run the row intuiter and store the results back into the metadata"""
        from .intuit import RowIntuiter
        from itertools import islice

        try:
            self._process = 'intuit_rows'
            self._start_time = time.time()

            with self.reader as r:
                if r.n_rows == 0:
                    return

                head = list(islice(r.raw, RowIntuiter.N_TEST_ROWS))
                n_rows = r.n_rows

            with self.reader as r:
                # Reset the iterator to get the tail
                if RowIntuiter.N_TEST_ROWS < r.n_rows:
                    tail = list(
                        islice(r.raw, r.n_rows - RowIntuiter.N_TEST_ROWS,
                               r.n_rows))
                else:
                    tail = list(islice(r.raw, 0, r.n_rows))

            ri = RowIntuiter().run(head, tail, n_rows)

            with self.writer as w:
                w.set_row_spec(ri)

        finally:
            self._process = 'none'
예제 #3
0
    def test_generator(self):
        from ambry_sources.sources import GeneratorSource, SourceSpec
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())

        def gen():

            yield list('abcde')

            for i in range(10):
                yield [i, i + 1, i + 2, i + 3, i + 4]

        f = HDFPartition(cache_fs, 'foobar')

        s = GeneratorSource(SourceSpec('foobar'), gen())

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(GeneratorSource(SourceSpec('foobar'), gen()))
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        self.assertEqual(f.headers, list('abcde'))
        rows = []

        for row in f.select():
            rows.append(row.dict)
        self.assertEqual(len(rows), 10)
        self.assertEqual(rows[0], {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4})
        self.assertEqual(rows[-1], {'a': 9, 'b': 10, 'c': 11, 'd': 12, 'e': 13})
예제 #4
0
 def test_converts_tuples(self):
     ret = RowIntuiter.coalesce_headers([('Header-row0', ''),
                                         ('Header-row1', ''),
                                         ('Header-row2-1', 'Header-row2-2')
                                         ])
     self.assertEqual(len(ret), 2)
     self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1')
     self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
예제 #5
0
 def test_converts_tuples(self):
     ret = RowIntuiter.coalesce_headers([
         ('Header-row0', ''),
         ('Header-row1', ''),
         ('Header-row2-1', 'Header-row2-2')])
     self.assertEqual(len(ret), 2)
     self.assertEqual(ret[0], 'Header-row0 Header-row1 Header-row2-1')
     self.assertEqual(ret[1], 'Header-row0 Header-row1 Header-row2-2')
예제 #6
0
    def _get_headers(self, source, spec):
        """ Collects headers from spec and returns them. """
        if spec.header_lines:
            max_header_line = max(spec.header_lines)
            rows = list(islice(source, max_header_line + 1))
            header_lines = itemgetter(*spec.header_lines)(rows)
            if not isinstance(header_lines[0], (list, tuple)):
                header_lines = [header_lines]
        else:
            header_lines = None

        if header_lines:
            return [h for h in RowIntuiter.coalesce_headers(header_lines)]
        return []
예제 #7
0
    def test_header_coalesce(self):
        from ambry_sources.intuit import RowIntuiter

        def csplit(h):
            return [r.split(',') for r in h]

        h = [
            'a1,,a3,,a5,,a7', 'b1,,b3,,b5,,b7', ',c2,,c4,,c6,',
            'd1,d2,d3,d4,d5,d6,d7'
        ]

        hc = [
            u'a1 b1 d1', u'a1 b1 c2 d2', u'a3 b3 c2 d3', u'a3 b3 c4 d4',
            u'a5 b5 c4 d5', u'a5 b5 c6 d6', u'a7 b7 c6 d7'
        ]

        self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
예제 #8
0
    def test_fixed(self):
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        assert spec.has_rowspec is False
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # prepare HDFPartition.
        f = HDFPartition(cache_fs, spec.name)

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(s)
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)
        f.load_rows(s)
        self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
예제 #9
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y))

        f = HDFPartition(cache_fs, source.spec.name)

        with f.writer as w:
            ri = RowIntuiter().run(head(source, 100), tail(source, 100))
            row_spec = self._row_intuiter_to_dict(ri)
            ti = TypeIntuiter().process_header(ri.headers).run(source)
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(source, run_stats=True)

        expected = {
            u('str_a'):   (30, None, None, None, 10),
            u('str_b'):   (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, 1.1, 5.5, 9.9, 10),
            u('int_b'):   (30, 1.0, 5.0, 9.0, 10),
            u('int_a'):   (30, 1.0, 5.5, 10.0, 10)}

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None,
                         col.max,
                         col.nuniques)
                for a, b in zip(expected[col.name], stats):
                    self.assertEqual(
                        a, b,
                        'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
예제 #10
0
    def test_header_coalesce(self):
        from ambry_sources.intuit import RowIntuiter

        def csplit(h):
            return [r.split(',') for r in h]

        h = [
            'a1,,a3,,a5,,a7',
            'b1,,b3,,b5,,b7',
            ',c2,,c4,,c6,',
            'd1,d2,d3,d4,d5,d6,d7'
        ]

        hc = [u'a1 b1 d1',
              u'a1 b1 c2 d2',
              u'a3 b3 c2 d3',
              u'a3 b3 c4 d4',
              u'a5 b5 c4 d5',
              u'a5 b5 c6 d6',
              u'a7 b7 c6 d7']

        self.assertEqual(hc, RowIntuiter.coalesce_headers(csplit(h)))
예제 #11
0
    def set_row_spec(self, ri_or_ss):
        """Set the row spec and schema from a RowIntuiter object or a SourceSpec"""

        from itertools import islice
        from operator import itemgetter
        from ambry_sources.intuit import RowIntuiter

        def set_descriptions(w, descriptions):

            for c, d in zip(w.columns, descriptions):
                col = w.column(c.name)
                d = d.replace('\n', ' ').replace('\r', ' ')
                col.description = d

        if isinstance(ri_or_ss, RowIntuiter):
            ri = ri_or_ss

            with self.parent.writer as w:

                w.data_start_row = ri.start_line
                w.data_end_row = ri.end_line if ri.end_line else None

                w.meta['row_spec']['header_rows'] = ri.header_lines
                w.meta['row_spec']['comment_rows'] = ri.comment_lines
                w.meta['row_spec']['start_row'] = ri.start_line
                w.meta['row_spec']['end_row'] = ri.end_line
                w.meta['row_spec']['data_pattern'] = ri.data_pattern_source

                set_descriptions(w, [h for h in ri.headers])

                w.headers = [self.header_mangler(h) for h in ri.headers]

        else:
            ss = ri_or_ss

            with self.parent.reader as r:
                # If the header lines are specified, we need to also coalesce them ad
                # set the header
                if ss.header_lines:

                    max_header_line = max(ss.header_lines)
                    rows = list(islice(r.raw, max_header_line + 1))

                    header_lines = itemgetter(*ss.header_lines)(rows)

                    if not isinstance(header_lines[0], (list, tuple)):
                        header_lines = [header_lines]

                else:
                    header_lines = None

            with self.parent.writer as w:

                w.data_start_row = ss.start_line
                w.data_end_row = ss.end_line if ss.end_line else None

                w.meta['row_spec']['header_rows'] = ss.header_lines
                w.meta['row_spec']['comment_rows'] = None
                w.meta['row_spec']['start_row'] = ss.start_line
                w.meta['row_spec']['end_row'] = ss.end_line
                w.meta['row_spec']['data_pattern'] = None

                if header_lines:
                    set_descriptions(w, [
                        h for h in RowIntuiter.coalesce_headers(header_lines)
                    ])
                    w.headers = [
                        self.header_mangler(h)
                        for h in RowIntuiter.coalesce_headers(header_lines)
                    ]

        # Now, look for the end line.
        if False:
            # FIXME: Maybe later ...
            r = self.parent.reader
            # Look at the last 100 rows, but don't start before the start row.
            test_rows = 100
            start = max(r.data_start_row, r.data_end_row - test_rows)

            end_rows = list(islice(r.raw, start, None))

            ri.find_end(end_rows)
예제 #12
0
    def set_row_spec(self, ri_or_ss):
        """Set the row spec and schema from a RowIntuiter object or a SourceSpec"""

        from itertools import islice
        from operator import itemgetter
        from ambry_sources.intuit import RowIntuiter

        def set_descriptions(w, descriptions):

            for c, d in zip(w.columns, descriptions):
                col = w.column(c.name)
                d = d.replace('\n', ' ').replace('\r', ' ')
                col.description = d

        if isinstance(ri_or_ss, RowIntuiter):
            ri = ri_or_ss

            with self.parent.writer as w:

                w.data_start_row = ri.start_line
                w.data_end_row = ri.end_line if ri.end_line else None

                w.meta['row_spec']['header_rows'] = ri.header_lines
                w.meta['row_spec']['comment_rows'] = ri.comment_lines
                w.meta['row_spec']['start_row'] = ri.start_line
                w.meta['row_spec']['end_row'] = ri.end_line
                w.meta['row_spec']['data_pattern'] = ri.data_pattern_source

                set_descriptions(w, [h for h in ri.headers])

                w.headers = [self.header_mangler(h) for h in ri.headers]

        else:
            ss = ri_or_ss

            with self.parent.reader as r:
                # If the header lines are specified, we need to also coalesce them ad
                # set the header
                if ss.header_lines:

                    max_header_line = max(ss.header_lines)
                    rows = list(islice(r.raw, max_header_line + 1))

                    header_lines = itemgetter(*ss.header_lines)(rows)

                    if not isinstance(header_lines[0], (list, tuple)):
                        header_lines = [header_lines]

                else:
                    header_lines = None

            with self.parent.writer as w:

                w.data_start_row = ss.start_line
                w.data_end_row = ss.end_line if ss.end_line else None

                w.meta['row_spec']['header_rows'] = ss.header_lines
                w.meta['row_spec']['comment_rows'] = None
                w.meta['row_spec']['start_row'] = ss.start_line
                w.meta['row_spec']['end_row'] = ss.end_line
                w.meta['row_spec']['data_pattern'] = None

                if header_lines:
                    set_descriptions(w, [h for h in RowIntuiter.coalesce_headers(header_lines)])
                    w.headers = [self.header_mangler(h) for h in RowIntuiter.coalesce_headers(header_lines)]

        # Now, look for the end line.
        if False:
            # FIXME: Maybe later ...
            r = self.parent.reader
            # Look at the last 100 rows, but don't start before the start row.
            test_rows = 100
            start = max(r.data_start_row, r.data_end_row - test_rows)

            end_rows = list(islice(r.raw, start, None))

            ri.find_end(end_rows)
예제 #13
0
    def test_load_and_headers(self):
        """ Just checks that all of the sources can be loaded without exceptions. """
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source_headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = HDFPartition(cache_fs, spec.name)
            if f.exists:
                f.remove()

            # FIXME: This is really complicated setup for HDFPartition file. Try to simplify.
            with f.writer as w:
                if spec.has_rowspec:
                    row_spec = self._spec_to_dict(spec)
                    headers = self._get_headers(s, spec)
                    ti = TypeIntuiter().process_header(headers).run(s)
                    w.set_row_spec(row_spec, headers)
                    w.set_types(ti)
                else:
                    ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows)
                    row_spec = self._row_intuiter_to_dict(ri)
                    ti = TypeIntuiter().process_header(ri.headers).run(s)
                    w.set_row_spec(row_spec, ri.headers)
                    w.set_types(ti)
            f.load_rows(s)

            with f.reader as r:
                if spec.name in source_headers:
                    self.assertEqual(source_headers[spec.name], r.headers)