예제 #1
0
    def run_row_intuiter(self):
        """Run the row intuiter and store the results back into the metadata"""
        from .intuit import RowIntuiter
        from itertools import islice

        try:
            self._process = 'intuit_rows'
            self._start_time = time.time()

            with self.reader as r:
                if r.n_rows == 0:
                    return

                head = list(islice(r.raw, RowIntuiter.N_TEST_ROWS))
                n_rows = r.n_rows

            with self.reader as r:
                # Reset the iterator to get the tail
                if RowIntuiter.N_TEST_ROWS < r.n_rows:
                    tail = list(
                        islice(r.raw, r.n_rows - RowIntuiter.N_TEST_ROWS,
                               r.n_rows))
                else:
                    tail = list(islice(r.raw, 0, r.n_rows))

            ri = RowIntuiter().run(head, tail, n_rows)

            with self.writer as w:
                w.set_row_spec(ri)

        finally:
            self._process = 'none'
예제 #2
0
    def test_generator(self):
        from ambry_sources.sources import GeneratorSource, SourceSpec
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())

        def gen():

            yield list('abcde')

            for i in range(10):
                yield [i, i + 1, i + 2, i + 3, i + 4]

        f = HDFPartition(cache_fs, 'foobar')

        s = GeneratorSource(SourceSpec('foobar'), gen())

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(GeneratorSource(SourceSpec('foobar'), gen()))
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        self.assertEqual(f.headers, list('abcde'))
        rows = []

        for row in f.select():
            rows.append(row.dict)
        self.assertEqual(len(rows), 10)
        self.assertEqual(rows[0], {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4})
        self.assertEqual(rows[-1], {'a': 9, 'b': 10, 'c': 11, 'd': 12, 'e': 13})
예제 #3
0
    def test_row_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from ambry_sources.intuit import RowIntuiter

        cache_fs = fsopendir('temp://')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            rows = list(s)
            l = len(rows)

            # the files are short, so the head and tail overlap
            ri = RowIntuiter(debug=False).run(rows[:int(l * .75)],
                                              rows[int(l * .25):], len(rows))

            print source_name, ri.start_line, ri.header_lines

            self.assertEqual(
                spec.expect_headers, ','.join(str(e) for e in ri.header_lines),
                'Headers of {} source does not match to row intuiter'.format(
                    spec.name))

            self.assertEqual(
                spec.expect_start, ri.start_line,
                'Start line of {} source does not match to row intuiter start line.'
                .format(spec.name))
예제 #4
0
    def test_fixed(self):
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        assert spec.has_rowspec is False
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # prepare HDFPartition.
        f = HDFPartition(cache_fs, spec.name)

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(s)
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)
        f.load_rows(s)
        self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
예제 #5
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y))

        f = HDFPartition(cache_fs, source.spec.name)

        with f.writer as w:
            ri = RowIntuiter().run(head(source, 100), tail(source, 100))
            row_spec = self._row_intuiter_to_dict(ri)
            ti = TypeIntuiter().process_header(ri.headers).run(source)
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(source, run_stats=True)

        expected = {
            u('str_a'):   (30, None, None, None, 10),
            u('str_b'):   (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, 1.1, 5.5, 9.9, 10),
            u('int_b'):   (30, 1.0, 5.0, 9.0, 10),
            u('int_a'):   (30, 1.0, 5.5, 10.0, 10)}

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None,
                         col.max,
                         col.nuniques)
                for a, b in zip(expected[col.name], stats):
                    self.assertEqual(
                        a, b,
                        'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
예제 #6
0
    def test_load_and_headers(self):
        """ Just checks that all of the sources can be loaded without exceptions. """
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source_headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = HDFPartition(cache_fs, spec.name)
            if f.exists:
                f.remove()

            # FIXME: This is really complicated setup for HDFPartition file. Try to simplify.
            with f.writer as w:
                if spec.has_rowspec:
                    row_spec = self._spec_to_dict(spec)
                    headers = self._get_headers(s, spec)
                    ti = TypeIntuiter().process_header(headers).run(s)
                    w.set_row_spec(row_spec, headers)
                    w.set_types(ti)
                else:
                    ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows)
                    row_spec = self._row_intuiter_to_dict(ri)
                    ti = TypeIntuiter().process_header(ri.headers).run(s)
                    w.set_row_spec(row_spec, ri.headers)
                    w.set_types(ti)
            f.load_rows(s)

            with f.reader as r:
                if spec.name in source_headers:
                    self.assertEqual(source_headers[spec.name], r.headers)