Ejemplo n.º 1
0
    def test_creates_virtual_tables_for_partition_with_segment_without_errors(
            self):

        fs = fsopendir('temp://')

        def gen():
            # generate header
            yield ['col1', 'col2']

            # generate rows
            yield [0, 0]
            yield [1, 1]

        mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr')
        mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        # create virtual tables. This should not raise an error.
        #
        connection = apsw.Connection(':memory:')
        try:
            add_partition(connection, mprows, 'vid1')
        except Exception as exc:
            raise AssertionError(
                'partition adding unexpectadly failed with {} error.'.format(
                    exc))

        # check selected rows
        #
        cursor = connection.cursor()
        result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall()
        self.assertEqual(result, [(0, 0), (1, 1)])
Ejemplo n.º 2
0
    def test_creates_virtual_tables_for_partition_with_segment_without_errors(self):

        fs = fsopendir('temp://')

        def gen():
            # generate header
            yield ['col1', 'col2']

            # generate rows
            yield [0, 0]
            yield [1, 1]

        mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr')
        mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        # create virtual tables. This should not raise an error.
        #
        connection = apsw.Connection(':memory:')
        try:
            add_partition(connection, mprows, 'vid1')
        except Exception as exc:
            raise AssertionError('partition adding unexpectadly failed with {} error.'.format(exc))

        # check selected rows
        #
        cursor = connection.cursor()
        result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall()
        self.assertEqual(result, [(0, 0), (1, 1)])
Ejemplo n.º 3
0
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)
Ejemplo n.º 4
0
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/' + source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(
                    six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
Ejemplo n.º 5
0
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():


            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/'+source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
Ejemplo n.º 6
0
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
Ejemplo n.º 7
0
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
Ejemplo n.º 8
0
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv',
                          name='simple')

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(10001, f.reader.info['data_end_row'])
Ejemplo n.º 9
0
    def test_selects_correct_rows_from_many_mprows(self):

        fs = fsopendir('temp://')
        header = ['col1', 'col2']

        # create 3 mprows files.
        #
        rows1 = [(0, 0), (1, 1)]
        mprows1 = MPRowsFile(fs, 'vid1')
        mprows1.load_rows(self._get_generator_source(header, rows1))

        rows2 = [(2, 2), (3, 3)]
        mprows2 = MPRowsFile(fs, 'vid2')
        mprows2.load_rows(self._get_generator_source(header, rows2))

        rows3 = [(4, 4), (5, 5)]
        mprows3 = MPRowsFile(fs, 'vid3')
        mprows3.load_rows(self._get_generator_source(header, rows3))

        # create virtual tables for all mprows
        #
        connection = apsw.Connection(':memory:')

        add_partition(connection, mprows1, 'vid1')
        add_partition(connection, mprows2, 'vid2')
        add_partition(connection, mprows3, 'vid3')

        # check rows of all added mprows.
        #

        cursor = connection.cursor()
        query_tmpl = 'SELECT * FROM {};'

        # check rows of the first file.
        #
        query = query_tmpl.format('vid1')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows1)

        # check rows of the second mprows file.
        #
        query = query_tmpl.format('vid2')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows2)

        # check rows of the third mprows file.
        #
        query = query_tmpl.format('vid3')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows3)
Ejemplo n.º 10
0
    def test_selects_correct_rows_from_many_mprows(self):

        fs = fsopendir('temp://')
        header = ['col1', 'col2']

        # create 3 mprows files.
        #
        rows1 = [(0, 0), (1, 1)]
        mprows1 = MPRowsFile(fs, 'vid1')
        mprows1.load_rows(self._get_generator_source(header, rows1))

        rows2 = [(2, 2), (3, 3)]
        mprows2 = MPRowsFile(fs, 'vid2')
        mprows2.load_rows(self._get_generator_source(header, rows2))

        rows3 = [(4, 4), (5, 5)]
        mprows3 = MPRowsFile(fs, 'vid3')
        mprows3.load_rows(self._get_generator_source(header, rows3))

        # create virtual tables for all mprows
        #
        connection = apsw.Connection(':memory:')

        add_partition(connection, mprows1, 'vid1')
        add_partition(connection, mprows2, 'vid2')
        add_partition(connection, mprows3, 'vid3')

        # check rows of all added mprows.
        #

        cursor = connection.cursor()
        query_tmpl = 'SELECT * FROM {};'

        # check rows of the first file.
        #
        query = query_tmpl.format('vid1')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows1)

        # check rows of the second mprows file.
        #
        query = query_tmpl.format('vid2')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows2)

        # check rows of the third mprows file.
        #
        query = query_tmpl.format('vid3')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows3)
Ejemplo n.º 11
0
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec(
            'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
            file='g2009.*\.txt',
            filetype='fixed',
            name='geofile',
            encoding='latin1',
        )

        spec.columns = [
            ColumnSpec(position=1, width=6, name='fileid', start=1),
            ColumnSpec(position=2, width=2, name='stusab', start=7),
            ColumnSpec(position=3, width=3, name='sumlevel', start=9),
            ColumnSpec(position=4, width=2, name='component', start=12),
            ColumnSpec(position=5, width=7, name='logrecno', start=14),
            ColumnSpec(position=6, width=1, name='us', start=21),
            ColumnSpec(position=7, width=1, name='region', start=22),
            ColumnSpec(position=8, width=1, name='division', start=23),
            ColumnSpec(position=9, width=2, name='statece', start=24),
            ColumnSpec(position=10, width=2, name='state', start=26),
            ColumnSpec(position=11, width=3, name='county', start=28),
            ColumnSpec(position=12, width=5, name='cousub', start=31),
            ColumnSpec(position=13, width=5, name='place', start=36),
            ColumnSpec(position=14, width=6, name='tract', start=41),
            ColumnSpec(position=15, width=1, name='blkgrp', start=47),
            ColumnSpec(position=16, width=5, name='concit', start=48),
            ColumnSpec(position=17, width=4, name='aianhh', start=53),
            ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
            ColumnSpec(position=19, width=1, name='aihhtli', start=62),
            ColumnSpec(position=20, width=3, name='aitsce', start=63),
            ColumnSpec(position=21, width=5, name='aits', start=66),
            ColumnSpec(position=22, width=5, name='anrc', start=71),
            ColumnSpec(position=23, width=5, name='cbsa', start=76),
            ColumnSpec(position=24, width=3, name='csa', start=81),
            ColumnSpec(position=25, width=5, name='metdiv', start=84),
            ColumnSpec(position=26, width=1, name='macc', start=89),
            ColumnSpec(position=27, width=1, name='memi', start=90),
            ColumnSpec(position=28, width=5, name='necta', start=91),
            ColumnSpec(position=29, width=3, name='cnecta', start=96),
            ColumnSpec(position=30, width=5, name='nectadiv', start=99),
            ColumnSpec(position=31, width=5, name='ua', start=104),
            ColumnSpec(position=33, width=2, name='cdcurr', start=114),
            ColumnSpec(position=34, width=3, name='sldu', start=116),
            ColumnSpec(position=35, width=3, name='sldl', start=119),
            ColumnSpec(position=39, width=5, name='submcd', start=136),
            ColumnSpec(position=40, width=5, name='sdelm', start=141),
            ColumnSpec(position=41, width=5, name='sdsec', start=146),
            ColumnSpec(position=42, width=5, name='sduni', start=151),
            ColumnSpec(position=43, width=1, name='ur', start=156),
            ColumnSpec(position=44, width=1, name='pci', start=157),
            ColumnSpec(position=47, width=5, name='puma5', start=169),
            ColumnSpec(position=49, width=40, name='geoid', start=179),
            ColumnSpec(position=50, width=200, name='name', start=219)
        ]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])
Ejemplo n.º 12
0
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'mz_no_zip': [u('id'), u('uuid'),
                          u('int'), u('float')],
            'namesu8': [
                u('origin_english'),
                u('name_english'),
                u('origin_native'),
                u('name_native')
            ],
            'sf_zip': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'simple': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'),
                           u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'),
                            u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'simple_fixed': [u('id'), u('uuid'),
                             u('int'),
                             u('float')],
            'altname': [u('id'), u('foo'),
                        u('bar'), u('baz')],
            'rentcsv': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'renttab': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'multiexcel': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'rent97': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)
Ejemplo n.º 13
0
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
                          file='g2009.*\.txt',
                          filetype='fixed',
                          name='geofile',
                          encoding='latin1',
                          )

        spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1),
                        ColumnSpec(position=2, width=2, name='stusab', start=7),
                        ColumnSpec(position=3, width=3, name='sumlevel', start=9),
                        ColumnSpec(position=4, width=2, name='component', start=12),
                        ColumnSpec(position=5, width=7, name='logrecno', start=14),
                        ColumnSpec(position=6, width=1, name='us', start=21),
                        ColumnSpec(position=7, width=1, name='region', start=22),
                        ColumnSpec(position=8, width=1, name='division', start=23),
                        ColumnSpec(position=9, width=2, name='statece', start=24),
                        ColumnSpec(position=10, width=2, name='state', start=26),
                        ColumnSpec(position=11, width=3, name='county', start=28),
                        ColumnSpec(position=12, width=5, name='cousub', start=31),
                        ColumnSpec(position=13, width=5, name='place', start=36),
                        ColumnSpec(position=14, width=6, name='tract', start=41),
                        ColumnSpec(position=15, width=1, name='blkgrp', start=47),
                        ColumnSpec(position=16, width=5, name='concit', start=48),
                        ColumnSpec(position=17, width=4, name='aianhh', start=53),
                        ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
                        ColumnSpec(position=19, width=1, name='aihhtli', start=62),
                        ColumnSpec(position=20, width=3, name='aitsce', start=63),
                        ColumnSpec(position=21, width=5, name='aits', start=66),
                        ColumnSpec(position=22, width=5, name='anrc', start=71),
                        ColumnSpec(position=23, width=5, name='cbsa', start=76),
                        ColumnSpec(position=24, width=3, name='csa', start=81),
                        ColumnSpec(position=25, width=5, name='metdiv', start=84),
                        ColumnSpec(position=26, width=1, name='macc', start=89),
                        ColumnSpec(position=27, width=1, name='memi', start=90),
                        ColumnSpec(position=28, width=5, name='necta', start=91),
                        ColumnSpec(position=29, width=3, name='cnecta', start=96),
                        ColumnSpec(position=30, width=5, name='nectadiv', start=99),
                        ColumnSpec(position=31, width=5, name='ua', start=104),
                        ColumnSpec(position=33, width=2, name='cdcurr', start=114),
                        ColumnSpec(position=34, width=3, name='sldu', start=116),
                        ColumnSpec(position=35, width=3, name='sldl', start=119),
                        ColumnSpec(position=39, width=5, name='submcd', start=136),
                        ColumnSpec(position=40, width=5, name='sdelm', start=141),
                        ColumnSpec(position=41, width=5, name='sdsec', start=146),
                        ColumnSpec(position=42, width=5, name='sduni', start=151),
                        ColumnSpec(position=43, width=1, name='ur', start=156),
                        ColumnSpec(position=44, width=1, name='pci', start=157),
                        ColumnSpec(position=47, width=5, name='puma5', start=169),
                        ColumnSpec(position=49, width=40, name='geoid', start=179),
                        ColumnSpec(position=50, width=200, name='name', start=219)]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])