Beispiel #1
0
    def test_generator(self):
        from ambry_sources.sources import GeneratorSource, SourceSpec

        cache_fs = fsopendir(self.setup_temp_dir())

        def gen():

            yield list('abcde')

            for i in range(10):
                yield [i, i + 1, i + 2, i + 3, i + 4]

        f = MPRowsFile(cache_fs, 'foobar').load_rows(
            GeneratorSource(SourceSpec('foobar'), gen()))

        self.assertEqual(1, f.info['data_start_row'])
        self.assertEqual(11, f.info['data_end_row'])
        self.assertEqual([0], f.info['header_rows'])

        self.assertEqual(f.headers, list('abcde'))
        rows = list(f.select())
        self.assertEqual(len(rows), 10)
        self.assertEqual(sorted(rows[0].keys()), sorted(list('abcde')))

        self.assertTrue(f.is_finalized)
    def test_creates_virtual_tables_for_partition_with_segment_without_errors(
            self):

        fs = fsopendir('temp://')

        def gen():
            # generate header
            yield ['col1', 'col2']

            # generate rows
            yield [0, 0]
            yield [1, 1]

        mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr')
        mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        # create virtual tables. This should not raise an error.
        #
        connection = apsw.Connection(':memory:')
        try:
            add_partition(connection, mprows, 'vid1')
        except Exception as exc:
            raise AssertionError(
                'partition adding unexpectadly failed with {} error.'.format(
                    exc))

        # check selected rows
        #
        cursor = connection.cursor()
        result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall()
        self.assertEqual(result, [(0, 0), (1, 1)])
    def test_creates_virtual_tables_for_partition_with_segment_without_errors(self):

        fs = fsopendir('temp://')

        def gen():
            # generate header
            yield ['col1', 'col2']

            # generate rows
            yield [0, 0]
            yield [1, 1]

        mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr')
        mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        # create virtual tables. This should not raise an error.
        #
        connection = apsw.Connection(':memory:')
        try:
            add_partition(connection, mprows, 'vid1')
        except Exception as exc:
            raise AssertionError('partition adding unexpectadly failed with {} error.'.format(exc))

        # check selected rows
        #
        cursor = connection.cursor()
        result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall()
        self.assertEqual(result, [(0, 0), (1, 1)])
Beispiel #4
0
    def test_spec_load(self):
        """Test that setting a SourceSpec propertly sets the header_lines data start position"""

        from ambry_sources.sources import SourceSpec
        import string

        rs = string.ascii_letters

        n = 500

        rows, headers = self.generate_rows(n)

        blank = ['' for e in rows[0]]

        # Append a complex header, to give the RowIntuiter something to do.
        rows = [
            ['Dataset Title'] + blank[1:],
            blank,
            blank,
            [rs[i] for i, e in enumerate(rows[0])],
            [rs[i + 1] for i, e in enumerate(rows[0])],
            [rs[i + 2] for i, e in enumerate(rows[0])],
        ] + rows

        f = MPRowsFile('mem://frh').load_rows(rows)

        d = f.info

        self.assertEqual(6, d['data_start_row'])
        self.assertEqual(506, d['data_end_row'])
        self.assertEqual([3, 4, 5], d['header_rows'])
        self.assertEqual([
            u('a_b_c'),
            u('b_c_d'),
            u('c_d_e'),
            u('d_e_f'),
            u('e_f_g'),
            u('f_g_h')
        ], d['headers'])

        class Rows(object):
            spec = SourceSpec(None, header_lines=(3, 4), start_line=5)

            def __iter__(self):
                return iter(rows)

        f = MPRowsFile('mem://frh').load_rows(Rows())

        d = f.info

        self.assertEqual(5, d['data_start_row'])
        self.assertEqual(506, d['data_end_row'])
        self.assertEqual([3, 4], d['header_rows'])
        self.assertEqual(
            [u('a_b'),
             u('b_c'),
             u('c_d'),
             u('d_e'),
             u('e_f'),
             u('f_g')], d['headers'])
        def write_large_blocks():

            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:
                w.headers = headers
                w.insert_rows(rows)

            print('MSGPack write L', float(N) / t.elapsed, w.n_rows)
Beispiel #6
0
        def write_large_blocks():

            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:
                w.headers = headers
                w.insert_rows(rows)

            print('MSGPack write L', float(N) / t.elapsed, w.n_rows)
        def write_small_blocks():
            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:

                for i in range(N):
                    w.headers = headers
                    w.insert_row(rows[i])

            print('MSGPack write S', float(N) / t.elapsed, w.n_rows)
Beispiel #8
0
        def write_small_blocks():
            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:

                for i in range(N):
                    w.headers = headers
                    w.insert_row(rows[i])

            print('MSGPack write S', float(N) / t.elapsed, w.n_rows)
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)
Beispiel #10
0
    def test_creates_virtual_table_for_simple_fixed_mpr(self):
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())
        sources = self.load_sources()
        spec = sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        # first make sure file not changed.
        expected_names = ['id', 'uuid', 'int', 'float']
        expected_types = ['int', binary_type.__name__, 'int', 'float']
        self.assertEqual([x['name'] for x in mprows.reader.columns],
                         expected_names)
        self.assertEqual([x['type'] for x in mprows.reader.columns],
                         expected_types)

        connection = apsw.Connection(':memory:')
        table = 'table1'
        add_partition(connection, mprows, table)

        # check all columns and some rows.
        cursor = connection.cursor()
        query = 'SELECT count(*) FROM {};'.format(table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, [(10000, )])

        with mprows.reader as r:
            expected_first_row = next(iter(r)).row

        # query by columns.
        query = 'SELECT id, uuid, int, float FROM {} LIMIT 1;'.format(table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0], expected_first_row)
Beispiel #11
0
        def Create(
                self,
                db,
                modulename,
                dbname,
                tablename,  # These argare are required by APSW
                mpr_url,
                *args):  # These are our args.

            mprows = MPRowsFile(mpr_url)

            columns_types = []
            column_names = []

            for column in sorted(mprows.reader.columns,
                                 key=lambda x: x['pos']):
                sqlite_type = TYPE_MAP.get(column['type'])
                if not sqlite_type:
                    raise Exception(
                        'Do not know how to convert {} to sql column.'.format(
                            column['type']))
                columns_types.append('"{}" {}'.format(column['name'],
                                                      sqlite_type))
                column_names.append(column['name'])

            columns_types_str = ',\n'.join(columns_types)
            schema = 'CREATE TABLE {}({});'.format(tablename,
                                                   columns_types_str)

            return schema, Table(column_names, mprows)
Beispiel #12
0
        def first_row_header(data_start_row=None, data_end_row=None):

            # Normal Headers
            f = MPRowsFile('mem://frh')
            w = f.writer

            w.columns = headers

            for row in rows:
                w.insert_row(row)

            if data_start_row is not None:
                w.data_start_row = data_start_row

            if data_end_row is not None:
                w.data_end_row = data_end_row

            w.close()

            self.assertEqual((u('a'), u('b'), u('c'), u('d'), u('e'), u('f')),
                             tuple(w.parent.reader.headers))

            w.parent.reader.close()

            return f
Beispiel #13
0
    def test_ctor(self):

        d = '/tmp/socrata'

        from os import makedirs
        from os.path import exists
        from shutil import rmtree

        if exists(d):
            print "Make", d
            rmtree(d)

        makedirs(d)

        cache_fs = fsopendir(d)  # fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='sources.csv')
        spec = sources['facilities']
        source = get_source(spec, cache_fs)

        def cb(*args):
            print args

        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source,
                                                        callback=cb,
                                                        limit=10)
Beispiel #14
0
    def test_type_intuit(self):
        from ambry_sources.intuit import TypeIntuiter

        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        with f.writer as w:
            w.load_rows(s)

        with f.reader as r:
            ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows)

        with f.writer as w:
            w.set_types(ti)

        columns = []
        with f.reader as w:
            for col in w.columns:
                columns.append((col.pos, col.name, col.type))
        expected_columns = [(1, u'id', u'int'), (2, u'uuid', u'str'),
                            (3, u'int', u'int'), (4, u'float', u'float')]
        self.assertEqual(columns, expected_columns)
Beispiel #15
0
        def schema_header(data_start_row=None, data_end_row=None):
            # Set the schema
            f = MPRowsFile('mem://sh')
            w = f.writer

            w.headers = ['x' + str(e) for e in range(len(headers))]

            for row in rows:
                w.insert_row(row)

            if data_start_row is not None:
                w.data_start_row = data_start_row

            if data_end_row is not None:
                w.data_end_row = data_end_row

            w.close()

            self.assertEqual(
                (u('x0'), u('x1'), u('x2'), u('x3'), u('x4'), u('x5')),
                tuple(w.parent.reader.headers))

            w.parent.reader.close()

            return f
    def test_all(self):
        """ Test all sources from geo_sources.csv """
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        for name, spec in sources.items():
            if name == 'highways':
                # it is already tested. Skip.
                continue

            source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            # now check its load to MPRows
            mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)
            first_row = next(iter(mpr.reader))

            # Are columns recognized properly?

            NAME_INDEX = 1  # which element of the column description contains name.
            # Collect all names from column descriptors. Skip first elem of the schema because
            # it's descriptor of column descriptor elements.
            columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
            self.assertIn('id', columns)
            self.assertIn('geometry', columns)

            # Is first row valid?
            self.assertEqual(len(columns), len(first_row))
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():


            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/'+source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
    def test_selects_correct_rows_from_many_mprows(self):

        fs = fsopendir('temp://')
        header = ['col1', 'col2']

        # create 3 mprows files.
        #
        rows1 = [(0, 0), (1, 1)]
        mprows1 = MPRowsFile(fs, 'vid1')
        mprows1.load_rows(self._get_generator_source(header, rows1))

        rows2 = [(2, 2), (3, 3)]
        mprows2 = MPRowsFile(fs, 'vid2')
        mprows2.load_rows(self._get_generator_source(header, rows2))

        rows3 = [(4, 4), (5, 5)]
        mprows3 = MPRowsFile(fs, 'vid3')
        mprows3.load_rows(self._get_generator_source(header, rows3))

        # create virtual tables for all mprows
        #
        connection = apsw.Connection(':memory:')

        add_partition(connection, mprows1, 'vid1')
        add_partition(connection, mprows2, 'vid2')
        add_partition(connection, mprows3, 'vid3')

        # check rows of all added mprows.
        #

        cursor = connection.cursor()
        query_tmpl = 'SELECT * FROM {};'

        # check rows of the first file.
        #
        query = query_tmpl.format('vid1')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows1)

        # check rows of the second mprows file.
        #
        query = query_tmpl.format('vid2')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows2)

        # check rows of the third mprows file.
        #
        query = query_tmpl.format('vid3')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows3)
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
Beispiel #20
0
    def test_creates_foreign_data_table_for_simple_fixed_mpr(
            self, fake_shares):
        fake_shares.return_value = True
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())
        sources = self.load_sources()
        spec = sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        # first make sure file was not changed.
        expected_names = ['id', 'uuid', 'int', 'float']
        expected_types = ['int', binary_type.__name__, 'int', 'float']
        self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]),
                         sorted(expected_names))
        self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]),
                         sorted(expected_types))

        try:
            # create foreign data table
            PostgreSQLTestBase._create_postgres_test_db()
            conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data)

            try:
                with conn.cursor() as cursor:
                    # we have to close opened transaction.
                    cursor.execute('COMMIT;')
                    add_partition(cursor, mprows, 'table1')

                # try to query just added partition foreign data table.
                with conn.cursor() as cursor:
                    table = 'table1'

                    # count all rows
                    query = 'SELECT count(*) FROM {}.{};'.format(
                        POSTGRES_PARTITION_SCHEMA_NAME, table)
                    cursor.execute(query)
                    result = cursor.fetchall()
                    self.assertEqual(result, [(10000, )])

                    # check first row
                    cursor.execute(
                        'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;'.
                        format(POSTGRES_PARTITION_SCHEMA_NAME, table))
                    result = cursor.fetchall()
                    self.assertEqual(len(result), 1)
                    expected_first_row = (1,
                                          'eb385c36-9298-4427-8925-fe09294dbd',
                                          30, Decimal('99.734691532'))
                    self.assertEqual(result[0], expected_first_row)

            finally:
                conn.close()
        finally:
            PostgreSQLTestBase._drop_postgres_test_db()
Beispiel #21
0
    def test_headers(self):

        fs = fsopendir('mem://')

        df = MPRowsFile(fs, 'foobar')

        with df.writer as w:

            schema = lambda row, col: w.meta['schema'][row][col]

            w.headers = list('abcdefghi')

            self.assertEqual('a', schema(1, 1))
            self.assertEqual('e', schema(5, 1))
            self.assertEqual('i', schema(9, 1))

            for h in w.columns:
                h.description = "{}-{}".format(h.pos, h.name)

            self.assertEqual('1-a', schema(1, 3))
            self.assertEqual('5-e', schema(5, 3))
            self.assertEqual('9-i', schema(9, 3))

            w.column(1).description = 'one'
            w.column(2).description = 'two'
            w.column('c').description = 'C'
            w.column('d')['description'] = 'D'

            self.assertEqual('one', schema(1, 3))
            self.assertEqual('two', schema(2, 3))
            self.assertEqual('C', schema(3, 3))
            self.assertEqual('D', schema(4, 3))

        with df.reader as r:
            schema = lambda row, col: r.meta['schema'][row][col]

            self.assertEqual([
                u('a'),
                u('b'),
                u('c'),
                u('d'),
                u('e'),
                u('f'),
                u('g'),
                u('h'),
                u('i')
            ], r.headers)

            self.assertEqual('one', schema(1, 3))
            self.assertEqual('two', schema(2, 3))
            self.assertEqual('C', schema(3, 3))
            self.assertEqual('D', schema(4, 3))
Beispiel #22
0
    def test_intuit_headers(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            self.assertEqual(spec.expect_start, f.info['data_start_row'])
            self.assertEqual([int(e) for e in spec.expect_headers.split(',')],
                             f.info['header_rows'])
    def test_generator(self):
        from ambry_sources.sources import GeneratorSource, SourceSpec

        cache_fs = fsopendir(self.setup_temp_dir())

        def gen():

            yield list('abcde')

            for i in range(10):
                yield [i, i+1, i+2, i+3, i+4]

        f = MPRowsFile(cache_fs, 'foobar').load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        self.assertEqual(1,  f.info['data_start_row'])
        self.assertEqual(11, f.info['data_end_row'])
        self.assertEqual([0],  f.info['header_rows'])

        self.assertEqual(f.headers, list('abcde'))
        rows = list(f.select())
        self.assertEqual(len(rows), 10)
        self.assertEqual(sorted(rows[0].keys()), sorted(list('abcde')))

        self.assertTrue(f.is_finalized)
Beispiel #24
0
    def test_intuit_footer(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            with f.reader as r:
                last = list(r.rows)[-1]  # islice isn't working on the reader.
                print source_name, last
                self.assertEqual(11999, int(last[0]))
                self.assertEqual('2q080z003Cg2', last[1])
    def test_highways(self):
        # FIXME: Optimize to use local file instead of downloading it all the time.
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        spec = sources['highways']
        source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # first check is it converted properly.
        row_gen = source._get_row_gen()
        first_row = next(row_gen)

        # generates valid first row
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row[0], 0)
        # last element is wkt.
        self.assertIn('LINESTRING', first_row[-1])

        # header is valid
        self.assertEqual(len(source._headers), 68)
        self.assertEqual(source._headers[0], 'id')
        self.assertEqual(source._headers[-1], 'geometry')

        # now check its load to MPRows
        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)

        # Are columns recognized properly?
        NAME_INDEX = 1  # which element of the column description contains name.
        # Collect all names from column descriptors. Skip first elem of the schema because
        # it's descriptor of column descriptor elements.
        columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
        self.assertIn('id', columns)
        self.assertIn('geometry', columns)
        self.assertIn('length', columns)  # column from shape file.

        # Is first row valid?
        first_row = next(iter(mpr.reader))
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row['id'], 0)
        self.assertIn('LINESTRING', first_row['geometry'])

        return

        # spec columns are properly populated
        self.assertEqual(len(spec.columns), 68)
        self.assertEqual(spec.columns[0]['name'], 'id')
        self.assertEqual(spec.columns[-1]['name'], 'geometry')
Beispiel #26
0
    def __init__(self, options, columns):
        """

        Args:
            options (dict): filesystem and path, filesystem is root directory str, path is relative
                name of the file.
                Example: {
                    'filesystem': '/tmp/my-root',
                    'path': '/dir1/file1.mpr'
                }
        """

        super(MPRForeignDataWrapper, self).__init__(options, columns)
        self.columns = columns
        if 'path' not in options:
            log_to_postgres(
                'Filename is required option of the partition msgpack fdw.',
                ERROR,
                hint=
                'Try to add the `path` option to the table creation statement')
            raise RuntimeError(
                '`path` is required option of the MPR (Message Pack Rows) fdw.'
            )

        if 'filesystem' not in options:
            log_to_postgres(
                'filesystem is required option of the partition msgpack fdw.',
                ERROR,
                hint=
                'Try to add the `filesystem` option to the table creation statement'
            )
            raise RuntimeError(
                '`filesystem` is required option of the MPR (Message Pack Rows) fdw.'
            )
        self.filesystem = fsopendir(options['filesystem'])
        self.path = options['path']

        if logger.level == logging.DEBUG:
            current_user = getpass.getuser()
            log_to_postgres(
                'Initializing Foreign Data Wrapper: user: {}, filesystem: {}, path: {}'
                .format(current_user, options['filesystem'],
                        options['path']), DEBUG)
        self._mp_rows = MPRowsFile(self.filesystem, self.path)
Beispiel #27
0
def import_source(spec, cache_fs,  file_path=None, account_accessor=None):
    """Download a source and load it into an MPR file. """

    s = get_source(spec, cache_fs,  account_accessor)

    if not file_path:
        file_path = spec.name

    f = MPRowsFile(cache_fs, file_path)
    w = f.writer

    w.set_spec(spec)

    for row in s:
        w.insert_row(row)

    w.close()

    return f
Beispiel #28
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        #cache_fs = fsopendir('temp://')
        from shutil import rmtree
        from os import makedirs

        tp = '/tmp/mpr-test'
        rmtree(tp, ignore_errors=True)
        makedirs(tp)
        cache_fs = fsopendir(tp)

        s = get_source(self.sources['simple_stats'],
                       cache_fs,
                       callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True)

        stat_names = ('count', 'min', 'mean', 'max', 'nuniques')

        vals = {
            u('str_a'): (30, None, None, None, 10),
            u('str_b'): (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, None, None, None, 10),
            u('int_b'): (30, None, None, None, 10),
            u('int_a'): (30, 1.0, 5.5, 10.0, 10)
        }

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min,
                         round(col.mean, 1) if col.mean else None, col.max,
                         col.nuniques)

                for a, b, stat_name in zip(vals[col.name], stats, stat_names):
                    self.assertEqual(
                        a, b, "{} failed for stat {}: {} != {}".format(
                            col.name, stat_name, a, b))
    def test_creates_virtual_table_for_source_with_header_containing_sql_reserved_words(
            self):
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())

        spec = SourceSpec('foobar')

        def gen():

            # yield header
            yield ['create', 'index', 'where', 'select', 'distinct']

            # yield rows
            for i in range(10):
                yield [i, i + 1, i + 2, i + 3, i + 4]

        s = GeneratorSource(spec, gen())
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        connection = apsw.Connection(':memory:')
        table = 'table1'
        add_partition(connection, mprows, table)

        # check all columns and some rows.
        cursor = connection.cursor()
        query = 'SELECT count(*) FROM {};'.format(table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, [(10, )])

        with mprows.reader as r:
            expected_first_row = next(iter(r)).row

        # query by columns.
        query = 'SELECT "create", "index", "where", "select", "distinct" FROM {} LIMIT 1;'.format(
            table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0], expected_first_row)
Beispiel #30
0
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/' + source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(
                    six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
Beispiel #31
0
        def no_header(data_start_row=None, data_end_row=None):

            # No header, column labels.
            f = MPRowsFile('mem://nh')
            w = f.writer

            for row in rows:
                w.insert_row(row)

            if data_start_row is not None:
                w.data_start_row = data_start_row

            if data_end_row is not None:
                w.data_end_row = data_end_row

            w.close()

            self.assertEqual(['col1', 'col2', 'col3', 'col4', 'col5', 'col6'],
                             w.parent.reader.headers)

            w.parent.reader.close()

            return f
Beispiel #32
0
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv',
                          name='simple')

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(10001, f.reader.info['data_end_row'])
    def test_selects_correct_rows_from_many_mprows(self):

        fs = fsopendir('temp://')
        header = ['col1', 'col2']

        # create 3 mprows files.
        #
        rows1 = [(0, 0), (1, 1)]
        mprows1 = MPRowsFile(fs, 'vid1')
        mprows1.load_rows(self._get_generator_source(header, rows1))

        rows2 = [(2, 2), (3, 3)]
        mprows2 = MPRowsFile(fs, 'vid2')
        mprows2.load_rows(self._get_generator_source(header, rows2))

        rows3 = [(4, 4), (5, 5)]
        mprows3 = MPRowsFile(fs, 'vid3')
        mprows3.load_rows(self._get_generator_source(header, rows3))

        # create virtual tables for all mprows
        #
        connection = apsw.Connection(':memory:')

        add_partition(connection, mprows1, 'vid1')
        add_partition(connection, mprows2, 'vid2')
        add_partition(connection, mprows3, 'vid3')

        # check rows of all added mprows.
        #

        cursor = connection.cursor()
        query_tmpl = 'SELECT * FROM {};'

        # check rows of the first file.
        #
        query = query_tmpl.format('vid1')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows1)

        # check rows of the second mprows file.
        #
        query = query_tmpl.format('vid2')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows2)

        # check rows of the third mprows file.
        #
        query = query_tmpl.format('vid3')
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, rows3)
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
                          file='g2009.*\.txt',
                          filetype='fixed',
                          name='geofile',
                          encoding='latin1',
                          )

        spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1),
                        ColumnSpec(position=2, width=2, name='stusab', start=7),
                        ColumnSpec(position=3, width=3, name='sumlevel', start=9),
                        ColumnSpec(position=4, width=2, name='component', start=12),
                        ColumnSpec(position=5, width=7, name='logrecno', start=14),
                        ColumnSpec(position=6, width=1, name='us', start=21),
                        ColumnSpec(position=7, width=1, name='region', start=22),
                        ColumnSpec(position=8, width=1, name='division', start=23),
                        ColumnSpec(position=9, width=2, name='statece', start=24),
                        ColumnSpec(position=10, width=2, name='state', start=26),
                        ColumnSpec(position=11, width=3, name='county', start=28),
                        ColumnSpec(position=12, width=5, name='cousub', start=31),
                        ColumnSpec(position=13, width=5, name='place', start=36),
                        ColumnSpec(position=14, width=6, name='tract', start=41),
                        ColumnSpec(position=15, width=1, name='blkgrp', start=47),
                        ColumnSpec(position=16, width=5, name='concit', start=48),
                        ColumnSpec(position=17, width=4, name='aianhh', start=53),
                        ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
                        ColumnSpec(position=19, width=1, name='aihhtli', start=62),
                        ColumnSpec(position=20, width=3, name='aitsce', start=63),
                        ColumnSpec(position=21, width=5, name='aits', start=66),
                        ColumnSpec(position=22, width=5, name='anrc', start=71),
                        ColumnSpec(position=23, width=5, name='cbsa', start=76),
                        ColumnSpec(position=24, width=3, name='csa', start=81),
                        ColumnSpec(position=25, width=5, name='metdiv', start=84),
                        ColumnSpec(position=26, width=1, name='macc', start=89),
                        ColumnSpec(position=27, width=1, name='memi', start=90),
                        ColumnSpec(position=28, width=5, name='necta', start=91),
                        ColumnSpec(position=29, width=3, name='cnecta', start=96),
                        ColumnSpec(position=30, width=5, name='nectadiv', start=99),
                        ColumnSpec(position=31, width=5, name='ua', start=104),
                        ColumnSpec(position=33, width=2, name='cdcurr', start=114),
                        ColumnSpec(position=34, width=3, name='sldu', start=116),
                        ColumnSpec(position=35, width=3, name='sldl', start=119),
                        ColumnSpec(position=39, width=5, name='submcd', start=136),
                        ColumnSpec(position=40, width=5, name='sdelm', start=141),
                        ColumnSpec(position=41, width=5, name='sdsec', start=146),
                        ColumnSpec(position=42, width=5, name='sduni', start=151),
                        ColumnSpec(position=43, width=1, name='ur', start=156),
                        ColumnSpec(position=44, width=1, name='pci', start=157),
                        ColumnSpec(position=47, width=5, name='puma5', start=169),
                        ColumnSpec(position=49, width=40, name='geoid', start=179),
                        ColumnSpec(position=50, width=200, name='name', start=219)]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])
Beispiel #36
0
 def test_fixed(self):
     cache_fs = fsopendir(self.setup_temp_dir())
     spec = self.sources['simple_fixed']
     s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
     f = MPRowsFile(cache_fs, spec.name).load_rows(s)
     self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
Beispiel #37
0
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec(
            'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
            file='g2009.*\.txt',
            filetype='fixed',
            name='geofile',
            encoding='latin1',
        )

        spec.columns = [
            ColumnSpec(position=1, width=6, name='fileid', start=1),
            ColumnSpec(position=2, width=2, name='stusab', start=7),
            ColumnSpec(position=3, width=3, name='sumlevel', start=9),
            ColumnSpec(position=4, width=2, name='component', start=12),
            ColumnSpec(position=5, width=7, name='logrecno', start=14),
            ColumnSpec(position=6, width=1, name='us', start=21),
            ColumnSpec(position=7, width=1, name='region', start=22),
            ColumnSpec(position=8, width=1, name='division', start=23),
            ColumnSpec(position=9, width=2, name='statece', start=24),
            ColumnSpec(position=10, width=2, name='state', start=26),
            ColumnSpec(position=11, width=3, name='county', start=28),
            ColumnSpec(position=12, width=5, name='cousub', start=31),
            ColumnSpec(position=13, width=5, name='place', start=36),
            ColumnSpec(position=14, width=6, name='tract', start=41),
            ColumnSpec(position=15, width=1, name='blkgrp', start=47),
            ColumnSpec(position=16, width=5, name='concit', start=48),
            ColumnSpec(position=17, width=4, name='aianhh', start=53),
            ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
            ColumnSpec(position=19, width=1, name='aihhtli', start=62),
            ColumnSpec(position=20, width=3, name='aitsce', start=63),
            ColumnSpec(position=21, width=5, name='aits', start=66),
            ColumnSpec(position=22, width=5, name='anrc', start=71),
            ColumnSpec(position=23, width=5, name='cbsa', start=76),
            ColumnSpec(position=24, width=3, name='csa', start=81),
            ColumnSpec(position=25, width=5, name='metdiv', start=84),
            ColumnSpec(position=26, width=1, name='macc', start=89),
            ColumnSpec(position=27, width=1, name='memi', start=90),
            ColumnSpec(position=28, width=5, name='necta', start=91),
            ColumnSpec(position=29, width=3, name='cnecta', start=96),
            ColumnSpec(position=30, width=5, name='nectadiv', start=99),
            ColumnSpec(position=31, width=5, name='ua', start=104),
            ColumnSpec(position=33, width=2, name='cdcurr', start=114),
            ColumnSpec(position=34, width=3, name='sldu', start=116),
            ColumnSpec(position=35, width=3, name='sldl', start=119),
            ColumnSpec(position=39, width=5, name='submcd', start=136),
            ColumnSpec(position=40, width=5, name='sdelm', start=141),
            ColumnSpec(position=41, width=5, name='sdsec', start=146),
            ColumnSpec(position=42, width=5, name='sduni', start=151),
            ColumnSpec(position=43, width=1, name='ur', start=156),
            ColumnSpec(position=44, width=1, name='pci', start=157),
            ColumnSpec(position=47, width=5, name='puma5', start=169),
            ColumnSpec(position=49, width=40, name='geoid', start=179),
            ColumnSpec(position=50, width=200, name='name', start=219)
        ]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])
Beispiel #38
0
    def test_datafile_read_write(self):
        from fs.opener import fsopendir
        import datetime
        from random import randint, random
        from contexttimer import Timer
        from uuid import uuid4

        fs = fsopendir('mem://')

        # fs = fsopendir('/tmp/pmpf')

        N = 50000

        # Basic read/ write tests.

        def rand_date():
            return datetime.date(randint(2000, 2015), randint(1, 12), 10)

        def rand_datetime():
            return datetime.datetime(randint(2000, 2015), randint(1, 12), 10)

        def rand_time():
            return datetime.time(randint(0, 23), randint(0, 59), 10)

        row = lambda: (None, 1, random(), str(uuid4()), rand_date(),
                       rand_datetime(), rand_time())

        headers = list('abcdefghi')[:len(row())]

        rows = [row() for i in range(N)]

        def write_large_blocks():

            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:
                w.headers = headers
                w.insert_rows(rows)

            print('MSGPack write L', float(N) / t.elapsed, w.n_rows)

        def write_small_blocks():
            df = MPRowsFile(fs, 'foobar')

            if df.exists:
                df.remove()

            with Timer() as t, df.writer as w:

                for i in range(N):
                    w.headers = headers
                    w.insert_row(rows[i])

            print('MSGPack write S', float(N) / t.elapsed, w.n_rows)

        print()
        # Write the whole file with insert_rows() which writes all of the rows at once.
        write_large_blocks()

        # Write the file in blocks, with insert_rows collecting rows into a cache, then writting the
        # cached blocks.
        write_small_blocks()

        df = MPRowsFile(fs, 'foobar')

        with Timer() as t:
            count = 0
            i = 0
            s = 0

            r = df.reader

            for i, row in enumerate(r):
                count += 1
            r.close()

        print('MSGPack read   ', float(N) / t.elapsed, i, count, s)

        with Timer() as t:
            count = 0

            r = df.reader

            for row in r.rows:

                count += 1

            r.close()

        print('MSGPack rows   ', float(N) / t.elapsed)

        with Timer() as t:
            count = 0

            r = df.reader

            for row in r.raw:
                count += 1

            r.close()

        print('MSGPack raw    ', float(N) / t.elapsed)
Beispiel #39
0
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'mz_no_zip': [u('id'), u('uuid'),
                          u('int'), u('float')],
            'namesu8': [
                u('origin_english'),
                u('name_english'),
                u('origin_native'),
                u('name_native')
            ],
            'sf_zip': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'simple': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'),
                           u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'),
                            u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'simple_fixed': [u('id'), u('uuid'),
                             u('int'),
                             u('float')],
            'altname': [u('id'), u('foo'),
                        u('bar'), u('baz')],
            'rentcsv': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'renttab': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'multiexcel': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'rent97': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)