Example #1
0
    def test_creates_virtual_table_for_simple_fixed_mpr(self):
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())
        sources = self.load_sources()
        spec = sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        # first make sure file not changed.
        expected_names = ['id', 'uuid', 'int', 'float']
        expected_types = ['int', binary_type.__name__, 'int', 'float']
        self.assertEqual([x['name'] for x in mprows.reader.columns],
                         expected_names)
        self.assertEqual([x['type'] for x in mprows.reader.columns],
                         expected_types)

        connection = apsw.Connection(':memory:')
        table = 'table1'
        add_partition(connection, mprows, table)

        # check all columns and some rows.
        cursor = connection.cursor()
        query = 'SELECT count(*) FROM {};'.format(table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(result, [(10000, )])

        with mprows.reader as r:
            expected_first_row = next(iter(r)).row

        # query by columns.
        query = 'SELECT id, uuid, int, float FROM {} LIMIT 1;'.format(table)
        result = cursor.execute(query).fetchall()
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0], expected_first_row)
Example #2
0
    def test_row_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from ambry_sources.intuit import RowIntuiter

        cache_fs = fsopendir('temp://')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            rows = list(s)
            l = len(rows)

            # the files are short, so the head and tail overlap
            ri = RowIntuiter(debug=False).run(rows[:int(l * .75)],
                                              rows[int(l * .25):], len(rows))

            print source_name, ri.start_line, ri.header_lines

            self.assertEqual(
                spec.expect_headers, ','.join(str(e) for e in ri.header_lines),
                'Headers of {} source does not match to row intuiter'.format(
                    spec.name))

            self.assertEqual(
                spec.expect_start, ri.start_line,
                'Start line of {} source does not match to row intuiter start line.'
                .format(spec.name))
Example #3
0
 def test_created_source_has_zip_filesystem(self):
     # FIXME: Optimize to use local file instead of downloading it all the time.
     cache_fs = fsopendir(self.setup_temp_dir())
     sources = self.load_sources(file_name='geo_sources.csv')
     spec = sources['community_plan']
     source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
     self.assertIsInstance(source._fstor._fs, ZipFS)
    def test_ctor(self):

        d = '/tmp/socrata'

        from os import makedirs
        from os.path import exists
        from shutil import rmtree

        if exists(d):
            print "Make", d
            rmtree(d)

        makedirs(d)

        cache_fs = fsopendir(d)  # fsopendir(self.setup_temp_dir())


        sources = self.load_sources(file_name='sources.csv')
        spec = sources['facilities']
        source = get_source(spec, cache_fs)

        def cb(*args):
            print args

        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source, callback = cb, limit = 10)
Example #5
0
    def __iter__(self):
        
        import unicodecsv as csv
        from contextlib import closing
        from ambry_sources import get_source
        from ambry.bundle.process import call_interval
        
        @call_interval(5)
        def progress(read_len, total_len):
            self._bundle.log('Downloading {}: {}'.format(self._source.url, total_len))

        spec = self._source.spec
        spec.urltype = 'zip'

        s = get_source(spec, self._bundle.library.download_cache,
                    account_accessor=self._bundle.library.account_accessor, callback=progress)

        encoding = self._source.spec.encoding or 'utf8'

        header = None
        header_len = None

        for i, row in enumerate(s):
            
            if i == 0:
                header = row
                header_len = len(row)
            
            if len(row) > header_len:
                head = list(row)[:header_len-1]
                tail = [str(e) for e in row[header_len-1:] ]
                row =  head + [','.join(tail) ]
            
            yield row
    def test_all(self):
        """ Test all sources from geo_sources.csv """
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        for name, spec in sources.items():
            if name == 'highways':
                # it is already tested. Skip.
                continue

            source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            # now check its load to MPRows
            mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)
            first_row = next(iter(mpr.reader))

            # Are columns recognized properly?

            NAME_INDEX = 1  # which element of the column description contains name.
            # Collect all names from column descriptors. Skip first elem of the schema because
            # it's descriptor of column descriptor elements.
            columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
            self.assertIn('id', columns)
            self.assertIn('geometry', columns)

            # Is first row valid?
            self.assertEqual(len(columns), len(first_row))
Example #7
0
    def test_type_intuit(self):
        from ambry_sources.intuit import TypeIntuiter

        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        with f.writer as w:
            w.load_rows(s)

        with f.reader as r:
            ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows)

        with f.writer as w:
            w.set_types(ti)

        columns = []
        with f.reader as w:
            for col in w.columns:
                columns.append((col.pos, col.name, col.type))
        expected_columns = [(1, u'id', u'int'), (2, u'uuid', u'str'),
                            (3, u'int', u'int'), (4, u'float', u'float')]
        self.assertEqual(columns, expected_columns)
 def test_created_source_has_zip_filesystem(self):
     # FIXME: Optimize to use local file instead of downloading it all the time.
     cache_fs = fsopendir(self.setup_temp_dir())
     sources = self.load_sources(file_name='geo_sources.csv')
     spec = sources['community_plan']
     source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
     self.assertIsInstance(source._fstor._fs, ZipFS)
    def test_all(self):
        """ Test all sources from geo_sources.csv """
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        for name, spec in sources.items():
            if name == 'highways':
                # it is already tested. Skip.
                continue

            source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            # now check its load to MPRows
            mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)
            first_row = next(iter(mpr.reader))

            # Are columns recognized properly?

            NAME_INDEX = 1  # which element of the column description contains name.
            # Collect all names from column descriptors. Skip first elem of the schema because
            # it's descriptor of column descriptor elements.
            columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
            self.assertIn('id', columns)
            self.assertIn('geometry', columns)

            # Is first row valid?
            self.assertEqual(len(columns), len(first_row))
    def test_type_intuit(self):
        from ambry_sources.intuit import TypeIntuiter

        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        with f.writer as w:
            w.load_rows(s)

        with f.reader as r:
            ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows)

        with f.writer as w:
            w.set_types(ti)

        columns = []
        with f.reader as w:
            for col in w.columns:
                columns.append((col.pos, col.name, col.type))
        expected_columns = [
            (1, u'id', u'int'),
            (2, u'uuid', u'str'),
            (3, u'int', u'int'),
            (4, u'float', u'float')
        ]
        self.assertEqual(columns, expected_columns)
    def test_row_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from ambry_sources.intuit import RowIntuiter

        cache_fs = fsopendir('temp://')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            rows = list(s)
            l = len(rows)

            # the files are short, so the head and tail overlap
            ri = RowIntuiter(debug=False).run(rows[:int(l*.75)], rows[int(l*.25):], len(rows))

            print source_name, ri.start_line, ri.header_lines

            self.assertEqual(
                spec.expect_headers,
                ','.join(str(e) for e in ri.header_lines),
                'Headers of {} source does not match to row intuiter'.format(spec.name))

            self.assertEqual(
                spec.expect_start, ri.start_line,
                'Start line of {} source does not match to row intuiter start line.'.format(spec.name))
Example #12
0
    def test_ctor(self):

        d = '/tmp/socrata'

        from os import makedirs
        from os.path import exists
        from shutil import rmtree

        if exists(d):
            print "Make", d
            rmtree(d)

        makedirs(d)

        cache_fs = fsopendir(d)  # fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='sources.csv')
        spec = sources['facilities']
        source = get_source(spec, cache_fs)

        def cb(*args):
            print args

        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source,
                                                        callback=cb,
                                                        limit=10)
Example #13
0
    def test_creates_foreign_data_table_for_simple_fixed_mpr(
            self, fake_shares):
        fake_shares.return_value = True
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())
        sources = self.load_sources()
        spec = sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        # first make sure file was not changed.
        expected_names = ['id', 'uuid', 'int', 'float']
        expected_types = ['int', binary_type.__name__, 'int', 'float']
        self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]),
                         sorted(expected_names))
        self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]),
                         sorted(expected_types))

        try:
            # create foreign data table
            PostgreSQLTestBase._create_postgres_test_db()
            conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data)

            try:
                with conn.cursor() as cursor:
                    # we have to close opened transaction.
                    cursor.execute('COMMIT;')
                    add_partition(cursor, mprows, 'table1')

                # try to query just added partition foreign data table.
                with conn.cursor() as cursor:
                    table = 'table1'

                    # count all rows
                    query = 'SELECT count(*) FROM {}.{};'.format(
                        POSTGRES_PARTITION_SCHEMA_NAME, table)
                    cursor.execute(query)
                    result = cursor.fetchall()
                    self.assertEqual(result, [(10000, )])

                    # check first row
                    cursor.execute(
                        'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;'.
                        format(POSTGRES_PARTITION_SCHEMA_NAME, table))
                    result = cursor.fetchall()
                    self.assertEqual(len(result), 1)
                    expected_first_row = (1,
                                          'eb385c36-9298-4427-8925-fe09294dbd',
                                          30, Decimal('99.734691532'))
                    self.assertEqual(result[0], expected_first_row)

            finally:
                conn.close()
        finally:
            PostgreSQLTestBase._drop_postgres_test_db()
Example #14
0
    def test_intuit_headers(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            self.assertEqual(spec.expect_start, f.info['data_start_row'])
            self.assertEqual([int(e) for e in spec.expect_headers.split(',')],
                             f.info['header_rows'])
    def test_intuit_headers(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            self.assertEqual(spec.expect_start, f.info['data_start_row'])
            self.assertEqual(
                [int(e) for e in spec.expect_headers.split(',')],
                f.info['header_rows'])
    def test_creates_foreign_data_table_for_simple_fixed_mpr(self, fake_shares):
        fake_shares.return_value = True
        # build rows reader
        cache_fs = fsopendir(self.setup_temp_dir())
        sources = self.load_sources()
        spec = sources['simple_fixed']
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
        mprows = MPRowsFile(cache_fs, spec.name).load_rows(s)

        # first make sure file was not changed.
        expected_names = ['id', 'uuid', 'int', 'float']
        expected_types = ['int', binary_type.__name__, 'int', 'float']
        self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]), sorted(expected_names))
        self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]), sorted(expected_types))

        try:
            # create foreign data table
            PostgreSQLTestBase._create_postgres_test_db()
            conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data)

            try:
                with conn.cursor() as cursor:
                    # we have to close opened transaction.
                    cursor.execute('COMMIT;')
                    add_partition(cursor, mprows, 'table1')

                # try to query just added partition foreign data table.
                with conn.cursor() as cursor:
                    table = 'table1'

                    # count all rows
                    query = 'SELECT count(*) FROM {}.{};'.format(POSTGRES_PARTITION_SCHEMA_NAME, table)
                    cursor.execute(query)
                    result = cursor.fetchall()
                    self.assertEqual(result, [(10000,)])

                    # check first row
                    cursor.execute(
                        'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;'
                        .format(POSTGRES_PARTITION_SCHEMA_NAME, table))
                    result = cursor.fetchall()
                    self.assertEqual(len(result), 1)
                    expected_first_row = (
                        1, 'eb385c36-9298-4427-8925-fe09294dbd', 30, Decimal('99.734691532'))
                    self.assertEqual(result[0], expected_first_row)

            finally:
                conn.close()
        finally:
            PostgreSQLTestBase._drop_postgres_test_db()
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)
Example #18
0
    def test_intuit_footer(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            with f.reader as r:
                last = list(r.rows)[-1]  # islice isn't working on the reader.
                print source_name, last
                self.assertEqual(11999, int(last[0]))
                self.assertEqual('2q080z003Cg2', last[1])
    def test_just_download(self):
        """Just check that all of the sources can be downloaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():
            try:
                s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

                for i, row in enumerate(s):
                    if i > 10:
                        break
            except Exception as exc:
                raise AssertionError('Failed to download {} source because of {} error.'
                                     .format(s.url, exc))
    def test_intuit_footer(self):
        sources = self.load_sources(file_name='sources.csv')

        for source_name in ['headers4', 'headers3', 'headers2', 'headers1']:
            cache_fs = fsopendir(self.setup_temp_dir())

            spec = sources[source_name]
            f = MPRowsFile(cache_fs, spec.name) \
                .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y)))

            with f.reader as r:
                last = list(r.rows)[-1]  # islice isn't working on the reader.
                print source_name, last
                self.assertEqual(11999, int(last[0]))
                self.assertEqual('2q080z003Cg2', last[1])
Example #21
0
    def test_just_download(self):
        """Just check that all of the sources can be downloaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():
            try:
                s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

                for i, row in enumerate(s):
                    if i > 10:
                        break
            except Exception as exc:
                raise AssertionError(
                    'Failed to download {} source because of {} error.'.format(
                        s.url, exc))
    def test_highways(self):
        # FIXME: Optimize to use local file instead of downloading it all the time.
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        spec = sources['highways']
        source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # first check is it converted properly.
        row_gen = source._get_row_gen()
        first_row = next(row_gen)

        # generates valid first row
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row[0], 0)
        # last element is wkt.
        self.assertIn('LINESTRING', first_row[-1])

        # header is valid
        self.assertEqual(len(source._headers), 68)
        self.assertEqual(source._headers[0], 'id')
        self.assertEqual(source._headers[-1], 'geometry')

        # now check its load to MPRows
        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)

        # Are columns recognized properly?
        NAME_INDEX = 1  # which element of the column description contains name.
        # Collect all names from column descriptors. Skip first elem of the schema because
        # it's descriptor of column descriptor elements.
        columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
        self.assertIn('id', columns)
        self.assertIn('geometry', columns)
        self.assertIn('length', columns)  # column from shape file.

        # Is first row valid?
        first_row = next(iter(mpr.reader))
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row['id'], 0)
        self.assertIn('LINESTRING', first_row['geometry'])

        return

        # spec columns are properly populated
        self.assertEqual(len(spec.columns), 68)
        self.assertEqual(spec.columns[0]['name'], 'id')
        self.assertEqual(spec.columns[-1]['name'], 'geometry')
    def test_highways(self):
        # FIXME: Optimize to use local file instead of downloading it all the time.
        cache_fs = fsopendir(self.setup_temp_dir())

        sources = self.load_sources(file_name='geo_sources.csv')
        spec = sources['highways']
        source = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # first check is it converted properly.
        row_gen = source._get_row_gen()
        first_row = next(row_gen)

        # generates valid first row
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row[0], 0)
        # last element is wkt.
        self.assertIn('LINESTRING', first_row[-1])

        # header is valid
        self.assertEqual(len(source._headers), 68)
        self.assertEqual(source._headers[0], 'id')
        self.assertEqual(source._headers[-1], 'geometry')

        # now check its load to MPRows
        mpr = MPRowsFile(cache_fs, spec.name).load_rows(source)

        # Are columns recognized properly?
        NAME_INDEX = 1  # which element of the column description contains name.
        # Collect all names from column descriptors. Skip first elem of the schema because
        # it's descriptor of column descriptor elements.
        columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]]
        self.assertIn('id', columns)
        self.assertIn('geometry', columns)
        self.assertIn('length', columns)  # column from shape file.

        # Is first row valid?
        first_row = next(iter(mpr.reader))
        self.assertEqual(len(first_row), 68)
        self.assertEqual(first_row['id'], 0)
        self.assertIn('LINESTRING', first_row['geometry'])

        return

        # spec columns are properly populated
        self.assertEqual(len(spec.columns), 68)
        self.assertEqual(spec.columns[0]['name'], 'id')
        self.assertEqual(spec.columns[-1]['name'], 'geometry')
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():


            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/'+source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
Example #25
0
    def test_row_load_intuit(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        from itertools import islice

        cache_fs = fsopendir('temp://')
        cache_fs.makedir('/mpr')
        # cache_fs = fsopendir('/tmp/ritest/')

        sources = self.load_sources('sources-non-std-headers.csv')

        for source_name, spec in sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, '/mpr/' + source_name)

            if f.exists:
                f.remove()

            f.load_rows(s, intuit_type=False, run_stats=False, limit=500)

            self.assertEqual(f.info['data_start_row'], spec.expect_start)

            with f.reader as r:
                # First row, marked with metadata, that is marked as a data row
                m1, row1 = next(
                    six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw))

            with f.reader as r:
                # First row
                row2 = next(r.rows)

            with f.reader as r:
                # First row proxy
                row3 = next(iter(r)).row

            self.assertEqual(row1, row2)
            self.assertEqual(row1, row3)

            with f.reader as r:
                raw_rows = list(islice(r.raw, None, 40))

            self.assertEqual(row2, raw_rows[f.info['data_start_row']])
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
Example #27
0
    def test_full_load(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        for source_name, spec in self.sources.items():

            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                self.assertTrue(len(r.headers) > 0)
Example #28
0
    def test_fixed(self):
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        assert spec.has_rowspec is False
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # prepare HDFPartition.
        f = HDFPartition(cache_fs, spec.name)

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(s)
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)
        f.load_rows(s)
        self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv',
                          name='simple')

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(10001, f.reader.info['data_end_row'])
Example #30
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        #cache_fs = fsopendir('temp://')
        from shutil import rmtree
        from os import makedirs

        tp = '/tmp/mpr-test'
        rmtree(tp, ignore_errors=True)
        makedirs(tp)
        cache_fs = fsopendir(tp)

        s = get_source(self.sources['simple_stats'],
                       cache_fs,
                       callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True)

        stat_names = ('count', 'min', 'mean', 'max', 'nuniques')

        vals = {
            u('str_a'): (30, None, None, None, 10),
            u('str_b'): (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, None, None, None, 10),
            u('int_b'): (30, None, None, None, 10),
            u('int_a'): (30, 1.0, 5.5, 10.0, 10)
        }

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min,
                         round(col.mean, 1) if col.mean else None, col.max,
                         col.nuniques)

                for a, b, stat_name in zip(vals[col.name], stats, stat_names):
                    self.assertEqual(
                        a, b, "{} failed for stat {}: {} != {}".format(
                            col.name, stat_name, a, b))
Example #31
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y))

        f = HDFPartition(cache_fs, source.spec.name)

        with f.writer as w:
            ri = RowIntuiter().run(head(source, 100), tail(source, 100))
            row_spec = self._row_intuiter_to_dict(ri)
            ti = TypeIntuiter().process_header(ri.headers).run(source)
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(source, run_stats=True)

        expected = {
            u('str_a'):   (30, None, None, None, 10),
            u('str_b'):   (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, 1.1, 5.5, 9.9, 10),
            u('int_b'):   (30, 1.0, 5.0, 9.0, 10),
            u('int_a'):   (30, 1.0, 5.5, 10.0, 10)}

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None,
                         col.max,
                         col.nuniques)
                for a, b in zip(expected[col.name], stats):
                    self.assertEqual(
                        a, b,
                        'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""

        #cache_fs = fsopendir('temp://')
        from shutil import rmtree
        from os import makedirs

        tp = '/tmp/mpr-test'
        rmtree(tp, ignore_errors=True)
        makedirs(tp)
        cache_fs = fsopendir(tp)

        s = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True)

        stat_names = ('count', 'min', 'mean', 'max', 'nuniques')

        vals = {u('str_a'):   (30, None, None, None, 10),
                u('str_b'):   (30, None, None, None, 10),
                u('float_a'): (30, 1.0, 5.5, 10.0, 10),
                u('float_b'): (30, 1.1, 5.5, 9.9, 10),
                u('float_c'): (30, None, None, None, 10),
                u('int_b'):   (30, None, None, None, 10),
                u('int_a'):   (30, 1.0, 5.5, 10.0, 10)}

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count,
                         col.min,
                         round(col.mean, 1) if col.mean else None,
                         col.max,
                         col.nuniques)

                for a, b, stat_name in zip(vals[col.name], stats, stat_names):
                    self.assertEqual(a, b, "{} failed for stat {}: {} != {}".format(col.name, stat_name, a, b))
 def test_fixed(self):
     cache_fs = fsopendir(self.setup_temp_dir())
     spec = self.sources['simple_fixed']
     s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
     f = MPRowsFile(cache_fs, spec.name).load_rows(s)
     self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
Example #34
0
    def __iter__(self):

        from ambry_sources import get_source
        from itertools import izip, chain
        from ambry.etl import Slice
        from ambry.orm import Column
        from ambry.bundle.process import CallInterval

        table = self.source.dest_table

        if isinstance(table, str):
            table = self.table(table)

        start = int(table.data["start"])
        length = int(table.data["length"])

        slca_str = ",".join(str(e[4]) for e in self.header_cols)
        slcb_str = "{}:{}".format(start - 1, start + length - 1)

        # Slice for the stusab, logrecno, etc.
        slca, slc_code = Slice.make_slicer(slca_str)
        # Slice for the data columns
        slcb, slc_code = Slice.make_slicer(slcb_str)

        columns = [c.name for c in table.columns]

        # Columns before the first data column, by removing the
        # data columns, which are presumed to all be at the end.
        preamble_cols = columns[: -2 * len(slcb(range(1, 300)))]
        data_columns = columns[len(preamble_cols) :]

        header_cols = [e[0] for e in self.header_cols]

        # A few sanity checks
        assert preamble_cols[-1] == "jam_flags"
        assert data_columns[0][-3:] == "001"
        assert data_columns[1][-3:] == "m90"

        all_cols = [Column.mangle_name(c) for c in header_cols + data_columns]

        yield all_cols

        def progress(read_len, total_len, source_name):
            self.bundle.log("Downloading {}; {} bytes".format(source_name, total_len))

        cache = self.library.download_cache

        row_n = 0
        for spec1, spec2 in self.generate_source_specs():

            s1 = get_source(spec1, cache, callback=CallInterval(progress, 10, source_name=spec1.url))
            s2 = get_source(spec2, cache, callback=CallInterval(progress, 10, source_name=spec1.url))

            for i, (row1, row2) in enumerate(izip(s1, s2)):
                # Interleave the slices of the of the data rows, prepend
                # the stusab, logrecno, etc.

                row_n += 1
                if self.limited_run and row_n > 10000:
                    return

                yield slca(row1) + tuple(chain(*zip(slcb(row1), slcb(row2))))
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
                          file='g2009.*\.txt',
                          filetype='fixed',
                          name='geofile',
                          encoding='latin1',
                          )

        spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1),
                        ColumnSpec(position=2, width=2, name='stusab', start=7),
                        ColumnSpec(position=3, width=3, name='sumlevel', start=9),
                        ColumnSpec(position=4, width=2, name='component', start=12),
                        ColumnSpec(position=5, width=7, name='logrecno', start=14),
                        ColumnSpec(position=6, width=1, name='us', start=21),
                        ColumnSpec(position=7, width=1, name='region', start=22),
                        ColumnSpec(position=8, width=1, name='division', start=23),
                        ColumnSpec(position=9, width=2, name='statece', start=24),
                        ColumnSpec(position=10, width=2, name='state', start=26),
                        ColumnSpec(position=11, width=3, name='county', start=28),
                        ColumnSpec(position=12, width=5, name='cousub', start=31),
                        ColumnSpec(position=13, width=5, name='place', start=36),
                        ColumnSpec(position=14, width=6, name='tract', start=41),
                        ColumnSpec(position=15, width=1, name='blkgrp', start=47),
                        ColumnSpec(position=16, width=5, name='concit', start=48),
                        ColumnSpec(position=17, width=4, name='aianhh', start=53),
                        ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
                        ColumnSpec(position=19, width=1, name='aihhtli', start=62),
                        ColumnSpec(position=20, width=3, name='aitsce', start=63),
                        ColumnSpec(position=21, width=5, name='aits', start=66),
                        ColumnSpec(position=22, width=5, name='anrc', start=71),
                        ColumnSpec(position=23, width=5, name='cbsa', start=76),
                        ColumnSpec(position=24, width=3, name='csa', start=81),
                        ColumnSpec(position=25, width=5, name='metdiv', start=84),
                        ColumnSpec(position=26, width=1, name='macc', start=89),
                        ColumnSpec(position=27, width=1, name='memi', start=90),
                        ColumnSpec(position=28, width=5, name='necta', start=91),
                        ColumnSpec(position=29, width=3, name='cnecta', start=96),
                        ColumnSpec(position=30, width=5, name='nectadiv', start=99),
                        ColumnSpec(position=31, width=5, name='ua', start=104),
                        ColumnSpec(position=33, width=2, name='cdcurr', start=114),
                        ColumnSpec(position=34, width=3, name='sldu', start=116),
                        ColumnSpec(position=35, width=3, name='sldl', start=119),
                        ColumnSpec(position=39, width=5, name='submcd', start=136),
                        ColumnSpec(position=40, width=5, name='sdelm', start=141),
                        ColumnSpec(position=41, width=5, name='sdsec', start=146),
                        ColumnSpec(position=42, width=5, name='sduni', start=151),
                        ColumnSpec(position=43, width=1, name='ur', start=156),
                        ColumnSpec(position=44, width=1, name='pci', start=157),
                        ColumnSpec(position=47, width=5, name='puma5', start=169),
                        ColumnSpec(position=49, width=40, name='geoid', start=179),
                        ColumnSpec(position=50, width=200, name='name', start=219)]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])
Example #36
0
 def test_fixed(self):
     cache_fs = fsopendir(self.setup_temp_dir())
     spec = self.sources['simple_fixed']
     s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))
     f = MPRowsFile(cache_fs, spec.name).load_rows(s)
     self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
Example #37
0
    def test_load_and_headers(self):
        """ Just checks that all of the sources can be loaded without exceptions. """
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source_headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = HDFPartition(cache_fs, spec.name)
            if f.exists:
                f.remove()

            # FIXME: This is really complicated setup for HDFPartition file. Try to simplify.
            with f.writer as w:
                if spec.has_rowspec:
                    row_spec = self._spec_to_dict(spec)
                    headers = self._get_headers(s, spec)
                    ti = TypeIntuiter().process_header(headers).run(s)
                    w.set_row_spec(row_spec, headers)
                    w.set_types(ti)
                else:
                    ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows)
                    row_spec = self._row_intuiter_to_dict(ri)
                    ti = TypeIntuiter().process_header(ri.headers).run(s)
                    w.set_row_spec(row_spec, ri.headers)
                    w.set_types(ti)
            f.load_rows(s)

            with f.reader as r:
                if spec.name in source_headers:
                    self.assertEqual(source_headers[spec.name], r.headers)
Example #38
0
    def test_bad_row_intuition(self):
        from ambry_sources.mpf import MPRowsFile
        from ambry_sources.sources.spec import SourceSpec, ColumnSpec

        cache_fs = fsopendir('temp://')

        spec = SourceSpec(
            'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip',
            file='g2009.*\.txt',
            filetype='fixed',
            name='geofile',
            encoding='latin1',
        )

        spec.columns = [
            ColumnSpec(position=1, width=6, name='fileid', start=1),
            ColumnSpec(position=2, width=2, name='stusab', start=7),
            ColumnSpec(position=3, width=3, name='sumlevel', start=9),
            ColumnSpec(position=4, width=2, name='component', start=12),
            ColumnSpec(position=5, width=7, name='logrecno', start=14),
            ColumnSpec(position=6, width=1, name='us', start=21),
            ColumnSpec(position=7, width=1, name='region', start=22),
            ColumnSpec(position=8, width=1, name='division', start=23),
            ColumnSpec(position=9, width=2, name='statece', start=24),
            ColumnSpec(position=10, width=2, name='state', start=26),
            ColumnSpec(position=11, width=3, name='county', start=28),
            ColumnSpec(position=12, width=5, name='cousub', start=31),
            ColumnSpec(position=13, width=5, name='place', start=36),
            ColumnSpec(position=14, width=6, name='tract', start=41),
            ColumnSpec(position=15, width=1, name='blkgrp', start=47),
            ColumnSpec(position=16, width=5, name='concit', start=48),
            ColumnSpec(position=17, width=4, name='aianhh', start=53),
            ColumnSpec(position=18, width=5, name='aianhhfp', start=57),
            ColumnSpec(position=19, width=1, name='aihhtli', start=62),
            ColumnSpec(position=20, width=3, name='aitsce', start=63),
            ColumnSpec(position=21, width=5, name='aits', start=66),
            ColumnSpec(position=22, width=5, name='anrc', start=71),
            ColumnSpec(position=23, width=5, name='cbsa', start=76),
            ColumnSpec(position=24, width=3, name='csa', start=81),
            ColumnSpec(position=25, width=5, name='metdiv', start=84),
            ColumnSpec(position=26, width=1, name='macc', start=89),
            ColumnSpec(position=27, width=1, name='memi', start=90),
            ColumnSpec(position=28, width=5, name='necta', start=91),
            ColumnSpec(position=29, width=3, name='cnecta', start=96),
            ColumnSpec(position=30, width=5, name='nectadiv', start=99),
            ColumnSpec(position=31, width=5, name='ua', start=104),
            ColumnSpec(position=33, width=2, name='cdcurr', start=114),
            ColumnSpec(position=34, width=3, name='sldu', start=116),
            ColumnSpec(position=35, width=3, name='sldl', start=119),
            ColumnSpec(position=39, width=5, name='submcd', start=136),
            ColumnSpec(position=40, width=5, name='sdelm', start=141),
            ColumnSpec(position=41, width=5, name='sdsec', start=146),
            ColumnSpec(position=42, width=5, name='sduni', start=151),
            ColumnSpec(position=43, width=1, name='ur', start=156),
            ColumnSpec(position=44, width=1, name='pci', start=157),
            ColumnSpec(position=47, width=5, name='puma5', start=169),
            ColumnSpec(position=49, width=40, name='geoid', start=179),
            ColumnSpec(position=50, width=200, name='name', start=219)
        ]

        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        f = MPRowsFile(cache_fs, spec.name)

        if f.exists:
            f.remove()

        f.load_rows(s)

        self.assertEqual(119, f.reader.info['data_end_row'])
Example #39
0
    def test_load_check_headers(self):
        """Just check that all of the sources can be loaded without exceptions"""

        cache_fs = fsopendir('temp://')

        headers = {
            'mz_with_zip_xl': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'mz_no_zip': [u('id'), u('uuid'),
                          u('int'), u('float')],
            'namesu8': [
                u('origin_english'),
                u('name_english'),
                u('origin_native'),
                u('name_native')
            ],
            'sf_zip': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'simple': [u('id'), u('uuid'),
                       u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'),
                           u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'),
                            u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'simple_fixed': [u('id'), u('uuid'),
                             u('int'),
                             u('float')],
            'altname': [u('id'), u('foo'),
                        u('bar'), u('baz')],
            'rentcsv': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'renttab': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'multiexcel': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ],
            'rent97': [
                u('id'),
                u('gvid'),
                u('renter_cost_gt_30'),
                u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'),
                u('owner_cost_gt_30_pct_cv')
            ]
        }

        for source_name, spec in self.sources.items():
            print(source_name)
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = MPRowsFile(cache_fs, spec.name)

            if f.exists:
                f.remove()

            f.load_rows(s)

            with f.reader as r:
                if spec.name in headers:
                    self.assertEqual(headers[spec.name], r.headers)