Example #1
0
    def import_data(self, **kwargs):
        """
            Imports data from an external source to create the test data
            :return a two item tuple containing the region that was imported and a list of the imported projects
        """

        # Calculate a sample lat/lon box of the config_entity
        config_entity = self.config_entity
        if self.test:
            bounds = chop_geom(config_entity.bounds, 0.90)
            logger.info(
                u"Creating subselection with extents: {0}. This will be used to crop any table that doesn't have a sample version"
                .format(bounds))

        conn = psycopg2.connect(
            **pg_connection_parameters(settings.DATABASES['default']))
        conn.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        cursor = conn.cursor()

        for db_entity in self.db_entities:

            # This is the index on wkb_geometry.
            spatial_index_name = '{schema}_{key}_geom_idx'.format(
                schema=db_entity.schema, key=db_entity.key)

            table = db_entity.table

            if db_entity.has_file_url:
                # Remove any table of the same name from the import schema. This is unlikley since imported
                # tables have timestamps
                drop_table('"%s"."%s"' %
                           (settings.IMPORT_SCHEMA, db_entity.key))
                sql_file_path = file_url_to_path(db_entity.url)
                # Create a command that pipes shp2pgsql to psql
                db_entity.srid = db_entity.srid or '4326'
                logger.info("verifying SRID {0}".format(db_entity.srid))
                verify_srid(db_entity.srid)

                # Create the import schema if needed
                PGNamespace.objects.create_schema(settings.IMPORT_SCHEMA)

                # Import the table
                import_sql_command = '/usr/bin/psql {0} -f {1}'.format(
                    self.target_database_connection, sql_file_path)
                stdin = "{0}\n{1}".format(
                    self.arguments.get('password', None),
                    self.target_database.get('PASSWORD', None))
                results = self.command_execution.run(import_sql_command,
                                                     stdin=stdin)
                if results.returncode:
                    raise Exception(results.stderr.text)

                # We expect a table in the public schema with a named based on db_entity.key
                # Move the table from the public schema to the db_entity schema
                move_to_schema = "alter table {0}.{1} set schema {2};".format(
                    settings.IMPORT_SCHEMA, db_entity.key, db_entity.schema)
                logger.info("Moving import file table to schema: %s" %
                            move_to_schema)
                cursor.execute(move_to_schema)
                # Drop the constraint that enforces the srid of the wkb_geometry if one exists
                drop_constraint = '''alter table {0}.{1} drop constraint if exists enforce_srid_wkb_geometry'''.format(
                    db_entity.schema, db_entity.key)
                logger.info("Dropping constraint on wkb_geometry: %s" %
                            drop_constraint)
                cursor.execute(drop_constraint)

                # Note we're not creating an index on wkb_geometry
                # here because imported files already have an index
                # created.

            elif db_entity.has_db_url:
                # The import database currently stores tables as
                # public.[config_entity.key]_[feature_class._meta.db_table (with schema removed)][_sample (for samples)]
                #
                # We always use the table name without the word sample for the target table name
                if settings.USE_SAMPLE_DATA_SETS or self.test:
                    source_table = "{0}_{1}_{2}".format(
                        config_entity.import_key or config_entity.key,
                        db_entity.table, 'sample')
                else:
                    source_table = "{0}_{1}".format(
                        config_entity.import_key or config_entity.key,
                        db_entity.table)

                connection_dict = postgres_url_to_connection_dict(
                    db_entity.url)
                self._dump_tables_to_target('-t %s' % source_table,
                                            source_schema='public',
                                            target_schema=db_entity.schema,
                                            source_table=source_table,
                                            target_table=table,
                                            connection_dict=connection_dict)

                # Create a spatial index
                spatial_index = '''create index {index_name} on {schema}.{key} using GIST (wkb_geometry);'''.format(
                    index_name=spatial_index_name,
                    schema=db_entity.schema,
                    key=db_entity.key)
                cursor.execute(spatial_index)

            # Whether the table comes from our server or an upload, we want to transform the SRID to 4326
            transform_to_4326 = 'ALTER TABLE {schema}.{table} ALTER COLUMN wkb_geometry ' \
                                'TYPE Geometry(geometry, 4326) ' \
                                'USING ST_Transform(ST_Force_2d(wkb_geometry), 4326);'.format
            logger.info("Transforming to 4326: %s" % transform_to_4326(
                schema=db_entity.schema, table=db_entity.table))

            cursor.execute(
                transform_to_4326(schema=db_entity.schema,
                                  table=db_entity.table))

            # Now cluster the data and vacuum so that future joins are faster:
            # * CLUSTER rewrites the data on disk so that rows that are spatially near each
            #   other are also near each other on disk
            # * VACUUM cleans up disk space, removing sparse holes on disk.
            # * ANALYZE regenerates statistics about wkb_geometry so that the query planner can make
            #   better decisions.

            logger.info('Clustering %s.%s to optimize spatial joins',
                        db_entity.schema, table)
            cluster = 'CLUSTER {index_name} ON {target_schema}.{target_table};'.format(
                index_name=spatial_index_name,
                target_schema=db_entity.schema,
                target_table=table)
            cursor.execute(cluster)

            logger.info('Vacuuming and analyzing %s.%s.', db_entity.schema,
                        table)
            analyze = 'VACUUM ANALYZE {target_schema}.{target_table};'.format(
                target_schema=db_entity.schema, target_table=table)

            cursor.execute(analyze)

            logger.info(
                "Finished importing data for DbEntity table {0}.{1}".format(
                    db_entity.schema, db_entity.key))
Example #2
0
    def import_data(self, **kwargs):
        """
            Imports data from an external source to create the test data
            :return a two item tuple containing the region that was imported and a list of the imported projects
        """

        # Calculate a sample lat/lon box of the config_entity
        config_entity = self.config_entity
        if self.test:
            bounds = chop_geom(config_entity.bounds, 0.90)
            logger.info(u"Creating subselection with extents: {0}. This will be used to crop any table that doesn't have a sample version".format(bounds))

        conn = psycopg2.connect(**pg_connection_parameters(settings.DATABASES['default']))
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        cursor = conn.cursor()

        for db_entity in self.db_entities:

            # This is the index on wkb_geometry.
            spatial_index_name = '{schema}_{key}_geom_idx'.format(schema=db_entity.schema, key=db_entity.key)

            table = db_entity.table

            if db_entity.has_file_url:
                # Remove any table of the same name from the import schema. This is unlikley since imported
                # tables have timestamps
                drop_table('"%s"."%s"' % (settings.IMPORT_SCHEMA, db_entity.key))
                sql_file_path = file_url_to_path(db_entity.url)
                # Create a command that pipes shp2pgsql to psql
                db_entity.srid = db_entity.srid or '4326'
                logger.info("verifying SRID {0}".format(db_entity.srid))
                verify_srid(db_entity.srid)

                # Create the import schema if needed
                PGNamespace.objects.create_schema(settings.IMPORT_SCHEMA)

                # Import the table
                import_sql_command = '/usr/bin/psql {0} -f {1}'.format(self.target_database_connection, sql_file_path)
                stdin = "{0}\n{1}".format(self.arguments.get('password', None), self.target_database.get('PASSWORD', None))
                results = self.command_execution.run(import_sql_command, stdin=stdin)
                if results.returncode:
                    raise Exception(results.stderr.text)

                # We expect a table in the public schema with a named based on db_entity.key
                # Move the table from the public schema to the db_entity schema
                move_to_schema = "alter table {0}.{1} set schema {2};".format(settings.IMPORT_SCHEMA, db_entity.key, db_entity.schema)
                logger.info("Moving import file table to schema: %s" % move_to_schema)
                cursor.execute(move_to_schema)
                # Drop the constraint that enforces the srid of the wkb_geometry if one exists
                drop_constraint = '''alter table {0}.{1} drop constraint if exists enforce_srid_wkb_geometry'''.format(db_entity.schema, db_entity.key)
                logger.info("Dropping constraint on wkb_geometry: %s" % drop_constraint)
                cursor.execute(drop_constraint)

                # Note we're not creating an index on wkb_geometry
                # here because imported files already have an index
                # created.

            elif db_entity.has_db_url:
                # The import database currently stores tables as
                # public.[config_entity.key]_[feature_class._meta.db_table (with schema removed)][_sample (for samples)]
                #
                # We always use the table name without the word sample for the target table name
                if settings.USE_SAMPLE_DATA_SETS or self.test:
                    source_table = "{0}_{1}_{2}".format(
                        config_entity.import_key or config_entity.key, db_entity.table, 'sample')
                else:
                    source_table = "{0}_{1}".format(config_entity.import_key or config_entity.key, db_entity.table)

                connection_dict = postgres_url_to_connection_dict(db_entity.url)
                self._dump_tables_to_target(
                    '-t %s' % source_table,
                    source_schema='public',
                    target_schema=db_entity.schema,
                    source_table=source_table,
                    target_table=table,
                    connection_dict=connection_dict)

                # Create a spatial index
                spatial_index = '''create index {index_name} on {schema}.{key} using GIST (wkb_geometry);'''.format(
                    index_name=spatial_index_name,
                    schema=db_entity.schema, key=db_entity.key)
                cursor.execute(spatial_index)

            # Whether the table comes from our server or an upload, we want to transform the SRID to 4326
            transform_to_4326 = 'ALTER TABLE {schema}.{table} ALTER COLUMN wkb_geometry ' \
                                'TYPE Geometry(geometry, 4326) ' \
                                'USING ST_Transform(ST_Force_2d(wkb_geometry), 4326);'.format
            logger.info("Transforming to 4326: %s" % transform_to_4326(schema=db_entity.schema, table=db_entity.table))

            cursor.execute(transform_to_4326(schema=db_entity.schema, table=db_entity.table))

            # Now cluster the data and vacuum so that future joins are faster:
            # * CLUSTER rewrites the data on disk so that rows that are spatially near each
            #   other are also near each other on disk
            # * VACUUM cleans up disk space, removing sparse holes on disk.
            # * ANALYZE regenerates statistics about wkb_geometry so that the query planner can make
            #   better decisions.

            logger.info('Clustering %s.%s to optimize spatial joins', db_entity.schema, table)
            cluster = 'CLUSTER {index_name} ON {target_schema}.{target_table};'.format(
                index_name=spatial_index_name,
                target_schema=db_entity.schema,
                target_table=table)
            cursor.execute(cluster)

            logger.info('Vacuuming and analyzing %s.%s.', db_entity.schema, table)
            analyze = 'VACUUM ANALYZE {target_schema}.{target_table};'.format(
                target_schema=db_entity.schema,
                target_table=table)

            cursor.execute(analyze)

            logger.info("Finished importing data for DbEntity table {0}.{1}".format(db_entity.schema, db_entity.key))
    def importer(self, config_entity, db_entity, **kwargs):
        """
            Replaces the normal ImportProcessor importer with one to import a sql from disk
        """
        if InformationSchema.objects.table_exists(db_entity.schema, db_entity.table):
            # The table already exists. Skip the import an log a warning
            logger.warn("The target table for the feature table import already exists. Skipping table import.")
        else:
            # We don't store the upload_id alone, so pull it off the url
            upload_id = db_entity.url.replace('file:///tmp/', '').replace('.sql.zip', '')
            # Unpack the zipfile and return the path the sql file was placed at
            if db_entity.url.startswith('file://'):
                file_path = db_entity.url[len('file://'):]

            logger.warn(file_path)
            path = unpack_zipfile(file_path, upload_id)
            # The file is always the name of the table defined therein
            table_name = path.split('/')[-1].split('.')[0].lower()
            db_entity.url = 'file://%s' % path
            # Update the db_entity.url from the zip file url to the file_path
            # This lets ImportData find it.
            logger.info("Url of DbEntity is %s" % db_entity.url)
            db_entity.save()

            # Perform some sed updates to get the sql file ready for import
            regex_substitutions = []
            sql_file_path = file_url_to_path(db_entity.url)

            # Add IF EXISTS to the drop table to prevent an error if IF EXISTS doesn't exist yet
            regex_substitutions.append((r'DROP TABLE (?!IF EXISTS)', r'DROP TABLE IF EXISTS'))

            # TODO temp, fix an AC bug. It seems that using a capitalized column is problematic (?)
            # The suggested solution is to double quote it, but quotes cause other problems, so we simply lowercase
            regex_substitutions.append((r' OGC_FID ', ' ogc_fid ', (4, 4)))  # only line 4
            regex_substitutions.append((r'PRIMARY KEY \(ogc_fid\)', 'PRIMARY KEY (ogc_fid)', (4, 4)))  # only line 4
            # TODO end temp fix

            # Update the index name to include the schema. This format matches that created for preconfigured feature
            # tables (see import_data.py)
            spatial_index_name = '{schema}_{key}_geom_idx'.format(schema=db_entity.schema, key=db_entity.key)
            regex_substitutions.append((r'CREATE INDEX ".*" ON', 'CREATE INDEX "%s" ON' % spatial_index_name, (6, 6)))  # only line 6 6

            # Remove the reference to the geometry_columns, since we use a materialized view
            regex_substitutions.append((r'^DELETE FROM geometry_columns', '--DELETE FROM geometry_columns', (2, 2)))

            # Update the sql to have a unique table name which matches the DbEntity key
            # Also change public to our import schema to keep it from causing trouble in the public schema
            # Otherwise we run into all kinds of trouble trying to get the SQL into the system
            regex_substitutions.append((r'"public"."%s"' % table_name, '"import"."%s"' % db_entity.key))

            regex_substitutions.append((r"'%s'" % table_name, "'%s'" % db_entity.key, (2, 5)))

            regex_substitutions.append((r'"%s_pk"' % table_name, '"%s_pk"' % db_entity.key, (4, 4)))

            # Update public to the import schema
            regex_substitutions.append((r"AddGeometryColumn\('public'", "AddGeometryColumn('%s'" % settings.IMPORT_SCHEMA, (5, 5)))

            regex_substitutions.append((r'"%s_wkb_geometry_geom_idx"' % table_name, '"%s_wkb_geometry_geom_idx"' % db_entity.key, (6, 6)))

            for command in regex_substitutions:
                logger.info("Applying the following substitution %s" % ', '.join(command[0:2]))
            apply_regexes_to_file(sql_file_path, regex_substitutions)

            ImportData(config_entity=config_entity, db_entity_key=db_entity.key).run()

        # Add our normal primary key in the id column if negit eded
        add_primary_key_if_needed(db_entity)

        feature_class_creator = FeatureClassCreator(config_entity, db_entity)
        # Inspect the imported table to create the feature_class_configuration
        feature_class_configuration = feature_class_creator.feature_class_configuration_from_introspection()

        # Merge the created feature_class_configuration with the on already defined for the db_entity
        feature_class_creator.update_db_entity(feature_class_configuration)
        logger.info("Finished import for DbEntity: %s, feature_class_configuration: %s" % (db_entity, db_entity.feature_class_configuration))

        # Create association classes and tables and populate them with data
        create_and_populate_relations(config_entity, feature_class_creator.db_entity)