Ejemplo n.º 1
0
    def __init__(self, meta, source_path=None):
        """
        :param meta: record from MetaTable
        :param source_path: path of source file on local filesystem
                            if None, look for data at a remote URL instead
        """
        # Just the info about column names we usually need
        self.dataset = meta.meta_tuple()
        self.name = 's_' + self.dataset.name

        # Get the Columns to construct our table
        try:
            # Can we just grab columns from an existing table?
            self.cols = self._from_ingested(meta.column_info())
            # if we can, update the corresponding meta table with these column names and types
            # we expect column names to be filled in by users, but helpful in test cases/backend submission
            if meta.column_names is None:
                cols = {col.name: str(col.type) for col in self.cols if col.name not in {u'geom', u'point_date', u'hash'}}
                meta.column_names = json.dumps(cols)
        except NoSuchTableError:
            self.cols = None

        # Retrieve the source file
        try:
            if source_path:  # Local ingest
                self.file_helper = ETLFile(source_path=source_path)
            else:  # Remote ingest
                self.file_helper = ETLFile(source_url=meta.source_url)
        except Exception as e:
            raise PlenarioETLError(e)
Ejemplo n.º 2
0
    def __init__(self, meta, source_path=None):
        """
        :param meta: record from MetaTable
        :param source_path: path of source file on local filesystem
                            if None, look for data at a remote URL instead
        """
        # Just the info about column names we usually need
        logger.info('Begin.')
        logger.info('meta: {}'.format(meta))
        logger.info('source_path: {}'.format(source_path))
        self.dataset = meta.meta_tuple()
        self.name = 's_' + self.dataset.name

        # Get the Columns to construct our table
        try:
            # Can we just grab columns from an existing table?
            self.cols = self._from_ingested(meta.column_info())
        except NoSuchTableError:
            self.cols = None

        # Retrieve the source file
        try:
            if source_path:  # Local ingest
                self.file_helper = ETLFile(source_path=source_path)
            else:  # Remote ingest
                self.file_helper = ETLFile(source_url=meta.source_url)
        except Exception as e:
            raise PlenarioETLError(e)
Ejemplo n.º 3
0
    def _make_table(self, f):
        """
        Create a table and fill it with CSV data.
        :param f: Open file handle pointing to start of CSV
        :return: populated table
        """
        # Persist an empty table eagerly
        # so that we can access it when we drop down to a raw connection.

        # Be paranoid and remove the table if one by this name already exists.
        table = Table(self.name, MetaData(), *self.cols, extend_existing=True)
        self._drop()
        table.create(bind=postgres_engine)

        # Fill in the columns we expect from the CSV.
        names = ['"' + c.name + '"' for c in self.cols]
        copy_st = "COPY {t_name} ({cols}) FROM STDIN " \
                  "WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')".\
            format(t_name=self.name, cols=', '.join(names))

        # In order to issue a COPY, we need to drop down to the psycopg2 DBAPI.
        conn = postgres_engine.raw_connection()
        try:
            with conn.cursor() as cursor:
                f.seek(0)
                cursor.copy_expert(copy_st, f)
                conn.commit()
                return table
        except Exception as e:
            # When the bulk copy fails on _any_ row,
            # roll back the entire operation.
            raise PlenarioETLError(e)
        finally:
            conn.close()
Ejemplo n.º 4
0
    def __enter__(self):
        """
        Add a table (prefixed with n_) to the database
        with one record for each record found in the staging table
        with a hash not present in the existing table.
        If there are no such records, do nothing.
        """

        # create n_table with point_date, geom, and id columns
        s = self.staging
        e = self.existing
        d = self.dataset

        derived_dates = func.cast(s.c[d.date], TIMESTAMP).label('point_date')
        derived_geoms = self._geom_col()
        cols_to_insert = [s.c['hash'], derived_dates, derived_geoms]

        # Select the hash and the columns we're deriving from the staging table.
        sel = select(cols_to_insert)
        # And limit our results to records
        # whose hashes aren't already present in the existing table.
        sel = sel.select_from(s.outerjoin(e, s.c['hash'] == e.c['hash'])).\
            where(e.c['hash'] == None)

        # Drop the table first out of healthy paranoia
        self._drop()
        try:
            self.table.create(bind=postgres_engine)
        except Exception as e:
            raise PlenarioETLError(repr(e) +
                                   '\nCould not create table n_' + d.name)

        ins = self.table.insert().from_select(cols_to_insert, sel)
        # Populate it with records from our select statement.
        try:
            postgres_engine.execute(ins)
        except Exception as e:
            raise PlenarioETLError(repr(e) + '\n' + str(sel))
        else:
            # Would be nice to check if we have new records or not right here.
            return self
Ejemplo n.º 5
0
    def ingest(self):
        """
        Create the table. The caller is responsible for cleanup.
        :return: SQLAlchemy Table
        """

        with ETLFile(source_url=self.url,
                     source_path=self.path) as file_helper:

            # Attempt insertion
            try:
                with zipfile.ZipFile(file_helper.handle) as shapefile_zip:
                    import_shapefile(shapefile_zip=shapefile_zip,
                                     table_name=self.name)
            except zipfile.BadZipfile:
                raise PlenarioETLError("Source file was not a valid .zip")
            except ShapefileError as e:
                raise PlenarioETLError(
                    "Failed to import shapefile.\n{}".format(repr(e)))

        add_unique_hash(self.name)
        return reflect(self.name)
Ejemplo n.º 6
0
    def insert(self):
        """
        Join with the staging table
        to insert complete records into existing table.
        """
        derived_cols = [c for c in self.table.c
                        if c.name in {'geom', 'point_date'}]
        staging_cols = [c for c in self.staging.c]
        sel_cols = staging_cols + derived_cols

        sel = select(sel_cols).where(self.staging.c.hash == self.table.c.hash)
        ins = self.existing.insert().from_select(sel_cols, sel)

        try:
            postgres_engine.execute(ins)
        except Exception as e:
            raise PlenarioETLError(repr(e) +
                                   '\n Failed on statement: ' + str(ins))
        try:
            _null_malformed_geoms(self.existing)
        except Exception as e:
            raise PlenarioETLError(repr(e) +
                        '\n Failed to null out geoms with (0,0) geocoding')
Ejemplo n.º 7
0
    def add(self):
        if self.meta.is_ingested:
            raise PlenarioETLError(
                "Table {} has already been ingested.".format(self.table_name))

        new = HashedShape(self.table_name, self.source_url, self.source_path)
        try:
            new.ingest()
            self.meta.update_after_ingest()
            session.commit()
        except:
            # In case ingestion failed partway through,
            # be sure to leave no trace.
            new.drop()
            raise
Ejemplo n.º 8
0
    def _hash_update(staging, existing):
        delete_absent_hashes(staging.name, existing.name)

        # Select all records from staging
        # whose hashes are not present in existing.
        join_condition = staging.c['hash'] == existing.c['hash']
        sel = staging.select()\
            .select_from(staging.outerjoin(existing, join_condition)).\
            where(existing.c['hash'] == None)

        # Insert those into existing
        col_names = [col.name for col in existing.columns]
        ins = existing.insert().from_select(col_names, sel)
        try:
            engine.execute(ins)
        except Exception as e:
            raise PlenarioETLError(repr(e) + '\n' + str(sel))
Ejemplo n.º 9
0
    def __enter__(self):
        """
        Create the staging table. Will be named s_[dataset_name]
        """
        with self.file_helper as helper:
            if not self.cols:
                # We couldn't get the column metadata from an existing table
                self.cols = self._from_inference(helper.handle)

            # Grab the handle to build a table from the CSV
            try:
                self.table = self._make_table(helper.handle)
                add_unique_hash(self.table.name)
                self.table = Table(self.name, MetaData(),
                                   autoload_with=engine, extend_existing=True)
                return self
            except Exception as e:
                raise PlenarioETLError(e)
Ejemplo n.º 10
0
    def __enter__(self):
        """Create the staging table. Will be named s_[dataset_name]"""

        logger.info('Begin.')
        with self.file_helper as helper:
            text_handle = open(helper.handle.name, "rt", encoding='utf-8')
            self.cols = self._from_inference(text_handle)

            # Grab the handle to build a table from the CSV
            try:
                self.table = self._make_table(text_handle)
                add_unique_hash(self.table.name)
                self.table = Table(
                    self.name,
                    postgres_base.metadata,
                    autoload_with=postgres_engine,
                    extend_existing=True
                )
                return self
            except Exception as e:
                raise PlenarioETLError(e)
        logger.info('End.')
Ejemplo n.º 11
0
    def _geom_col(self):
        """
        Derive selectable with a PostGIS point in 4326 projection
        derived from either the latitude and longitude columns
        or single location column
        """
        t = self.staging
        d = self.dataset

        if d.lat and d.lon:
            # Assume latitude and longitude columns are both numeric types.
            geom_col = func.ST_SetSRID(func.ST_Point(t.c[d.lon], t.c[d.lat]),
                                       4326).label('geom')

        elif d.loc:
            geom_col = func.point_from_loc(t.c[d.loc]).label('geom')

        else:
            msg = 'Staging table does not have geometry information.'
            raise PlenarioETLError(msg)

        return geom_col
Ejemplo n.º 12
0
    def _add_unique_hash(table_name):
        """
        Adds an md5 hash column of the preexisting columns
        and removes duplicate rows from a table.
        :param table_name: Name of table to add hash to.
        """
        add_hash = '''
        DROP TABLE IF EXISTS temp;
        CREATE TABLE temp AS
          SELECT DISTINCT *,
                 md5(CAST(("{table_name}".*)AS text))
                    AS hash FROM "{table_name}";
        DROP TABLE "{table_name}";
        ALTER TABLE temp RENAME TO "{table_name}";
        ALTER TABLE "{table_name}" ADD PRIMARY KEY (hash);
        '''.format(table_name=table_name)

        try:
            engine.execute(add_hash)
        except Exception as e:
            raise PlenarioETLError(
                repr(e) + '\n Failed to deduplicate with ' + add_hash)