Esempio n. 1
0
    def create_index(self, columns, name=None):
        """
        Create an index to speed up queries on a table. If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        if not name:
            sig = '||'.join(columns)

            # This is a work-around for a bug in <=0.6.1 which would create
            # indexes based on hash() rather than a proper hash.
            key = abs(hash(sig))
            name = 'ix_%s_%s' % (self.table.name, key)
            if name in self.indexes:
                return self.indexes[name]

            key = sha1(sig.encode('utf-8')).hexdigest()[:16]
            name = 'ix_%s_%s' % (self.table.name, key)

        if name in self.indexes:
            return self.indexes[name]
        try:
            self.database._acquire()
            columns = [self.table.c[c] for c in columns]
            idx = Index(name, *columns)
            idx.create(self.database.engine)
        except:
            idx = None
        finally:
            self.database._release()
        self.indexes[name] = idx
        return idx
Esempio n. 2
0
    def create_index(self, columns, name=None):
        """
        Create an index to speed up queries on a table. If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        if not name:
            sig = '||'.join(columns)

            # This is a work-around for a bug in <=0.6.1 which would create
            # indexes based on hash() rather than a proper hash.
            key = abs(hash(sig))
            name = 'ix_%s_%s' % (self.table.name, key)
            if name in self.indexes:
                return self.indexes[name]

            key = sha1(sig.encode('utf-8')).hexdigest()[:16]
            name = 'ix_%s_%s' % (self.table.name, key)

        if name in self.indexes:
            return self.indexes[name]
        try:
            self.database._acquire()
            columns = [self.table.c[c] for c in columns]
            idx = Index(name, *columns)
            idx.create(self.database.engine)
        except:
            idx = None
        finally:
            self.database._release()
        self.indexes[name] = idx
        return idx
Esempio n. 3
0
    def create_index(self, columns, name=None, **kw):
        """Create an index to speed up queries on a table.

        If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        columns = [self._get_column_name(c) for c in ensure_list(columns)]
        with self.db.lock:
            if not self.exists:
                raise DatasetException("Table has not been created yet.")

            for column in columns:
                if not self.has_column(column):
                    return

            if not self.has_index(columns):
                self._threading_warn()
                name = name or index_name(self.name, columns)
                columns = [self.table.c[c] for c in columns]

                # MySQL crashes out if you try to index very long text fields,
                # apparently. This defines (a somewhat random) prefix that
                # will be captured by the index, after which I assume the engine
                # conducts a more linear scan:
                mysql_length = {}
                for col in columns:
                    if isinstance(col.type, MYSQL_LENGTH_TYPES):
                        mysql_length[col.name] = 10
                kw["mysql_length"] = mysql_length

                idx = Index(name, *columns, **kw)
                idx.create(self.db.executable)
Esempio n. 4
0
 def create_index(self, columns, name=None, index_type="btree"):
     """
     Create an index to speed up queries on a table.
     If no ``name`` is given a random name is created.
     ::
         table.create_index(['name', 'country'])
     """
     self._check_dropped()
     if not name:
         sig = "||".join(columns + [index_type])
         # This is a work-around for a bug in <=0.6.1 which would create
         # indexes based on hash() rather than a proper hash.
         key = abs(hash(sig))
         name = "ix_%s_%s" % (self.table.name, key)
         if name in self.indexes:
             return self.indexes[name]
         key = sha1(sig.encode("utf-8")).hexdigest()[:16]
         name = "ix_%s_%s" % (self.table.name, key)
     if name in self.indexes:
         return self.indexes[name]
     # self.db._acquire()
     columns = [self.table.c[col] for col in columns]
     idx = Index(name, *columns, postgresql_using=index_type)
     idx.create(self.engine)
     # finally:
     #    self.db._release()
     self.indexes[name] = idx
     return idx
Esempio n. 5
0
 def create_index(self, columns, name=None, index_type="btree"):
     """
     Create an index to speed up queries on a table.
     If no ``name`` is given a random name is created.
     ::
         table.create_index(['name', 'country'])
     """
     self._check_dropped()
     if not name:
         sig = "||".join(columns + [index_type])
         # This is a work-around for a bug in <=0.6.1 which would create
         # indexes based on hash() rather than a proper hash.
         key = abs(hash(sig))
         name = "ix_%s_%s" % (self.table.name, key)
         if name in self.indexes:
             return self.indexes[name]
         key = sha1(sig.encode("utf-8")).hexdigest()[:16]
         name = "ix_%s_%s" % (self.table.name, key)
     if name in self.indexes:
         return self.indexes[name]
     # self.db._acquire()
     columns = [self.table.c[col] for col in columns]
     idx = Index(name, *columns, postgresql_using=index_type)
     idx.create(self.engine)
     # finally:
     #    self.db._release()
     self.indexes[name] = idx
     return idx
Esempio n. 6
0
 def generate_key_index(self):
     for index in self.key.table.indexes:
         if len(index.columns) == 1:
             for col in index.columns:
                 if col == self.key:
                     return
     index = Index(self.index_name, self.key)
     index.create(self.config.engine)
Esempio n. 7
0
def upgrade(migrate_engine):
    LOG.info(_LI("Adding boolean column delayed_notify to table 'zones'"))
    meta.bind = migrate_engine
    zones_table = Table('zones', meta, autoload=True)
    col = Column('delayed_notify', Boolean(), default=False)
    col.create(zones_table)
    index = Index('delayed_notify', zones_table.c.delayed_notify)
    index.create(migrate_engine)
Esempio n. 8
0
def upgrade(migrate_engine):
    meta = MetaData()
    meta.bind = migrate_engine

    service_statuses = Table('service_statuses', meta, autoload=True)
    idx = Index("service_statuses_instance_id", service_statuses.c.instance_id)

    try:
        idx.create()
    except OperationalError as e:
        logger.info(e)
Esempio n. 9
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine
    Table('datastores', meta, autoload=True)
    Table('datastore_versions', meta, autoload=True)
    instances = Table('instances', meta, autoload=True)
    create_tables([clusters])
    instances.create_column(
        Column('cluster_id', String(36), ForeignKey("clusters.id")))
    instances.create_column(Column('shard_id', String(36)))
    instances.create_column(Column('type', String(64)))
    cluster_id_idx = Index("instances_cluster_id", instances.c.cluster_id)
    cluster_id_idx.create()
Esempio n. 10
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine
    Table('datastores', meta, autoload=True)
    Table('datastore_versions', meta, autoload=True)
    instances = Table('instances', meta, autoload=True)
    create_tables([clusters])
    instances.create_column(Column('cluster_id', String(36),
                                   ForeignKey("clusters.id")))
    instances.create_column(Column('shard_id', String(36)))
    instances.create_column(Column('type', String(64)))
    cluster_id_idx = Index("instances_cluster_id", instances.c.cluster_id)
    cluster_id_idx.create()
Esempio n. 11
0
    def generate_key_index(self):
        key = self.key
        table = key.table
        if isinstance(table, Alias):
            table = table.original
            key = table.c[key.name]

        for index in table.indexes:
            if len(index.columns) == 1:
                for col in index.columns:
                    if col == key:
                        return
        index = Index(self.index_name, key)
        index.create(self.config.engine)
Esempio n. 12
0
def create_index(engine, table, columns, name=None):
    with lock:
        if not name:
            sig = abs(hash('||'.join(columns)))
            name = 'ix_%s_%s' % (table.name, sig)
        if name in INDEXES:
            return INDEXES[name]
        try:
            columns = [table.c[c] for c in columns]
            idx = Index(name, *columns)
            idx.create(engine)
        except:
            idx = None
        INDEXES[name] = idx
        return idx
Esempio n. 13
0
def add_dataset_key(request, table):
    """
    Add a key to a dataset
    """
    # If POST add the key
    if request.method == 'POST':
        # Get the POST parameter
        post_data = dict(request.POST)

        key_name = post_data['dataset_key_name'][0]

        dataset_columns = post_data['dataset_columns']

        # Get the table
        t = getattr(m.Base.classes, table)
        # Get the column objects for each selected column in the POST parameter
        column_objects = []
        for col in dataset_columns:
            column_objects.append(getattr(t.__table__.columns, col))

        # Build up a standard name for the index
        index_name = key_name

        # Create an sqlalchemy Index object
        index = Index(index_name, *column_objects)
        index.create(m.engine)

        # Create an entry in dataset_keys
        session = m.get_session()
        dataset_key = m.DATASET_KEYS(dataset_uuid=table,
                                     index_name=index_name,
                                     dataset_columns=dataset_columns)
        session.add(dataset_key)
        session.commit()
        session.close()

        # Redirect to the manage_dataset page
        return redirect('/manage/' + table)
    else:
        # Get the columns in the table and add them to the dropdown in the form
        columns = [
            str(x).split('.')[1]
            for x in getattr(m.Base.classes, table).__table__.columns
        ]
        form = AddDatasetKey(zip(columns, columns))
        # Return the form

        return render(request, 'add_dataset_key.html', {'form': form})
Esempio n. 14
0
def create_index(engine, table, columns, name=None):
    table = get_table(engine, table)
    with lock:
        if not name:
            sig = abs(hash("||".join(columns)))
            name = "ix_%s_%s" % (table.name, sig)
        if name in engine._indexes:
            return engine._indexes[name]
        try:
            columns = [table.c[c] for c in columns]
            idx = Index(name, *columns)
            idx.create(engine)
        except:
            idx = None
        engine._indexes[name] = idx
        return idx
Esempio n. 15
0
def create_index(engine, table, columns, name=None):
    table = get_table(engine, table)
    with lock:
        if not name:
            sig = abs(hash('||'.join(columns)))
            name = 'ix_%s_%s' % (table.name, sig)
        if name in engine._indexes:
            return engine._indexes[name]
        try:
            columns = [table.c[c] for c in columns]
            idx = Index(name, *columns)
            idx.create(engine)
        except:
            idx = None
        engine._indexes[name] = idx
        return idx
Esempio n. 16
0
    def add_index(cls, *cols, **kwargs):
        """Add an index to the table.
        """
        # Make slug from column names.
        col_names = '_'.join([c.name for c in cols])

        # Build the index name.
        name = 'idx_{}_{}'.format(cls.__tablename__, col_names)

        idx = Index(name, *cols, **kwargs)

        # Render the index.
        try:
            idx.create(bind=engine)
        except Exception as e:
            print(e)

        print(col_names)
Esempio n. 17
0
def upgrade(migrate_engine):
    meta = MetaData()
    meta.bind = migrate_engine

    backups = Table('backups', meta, autoload=True)
    backups_instance_id_idx = Index("backups_instance_id",
                                    backups.c.instance_id)
    backups_deleted_idx = Index("backups_deleted", backups.c.deleted)

    try:
        backups_instance_id_idx.create()
    except OperationalError as e:
        logger.info(e)

    try:
        backups_deleted_idx.create()
    except OperationalError as e:
        logger.info(e)
Esempio n. 18
0
def upgrade(migrate_engine):
    meta = MetaData()
    meta.bind = migrate_engine

    instances = Table('instances', meta, autoload=True)

    tenant_id_idx = Index("instances_tenant_id", instances.c.tenant_id)

    try:
        tenant_id_idx.create()
    except OperationalError as e:
        logger.info(e)

    deleted_idx = Index("instances_deleted", instances.c.deleted)
    try:
        deleted_idx.create()
    except OperationalError as e:
        logger.info(e)
Esempio n. 19
0
async def _create_index(meta: MetaData, engine: Engine, table: Table, column: str):
    logger.info(f'Creating index on {table.schema}.{table.name}')
    ts = time.time()
    idx_name = f'{table.name}_{column}_idx'
    index = Index(idx_name, table.columns[column])
    loop = asyncio.get_event_loop()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        await loop.run_in_executor(executor, lambda: index.create(bind=engine))
    te = time.time()
    logger.info(f'Index {idx_name} created after {(te - ts):.2f} seconds')
Esempio n. 20
0
    def create_index(self, columns, name=None, **kw):
        """Create an index to speed up queries on a table.

        If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        columns = [normalize_column_name(c) for c in ensure_tuple(columns)]
        with self.db.lock:
            if not self.exists:
                raise DatasetException("Table has not been created yet.")

            if not self.has_index(columns):
                self._threading_warn()
                name = name or index_name(self.name, columns)
                columns = [self.table.c[c] for c in columns]
                idx = Index(name, *columns, **kw)
                idx.create(self.db.executable)
Esempio n. 21
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine
    Table('datastores', meta, autoload=True)
    Table('datastore_versions', meta, autoload=True)
    instances = Table('instances', meta, autoload=True)

    # since the downgrade is a no-op, an upgrade after a downgrade will
    # cause an exception because the tables already exist
    # we will catch that case and log an info message
    try:
        create_tables([clusters])

        instances.create_column(
            Column('cluster_id', String(36), ForeignKey("clusters.id")))
        instances.create_column(Column('shard_id', String(36)))
        instances.create_column(Column('type', String(64)))

        cluster_id_idx = Index("instances_cluster_id", instances.c.cluster_id)
        cluster_id_idx.create()
    except OperationalError as e:
        logger.info(e)
Esempio n. 22
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine
    Table('datastores', meta, autoload=True)
    Table('datastore_versions', meta, autoload=True)
    instances = Table('instances', meta, autoload=True)

    # since the downgrade is a no-op, an upgrade after a downgrade will
    # cause an exception because the tables already exist
    # we will catch that case and log an info message
    try:
        create_tables([clusters])

        instances.create_column(Column('cluster_id', String(36),
                                       ForeignKey("clusters.id")))
        instances.create_column(Column('shard_id', String(36)))
        instances.create_column(Column('type', String(64)))

        cluster_id_idx = Index("instances_cluster_id", instances.c.cluster_id)
        cluster_id_idx.create()
    except OperationalError as e:
        logger.info(e)
Esempio n. 23
0
    def create_index(self, columns, name=None):
        """
        Create an index to speed up queries on a table. If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        with self.database.lock:
            if not name:
                sig = abs(hash('||'.join(columns)))
                name = 'ix_%s_%s' % (self.table.name, sig)
            if name in self.indexes:
                return self.indexes[name]
            try:
                columns = [self.table.c[c] for c in columns]
                idx = Index(name, *columns)
                idx.create(self.database.engine)
            except:
                idx = None
            self.indexes[name] = idx
            return idx
Esempio n. 24
0
    def create_index(self, columns, name=None):
        """
        Create an index to speed up queries on a table. If no ``name`` is given a random name is created.
        ::

            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        with self.database.lock:
            if not name:
                sig = abs(hash('||'.join(columns)))
                name = 'ix_%s_%s' % (self.table.name, sig)
            if name in self.indexes:
                return self.indexes[name]
            try:
                columns = [self.table.c[c] for c in columns]
                idx = Index(name, *columns)
                idx.create(self.database.engine)
            except:
                idx = None
            self.indexes[name] = idx
            return idx
Esempio n. 25
0
    def visit_column(self, column):
        """Create a column (table already exists).

        :param column: column object
        :type column: :class:`sqlalchemy.Column` instance
        """
        if column.default is not None:
            self.traverse_single(column.default)

        table = self.start_alter_table(column)
        self.append("ADD ")
        self.append(self.get_column_specification(column))

        for cons in column.constraints:
            self.traverse_single(cons)
        self.execute()

        # ALTER TABLE STATEMENTS

        # add indexes and unique constraints
        if column.index_name:
            ix = Index(column.index_name,
                       column,
                       unique=bool(column.index_name or column.index))
            ix.create()
        elif column.unique_name:
            constraint.UniqueConstraint(column,
                                        name=column.unique_name).create()

        # SA bounds FK constraints to table, add manually
        for fk in column.foreign_keys:
            self.add_foreignkey(fk.constraint)

        # add primary key constraint if needed
        if column.primary_key_name:
            cons = constraint.PrimaryKeyConstraint(column,
                                                   name=column.primary_key_name)
            cons.create()
Esempio n. 26
0
def wipe_opt_out_patients(report_every: int = 1000,
                          chunksize: int = 10000) -> None:
    """
    Delete any data from patients that have opted out (after their data was
    processed on a previous occasion).
    (Slightly complicated by the fact that the destination database can't
    necessarily 'see' the mapping database, so we need to cache the RID keys in
    the destination database temporarily.)
    """
    start = "wipe_opt_out_patients"
    log.info(start)

    adminsession = config.admindb.session
    metadata = MetaData()  # operate in isolation!
    destengine = config.destdb.engine
    destsession = config.destdb.session
    ridfield = config.research_id_fieldname

    # Drop/create temporary table
    pkfield = 'rid'
    temptable = Table(
        config.temporary_tablename, metadata,
        Column(pkfield, config.SqlTypeEncryptedPid, primary_key=True),
        **TABLE_KWARGS)
    log.debug(start + ": 1. dropping temporary table")
    temptable.drop(destengine, checkfirst=True)  # use engine, not session
    log.debug(start + ": 2. making temporary table")
    temptable.create(destengine, checkfirst=True)  # use engine, not session

    log.debug(start + ": 3. populating temporary table with RIDs")

    def insert(records_):
        # records_: a list of dictionaries
        # http://docs.sqlalchemy.org/en/latest/core/tutorial.html
        log.debug(start + "... inserting {} records".format(len(records_)))
        destsession.execute(temptable.insert(), records_)

    i = 0
    records = []  # type: List[Dict[str: Any]]
    for rid in gen_optout_rids():
        i += 1
        if report_every and i % report_every == 0:
            log.debug(start + "... src row# {}".format(i))
        records.append({pkfield: rid})  # a row is a dict of values
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str: Any]]
    if records:  # remainder
        insert(records)
    commit_destdb()

    log.debug(start + ": 4. creating index on temporary table")
    index = Index('_temptable_idx', temptable.columns[pkfield])
    index.create(destengine)  # use engine, not session

    # 5. For each patient destination table,
    #    DELETE FROM desttable WHERE rid IN (SELECT rid FROM temptable)
    log.debug(start + ": 5. deleting from destination table by opt-out RID")
    for dest_table_name in config.dd.get_dest_tables_with_patient_info():
        log.debug(start + ": ... {}".format(dest_table_name))
        dest_table = config.dd.get_dest_sqla_table(dest_table_name)
        query = dest_table.delete().where(
            column(ridfield).in_(select([temptable.columns[pkfield]])))
        destengine.execute(query)
        commit_destdb()

    log.debug(start + ": 6. dropping temporary table")
    temptable.drop(destengine, checkfirst=True)  # use engine, not session
    commit_destdb()

    log.debug(start + ": 7. deleting opt-out patients from mapping table")
    adminsession.query(PatientInfo).filter(
        or_(PatientInfo.pid.in_(adminsession.query(OptOutPid.pid)),
            PatientInfo.mpid.in_(adminsession.query(
                OptOutMpid.mpid)))).delete(synchronize_session=False)
    commit_admindb()
Esempio n. 27
0
    def create_cube_aggregate(self, browser, table_name=None, dimensions=None,
                              dimension_links=None, schema=None,
                              replace=False):
        """Creates an aggregate table. If dimensions is `None` then all cube's
        dimensions are considered.

        Arguments:

        * `dimensions`: list of dimensions to use in the aggregated cuboid, if
          `None` then all cube dimensions are used
        * `dimension_links`: list of dimensions that are required for each
          aggregation (for example a date dimension in most of the cases). The
          list should be a subset of `dimensions`.
        * `aggregates_prefix`: aggregated table prefix
        * `aggregates_schema`: schema where aggregates are stored

        """

        if browser.store != self:
            raise ArgumentError("Can create aggregate table only within "
                                "the same store")

        schema = schema or self.options.get("aggregates_schema", self.schema)
        # Just a shortcut
        cube = browser.cube

        prefix = self.options.get("aggregates_prefix", "")
        table_name = table_name or "%s_%s" % (prefix, cube.name)

        if dimensions:
            dimensions = [cube.dimension(dimension) for dimension in dimensions]
        else:
            dimensions = cube.dimensions

        builder = QueryBuilder(browser)

        if builder.snowflake.fact_name == table_name and builder.snowflake.schema == schema:
            raise ArgumentError("target is the same as source fact table")

        drilldown = []
        keys = None
        for dimension in dimensions:
            levels = dimension.hierarchy().levels
            drilldown.append((dimension, dimension.hierarchy(), levels[-1]))
            keys = [l.key for l in levels]

        cell = Cell(cube)
        drilldown = Drilldown(drilldown, cell)

        # Create statement of all dimension level keys for
        # getting structure for table creation
        statement = builder.aggregation_statement(
            cell,
            drilldown=drilldown,
            aggregates=cube.aggregates,
            attributes=keys
        )

        # Create table
        table = self.create_table_from_statement(
            table_name,
            statement,
            schema=schema,
            replace=replace,
            insert=False
        )

        self.logger.info("Inserting...")

        with self.connectable.begin() as connection:

            insert = InsertIntoAsSelect(table, statement,
                                        columns=statement.columns)

            connection.execute(insert)
        self.logger.info("Done")

        self.logger.info("Creating indexes...")

        aggregated_columns = [a.name for a in cube.aggregates]
        for column in table.columns:
            if column.name in aggregated_columns:
                continue

            name = "%s_%s_idx" % (table_name, column)
            self.logger.info("creating index: %s" % name)
            index = Index(name, column)
            index.create(self.connectable)

        self.logger.info("Done")
Esempio n. 28
0
def init_db():
    movies_in_db = db.session.query(Movie).count()
    genres_in_db = db.session.query(Genre).count()
    tags_in_db = db.session.query(Tag).count()
    tagweights_in_db = db.session.query(TagWeight).count()

    print("finding a good basis set...")
    raw_movies = get_movies()
    print("narrowing basis set down to movies with tags...")
    movie_tag_dictionary = get_tags_and_relevancy(raw_movies)
    movies = []
    for movie in raw_movies:
        if movie_tag_dictionary.get(movie[0]):
            movies.append(movie)

    if movies_in_db == 0 or genres_in_db == 0:
        print("populating genres table...")
        genres = get_genres(movies)
        if genres_in_db == 0:
            for i in range(0, len(genres)):
                genre = genres[i]
                new_genre = Genre(genre_id=i, name=genre)
                db.session.add(new_genre)
        db.session.commit()
        print("populating movies table...")
        if movies_in_db == 0:
            for movie in movies:
                new_movie = Movie(movie_id=movie[0],
                                  name=movie[1],
                                  rating=int(movie[4] * 10000),
                                  year_released=movie[2])
                db.session.add(new_movie)
                for genre in movie[3]:
                    stored_genre = Genre.query.filter_by(name=genre).first()
                    new_movie.genres.append(stored_genre)
        print("there are " + str(len(movies)) + " movies in the database")
        db.session.commit()

    if tags_in_db == 0:
        print("populating tags table...")
        tag_dict = get_scored_tags()
        for tag_id in tag_dict.keys():
            new_tag = Tag(tag_id=tag_id, name=tag_dict[tag_id])
            db.session.add(new_tag)
        db.session.commit()

    if tagweights_in_db == 0:
        print("connecting tagweights to movies and tags...")
        movie_count = 1
        start = time.time()
        tag_dict = get_scored_tags()
        for movie_id in movie_tag_dictionary.keys():
            count = 1
            movie = Movie.query.filter_by(movie_id=movie_id).first()
            tagweight_lst = []
            for tag_relevancy in movie_tag_dictionary[movie_id]:
                tagweight = TagWeight(movie_id=movie_id,
                                      tag_id=count,
                                      weight=tag_relevancy)
                tagweight_lst.append(tagweight)
                count += 1
            db.session.add_all(tagweight_lst)
            movie_count += 1
            if movie_count % 50 == 0:
                print("processing tags for movie " + str(movie_count) + "...")
                print(time.time() - start)
                start = time.time()
        db.session.commit()

    print("adding index")
    try:
        index_tag_movie = Index('index_tag_movie', TagWeight.tag_id,
                                TagWeight.movie_id)
        index_tag_movie.create(bind=engine)
    except Exception as e:
        print("index could not be created, probably because it already exists")
Esempio n. 29
0
    def create_cube_aggregate(self,
                              cube,
                              table_name=None,
                              dimensions=None,
                              replace=False,
                              create_index=False,
                              schema=None):
        """Creates an aggregate table. If dimensions is `None` then all cube's
        dimensions are considered.

        Arguments:

        * `dimensions`: list of dimensions to use in the aggregated cuboid, if
          `None` then all cube dimensions are used
        """

        print "CREATING THE NECE"

        browser = SQLBrowser(cube, self, schema=schema)

        if browser.safe_labels:
            raise ConfigurationError("Aggregation does not work with "
                                     "safe_labels turned on")

        schema = schema or self.naming.aggregate_schema \
                    or self.naming.schema

        # TODO: this is very similar to the denormalization prep.
        table_name = table_name or self.naming.aggregate_table_name(cube.name)
        fact_name = cube.fact or self.naming.fact_table_name(cube.name)

        dimensions = dimensions or [dim.name for dim in cube.dimensions]

        if fact_name == table_name and schema == self.naming.schema:
            raise StoreError("Aggregation target is the same as fact")

        drilldown = []
        keys = []
        for dimref in dimensions:
            (dimname, hiername, level) = string_to_dimension_level(dimref)
            dimension = cube.dimension(dimname)
            hierarchy = dimension.hierarchy(hiername)
            levels = hierarchy.levels
            drilldown.append((dimension, hierarchy, levels[-1]))
            keys += [l.key for l in levels]

        cell = Cell(cube)
        drilldown = Drilldown(drilldown, cell)

        # Create statement of all dimension level keys for
        # getting structure for table creation
        (statement,
         _) = browser.aggregation_statement(cell,
                                            drilldown=drilldown,
                                            aggregates=cube.aggregates)

        # Create table
        table = self.create_table_from_statement(table_name,
                                                 statement,
                                                 schema=schema,
                                                 replace=replace,
                                                 insert=False)

        self.logger.info("Inserting...")

        insert = table.insert().from_select(statement.columns, statement)
        self.execute(insert)

        self.logger.info("Done")

        if create_index:
            self.logger.info("Creating indexes...")
            aggregated_columns = [a.name for a in cube.aggregates]
            for column in table.columns:
                if column.name in aggregated_columns:
                    continue

                name = "%s_%s_idx" % (table_name, column)
                self.logger.info("creating index: %s" % name)
                index = Index(name, column)
                index.create(self.connectable)

        self.logger.info("Done")
Esempio n. 30
0
    def create_cube_aggregate(self, cube, table_name=None, dimensions=None,
                                 replace=False, create_index=False,
                                 schema=None):
        """Creates an aggregate table. If dimensions is `None` then all cube's
        dimensions are considered.

        Arguments:

        * `dimensions`: list of dimensions to use in the aggregated cuboid, if
          `None` then all cube dimensions are used
        """

        browser = SQLBrowser(cube, self, schema=schema)

        if browser.safe_labels:
            raise ConfigurationError("Aggregation does not work with "
                                     "safe_labels turned on")

        schema = schema or self.naming.aggregate_schema \
                    or self.naming.schema

        # TODO: this is very similar to the denormalization prep.
        table_name = table_name or self.naming.aggregate_table_name(cube.name)
        fact_name = cube.fact or self.naming.fact_table_name(cube.name)

        dimensions = dimensions or [dim.name for dim in cube.dimensions]

        if fact_name == table_name and schema == self.naming.schema:
            raise StoreError("Aggregation target is the same as fact")

        drilldown = []
        keys = []
        for dimref in dimensions:
            (dimname, hiername, level) = string_to_dimension_level(dimref)
            dimension = cube.dimension(dimname)
            hierarchy = dimension.hierarchy(hiername)
            levels = hierarchy.levels
            drilldown.append((dimension, hierarchy, levels[-1]))
            keys += [l.key for l in levels]

        cell = Cell(cube)
        drilldown = Drilldown(drilldown, cell)

        # Create statement of all dimension level keys for
        # getting structure for table creation
        (statement, _) = browser.aggregation_statement(
            cell,
            drilldown=drilldown,
            aggregates=cube.aggregates
        )

        # Create table
        table = self.create_table_from_statement(
            table_name,
            statement,
            schema=schema,
            replace=replace,
            insert=False
        )

        self.logger.info("Inserting...")

        insert = table.insert().from_select(statement.columns, statement)
        self.execute(insert)

        self.logger.info("Done")

        if create_index:
            self.logger.info("Creating indexes...")
            aggregated_columns = [a.name for a in cube.aggregates]
            for column in table.columns:
                if column.name in aggregated_columns:
                    continue

                name = "%s_%s_idx" % (table_name, column)
                self.logger.info("creating index: %s" % name)
                index = Index(name, column)
                index.create(self.connectable)

        self.logger.info("Done")
Esempio n. 31
0
def add_index(engine: Engine,
              sqla_column: Column = None,
              multiple_sqla_columns: List[Column] = None,
              unique: bool = False,
              fulltext: bool = False,
              length: int = None) -> None:
    """
    Adds an index to a database column (or, in restricted circumstances,
    several columns).

    The table name is worked out from the :class:`Column` object.

    Args:
        engine: SQLAlchemy :class:`Engine` object
        sqla_column: single column to index
        multiple_sqla_columns: multiple columns to index (see below)
        unique: make a ``UNIQUE`` index?
        fulltext: make a ``FULLTEXT`` index?
        length: index length to use (default ``None``)

    Restrictions:

    - Specify either ``sqla_column`` or ``multiple_sqla_columns``, not both.
    - The normal method is ``sqla_column``.
    - ``multiple_sqla_columns`` is only used for Microsoft SQL Server full-text
      indexing (as this database permits only one full-text index per table,
      though that index can be on multiple columns).

    """
    # We used to process a table as a unit; this makes index creation faster
    # (using ALTER TABLE).
    # http://dev.mysql.com/doc/innodb/1.1/en/innodb-create-index-examples.html  # noqa
    # ... ignored in transition to SQLAlchemy

    def quote(identifier: str) -> str:
        return quote_identifier(identifier, engine)

    is_mssql = engine.dialect.name == SqlaDialectName.MSSQL
    is_mysql = engine.dialect.name == SqlaDialectName.MYSQL

    multiple_sqla_columns = multiple_sqla_columns or []  # type: List[Column]
    if multiple_sqla_columns and not (fulltext and is_mssql):
        raise ValueError("add_index: Use multiple_sqla_columns only for mssql "
                         "(Microsoft SQL Server) full-text indexing")
    if bool(multiple_sqla_columns) == (sqla_column is not None):
        raise ValueError(
            f"add_index: Use either sqla_column or multiple_sqla_columns, "
            f"not both (sqla_column = {sqla_column!r}, "
            f"multiple_sqla_columns = {multiple_sqla_columns!r}"
        )
    if sqla_column is not None:
        colnames = [sqla_column.name]
        sqla_table = sqla_column.table
        tablename = sqla_table.name
    else:
        colnames = [c.name for c in multiple_sqla_columns]
        sqla_table = multiple_sqla_columns[0].table
        tablename = sqla_table.name
        if any(c.table.name != tablename for c in multiple_sqla_columns[1:]):
            raise ValueError(
                f"add_index: tablenames are inconsistent in "
                f"multiple_sqla_columns = {multiple_sqla_columns!r}")

    if fulltext:
        if is_mssql:
            idxname = ''  # they are unnamed
        else:
            idxname = "_idxft_{}".format("_".join(colnames))
    else:
        idxname = "_idx_{}".format("_".join(colnames))
    if idxname and index_exists(engine, tablename, idxname):
        log.info(f"Skipping creation of index {idxname} on "
                 f"table {tablename}; already exists")
        return
        # because it will crash if you add it again!
    log.info(
        "Creating{ft} index {i} on table {t}, column(s) {c}",
        ft=" full-text" if fulltext else "",
        i=idxname or "<unnamed>",
        t=tablename,
        c=", ".join(colnames),
    )

    if fulltext:
        if is_mysql:
            log.info('OK to ignore this warning, if it follows next: '
                     '"InnoDB rebuilding table to add column FTS_DOC_ID"')
            # https://dev.mysql.com/doc/refman/5.6/en/innodb-fulltext-index.html
            sql = (
                "ALTER TABLE {tablename} "
                "ADD FULLTEXT INDEX {idxname} ({colnames})".format(
                    tablename=quote(tablename),
                    idxname=quote(idxname),
                    colnames=", ".join(quote(c) for c in colnames),
                )
            )
            # DDL(sql, bind=engine).execute_if(dialect=SqlaDialectName.MYSQL)
            DDL(sql, bind=engine).execute()

        elif is_mssql:  # Microsoft SQL Server
            # https://msdn.microsoft.com/library/ms187317(SQL.130).aspx
            # Argh! Complex.
            # Note that the database must also have had a
            #   CREATE FULLTEXT CATALOG somename AS DEFAULT;
            # statement executed on it beforehand.
            schemaname = engine.schema_for_object(
                sqla_table) or MSSQL_DEFAULT_SCHEMA  # noqa
            if mssql_table_has_ft_index(engine=engine,
                                        tablename=tablename,
                                        schemaname=schemaname):
                log.info(
                    f"... skipping creation of full-text index on table "
                    f"{tablename}; a full-text index already exists for that "
                    f"table; you can have only one full-text index per table, "
                    f"though it can be on multiple columns")
                return
            pk_index_name = mssql_get_pk_index_name(
                engine=engine, tablename=tablename, schemaname=schemaname)
            if not pk_index_name:
                raise ValueError(
                    f"To make a FULLTEXT index under SQL Server, we need to "
                    f"know the name of the PK index, but couldn't find one "
                    f"from get_pk_index_name() for table {tablename!r}")
            # We don't name the FULLTEXT index itself, but it has to relate
            # to an existing unique index.
            sql = (
                "CREATE FULLTEXT INDEX ON {tablename} ({colnames}) "
                "KEY INDEX {keyidxname} ".format(
                    tablename=quote(tablename),
                    keyidxname=quote(pk_index_name),
                    colnames=", ".join(quote(c) for c in colnames),
                )
            )
            # SQL Server won't let you do this inside a transaction:
            # "CREATE FULLTEXT INDEX statement cannot be used inside a user
            # transaction."
            # https://msdn.microsoft.com/nl-nl/library/ms191544(v=sql.105).aspx
            # So let's ensure any preceding transactions are completed, and
            # run the SQL in a raw way:
            # engine.execute(sql).execution_options(autocommit=False)
            # http://docs.sqlalchemy.org/en/latest/core/connections.html#understanding-autocommit
            #
            # ... lots of faff with this (see test code in no_transactions.py)
            # ... ended up using explicit "autocommit=True" parameter (for
            #     pyodbc); see create_indexes()
            transaction_count = mssql_transaction_count(engine)
            if transaction_count != 0:
                log.critical(f"SQL Server transaction count (should be 0): "
                             f"{transaction_count}")
                # Executing serial COMMITs or a ROLLBACK won't help here if
                # this transaction is due to Python DBAPI default behaviour.
            DDL(sql, bind=engine).execute()

            # The reversal procedure is DROP FULLTEXT INDEX ON tablename;

        else:
            log.error(f"Don't know how to make full text index on dialect "
                      f"{engine.dialect.name}")

    else:
        index = Index(idxname, sqla_column, unique=unique, mysql_length=length)
        index.create(engine)
Esempio n. 32
0
def delete_where_no_source(nlpdef: NlpDefinition,
                           ifconfig: InputFieldConfig,
                           report_every: int = DEFAULT_REPORT_EVERY,
                           chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    Delete destination records where source records no longer exist.

    - Can't do this in a single SQL command, since the engine can't necessarily
      see both databases.
    - Can't use a single temporary table, since the progress database isn't
      necessarily the same as any of the destination database(s).
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - So we fetch all source PKs (which, by definition, do exist), stash them
      keep them in memory, and do a DELETE WHERE NOT IN based on those
      specified values (or, if there are no PKs in the source, delete
      everything from the destination).

    Problems:
    - This is IMPERFECT if we have string source PKs and there are hash
      collisions (e.g. PKs for records X and Y both hash to the same thing;
      record X is deleted; then its processed version might not be).
    - With massive tables, we might run out of memory or (much more likely)
      SQL parameter slots. -- This is now happening; error looks like:
      pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but
      2717783 parameters were supplied', 'HY000')

    A better way might be:
    - for each table, make a temporary table in the same database
    - populate that table with (source PK integer/hash, source PK string) pairs
    - delete where pairs don't match -- is that portable SQL?
      http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns  # noqa
    - More efficient would be to make one table per destination database.

    On the "delete where multiple fields don't match":
    - Single field syntax is
        DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b)
    - Multiple field syntax is
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND a.a2 = b.b2
        )
    - In SQLAlchemy, exists():
        http://stackoverflow.com/questions/14600619
        http://docs.sqlalchemy.org/en/latest/core/selectable.html
    - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false,
      so we have to do an explicit null check.
      You do that with "field == None" (disable
      See http://stackoverflow.com/questions/21668606
      We're aiming, therefore, for:
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND (
                a.a2 = b.b2
                OR (a.a2 IS NULL AND b.b2 IS NULL)
            )
        )
    """

    # -------------------------------------------------------------------------
    # Sub-functions
    # -------------------------------------------------------------------------

    def insert(records_):
        n_rows = len(records_)
        log.debug("... inserting {} records".format(n_rows))
        for db in databases:
            session_ = db['session']
            temptable_ = db['temptable']  # type: Table
            session_.execute(temptable_.insert(), records_)
            nlpdef.notify_transaction(session_,
                                      n_rows=n_rows,
                                      n_bytes=sys.getsizeof(records_))

    def commit():
        for db in databases:
            nlpdef.commit(db['session'])

    # -------------------------------------------------------------------------
    # Main code
    # -------------------------------------------------------------------------
    # Use info log level, otherwise it looks like our code hangs with very
    # large databases.

    log.info("delete_where_no_source: examining source table {}.{}; "
             "MAY BE SLOW".format(ifconfig.get_srcdb(),
                                  ifconfig.get_srctable()))

    # Start our list with the progress database
    databases = [{
        'session': nlpdef.get_progdb_session(),
        'engine': nlpdef.get_progdb_engine(),
        'metadata': nlpdef.get_progdb_metadata(),
        'temptable': None,  # type: Table
    }]

    # Add the processors' destination databases
    for processor in nlpdef.get_processors():  # of type BaseNlpParser
        session = processor.get_session()
        if any(x['session'] == session for x in databases):
            continue  # already exists
        databases.append({
            'session': session,
            'engine': processor.get_engine(),
            'metadata': processor.get_metadata(),
        })

    # Make a temporary table in each database (note: the Table objects become
    # affiliated to their engine, I think, so make separate ones for each).
    log.info("... using {n} destination database(s)".format(n=len(databases)))
    log.info("... dropping (if exists) and creating temporary table(s)")
    for database in databases:
        engine = database['engine']
        temptable = Table(
            nlpdef.get_temporary_tablename(),
            database['metadata'],
            Column(FN_SRCPKVAL, BigInteger),  # not PK, as may be a hash
            Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)),
            **TABLE_KWARGS)
        temptable.drop(engine, checkfirst=True)
        temptable.create(engine, checkfirst=True)
        database['temptable'] = temptable

    # Insert PKs into temporary tables

    n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable())
    log.info("... populating temporary table(s): {} records to go; working in "
             "chunks of {}".format(n, chunksize))
    i = 0
    records = []  # type: List[Dict[str, Any]]
    for pkval, pkstr in ifconfig.gen_src_pks():
        i += 1
        if report_every and i % report_every == 0:
            log.info("... src row# {} / {}".format(i, n))
        records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str, Any]]
    if records:  # remainder
        insert(records)

    # Commit
    commit()

    # Index, for speed
    log.info("... creating index(es) on temporary table(s)")
    for database in databases:
        temptable = database['temptable']  # type: Table
        index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL])
        index.create(database['engine'])

    # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.info("... deleting from progress/destination DBs where appropriate")

    # Delete from progress database
    prog_db = databases[0]
    prog_temptable = prog_db['temptable']
    ifconfig.delete_progress_records_where_srcpk_not(prog_temptable)

    # Delete from others
    for processor in nlpdef.get_processors():
        database = [
            x for x in databases if x['session'] == processor.get_session()
        ][0]
        temptable = database['temptable']
        processor.delete_where_srcpk_not(ifconfig, temptable)

    # Drop temporary tables
    log.info("... dropping temporary table(s)")
    for database in databases:
        database['temptable'].drop(database['engine'], checkfirst=True)

    # Commit
    commit()