Ejemplo n.º 1
0
    def filter_query(self, tbls, *args):
        "Query a table and filter results."
        self.grab_session()
        ok_classes = [type(self.Base), InstrumentedAttribute]
        if _isiterable(tbls) and not isinstance(tbls, dict):
            if all([any([isinstance(tbl, ok_class) for ok_class in ok_classes])
                    for tbl in tbls]):
                query_args = tbls
            elif isinstance(tbls[0], str):
                query_args = [self.tables[tbl] for tbl in tbls]
            else:
                raise IndraDbException(
                    'Unrecognized table specification type: %s.' %
                    type(tbls[0])
                    )
        else:
            if any([isinstance(tbls, ok_class) for ok_class in ok_classes]):
                query_args = [tbls]
            elif isinstance(tbls, str):
                query_args = [self.tables[tbls]]
            else:
                raise IndraDbException(
                    'Unrecognized table specification type: %s.' %
                    type(tbls)
                    )

        return self.session.query(*query_args).filter(*args)
Ejemplo n.º 2
0
    def select_sample_from_table(self, number, table, *args, **kwargs):
        """Select a number of random samples from the given table.

        Parameters
        ----------
        number : int
            The number of samples to return
        table : str, table class, or column attribute of table class
            The table or table column to be sampled.
        *args, **kwargs :
            All other arguments are passed to `select_all`, including any and
            all filtering clauses.

        Returns
        -------
        A list of sqlalchemy orm objects
        """
        # Get the base set of tables needed.
        if isinstance(table, str):
            # This should be the string name for a table or view.
            if table in self.tables.keys() or table in self.views.keys():
                true_table = getattr(self, table)
            else:
                raise IndraDbException("Invalid table name: %s." % table)
        elif hasattr(table, 'class_'):
            # This is technically an attribute of a table.
            true_table = table.class_
        elif table in self.tables.values() or table in self.views.values():
            # This is an actual table object
            true_table = table
        else:
            raise IndraDbException("Unrecognized table: %s of type %s"
                                   % (table, type(table)))

        # Get all ids for this table given query filters
        logger.info("Getting all relevant ids.")
        pk = self.get_primary_key(true_table)
        id_tuples = self.select_all(getattr(true_table, pk.name), *args,
                                    **kwargs)
        id_list = list({entry_id for entry_id, in id_tuples})

        # Sample from the list of ids
        logger.info("Getting sample of %d from %d members."
                    % (number, len(id_list)))
        id_sample = random.sample(id_list, number)
        if hasattr(table, 'key') and table.key == 'id':
            return [(entry_id,) for entry_id in id_sample]

        return self.select_all(table, getattr(table, pk.name).in_(id_sample))
Ejemplo n.º 3
0
def get_statements_without_agents(db, prefix, *other_stmt_clauses, **kwargs):
    """Get a generator for orm statement objects which do not have agents."""
    num_per_yield = kwargs.pop('num_per_yield', 100)
    verbose = kwargs.pop('verbose', False)

    # Get the objects for either raw or pa statements.
    stmt_tbl_obj = db.tables[prefix + '_statements']
    agent_tbl_obj = db.tables[prefix + '_agents']

    # Build a dict mapping stmt UUIDs to statement IDs
    logger.info("Getting %s that lack %s in the database."
                % (stmt_tbl_obj.__tablename__, agent_tbl_obj.__tablename__))
    if prefix == 'pa':
        agents_link = (stmt_tbl_obj.mk_hash == agent_tbl_obj.stmt_mk_hash)
    elif prefix == 'raw':
        agents_link = (stmt_tbl_obj.id == agent_tbl_obj.stmt_id)
    else:
        raise IndraDbException("Unrecognized prefix: %s." % prefix)
    stmts_wo_agents_q = (db.session
                           .query(stmt_tbl_obj)
                           .filter(*other_stmt_clauses)
                           .filter(~exists().where(agents_link)))

    # Start printing some data
    if verbose:
        num_stmts = stmts_wo_agents_q.count()
        print("Adding agents for %d statements." % num_stmts)
    else:
        num_stmts = None

    # Get the iterator
    return stmts_wo_agents_q.yield_per(num_per_yield), num_stmts
Ejemplo n.º 4
0
    def link(self, table_1, table_2):
        """Get the joining clause between two tables, if one exists.

        If no link exists, an exception will be raised. Note that this only
        works for directly links.
        """
        table_name_1 = table_1.__tablename__
        table_name_2 = table_2.__tablename__
        if WITH_NX:
            fk_path = nx.shortest_path(self.__foreign_key_graph, table_name_1,
                                       table_name_2)
        else:
            fk_path = [table_name_1, table_name_2]

        links = []
        for i in range(len(fk_path) - 1):
            link = self._get_foreign_key_constraint(fk_path[i], fk_path[i+1])
            if link is None:
                link = self._get_foreign_key_constraint(fk_path[i+1],
                                                        fk_path[i])
            if link is None:
                raise IndraDbException("There is no foreign key in %s "
                                       "pointing to %s."
                                       % (table_name_1, table_name_2))
            links.append(link)
        return links
Ejemplo n.º 5
0
def extract_agent_data(stmt, stmt_id):
    """Create the tuples for copying agents into the database."""
    # Figure out how the agents are structured and assign roles.
    ag_list = stmt.agent_list(deep_sorted=True)
    nary_stmt_types = [
        Complex, SelfModification, ActiveForm, Conversion, Translocation
    ]
    if any([isinstance(stmt, tp) for tp in nary_stmt_types]):
        agents = {('OTHER', ag, i) for i, ag in enumerate(ag_list)}
    elif len(ag_list) == 2:
        agents = {(role, ag_list[i], i)
                  for i, role in enumerate(['SUBJECT', 'OBJECT'])}
    else:
        raise IndraDbException("Unhandled agent structure for stmt %s "
                               "with agents: %s." %
                               (str(stmt), str(stmt.agent_list())))

    def all_agent_refs(ag):
        """Smooth out the iteration over agents and their refs."""
        for ns, ag_id in ag.db_refs.items():
            if isinstance(ag_id, list):
                for sub_id in ag_id:
                    yield ns, sub_id
            else:
                yield ns, ag_id
        yield 'NAME', ag.name

    # Prep the agents for copy into the database.
    ref_data = []
    mod_data = []
    mut_data = []
    warnings = set()
    for role, ag, idx in agents:
        # If no agent, or no db_refs for the agent, skip the insert
        # that follows.
        if ag is None or ag.db_refs is None:
            continue

        # Get the db refs data.
        for ns, ag_id in all_agent_refs(ag):
            if ag_id is not None:
                ref_data.append(
                    (stmt_id, idx, ns, regularize_agent_id(ag_id, ns), role))
            else:
                if ns not in warnings:
                    warnings.add(ns)
                    logger.warning("Found agent for %s with None value." % ns)

        # Get the modification data
        for mod in ag.mods:
            mod_data.append((stmt_id, idx, mod.mod_type, mod.position,
                             mod.residue, mod.is_modified))

        # Get the mutation data
        for mut in ag.mutations:
            mut_data.append(
                (stmt_id, idx, mut.position, mut.residue_from, mut.residue_to))

    return ref_data, mod_data, mut_data
Ejemplo n.º 6
0
    def _prep_copy(self, tbl_name, data, cols):

        # If cols is not specified, use all the cols in the table, else check
        # to make sure the names are valid.
        if cols is None:
            cols = self.get_column_names(tbl_name)
        else:
            db_cols = self.get_column_names(tbl_name)
            assert all([col in db_cols for col in cols]), \
                "Do not recognize one of the columns in %s for table %s." % \
                (cols, tbl_name)

        # Check for automatic timestamps which won't be applied by the
        # database when using copy, and manually insert them.
        auto_timestamp_type = type(func.now())
        for col in self.get_column_objects(tbl_name):
            if col.default is not None:
                if isinstance(col.default.arg, auto_timestamp_type) \
                        and col.name not in cols:
                    logger.info("Applying timestamps to %s." % col.name)
                    now = datetime.utcnow()
                    cols += (col.name,)
                    data = [datum + (now,) for datum in data]

        # Format the data for the copy.
        data_bts = []
        n_cols = len(cols)
        for entry in data:
            # Make sure that the number of columns matches the number of columns in
            # the data.
            if n_cols != len(entry):
                raise ValueError("Number of columns does not match number of "
                                 "columns in data.")

            # Convert the entry to bytes
            new_entry = []
            for element in entry:
                if isinstance(element, str):
                    new_entry.append(element.encode('utf8'))
                elif (isinstance(element, bytes)
                      or element is None
                      or isinstance(element, Number)
                      or isinstance(element, datetime)):
                    new_entry.append(element)
                else:
                    raise IndraDbException(
                        "Don't know what to do with element of type %s."
                        "Should be str, bytes, datetime, None, or a "
                        "number." % type(element)
                    )
            data_bts.append(tuple(new_entry))

        # Prep the connection.
        if self._conn is None:
            self._conn = self.engine.raw_connection()
            self._conn.rollback()

        return cols, data_bts
Ejemplo n.º 7
0
 def grab_session(self):
     "Get an active session with the database."
     if self.session is None or not self.session.is_active:
         logger.debug('Attempting to get session...')
         DBSession = sessionmaker(bind=self.engine)
         logger.debug('Got session.')
         self.session = DBSession()
         if self.session is None:
             raise IndraDbException("Failed to grab session.")
Ejemplo n.º 8
0
    def copy(self, tbl_name, data, cols=None):
        "Use pg_copy to copy over a large amount of data."
        logger.info("Received request to copy %d entries into %s." %
                    (len(data), tbl_name))
        if len(data) is 0:
            return  # Nothing to do....

        # If cols is not specified, use all the cols in the table, else check
        # to make sure the names are valid.
        if cols is None:
            cols = self.get_column_names(tbl_name)
        else:
            db_cols = self.get_column_names(tbl_name)
            assert all([col in db_cols for col in cols]),\
                "Do not recognize one of the columns in %s for table %s." % \
                (cols, tbl_name)

        # Do the copy. Use pgcopy if available.
        if self.sqltype == sqltypes.POSTGRESQL and CAN_COPY:
            # Check for automatic timestamps which won't be applied by the
            # database when using copy, and manually insert them.
            auto_timestamp_type = type(func.now())
            for col in self.get_column_objects(tbl_name):
                if col.default is not None:
                    if isinstance(col.default.arg, auto_timestamp_type) \
                       and col.name not in cols:
                        logger.info("Applying timestamps to %s." % col.name)
                        now = datetime.utcnow()
                        cols += (col.name, )
                        data = [datum + (now, ) for datum in data]

            # Now actually do the copy
            conn = self.engine.raw_connection()
            mngr = CopyManager(conn, tbl_name, cols)
            data_bts = []
            for entry in data:
                new_entry = []
                for element in entry:
                    if isinstance(element, str):
                        new_entry.append(element.encode('utf8'))
                    elif (isinstance(element, bytes) or element is None
                          or isinstance(element, Number)
                          or isinstance(element, datetime)):
                        new_entry.append(element)
                    else:
                        raise IndraDbException(
                            "Don't know what to do with element of type %s."
                            "Should be str, bytes, datetime, None, or a "
                            "number." % type(element))
                data_bts.append(tuple(new_entry))
            mngr.copy(data_bts, BytesIO)
            conn.commit()
        else:
            # TODO: use bulk insert mappings?
            logger.warning("You are not using postgresql or do not have "
                           "pgcopy, so this will likely be very slow.")
            self.insert_many(tbl_name, [dict(zip(cols, ro)) for ro in data])
Ejemplo n.º 9
0
def get_primary_db(force_new=False):
    """Get a DatabaseManager instance for the primary database host.

    The primary database host is defined in the defaults.txt file, or in a file
    given by the environment variable DEFAULTS_FILE. Alternatively, it may be
    defined by the INDRADBPRIMARY environment variable. If none of the above
    are specified, this function will raise an exception.

    Note: by default, calling this function twice will return the same
    `DatabaseManager` instance. In other words::

        db1 = get_primary_db()
        db2 = get_primary_db()
        db1 is db2

    This means also that, for example `db1.select_one(db2.TextRef)` will work,
    in the above context.

    It is still recommended that when creating a script or function, or other
    general application, you should not rely on this feature to get your access
    to the database, as it can make substituting a different database host both
    complicated and messy. Rather, a database instance should be explicitly
    passed between different users as is done in `get_statements_by_gene_role_type`
    function's call to `get_statements` in `indra.db.query_db_stmts`.

    Parameters
    ----------
    force_new : bool
        If true, a new instance will be created and returned, regardless of
        whether there is an existing instance or not. Default is False, so that
        if this function has been called before within the global scope, a the
        instance that was first created will be returned.

    Returns
    -------
    primary_db : :py:class:`DatabaseManager`
        An instance of the database manager that is attached to the primary
        database.
    """
    logger.warning("DEPRECATION WARNING: This function is being deprecated.")
    defaults = get_databases()
    if 'primary' in defaults.keys():
        primary_host = defaults['primary']
    else:
        raise IndraDbException("No primary host available in defaults file.")

    global __PRIMARY_DB
    if __PRIMARY_DB is None or force_new:
        __PRIMARY_DB = PrincipalDatabaseManager(primary_host, label='primary')
        __PRIMARY_DB.grab_session()
    return __PRIMARY_DB
Ejemplo n.º 10
0
def run_preassembly(mode, project_name):
    """Construct a submitter and begin submitting jobs to Batch for preassembly.

    This function will determine which statement types need to be updated and
    how far back they go, and will create the appropriate
    :class:`PreassemblySubmitter
    <indra_db.preassembly.submitter.PreassemblySubmitter>`
    instance, and run the jobs with pre-set parameters on statement types that
    need updating.

    Parameters
    ----------
    project_name : str
        This name is used to gag the various AWS resources used for accounting
        purposes.
    """
    from indra_db.preassembly.submitter import VALID_STATEMENTS, \
        PreassemblySubmitter
    db = get_db('primary')
    if mode == 'update':
        # Find the latest update for each statement type.
        last_updates = list_last_updates(db)

        # Get the most recent raw statement datetimes
        latest_raw_stmts = list_latest_raw_stmts(db)

        # Only include statements types that have new raw statements.
        need_to_update = [
            s_type for s_type, last_upd in last_updates.items()
            if s_type in latest_raw_stmts.keys()
            and latest_raw_stmts[s_type] > last_upd
        ]
    else:
        # Make sure the pa_statements table is truly empty.
        if db.select_one(db.PAStatements):
            raise IndraDbException("Please clear the pa_statements table "
                                   "before running create. If you want to run "
                                   "an incremental update, please run with "
                                   "mode 'update'.")

        # Just run them all.
        need_to_update = VALID_STATEMENTS[:]

    # Create the submitter, and run it.
    basename = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
    ps = PreassemblySubmitter(basename, mode, project_name=project_name)
    ps.set_max_jobs(4)
    ps.run(need_to_update, 100000, True, stagger=600, poll_interval=120)
Ejemplo n.º 11
0
    def load_dump(self, dump_file, force_clear=True):
        """Load from a dump of the readonly schema on s3."""

        # Make sure the database is clear.
        if 'readonly' in self.get_schemas():
            if force_clear:
                # For some reason, dropping tables does not work.
                self.drop_schema('readonly')
            else:
                raise IndraDbException("Tables already exist and force_clear "
                                       "is False.")

        # Do the restore
        self.pg_restore(dump_file)

        # Run Vacuuming
        logger.info("Running vacuuming.")
        self.vacuum()

        return
Ejemplo n.º 12
0
def _get_agent_tuples(stmt, stmt_id):
    """Create the tuples for copying agents into the database."""
    # Figure out how the agents are structured and assign roles.
    ag_list = stmt.agent_list()
    nary_stmt_types = [Complex, SelfModification, ActiveForm, Conversion,
                       Translocation]
    if any([isinstance(stmt, tp) for tp in nary_stmt_types]):
        agents = {('OTHER', ag) for ag in ag_list}
    elif len(ag_list) == 2:
        agents = {('SUBJECT', ag_list[0]), ('OBJECT', ag_list[1])}
    else:
        raise IndraDbException("Unhandled agent structure for stmt %s "
                                 "with agents: %s."
                               % (str(stmt), str(stmt.agent_list())))

    def all_agent_refs(agents):
        """Smooth out the iteration over agents and their refs."""
        for role, ag in agents:
            # If no agent, or no db_refs for the agent, skip the insert
            # that follows.
            if ag is None or ag.db_refs is None:
                continue
            for ns, ag_id in ag.db_refs.items():
                if isinstance(ag_id, list):
                    for sub_id in ag_id:
                        yield ns, sub_id, role
                else:
                    yield ns, ag_id, role

    # Prep the agents for copy into the database.
    agent_data = []
    for ns, ag_id, role in all_agent_refs(agents):
        if ag_id is not None:
            agent_data.append((stmt_id, ns, regularize_agent_id(ag_id, ns),
                               role))
        else:
            logger.warning("Found agent for %s with None value." % ns)
    return agent_data