def _run( self, predicates: List["ReactionComponentPredicate"], cursor: psycopg2.extensions.cursor, limit: Optional[int] = None, ) -> List[Result]: """Runs the query for a set of predicates.""" if not predicates: return [] self._setup(predicates, cursor) predicate_components = [] args = [] for predicate in predicates: components = [ sql.SQL(""" SELECT DISTINCT dataset_id, reaction_id, serialized FROM reactions """) ] components.extend(self._get_tables()) components.append(sql.SQL(""" WHERE """)) predicate_sql, predicate_args = predicate.get() components.append(predicate_sql) args.extend(predicate_args) predicate_components.append(sql.Composed(components)) components = [sql.Composed(predicate_components).join(" INTERSECT ")] if limit: components.append(sql.SQL(" LIMIT %s")) args.append(limit) query = sql.Composed(components).join("") logger.info("Running SQL command:%s", cursor.mogrify(query, args).decode()) cursor.execute(query, args) return fetch_results(cursor)
def run(self, cursor: psycopg2.extensions.cursor, limit: Optional[int] = None) -> List[Result]: """Runs the query. Args: cursor: psycopg2 cursor. limit: Integer maximum number of matches. If None (the default), no limit is set. Returns: List of Result instances. """ components = [ sql.SQL(""" SELECT DISTINCT dataset_id, reaction_id, serialized FROM reactions WHERE doi = ANY (%s)""") ] args = [self._dois] if limit: components.append(sql.SQL(" LIMIT %s")) args.append(limit) query = sql.Composed(components).join("") logger.info("Running SQL command:%s", cursor.mogrify(query, args).decode()) cursor.execute(query, args) return fetch_results(cursor)
def generate_insert_queries( curs: psycopg2.extensions.cursor, insert_table: str, df: pd.DataFrame, *, chunksize: int = 10000, ) -> Coroutine: """Generator that helps insert_pandas_into. Assumes totally valid arguments, and colnames must match the schema of the insert table. Args: curs (psycopg2.extensions.cursor): Connection used to insert to table insert_table (str): Target table in database df (pd.DataFrame): Pandas dataframe that will be inserted chunksize (int, optional): How many rows to write per insert. Defaults to 10000. Returns: None """ # TODO: assert cursor here if not isinstance(insert_table, str): raise TypeError("insert_table must be a str") if not isinstance(df, pd.DataFrame): raise TypeError("df must be a pandas DataFrame") if not isinstance(chunksize, int): raise TypeError("chunksize must be an int") ncol = len(df.columns) colnames = df.columns.tolist() sanitized_colnames = [f'"{col}"' for col in colnames] insert_template = "\n".join( [ f"INSERT INTO {insert_table} ", "(", # indent the first colname " " + ",\n ".join(sanitized_colnames), ")", "VALUES\n", "{}", ] ) all_values = df.values.tolist() formatting = ", ".join(["%s"] * ncol) # e.g. '%s, %s, %s' for i in range(0, len(all_values) + chunksize, chunksize): subset_values = all_values[i : i + chunksize] if subset_values: # as of 2018 Dec 7, you can only use mogrify with a cursor object query_values = " " + ",\n ".join( curs.mogrify(f"({formatting})", row).decode() for row in subset_values ) # cleanup values query_values = query_values.replace("'NaT'::timestamp", "NULL") query_values = query_values.replace("'NaN'::float", "NULL") query_values = query_values.replace("'None'", "NULL") query = insert_template.format(query_values) yield query
def _setup(self, predicates: List["ReactionComponentPredicate"], cursor: psycopg2.extensions.cursor) -> None: """Prepares the database for a query. Args: cursor: psycopg.cursor instance. predicates: Predicates included in this query. """ command = sql.SQL("SET rdkit.do_chiral_sss=%s") args = [self._do_chiral_sss] logger.info("Running SQL command: %s", cursor.mogrify(command, args).decode()) cursor.execute(command, args) command = sql.SQL("SET rdkit.tanimoto_threshold=%s") tanimoto_threshold = self._tanimoto_threshold for predicate in predicates: if predicate.mode == ReactionComponentPredicate.MatchMode.EXACT: tanimoto_threshold = 1.0 args = [tanimoto_threshold] logger.info("Running SQL command: %s", cursor.mogrify(command, args).decode()) cursor.execute(command, args)
def run(self, cursor: psycopg2.extensions.cursor, limit: Optional[int] = None) -> List[Result]: """Runs the query. Args: cursor: psycopg.cursor instance. limit: Not used; present for compatibility. Returns: List of Result instances. """ del limit # Unused. query = sql.SQL(""" SELECT DISTINCT dataset_id, reaction_id, serialized FROM reactions WHERE reaction_id = ANY (%s)""") args = [self._reaction_ids] logger.info("Running SQL command:%s", cursor.mogrify(query, args).decode()) cursor.execute(query, args) return fetch_results(cursor)
def run(self, cursor: psycopg2.extensions.cursor, limit: Optional[int] = None) -> List[Result]: """Runs the query. Args: cursor: psycopg.cursor instance. limit: Maximum number of matches. If None, no limit is set. Returns: List of Result instances. """ del limit # Unused. query = sql.SQL(""" SELECT DISTINCT dataset_id, reaction_id, serialized FROM reactions TABLESAMPLE SYSTEM_ROWS (%s)""") args = [self._num_rows] logger.info("Running SQL command:%s", cursor.mogrify(query, args).decode()) cursor.execute(query, args) return fetch_results(cursor)