def insert_events(data: pandas.DataFrame, mapping: dict, db_cursor: DictCursor, batch_size: int = 1000) -> pandas.DataFrame: """ Inserts sampling events, reeturning the given dataframe with updated `pid`'s from the database. """ base_query, field_mapping = get_base_query(mapping['event']) total = len(data.values) start = 0 end = min(total, batch_size) pids = [] while start < total: logging.info(" * inserting %s to %s", start, end) values = format_values(data, field_mapping, start, end) query = f"{base_query} VALUES {', '.join(values)} RETURNING pid;" try: db_cursor.execute(query) pids += [r['pid'] for r in db_cursor.fetchall()] except psycopg2.Error as err: logging.error(err) logging.error("No data were imported.") sys.exit(1) start = end end = min(total, end + batch_size) # assign pids to data for future joins return data.assign(pid=pids)
def insert_common(data: pandas.DataFrame, mapping: dict, db_cursor: DictCursor, batch_size: int = 1000): """ Inserts `data` into the database based on the given `mapping`. """ base_query, field_mapping = get_base_query(mapping) total = len(data.values) start = 0 end = min(total, batch_size) while start < total: logging.info(" * inserting %s to %s", start, end) values = format_values(data, field_mapping, start, end) query = f"{base_query} VALUES {', '.join(values)};" try: db_cursor.execute(query) except psycopg2.Error as err: logging.error(err) logging.error("No data were imported.") sys.exit(1) start = end end = min(total, end + batch_size)
def list_datasets_in_bioatlas(cursor: DictCursor) -> list: """ Returns a list of all datasets available in the database where `in_bioatlas`is `true`. """ query = "SELECT pid, dataset_id FROM public.dataset WHERE in_bioatlas;" cursor.execute(query) return [dict(row) for row in cursor.fetchall()]
def create_input_fasta(cursor: DictCursor, filename: str = 'blast_input'): """ Uses the given database `cursor` to extract sequences to be placed in `filename`. """ logging.info("Creating input fasta file: %s.fasta", filename) with open(f'{filename}.fasta', 'w') as fasta: cursor.execute("SELECT * FROM api.app_asvs_for_blastdb;") for asv_id, taxonomy, sequence in cursor.fetchall(): fasta.write('>%s-%s\n%s\n' % (asv_id, taxonomy, sequence))
def create_tables(cur: DictCursor, conn: DictConnection): """ Function that creates all tables from `CREATE_TABLE_QUERIES` Parameters ---------- cur conn """ for query in CREATE_TABLE_QUERIES: cur.execute(query) conn.commit()
def load_staging_tables(cur: DictCursor, conn: DictConnection): """ Function that loads all data from the source S3 bucket into the staging Redshift tables by using the `COPY_TABLE_QUERIES` variable Parameters ---------- cur conn """ for query in COPY_TABLE_QUERIES: cur.execute(query) conn.commit()
def insert_tables(cur: DictCursor, conn: DictConnection): """ Function that inserts data from the staging Redshift tables into the Redshift analytical tables by using the `INSERT_TABLE_QUERIES` variable Parameters ---------- cur conn """ for query in INSERT_TABLE_QUERIES: cur.execute(query) conn.commit()
def drop_tables(cur: DictCursor, conn: DictConnection): """ Function that drops all tables from `DROP_TABLES` by using the `DROP_TABLE_FORMAT` string format Parameters ---------- cur conn """ for table in DROP_TABLES: cur.execute(DROP_TABLE_FORMAT.format(table=table)) conn.commit()
def insert_dataset(data: pandas.DataFrame, mapping: dict, db_cursor: DictCursor) -> int: """ Inserts a single dataset into the database, and returns the database `pid`. """ base_query, field_mapping = get_base_query(mapping['dataset']) if len(data.values) != 1: logging.error("There must be exactly one dataset to insert") sys.exit(1) values = format_values(data, field_mapping, 0, 1) query = f"{base_query} VALUES {', '.join(values)} RETURNING pid;" try: db_cursor.execute(query) except psycopg2.Error as err: logging.error(err) logging.error("No data were imported.") sys.exit(1) return db_cursor.fetchall()[0]['pid']
def insert_asvs(data: pandas.DataFrame, mapping: dict, db_cursor: DictCursor, batch_size: int = 1000) -> (pandas.DataFrame, int): """ Inserts asv's into the database, returning the database `pid`'s. Unlike the other categories asv conflicts returns the id of the previously registered entry. """ base_query, field_mapping = get_base_query(mapping['asv']) total = len(data.values) start = 0 end = min(total, batch_size) # get max asv_id before insert (this helps us figure out which asv's were # already in the database). db_cursor.execute("SELECT MAX(pid) FROM asv;") old_max_pid = db_cursor.fetchone()[0] pids = [] while start < total: logging.info(" * inserting %s to %s", start, end) values = format_values(data, field_mapping, start, end) query = f"{base_query} VALUES {', '.join(values)} " + \ "ON CONFLICT (asv_sequence) DO UPDATE SET pid = asv.pid " + \ "RETURNING pid;" try: db_cursor.execute(query) pids += [r['pid'] for r in db_cursor.fetchall()] except psycopg2.Error as err: logging.error(err) logging.error("No data were imported.") sys.exit(1) start = end end = min(total, end + batch_size) # assign pids to data for future joins return data.assign(pid=pids), old_max_pid or 0
def next(self): res = _cursor.next(self) self.log(res) return res
def fetchall(self): res = _cursor.fetchall(self) for row in res: self.log(row) return res
def fetchone(self): res = _cursor.fetchone(self) self.log(res) return res
def __execute(self, cursor: DictCursor, query_args: tuple, double_percentage_sign_marker: str, print_warnings: bool) -> None: """Execute statement, set up cursor to results.""" # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point if len(query_args) == 0: raise McDatabaseResultException('No query or its parameters.') if len(query_args[0]) == 0: raise McDatabaseResultException('Query is empty or undefined.') try: if len(query_args) == 1: # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty) # to execute(). query_args = (query_args[0], {},) query = query_args[0] # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s') query = re.sub(r'%(?!(s|\(.*?\)s?))', '%%', query) # Replace percentage signs coming from quote()d strings with double percentage signs query = query.replace(double_percentage_sign_marker, '%%') query_args_list = list(query_args) query_args_list[0] = query query_args = tuple(query_args_list) log.debug("Running query: %s" % str(query_args)) t = time.time() cursor.execute(*query_args) query_time = time.time() - t if (query_time >= 1): query_text = textwrap.shorten(str(query_args[0]), width=80) query_params = textwrap.shorten(str(query_args[1:]), width=80) log.info("Slow query (%d seconds): %s, %s" % (query_time, query_text, query_params)) except psycopg2.Warning as ex: if print_warnings: log.warning('Warning while running query: %s' % str(ex)) else: log.debug('Warning while running query: %s' % str(ex)) except psycopg2.ProgrammingError as ex: raise McDatabaseResultException( 'Invalid query: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) except psycopg2.Error as ex: try: mogrified_query = cursor.mogrify(*query_args) except Exception as ex: # Can't mogrify raise McDatabaseResultException( 'Query failed: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) else: raise McDatabaseResultException( 'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s' % { 'exception': str(ex), 'query': str(query_args), 'mogrified_query': str(mogrified_query), }) except Exception as ex: raise McDatabaseResultException( 'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) self.__cursor = cursor # Cursor now holds results
def __execute(self, cursor: DictCursor, query_args: tuple, double_percentage_sign_marker: str, print_warnings: bool) -> None: """Execute statement, set up cursor to results.""" # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point if len(query_args) == 0: raise McDatabaseResultException('No query or its parameters.') if len(query_args[0]) == 0: raise McDatabaseResultException('Query is empty or undefined.') try: if len(query_args) == 1: # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty) # to execute(). query_args = (query_args[0], {},) query = query_args[0] # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s') query = re.sub(r'%(?!(s|\(.*?\)s?))', '%%', query) # Replace percentage signs coming from quote()d strings with double percentage signs query = query.replace(double_percentage_sign_marker, '%%') query_args_list = list(query_args) query_args_list[0] = query query_args = tuple(query_args_list) log.debug("Running query: %s" % str(query_args)) t = time.time() cursor.execute(*query_args) query_time = time.time() - t if query_time >= 1: query_text = textwrap.shorten(str(query_args[0]), width=80) query_params = textwrap.shorten(str(query_args[1:]), width=80) log.info("Slow query (%d seconds): %s, %s" % (query_time, query_text, query_params)) except psycopg2.Warning as ex: if print_warnings: log.warning('Warning while running query: %s' % str(ex)) else: log.debug('Warning while running query: %s' % str(ex)) except psycopg2.ProgrammingError as ex: message = (query_args[0][0:1024], query_args[1:]) raise McDatabaseResultException( 'Invalid query: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(message), }) except psycopg2.Error as ex: ex_class = McDatabaseResultException if isinstance(ex, TransactionRollbackError): ex_class = McTransactionRollbackError if 'tuple to be locked was already moved to another partition due to concurrent update' in str(ex): ex_class = McTupleAlreadyMovedError try: mogrified_query = cursor.mogrify(*query_args) except Exception as ex: # Can't mogrify raise ex_class( 'Query failed: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) else: raise ex_class( 'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s' % { 'exception': str(ex), 'query': str(query_args), 'mogrified_query': str(mogrified_query), }) except Exception as ex: message = (query_args[0][0:1024], query_args[1:]) raise McDatabaseResultException( 'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(message), }) self.__cursor = cursor # Cursor now holds results
def __execute(self, cursor: DictCursor, query_args: tuple, double_percentage_sign_marker: str, print_warnings: bool) -> None: """Execute statement, set up cursor to results.""" # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point if len(query_args) == 0: raise McDatabaseResultException('No query or its parameters.') if len(query_args[0]) == 0: raise McDatabaseResultException('Query is empty or undefined.') try: if len(query_args) == 1: # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty) # to execute(). query_args = ( query_args[0], {}, ) query = query_args[0] # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s') query = re.sub('%(?!(s|\(.*?\)s?))', '%%', query) # Replace percentage signs coming from quote()d strings with double percentage signs query = query.replace(double_percentage_sign_marker, '%%') query_args_list = list(query_args) query_args_list[0] = query query_args = tuple(query_args_list) log.debug("Running query: %s" % str(query_args)) cursor.execute(*query_args) except psycopg2.Warning as ex: if print_warnings: log.warning('Warning while running query: %s' % str(ex)) else: log.debug('Warning while running query: %s' % str(ex)) except psycopg2.ProgrammingError as ex: raise McDatabaseResultException( 'Invalid query: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) except psycopg2.Error as ex: try: mogrified_query = cursor.mogrify(*query_args) except Exception as ex: # Can't mogrify raise McDatabaseResultException( 'Query failed: %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) else: raise McDatabaseResultException( 'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s' % { 'exception': str(ex), 'query': str(query_args), 'mogrified_query': str(mogrified_query), }) except Exception as ex: raise McDatabaseResultException( 'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s' % { 'exception': str(ex), 'query': str(query_args), }) self.__cursor = cursor # Cursor now holds results
def fetchmany(self, size=None): res = _cursor.fetchmany(self, size=None) return res