def build_and_then_import_data(connection, table, primary_key, columns, excludes, search, total_count, chunk_size, verbose=False, dry_run=False): """ Select all data from a table and return it together with a list of table columns. :param connection: A database connection instance. :param str table: Name of the table to retrieve the data. :param str primary_key: Table primary key :param list columns: A list of table fields :param list[dict] excludes: A list of exclude definitions. :param str search: A SQL WHERE (search_condition) to filter and keep only the searched rows. :param int total_count: The amount of rows for the current table :param int chunk_size: Number of data rows to fetch with the cursor :param bool verbose: Display logging information and a progress bar. :param bool dry_run: Script is running in dry-run mode, no commit expected. """ column_names = get_column_names(columns) sql_columns = SQL(', ').join([ Identifier(column_name) for column_name in [primary_key] + column_names ]) sql_select = SQL('SELECT {columns} FROM {table}').format( table=Identifier(table), columns=sql_columns) if search: sql_select = Composed([ sql_select, SQL(" WHERE {search_condition}".format(search_condition=search)) ]) if dry_run: sql_select = Composed([sql_select, SQL(" LIMIT 100")]) cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') cursor.execute(sql_select.as_string(connection)) temp_table = 'tmp_{table}'.format(table=table) create_temporary_table(connection, columns, table, temp_table, primary_key) batches = int(math.ceil((1.0 * total_count) / (1.0 * chunk_size))) for i in trange(batches, desc="Processing {} batches for {}".format(batches, table), disable=not verbose): records = cursor.fetchmany(size=chunk_size) if records: data = parmap.map(process_row, records, columns, excludes, pm_pbar=verbose) import_data(connection, temp_table, [primary_key] + column_names, filter(None, data)) apply_anonymized_data(connection, temp_table, table, primary_key, columns) cursor.close()
def load(self): with self.conn.cursor() as cursor: headers = self.headers.keys() stmt = SQL("COPY {} ({}) FROM STDIN WITH CSV HEADER NULL AS ''") columns = Composed([Identifier(c) for c in headers]) columns = columns.join(', ') stmt = stmt.format(Identifier(self.table), columns) # print stmt.as_string(cursor) cursor.copy_expert(stmt, self.proxy) self.conn.commit()
def load(self, fh, delimiter): with self.conn.cursor() as cursor: headers = list(self.headers.keys()) stmt = SQL("COPY {} ({}) FROM STDIN " "WITH CSV HEADER DELIMITER AS {} NULL AS ''") columns = Composed([Identifier(c) for c in headers]) columns = columns.join(', ') stmt = stmt.format(Identifier(self.table), columns, Literal(delimiter)) print(stmt.as_string(cursor)) cursor.copy_expert(stmt, fh) self.conn.commit()
def sql_with_cte(table: Table, cte_deps: List[Table], **kwargs) -> Composed: """Build Table query with CTEs. cte_deps must be topologically sorted""" names = {t: Identifier("_cte%d" % i) for i, t in enumerate(cte_deps)} return Composed([ SQL("WITH "), SQL("\n , ").join( Composed( Composed([ n, SQL(" AS ("), sql_with_named_deps(t, names, **kwargs), SQL(")") ]) for t, n in names.items())), SQL("\n"), sql_with_named_deps(table, names, **kwargs) ])
def sql_with_named_deps(table: Table, deps: Mapping[Table, Identifier], **kwargs) -> Composed: """Build Table query relying on given Identifiers for dependencies""" return table.sql.format( **{ s: Composed([deps[t], SQL(" AS "), Identifier(s)]) for s, t in table.source_tables.items() }, **kwargs)
def sql_with_subqueries(table: Table, **kwargs) -> Composed: """Recursively build Table query with dependencies as sub-queries""" return table.sql.format( **{ s: Composed([ SQL("("), sql_with_subqueries(t, **kwargs), SQL(") AS "), Identifier(s) ]) for s, t in table.source_tables.items() }, **kwargs)
def index_action(self, columns, action='add', where='', table=None): if isinstance(columns, str): columns = [columns] def stringify(column): if isinstance(column, str): return column else: return ('_'.join(map(str, (column, ) + column.params)).replace( '"', '').replace('%s', '__')) name = [table or self.table_name] name.append('_'.join(map(stringify, columns))) if where: name.append('+where') name.append(stringify(where)) name.append('index') index_name = self.convert_name('_'.join(name)) with Transaction().connection.cursor() as cursor: if action == 'add': if index_name in self._indexes: return params = sum( (c.params for c in columns if hasattr(c, 'params')), ()) if where: params += where.params where = ' WHERE %s' % where cursor.execute( Composed([ SQL('CREATE INDEX {name} ON {table} ({columns})'). format(name=Identifier(index_name), table=Identifier(self.table_name), columns=SQL(',').join((Identifier( c) if isinstance(c, str) else SQL(str(c))) for c in columns)), SQL(where) ]), params) self._update_definitions(indexes=True) elif action == 'remove': if len(columns) == 1 and isinstance(columns[0], str): if self._field2module.get( columns[0], self.module_name) != self.module_name: return if index_name in self._indexes: cursor.execute( SQL('DROP INDEX {}').format(Identifier(index_name))) self._update_definitions(indexes=True) else: raise Exception('Index action not supported!')
def update(self, an_id: id = None, where_key: str = None, name: str = None, data=None, notes: str = None, modified_by: str = None, created_by: str = None, my_conn: Optional[dict] = None, t_log: Optional[TimeLogger] = None, verbose: bool = None): """Insert New Record Into grouped_physics_object""" if my_conn is None: my_conn = self.my_conn else: self.my_conn = my_conn if verbose is True and t_log is None: t_log = TimeLogger() my_conn = my_connect(my_conn=my_conn, t_log=t_log, verbose=verbose) conn = my_conn['conn'] db_params = my_conn['db_params'] if where_key is None: where_key = self.id_name() if an_id is None: warn("No Record ID Specified", NoRecordIDError) else: if data is None: data = {} data.update(add_field('name', name)) data.update(add_field('notes', notes)) data.update(add_field('created_by', created_by)) # If there is no data, then skip. Of course one could still change modified by: if len(data) > 0 or modified_by is not None: # Always require a modified by and because one can change data without specifying a modifer, # this is necessary. We don't check it before the previous if, because we don't want to create # a modified_by if not data was set and no modified_by was set. if modified_by is None: modified_by = db_params['user'] data.update(modified_by=modified_by) fields = data.keys() sql = "UPDATE {table} SET {fields} WHERE {pkey} = {a_value}" if verbose: print('Data:\n', data) print('\nFields:\n', fields) query = SQL(sql).format( table=Identifier(self.table_name), fields=SQL(', ').join( Composed([Identifier(k), SQL(' = '), Placeholder(k)]) for k in fields ), pkey=Identifier(where_key), a_value=Placeholder('where_key') ) data.update(where_key=an_id) cur = conn.cursor(cursor_factory=NamedTupleCursor) if verbose: print(query.as_string(conn)) print(cur.mogrify(query, data)) try: cur.execute(query, data) except OperationalError as error: print(error) conn.commit() cur.close() self.pull_data()
def update(self, an_id: id = None, where_key: str = None, name: str = None, data=None, latex: str = None, notes: str = None, unit_id: int = None, image: bytes = None, template_id: int = None, dimensions: int = None, modified_by: str = None, created_by: str = None, verbose: bool = None): """Insert New Record Into math_object""" if where_key is None: where_key = self.id_name if an_id is None: warn("No Record ID Specified", NoRecordIDError) else: if data is None: data = {} db_params = config() data.update(self._add_field('name', name)) data.update(self._add_field('notes', notes)) data.update(self._process_latex(latex, image, template_id, verbose)) data.update(self._add_field('dimensions', dimensions)) data.update(self._add_field('unit_id', unit_id)) data.update(self._add_field('created_by', created_by)) # If there is no data, then skip. Of course one could still change modified by: if len(data) > 0 or modified_by is not None: # Always require a modified by and because one can change data without specifying a modifer, # this is necessary. We don't check it before the previous if, because we don't want to create # a modified_by if not data was set and no modified_by was set. if modified_by is None: modified_by = db_params['user'] data.update(modified_by=modified_by) fields = data.keys() sql = "UPDATE {table} SET {fields} WHERE {pkey} = {a_value} RETURNING *" query = SQL(sql).format( table=Identifier(self.table), fields=SQL(', ').join( Composed([Identifier(k), SQL(' = '), Placeholder(k)]) for k in fields), pkey=Identifier(where_key), a_value=Placeholder('where_key')) data.update(where_key=an_id) conn = connect(**db_params) cur = conn.cursor(cursor_factory=NamedTupleCursor) if verbose: print(query.as_string(conn)) print(cur.mogrify(query, data)) try: cur.execute(query, data) except OperationalError as error: print(error) new_record = cur.fetchall() conn.commit() cur.close() conn.close() self.last_inserted = self.as_columns(new_record) self.records = self.all_records()