def assert_no_duplicates_in_column(client, col: Union[str, List[str]], product, stats=False): """ Assert if there are duplicates in a column (or group of columns). If there are duplicates, it raises an AssertionError with an error message showing some of the duplicated values Parameters ---------- stats : bool, default=False Whether to show duplicates stats in the error message or not """ duplicates_query = _duplicates_query(col, product) sql = Template(""" {{query}} LIMIT 10 """).render(query=duplicates_query) cur = client.connection.cursor() cur.execute(sql) output = cur.fetchall() if len(output): names = [t[0] for t in cur.description] table = tabulate(output, headers=names) cols = ','.join(_make_iterable(col)) sql_sample_rows = Template(""" WITH duplicated AS ( {{sql}} ) SELECT t.* FROM {{product}} AS t JOIN duplicated USING ({{cols}}) ORDER BY {{cols}} """).render(sql=sql, product=product, cols=cols) cur = client.connection.cursor() cur.execute(sql_sample_rows) output = cur.fetchall() cur.close() names = [t[0] for t in cur.description] table_sample = tabulate(output, headers=names) msg = f'Duplicates found.\n\n{table}\n\n{table_sample}' if stats: n_rows, n_unique, n_duplicates = duplicates_stats( client, col, product) msg += (f'\n\nNumber of rows: {n_rows:,}\n' f'Number of unique values: {n_unique:,}\n' f'Number of duplicates: {n_duplicates:,}') raise AssertionError(msg)
def to_task(self, dag): """Converts the spec to a Task instance and adds it to the dag """ task_dict = copy(self.data) upstream = _make_iterable(task_dict.pop('upstream')) class_ = task_dict.pop('class') product = init_product(task_dict, self.meta, class_, self.project_root) _init_client(task_dict) source = task_dict.pop('source') name = task_dict.pop('name', None) on_finish = task_dict.pop('on_finish', None) on_render = task_dict.pop('on_render', None) on_failure = task_dict.pop('on_failure', None) if 'serializer' in task_dict: task_dict['serializer'] = dotted_path.load_callable_dotted_path( task_dict['serializer']) if 'unserializer' in task_dict: task_dict['unserializer'] = dotted_path.load_callable_dotted_path( task_dict['unserializer']) # edge case: if using lazy_import, we should not check if the kernel # is installed. this is used when exporting to Argo/Airflow using # soopervisor, since the exporting process should not require to have # the ir kernel installed. The same applies when Airflow has to convert # the DAG, the Airflow environment shouldn't require the ir kernel if (class_ == tasks.NotebookRunner and self.lazy_import and 'check_if_kernel_installed' not in task_dict): task_dict['check_if_kernel_installed'] = False try: task = class_(source=source, product=product, name=name, dag=dag, **task_dict) except Exception as e: msg = f'Error initializing Task from {self!r}. Error: {e.args[0]}' e.args = (msg, ) raise if on_finish: task.on_finish = dotted_path.load_callable_dotted_path(on_finish) if on_render: task.on_render = dotted_path.load_callable_dotted_path(on_render) if on_failure: task.on_failure = dotted_path.load_callable_dotted_path(on_failure) return task, upstream
def _duplicates_query(col, product): """Generate SQL code that counts number of duplicates """ cols = ','.join(_make_iterable(col)) return Template(""" SELECT {{cols}}, COUNT(*) - 1 AS n_duplicates FROM {{product}} GROUP BY {{cols}} HAVING COUNT(*) > 1 """).render(cols=cols, product=product)
def to_task(self, dag): """ Convert the spec to a Task or TaskGroup and add it to the dag. Returns a (task, upstream) tuple with the Task instance and list of upstream dependencies (as described in the 'upstream' key, if any, empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup instance instead Parameters ---------- dag The DAG to add the task(s) to """ data = copy(self.data) upstream = _make_iterable(data.pop('upstream')) if 'grid' in data: if 'name' not in data: raise KeyError(f'Error initializing task with spec {data!r}: ' 'tasks with \'grid\' must have a \'name\'') task_class = data.pop('class') product_class = _find_product_class(task_class, data, self.meta) product = data.pop('product') name = data.pop('name') grid = data.pop('grid') # TODO: support for hooks return TaskGroup.from_grid( task_class=task_class, product_class=product_class, product_primitive=product, task_kwargs=data, dag=dag, name=name, grid=grid, resolve_relative_to=self.project_root), upstream else: return _init_task(data=data, meta=self.meta, project_root=self.project_root, lazy_import=self.lazy_import, dag=dag), upstream
def duplicates_stats(client, col: Union[str, List[str]], product): """Get stats on rows with duplicated values Returns ------- n_rows Number of rows in product n_unique Number of unique values (for selected columns) in product n_duplicates Number of rows with duplicated values (this is equal as the number of rows we'd have to drop to remove duplicates) """ cols = ','.join(_make_iterable(col)) # num of rows in product n_rows = _query( client, Template('SELECT COUNT(*) FROM {{product}}').render(product=product)) # num of unique values (using all columns) n_unique = _query( client, Template('SELECT COUNT(DISTINCT({{cols}})) FROM {{product}}').render( product=product, cols=cols)) sql_n_duplicates = Template(""" WITH duplicated AS ( {{sql}} ) SELECT SUM(n_duplicates) FROM duplicated """).render(sql=_duplicates_query(col, product), product=product, cols=cols) # num of duplicated rows (number of rows we have to drop to remove all # duplicates) n_duplicates = _query(client, sql_n_duplicates) return n_rows, n_unique, n_duplicates
def to_task(self, dag): """ Convert the spec to a Task or TaskGroup and add it to the dag. Returns a (task, upstream) tuple with the Task instance and list of upstream dependencies (as described in the 'upstream' key, if any, empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup instance instead Parameters ---------- dag The DAG to add the task(s) to """ data = copy(self.data) upstream = _make_iterable(data.pop('upstream')) if 'grid' in data: data_source_ = data["source"] data_source = str(data_source_ if not hasattr( data_source_, '__name__') else data_source_.__name__) if 'params' in data: raise DAGSpecInitializationError( 'Error initializing task with ' f'source {data_source!r}: ' '\'params\' is not allowed when using \'grid\'') if 'name' not in data: raise DAGSpecInitializationError( f'Error initializing task with ' f'source {data_source!r}: ' 'tasks with \'grid\' must have a \'name\'') task_class = data.pop('class') product_class = _find_product_class(task_class, data, self.meta) product = data.pop('product') name = data.pop('name') grid = data.pop('grid') # hooks on_render = data.pop('on_render', None) on_finish = data.pop('on_finish', None) on_failure = data.pop('on_failure', None) if on_render: on_render = dotted_path.DottedPath(on_render, lazy_load=self.lazy_import) if on_finish: on_finish = dotted_path.DottedPath(on_finish, lazy_load=self.lazy_import) if on_failure: on_failure = dotted_path.DottedPath(on_failure, lazy_load=self.lazy_import) return TaskGroup.from_grid(task_class=task_class, product_class=product_class, product_primitive=product, task_kwargs=data, dag=dag, name=name, grid=grid, resolve_relative_to=self.project_root, on_render=on_render, on_finish=on_finish, on_failure=on_failure), upstream else: return _init_task(data=data, meta=self.meta, project_root=self.project_root, lazy_import=self.lazy_import, dag=dag), upstream