コード例 #1
0
ファイル: duplicated.py プロジェクト: ploomber/ploomber
def assert_no_duplicates_in_column(client,
                                   col: Union[str, List[str]],
                                   product,
                                   stats=False):
    """
    Assert if there are duplicates in a column (or group of columns). If there
    are duplicates, it raises an AssertionError with an error message showing
    some of the duplicated values

    Parameters
    ----------
    stats : bool, default=False
        Whether to show duplicates stats in the error message or not
    """
    duplicates_query = _duplicates_query(col, product)

    sql = Template("""
    {{query}}
    LIMIT 10
    """).render(query=duplicates_query)

    cur = client.connection.cursor()
    cur.execute(sql)
    output = cur.fetchall()

    if len(output):
        names = [t[0] for t in cur.description]
        table = tabulate(output, headers=names)
        cols = ','.join(_make_iterable(col))

        sql_sample_rows = Template("""
        WITH duplicated AS (
            {{sql}}
        )
        SELECT t.*
        FROM {{product}} AS t
        JOIN duplicated
        USING ({{cols}})
        ORDER BY {{cols}}
        """).render(sql=sql, product=product, cols=cols)

        cur = client.connection.cursor()
        cur.execute(sql_sample_rows)
        output = cur.fetchall()
        cur.close()

        names = [t[0] for t in cur.description]
        table_sample = tabulate(output, headers=names)

        msg = f'Duplicates found.\n\n{table}\n\n{table_sample}'

        if stats:
            n_rows, n_unique, n_duplicates = duplicates_stats(
                client, col, product)

            msg += (f'\n\nNumber of rows: {n_rows:,}\n'
                    f'Number of unique values: {n_unique:,}\n'
                    f'Number of duplicates: {n_duplicates:,}')

        raise AssertionError(msg)
コード例 #2
0
ファイル: TaskSpec.py プロジェクト: israelrico007/ploomber
    def to_task(self, dag):
        """Converts the spec to a Task instance and adds it to the dag
        """
        task_dict = copy(self.data)
        upstream = _make_iterable(task_dict.pop('upstream'))
        class_ = task_dict.pop('class')

        product = init_product(task_dict, self.meta, class_, self.project_root)

        _init_client(task_dict)

        source = task_dict.pop('source')

        name = task_dict.pop('name', None)

        on_finish = task_dict.pop('on_finish', None)
        on_render = task_dict.pop('on_render', None)
        on_failure = task_dict.pop('on_failure', None)

        if 'serializer' in task_dict:
            task_dict['serializer'] = dotted_path.load_callable_dotted_path(
                task_dict['serializer'])

        if 'unserializer' in task_dict:
            task_dict['unserializer'] = dotted_path.load_callable_dotted_path(
                task_dict['unserializer'])

        # edge case: if using lazy_import, we should not check if the kernel
        # is installed. this is used when exporting to Argo/Airflow using
        # soopervisor, since the exporting process should not require to have
        # the ir kernel installed. The same applies when Airflow has to convert
        # the DAG, the Airflow environment shouldn't require the ir kernel
        if (class_ == tasks.NotebookRunner and self.lazy_import
                and 'check_if_kernel_installed' not in task_dict):
            task_dict['check_if_kernel_installed'] = False

        try:
            task = class_(source=source,
                          product=product,
                          name=name,
                          dag=dag,
                          **task_dict)
        except Exception as e:
            msg = f'Error initializing Task from {self!r}. Error: {e.args[0]}'
            e.args = (msg, )
            raise

        if on_finish:
            task.on_finish = dotted_path.load_callable_dotted_path(on_finish)

        if on_render:
            task.on_render = dotted_path.load_callable_dotted_path(on_render)

        if on_failure:
            task.on_failure = dotted_path.load_callable_dotted_path(on_failure)

        return task, upstream
コード例 #3
0
ファイル: duplicated.py プロジェクト: ploomber/ploomber
def _duplicates_query(col, product):
    """Generate SQL code that counts number of duplicates
    """
    cols = ','.join(_make_iterable(col))

    return Template("""
    SELECT {{cols}}, COUNT(*) - 1 AS n_duplicates
    FROM {{product}}
    GROUP BY {{cols}}
    HAVING COUNT(*) > 1
    """).render(cols=cols, product=product)
コード例 #4
0
ファイル: taskspec.py プロジェクト: ploomber/ploomber
    def to_task(self, dag):
        """
        Convert the spec to a Task or TaskGroup and add it to the dag.
        Returns a (task, upstream) tuple with the Task instance and list of
        upstream dependencies (as described in the 'upstream' key, if any,
        empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup
        instance instead

        Parameters
        ----------
        dag
            The DAG to add the task(s) to
        """
        data = copy(self.data)
        upstream = _make_iterable(data.pop('upstream'))

        if 'grid' in data:
            if 'name' not in data:
                raise KeyError(f'Error initializing task with spec {data!r}: '
                               'tasks with \'grid\' must have a \'name\'')

            task_class = data.pop('class')
            product_class = _find_product_class(task_class, data, self.meta)
            product = data.pop('product')
            name = data.pop('name')
            grid = data.pop('grid')
            # TODO: support for hooks
            return TaskGroup.from_grid(
                task_class=task_class,
                product_class=product_class,
                product_primitive=product,
                task_kwargs=data,
                dag=dag,
                name=name,
                grid=grid,
                resolve_relative_to=self.project_root), upstream
        else:
            return _init_task(data=data,
                              meta=self.meta,
                              project_root=self.project_root,
                              lazy_import=self.lazy_import,
                              dag=dag), upstream
コード例 #5
0
ファイル: duplicated.py プロジェクト: ploomber/ploomber
def duplicates_stats(client, col: Union[str, List[str]], product):
    """Get stats on rows with duplicated values

    Returns
    -------
    n_rows
        Number of rows in product
    n_unique
        Number of unique values (for selected columns) in product
    n_duplicates
        Number of rows with duplicated values (this is equal as the number
        of rows we'd have to drop to remove duplicates)
    """
    cols = ','.join(_make_iterable(col))

    # num of rows in product
    n_rows = _query(
        client,
        Template('SELECT COUNT(*) FROM {{product}}').render(product=product))

    # num of unique values (using all columns)
    n_unique = _query(
        client,
        Template('SELECT COUNT(DISTINCT({{cols}})) FROM {{product}}').render(
            product=product, cols=cols))

    sql_n_duplicates = Template("""
        WITH duplicated AS (
            {{sql}}
        )
        SELECT SUM(n_duplicates) FROM duplicated
        """).render(sql=_duplicates_query(col, product),
                    product=product,
                    cols=cols)

    # num of duplicated rows (number of rows we have to drop to remove all
    # duplicates)
    n_duplicates = _query(client, sql_n_duplicates)

    return n_rows, n_unique, n_duplicates
コード例 #6
0
ファイル: taskspec.py プロジェクト: cxz/ploomber
    def to_task(self, dag):
        """
        Convert the spec to a Task or TaskGroup and add it to the dag.
        Returns a (task, upstream) tuple with the Task instance and list of
        upstream dependencies (as described in the 'upstream' key, if any,
        empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup
        instance instead

        Parameters
        ----------
        dag
            The DAG to add the task(s) to
        """
        data = copy(self.data)
        upstream = _make_iterable(data.pop('upstream'))

        if 'grid' in data:
            data_source_ = data["source"]
            data_source = str(data_source_ if not hasattr(
                data_source_, '__name__') else data_source_.__name__)

            if 'params' in data:
                raise DAGSpecInitializationError(
                    'Error initializing task with '
                    f'source {data_source!r}: '
                    '\'params\' is not allowed when using \'grid\'')

            if 'name' not in data:
                raise DAGSpecInitializationError(
                    f'Error initializing task with '
                    f'source {data_source!r}: '
                    'tasks with \'grid\' must have a \'name\'')

            task_class = data.pop('class')
            product_class = _find_product_class(task_class, data, self.meta)
            product = data.pop('product')
            name = data.pop('name')
            grid = data.pop('grid')

            # hooks
            on_render = data.pop('on_render', None)
            on_finish = data.pop('on_finish', None)
            on_failure = data.pop('on_failure', None)

            if on_render:
                on_render = dotted_path.DottedPath(on_render,
                                                   lazy_load=self.lazy_import)

            if on_finish:
                on_finish = dotted_path.DottedPath(on_finish,
                                                   lazy_load=self.lazy_import)

            if on_failure:
                on_failure = dotted_path.DottedPath(on_failure,
                                                    lazy_load=self.lazy_import)

            return TaskGroup.from_grid(task_class=task_class,
                                       product_class=product_class,
                                       product_primitive=product,
                                       task_kwargs=data,
                                       dag=dag,
                                       name=name,
                                       grid=grid,
                                       resolve_relative_to=self.project_root,
                                       on_render=on_render,
                                       on_finish=on_finish,
                                       on_failure=on_failure), upstream
        else:
            return _init_task(data=data,
                              meta=self.meta,
                              project_root=self.project_root,
                              lazy_import=self.lazy_import,
                              dag=dag), upstream