Beispiel #1
0
def mutual_info(when, then, df=None):
    """
    If a df is passed, the mutual_info metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param when: First column on which to compute MI.
    :type when: str/int
    :param then: Second column on which to compute MI.
    :type then: str/int
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "mutual_info", "when": when, "then": then}
    # create tak containing parameters
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #2
0
def grouprule(columns, having, conditions=None, df=None):
    """
    If a df is passed, the groupRule metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param columns: Columns on which to run the metric, grouping data.
    :param conditions: Conditions on which to run the metric, filtering data before grouping, can be None.
    :type conditions: list
    :param having: Conditions to apply to groups.
    :type having: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "groupRule", "columns": columns, "having": having}
    if conditions is not None:
        params["conditions"] = conditions
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #3
0
def deduplication(columns=None, df=None):
    """
    If a df is passed, the deduplication metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param columns: Columns on which to run the metric, None to run the deduplication
        metric on the whole table (deduplication on rows).
    :type columns: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "deduplication"}
    if not (columns is None):
        params["columns"] = columns
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #4
0
def constraint(when, then, conditions=None, df=None):
    """
    If a df is passed, the constraint metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param when: A list of columns in the df to use as the precondition of a functional constraint. No column
        should be in both when and then.
    :type when: list
    :param then: A list of columns in the df to use as the postcondition of a functional constraint. No column
        should be in both when and then.
    :type then: list
    :param conditions: Conditions on which to filter data before applying the metric.
    :type conditions: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "constraint", "when": when, "then": then}
    if conditions:
        params["conditions"] = conditions
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #5
0
def deduplication_approximated(columns, df=None):
    """
    If a df is passed, the deduplication_approximated metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).
    Differently from deduplication, here columns must be specified (deduplication_approximated does not
    work on a whole row level).

    :param columns: Columns on which to run the metric.
    :type columns: list
        :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
    this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "deduplication_approximated"}
    if not (columns is None):
        params["columns"] = columns
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #6
0
def timeliness(columns, value, df=None, dateFormat=None, timeFormat=None):
    """
    If a df is passed, the timeliness metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).
    Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats.

    :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp
        using the dateFormat or timeFormat argument.
    :type columns: list
    :param value: Value used to run the metric, confronting values in the specified columns against it.
    :type value: str
    :param dateFormat: Format in which the value (and values in columns, if they are of string type) are; used
        to cast columns if they contain dates as strings. Either dateFormat
        or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
        directives to express formats.
    :type dateFormat: str
    :param timeFormat: Format in which the value (and values in columns, if they are of string type) are; used
        to cast columns if they contain dates as strings. Either dateFormat
        or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
        directives to express formats.
    :type timeFormat: str
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    assert (dateFormat is None or timeFormat is None) and (
        not dateFormat is None or not timeFormat is None
    ), "Pass either a dateFormat or a timeFormat, not both."
    # make a dict representing the parameters
    params = {"metric": "timeliness", "columns": columns, "value": value}
    if dateFormat:
        params["dateFormat"] = dateFormat
    elif timeFormat:
        params["timeFormat"] = timeFormat
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #7
0
def rule(conditions, df=None):
    """
    If a df is passed, the rule metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param conditions: Conditions on which to run the metric.
    :type conditions: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "rule", "conditions": conditions}
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
Beispiel #8
0
def freshness(columns, df=None, dateFormat=None, timeFormat=None):
    """
    If a df is passed, the freshness metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).
    Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats.

    :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp
        using the dateFormat or timeFormat argument.
    :type columns: list
    :param dateFormat: Format in which the values in columns are if those columns are of type string; otherwise they must
        be of type date or timestamp. Use this parameter if you are interested in a result in terms of days.
        Either dateFormat or timeFormat must be passed, but not both.
        Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats.
    :type dateFormat: str
    :param timeFormat: Format in which the values in columns are if those columns are of type string; otherwise they must
        be of type timestamp. Use this parameter if you are interested in results in terms of seconds.
        Either dateFormat or timeFormat must be passed, but not both.
        Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats.
    :type timeFormat: str
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "freshness", "columns": columns}
    if dateFormat:
        params["dateFormat"] = dateFormat
    elif timeFormat:
        params["timeFormat"] = timeFormat
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]