コード例 #1
0
ファイル: distributions.py プロジェクト: georg-un/DataSciTK
def is_not_normally_distributed(data,
                                alpha=0.05,
                                alternative='two-sided',
                                mode='approx',
                                verbose=False):
    """
    Performs a Kolmogorov-Smirnov-Test for normal distribution. The tested hypothesis is that the data is not
    normally distributed. If the p-value is smaller than alpha, it returns True.

    :param data:                1-dimensional numpy array or pandas DataFrame of shape (m, 1).
    :param alpha:               Float. Defines the significance level.
    :param alternative:         String. Either 'two-sided', 'less' or 'greater'. Defines the alternative hypothesis.
    :param mode:                String. Either 'approx' or 'asymp'. See scipy.stats.kstest for more info.
    :param verbose:             True or False. True for verbose output.

    :return:                    True if data is not normally distributed. False if alternative hypothesis cannot be rejected.

    """

    # Check if inputs are valid
    check_numpy_array_pandas_dataframe_series_1d(data, 'data')

    check_float(alpha, 'alpha')
    if alpha <= 0.0 or alpha >= 1:
        raise TypeError(
            "Value for 'alpha' is {0}, but must be a value between 0 and 1.".
            format(alpha))

    check_string(alternative, 'alternative')
    if alternative not in ['two-sided', 'less', 'greater']:
        raise TypeError(
            "Value for parameter 'alternative' must be either 'two-sided', 'less' or 'greater'."
        )

    check_string(mode, 'mode')
    if mode not in ['approx', 'asymp']:
        raise TypeError(
            "Value for parameter 'mode' must be either 'approx' or 'asymp'.")

    check_boolean(verbose, 'verbose')

    # Test alternative hypothesis
    alternative_hypothesis = st.kstest(data,
                                       'norm',
                                       alternative=alternative,
                                       mode=mode)

    # Compare the p-value with the given alpha and return the respective result
    if alternative_hypothesis.pvalue < alpha:
        if verbose:
            print("Not normally distributed with a p-value of {0}.".format(
                alternative_hypothesis.pvalue))
        return True
    elif alternative_hypothesis.pvalue >= alpha:
        if verbose:
            print("Normally distributed with a p-value of {0}.".format(
                alternative_hypothesis.pvalue))
        return False
    else:
        raise IOError("Did not get a p-value for the Kolmogorov-Smirnov-Test.")
コード例 #2
0
ファイル: distributions.py プロジェクト: georg-un/DataSciTK
    def __init__(self, data, n_bins=200, verbose=False):
        """
        :param data:        1-dimensional numpy array, pandas Series or pandas Dataframe.
        :param n_bins:      Integer (at least 10). Defines the number of bins for the histogram and the precision.
        :param verbose:     True or False. Defines the verbosity of the output.

        """

        # Check if input types are valid
        check_numpy_array_pandas_dataframe_series_1d(data, 'data')
        check_integer(n_bins, 'n_bins')
        check_boolean(verbose, 'verbose')

        # Convert data to pandas.Series if it is a numpy ndarray
        if type(data) is np.ndarray:
            data = pd.Series(data)

        # Make sure n_bins is larger than 10
        if n_bins < 10:
            raise TypeError(
                "Argument for parameter 'n_bins' must be at least 10.")

        # Resize n_bins if it is too large
        if n_bins > len(data):
            n_bins = len(data)

        # Assign input variables to object
        self.data = data
        self.n_bins = n_bins
        self.verbose = verbose
コード例 #3
0
ファイル: type_ops.py プロジェクト: georg-un/DataSciTK
def get_contained_types(data, unique=True, as_string=True):
    """
    Gets all types in the input array

    :param data:            1-dimensional numpy array or pandas Series
    :param unique:          True or False. If true types are returned uniquely as strings.
    :param as_string:       True or False. Types are either returned as string or as type.

    :return:                1-dimensional numpy array containing either strings or types.

    """

    # Check if inputs are valid
    check_numpy_array_pandas_series_1d(data, 'data')

    check_boolean(unique, 'unique')

    check_boolean(as_string, 'as_string')

    if unique and not as_string:
        raise TypeError(
            "Parameter 'as_string' cannot be False as long as parameter 'unique' is True."
        )

    # Create a list with all the types in the input array
    if as_string:
        types_found = [str(type(element))[8:-2] for element in data]
    else:
        types_found = [type(element) for element in data]

    if unique:
        types_found = np.unique(types_found)

    return types_found
コード例 #4
0
ファイル: type_ops.py プロジェクト: georg-un/DataSciTK
def is_type_homogeneous(input_array, verbose=True):
    """
    Check if all values of the input array have the same type.

    :param input_array:     1-dimensional numpy array or pandas Series
    :param verbose:         True for verbose output (default)

    :return:                True or False. True if all values have the same type.

    """
    check_numpy_array_pandas_series_1d(input_array, 'input_array')

    # Check if input is valid
    check_boolean(verbose, 'verbose')

    # Get types in input array
    types_found = get_contained_types(input_array, unique=True, as_string=True)

    # Check for number of types
    if types_found.size == 1:
        if verbose:
            print('Input array contains the following type: {0}.'.format(
                types_found))
        return True
    elif types_found.size > 1:
        if verbose:
            print('Input array contains the following types {0}.:'.format(
                types_found))
        return False
    else:
        raise IOError('No types have been found.')
コード例 #5
0
ファイル: distributions.py プロジェクト: georg-un/DataSciTK
    def plot(self, x_label, title='default', y_label='Frequency', legend=True):
        """
        Plot a histogram of the data and the probability density function of the fitted distribution.

        :param x_label:         String. Title of the x-axis.
        :param title:           String. Title of the plot. If 'default', the default title will be used.
        :param y_label:         String. Title of the y-axis.
        :param legend:          Boolean. Defines if a legend will be shown.

        """

        # Check if input types are valid
        check_string(x_label, 'x_label')
        check_string(y_label, 'y_label')
        check_string(title, 'title')
        check_boolean(legend, 'legend')

        # Get string of additional parameters
        if len(self.arg) > 0:
            parameters = str([round(x, 2) for x in self.arg])[1:-1]
        else:
            parameters = 'None'

        # Set default title
        if title == 'default':
            title = "Histogram of {0} with the theoretical distribution {1}.\nSD: {2}, Mean: {3}, Additional parameters: {4}.".format(
                x_label, self.distribution.name.capitalize(),
                round(self.standard_deviation, 2), round(self.mean, 2),
                parameters)

        # Create main plot
        plt.figure(figsize=(12, 8))
        ax = self.data.plot(kind='hist',
                            bins=50,
                            normed=True,
                            alpha=0.5,
                            label='Data',
                            legend=legend)
        y_lim = (ax.get_ylim()[0], ax.get_ylim()[1] * 1.2)
        x_lim = ax.get_xlim()

        # Get probability density function and plot it
        pdf = _get_pdf(distribution=self.distribution,
                       parameters=self.parameters)
        pdf.plot(lw=2,
                 label=self.distribution.name.capitalize(),
                 legend=legend,
                 ax=ax)

        # Set focus on histogram
        plt.ylim(y_lim)
        plt.xlim(x_lim)

        # Set title and labels
        ax.set_title(title)
        ax.set_xlabel(xlabel=x_label)
        ax.set_ylabel(ylabel=y_label)
コード例 #6
0
ファイル: distributions.py プロジェクト: georg-un/DataSciTK
    def plot(self, x_label, title='default', y_label='Frequency', legend=True):
        """
        Plot a histogram of the data and the probability density functions of the n best fitting distributions.

        :param x_label:         String. Title of the x-axis.
        :param title:           String. Title of the plot. If 'default', the default title will be used.
        :param y_label:         String. Title of the y-axis.
        :param legend:          Boolean. Defines if a legend will be shown.

        """

        # Check if input types are valid
        check_string(x_label, 'x_label')
        check_string(y_label, 'y_label')
        check_string(title, 'title')
        check_boolean(legend, 'legend')

        # Set default title
        if title == 'default':
            title = "Comparison between the best {0} fitting distributions.".format(
                len(self.distributions))

        # Create main plot
        plt.figure(figsize=(12, 8))
        ax = self.data.plot(kind='hist',
                            bins=50,
                            normed=True,
                            alpha=0.5,
                            label='Data',
                            legend=legend)
        y_lim = (ax.get_ylim()[0], ax.get_ylim()[1] * 1.2)
        x_lim = ax.get_xlim()

        # Plot the best n distributions
        for index in range(0, len(self.distributions)):
            # Get distribution and parameter
            distribution = self.distributions[index].distribution
            distribution_name = distribution.name
            parameters = self.distributions[index].parameters

            # Get PDF and plot it
            pdf = _get_pdf(distribution=distribution, parameters=parameters)
            pdf.plot(lw=2,
                     label=distribution_name.capitalize(),
                     legend=legend,
                     ax=ax)

        # Set focus on histogram
        plt.ylim(y_lim)
        plt.xlim(x_lim)

        # Set title and labels
        ax.set_title(title)
        ax.set_xlabel(xlabel=x_label)
        ax.set_ylabel(ylabel=y_label)
コード例 #7
0
ファイル: type_ops.py プロジェクト: georg-un/DataSciTK
def contains_types(data, types, exclusively=False, verbose=True):
    """
    Check if the input array contains certain types. If exclusively is set to True, check if the input array
    contains ONLY the specified types.

    :param data:                1-dimensional numpy array or pandas Series
    :param types:               string or list of strings. Specifies the types (e.g. ['str', 'int']
    :param exclusively:         True or False. If set to True, check if ONLY the specified types are present
    :param verbose:             True or False. Set to true for verbose output.

    :return:                    True or False

    """

    # Make sure parameter 'types' is a list
    if type(types) is not list:
        types = [types]

    # Check if inputs are valid
    check_numpy_array_pandas_series_1d(data, 'data')

    check_list_of_strings(types, 'types')

    check_boolean(exclusively, 'exclusively')

    check_boolean(verbose, 'verbose')

    # Get types in input array
    contained_types = get_contained_types(data, unique=True, as_string=True)

    # Check if all types can be found
    types_found = [element in contained_types for element in types]
    if all(types_found):
        result = True
    else:
        if verbose:
            for index, found in enumerate(types_found):
                if not found:
                    print("Type '{0}' has not been found.".format(
                        types[index]))
        result = False

    # Check if additional types are present if exclusively is set to True
    if exclusively:
        additional_types = [element in types for element in contained_types]
        additional_types = np.invert(additional_types)
        if any(additional_types):
            if verbose:
                for index, additional_type in enumerate(additional_types):
                    if additional_type:
                        print("Additional type '{0}' has been found.".format(
                            contained_types[index]))
            result = False

    return result
コード例 #8
0
ファイル: category_ops.py プロジェクト: georg-un/DataSciTK
def count_elements_with_category(data, categories, verbose=False):
    """
    Counts all observations in 'data' which match the given category. Returns the sum of it.

    :param data:     1-dimensional numpy array
    :param categories:      List or single value. Must match the type of the values in 'data'.
    :param verbose:         True or False. True for verbose output.

    :return:                Integer. Number of found occurrences.

    """

    check_numpy_array_1d(data, 'data')

    check_boolean(verbose, 'verbose')

    # Convert category to a list, if it is not already one
    if type(categories) is not list:
        categories = [categories]

    # Check for type homogeneity
    if not is_type_homogeneous(data, verbose=False):
        raise TypeError(
            "Argument for 'data' contains values with different types {0}. Please use only type homogeneous arrays."
            .format(get_contained_types(data, unique=True, as_string=True)))

    # Check if types of data and category-argument match
    for category in categories:
        if not isinstance(category, type(data[0])):
            raise TypeError(
                "Type of 'category' ({0}) does not match type of values in 'data' ({1})."
                .format(type(category), type(data[0]))
            )  # TODO: maybe add automatic conversion in the future

    # Find matches for each category, get the sum of occurrences and add the sums of all categories together
    sum_found_observations = 0
    for category in categories:
        found_observations = np.sum(data[data == category])
        if verbose:
            print("Found {0} observations of the category '{1]'.".format(
                found_observations, category))
        sum_found_observations += found_observations

    if verbose:
        print("Found {0} matching observations in total.".format(
            sum_found_observations))

    return sum_found_observations
コード例 #9
0
ファイル: category_ops.py プロジェクト: georg-un/DataSciTK
def recode_binary_by_categories(data, to_0, to_1, verbose=False):
    """
    Recode a numpy array or pandas Series to 0 and 1 according to two lists of categories.

    :param data:                1-dimensional numpy array or pandas Series
    :param to_0:                List. Categories which should be coded to 0.
    :param to_1:                List. Categories which should be coded to 1.
    :param verbose:             True or False. If true a warning is printed, if some category is not found in data.

    :return:                    1-dimensional numpy array containing only 1 and 0.

    """

    # Check if inputs are valid
    check_numpy_array_pandas_series_1d(data, 'data')

    check_boolean(verbose, 'verbose')

    check_list_numpy_array(to_0, 'to_0')

    check_list_numpy_array(to_1, 'to_1')

    # Get contained categories
    contained_categories = get_contained_categories(data)

    # Check if all categories are defined in to_0 and to_1
    for category in contained_categories:
        if category not in to_0 and category not in to_1:
            raise TypeError(
                "Argument for 'data' contains the category '{0}' which is neither defined in to_0 or to_1."
                .format(category))

    # Check if categories are defined for both lists
    for category in to_0:
        if category in to_1:
            raise TypeError(
                "Category '{0}' is defined in to_0 and to_1. A category must not be contained in both lists."
                .format(category))

    # Print warning if one of the defined categories has not been found in data
    if verbose:
        for category in to_0:
            if category not in contained_categories:
                print(
                    "Info: Category '{0}' from to_0 has not been found in data."
                    .format(category))
        for category in to_1:
            if category not in contained_categories:
                print(
                    "Info: Category '{0}' from to_1 has not been found in data."
                    .format(category))

    # Copy data to binary array
    binary_array = data.copy()

    # Loop over array and recode to 0 and 1
    for index, value in enumerate(data):
        if value in to_0:
            binary_array[index] = 0
        elif value in to_1:
            binary_array[index] = 1
        else:
            raise IOError(
                "Value '{0}' was neither in to_0 nor in to_1.".format(value))

    return binary_array
コード例 #10
0
ファイル: category_ops.py プロジェクト: georg-un/DataSciTK
def contains_category(data, categories, exclusively=False, verbose=True):
    """
    Check if all specified categories are present in the data. If exclusively is set to True (default), check
    if ONLY the specified categories are present in the data and return False if additional categories are found.

    :param data:            1-dimensional numpy array.
    :param categories:      Single value or list. Must be of the same type as the values in the data.
    :param exclusively:     True or False. Checks if the data contains exclusively the specified categories.
    :param verbose:         True or False. Prints additional information to console if True.

    :return:                True or False.

    """

    # Transform categories parameter to list if it is not already one
    if type(categories) is not list:
        categories = [categories]

    # Check if inputs are valid
    #check_numpy_array_1d(data, 'data')  # TODO: pandas series or df possible as well?
    check_numpy_array_pandas_series_1d(data, 'data')

    for category in categories:
        if is_float(data[0]) and not is_float(category):
            raise TypeError(
                "Type of category '{0}' ({1}) must match type of the values for 'data' ({2})."
                .format(category, type_as_string(category),
                        type_as_string(data[0])))
        if is_integer(data[0]) and not is_integer(category):
            raise TypeError(
                "Type of category '{0}' ({1}) must match type of the values for 'data' ({2})."
                .format(category, type_as_string(category),
                        type_as_string(data[0])))
        elif not is_float(data[0]) and not is_integer(
                data[0]) and not isinstance(data[0], type(category)):
            raise TypeError(
                "Type of category '{0}' ({1}) must match type of the values for 'data' ({2})."
                .format(category, type_as_string(category),
                        type_as_string(data[0])))

    check_boolean(exclusively, 'exclusively')

    if not is_type_homogeneous(data, verbose=False):
        raise TypeError(
            "Argument for 'data' must be type homogeneous but contains values with different types: {0}."
            .format(get_contained_types(data, unique=True, as_string=True)))

    # Get all unique categories in data
    input_array_categories = get_contained_categories(data)

    # Check if all categories can be found
    categories_found = [x in input_array_categories for x in categories]
    if all(categories_found):
        result = True
    else:
        if verbose:
            for index, found in enumerate(categories_found):
                if not found:
                    print("Category '{0}' has not been found.".format(
                        categories[index]))
        result = False

    # Check if additional categories are present if exclusively is set to True
    if exclusively:
        additional_categories = [
            x in categories for x in input_array_categories
        ]
        additional_categories = np.invert(additional_categories)
        if any(additional_categories):
            if verbose:
                for index, additional_category in enumerate(
                        additional_categories):
                    if additional_category:
                        print(
                            "Additional category '{0}' has been found.".format(
                                input_array_categories[index]))
            result = False

    return result
コード例 #11
0
ファイル: regressions.py プロジェクト: georg-un/DataSciTK
    def __init__(self,
                 y,
                 X,
                 variables,
                 regression_method,
                 benchmark_criterion='aic',
                 max_exponent=3,
                 max_root=3,
                 include_log=True,
                 include_interactions=True,
                 verbose=False):
        """
        :param y:                           1-dimensional numpy array, pandas Series or pandas DataFrame.
                                            Contains the dependent variable.

        :param X:                           n-dimensional pandas DataFrame.
                                            Contains the independent variables

        :param variables:                   List of strings.
                                            Defines the variables (column names) that should be included in the checks.

        :param regression_method:           String.
                                            Defines the regression method. Possible methods are:
                                            'logit', 'poisson', 'glm', 'gls', 'glsar', 'mnlogit', 'negativebinomial',
                                            'ols', 'probit', 'rlm', 'wls'.

        :param benchmark_criterion:         String.
                                            Defines the benchmark criterion. Possible criterions are:
                                            'aic', 'bic'.

        :param max_exponent:                Integer.
                                            Defines the maximum exponent for each non-boolean column which should be
                                            included in the checks.

        :param max_root:                    Integer.
                                            Defines the maximum root for each column with values larger than zero which
                                            should be included in the checks.

        :param include_log:                 Boolean.
                                            Defines if the logarithm should be included in the checks for each column
                                            with values larger than zero.

        :param include_interactions:        Boolean.
                                            Defines if multiplicative interactions between all variables should be
                                            included in the checks.

        :param verbose:                     Boolean.
                                            Defines the verbosity level of the output.

        """

        # Define constants
        REGRESSION_TYPES = [
            'logit', 'poisson', 'glm', 'gls', 'glsar', 'mnlogit',
            'negativebinomial', 'ols', 'probit', 'rlm', 'wls'
        ]
        BENCHMARK_CRITERIA = ['aic', 'bic']

        # Check if input types are valid
        check_numpy_array_pandas_dataframe_series_1d(y, 'y')
        check_pandas_dataframe_nd(X, 'X')
        check_list_numpy_array(variables, 'variables')
        check_string(regression_method, 'regression_method')
        check_string(benchmark_criterion, 'benchmark_criterion')
        check_integer(max_exponent, 'max_exponent')
        check_integer(max_root, 'max_root')
        check_boolean(include_log, 'include_log')
        check_boolean(include_interactions, 'include_interactions')
        check_boolean(verbose, 'verbose')

        # Check additional restrictions
        if len(variables) < 3:
            raise TypeError("Number of variables has to be at least 3.")

        if regression_method not in REGRESSION_TYPES:
            raise TypeError(
                "Regression method '{0}' is not available. Please use one of the following: {1}."
                .format(regression_method, REGRESSION_TYPES))

        if benchmark_criterion not in BENCHMARK_CRITERIA:
            raise TypeError(
                "Benchmark criterion '{0}' is not available. Please use one of the following: {1}."
                .format(benchmark_criterion, BENCHMARK_CRITERIA))

        if max_exponent < 1:
            raise TypeError(
                "Argument for parameter 'max_exponent' must be at least 1.")

        if max_root < 1:
            raise TypeError(
                "Argument for parameter 'max_root' must be at least 1.")

        # Assign input variables to object
        self.y = y
        self.X = X
        self.variables = variables
        self.regression_method = regression_method
        self.benchmark_criterion = benchmark_criterion
        self.max_exponent = max_exponent
        self.max_root = max_root
        self.include_log = include_log
        self.include_interactions = include_interactions
        self.verbose = verbose

        self._concat_y_X()
        self._generate_all_variables()