Esempio n. 1
0
def scattermatrix(df,
                  figsize: tuple = (14.4, 9),
                  title: str = None,
                  save: bool = False,
                  savepath: str = '.\\scattermatrix.png',
                  show: bool = False,
                  close: bool = False):

    dfTemp = df.loc[:, dfutl.numericColumns(df)]
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1, 1, 1)
    axes = pd.plotting.scatter_matrix(dfTemp, ax=ax)

    if title is not None:
        ax.set_title(title)

    # format x and y axis labels
    for x in range(axes.shape[0]):
        for y in range(axes.shape[1]):
            ax = axes[x, y]
            ax.xaxis.label.set_rotation(30)
            ax.yaxis.label.set_rotation(0)
            ax.yaxis.labelpad = 50

    if save:
        if savepath is not None and savepath[-1:] == '\\':
            savepath = savepath + 'scattermatrix.png'
        plt.savefig(savepath, format='png')

    if show:
        plt.show()

    if close:
        plt.close()
Esempio n. 2
0
    sigma2X = ((mpdX.index.values.astype(float) - Ex)**2 * mpdX.values).sum()
    sigma2Y = ((mpdY.index.values.astype(float) - Ey)**2 * mpdY.values).sum()
    sigmaX = math.sqrt(sigma2X)
    sigmaY = math.sqrt(sigma2Y)
    corrXY = sigmaXY / (sigmaX * sigmaY)

    print('Given a joint probability distribution:')
    print(df)
    print('')
    print('cov(X, Y) = {0:.8}'.format(sigmaXY))
    print('corr(X, Y) = {0:.8}'.format(corrXY))
    print('\n\n\n\n')

    with open('.\\iris.pkl', 'rb') as fl:
        df = pk.load(fl)
        cols = dfutl.numericColumns(df)
        df = df.loc[:, cols]

    # Given a data set of observations, the covariance and correlation
    # can be calculated in the following manner.

    # Calculate covariances.
    cov = df.cov()
    cov2 = np.cov(np.transpose(df.values))
    assert (np.all(abs(cov.values - cov2) < 1e-8))

    # Calculate correlations.
    corr = df.corr()
    corr2 = np.corrcoef(np.transpose(df.values))
    assert (np.all(abs(corr.values - corr2) < 1e-8))
Esempio n. 3
0
def eda(filepath: str,
        features=None,
        targets=None,
        removeOutliers: bool = False,
        datasetname: str = ''):

    # load the data
    df = pk.load(open(filepath, 'rb'))

    # process inputs
    # TODO: properly infer if features or targets are a sequence or a single string
    if features is None:
        features = list(set(df.columns) - set(targets))

    # examine the data
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Shape of dataset:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Number of Rows: {1}'.format('    ', df.shape[0]))
    print('{0}Number of Columns: {1}'.format('    ', df.shape[1]))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Column names:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    for col in df.columns:
        print('{0}{1}'.format('    ', col))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}First 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.head(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Last 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.tail(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Statistical Summary:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.describe())
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # infer data types of the input DataFrame
    # ----------------------------------------------------------------------
    colNumeric = dfutl.numericColumns(df)

    # ----------------------------------------------------------------------
    # mean centering and scaling: standardize or normalize
    # ----------------------------------------------------------------------
    dfNumeric = df.loc[:, colNumeric]
    df.loc[:, colNumeric] = (dfNumeric - dfNumeric.mean()) / dfNumeric.std()
    dfNumeric = df.loc[:, colNumeric]

    # ----------------------------------------------------------------------
    # outlier detection
    # ----------------------------------------------------------------------
    # use z-score filtering
    # samples that are more than 3 standard deviations away from mean are to be discarded
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Outlier Detection:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    numouttotal = 0
    numout = 1
    passNum = 0
    while (numout > 0):
        # determine the number of outliers using zscore
        zscores = stats.zscore(dfNumeric)
        idx = np.logical_not(np.logical_or(zscores < -3, zscores > 3))
        idxrows = np.all(idx, axis=1)
        idxrowsout = np.logical_not(idxrows)
        numout = len(idxrows) - len(idxrows[idxrows])

        print('{0}Pass {1} detected {2} outliers'.format(
            '    ', passNum, numout))
        if not removeOutliers:
            break

        # remove outliers and contineu
        if (numout > 0 and removeOutliers):
            df = df.loc[idxrows, :]
            dfNumeric = df.loc[:, colNumeric]

        numouttotal = numouttotal + numout
        passNum = passNum + 1
    if removeOutliers:
        print('{0}Total number of outliers: {1}'.format('    ', numouttotal))
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # visualization
    # ----------------------------------------------------------------------
    plt.close('all')

    save = True
    if len(datasetname) > 0:
        savepath = '.\\png\\{0}\\eda\\'.format(datasetname)
        isdir = os.path.isdir(savepath)
        if not isdir:
            os.makedirs(savepath)
    else:
        savepath = '.\\png\\'

    plots.boxplot(dfNumeric, save=save, savepath=savepath)
    plots.histogram(df, tightLayout=True, save=save, savepath=savepath)
    plots.scattermatrix(dfNumeric, save=save, savepath=savepath)
    plots.heatmap(dfNumeric, correlation=0.5, save=save, savepath=savepath)

    #plt.show()
    plt.close('all')

    return df
Esempio n. 4
0
def probplot(df,
             fig=None,
             figsize: tuple = (14.4, 9),
             ax=None,
             title: str = None,
             save: bool = False,
             savepath: str = '.\\probplot.png',
             show: bool = False,
             close: bool = False):

    colNumeric = dfutl.numericColumns(df)
    numVar = len(colNumeric)
    df = df.loc[:, colNumeric]

    # if inputs are valid
    if numVar > 0:
        # determine the number of rows and columns of subplots
        # cap number of columns at 4 columns
        ncols = min(int(math.ceil(math.sqrt(numVar))), 3)
        numplots = 0
        nrows = 0
        while numplots < numVar:
            nrows = nrows + 1
            numplots = nrows * ncols

        # Modify figsize. Every 3 plots = 9 in in height.
        figsize = (14.4, int(nrows * 3))
        if fig is None:
            fig = plt.figure(figsize=figsize)

        if fig is not None and ax is not None:
            plotOne = True
        else:
            plotOne = False

        # loop through all variables and plot them on the corresponding axes
        for cntAx in range(0, numVar):

            # get the series for which the histogram is to be made
            x = df.iloc[:, cntAx].copy()
            x.sort_values(inplace=True)
            x.reset_index(drop=True, inplace=True)
            n = len(x.index)
            j = ((pd.Series(x.index) + 1) - 0.5) / n
            jmu = j.mean()
            jstd = j.std()
            z = stats.norm.ppf(j)

            # use values between the 25th and 75th percentile to plot a line
            idx = list(range(math.ceil(0.25 * n), math.floor(0.75 * n) + 1))
            xline = x[idx]
            yline = z[idx]
            m, b, _, _, _ = stats.linregress(xline, yline)
            yreg = (m * x) + b

            # add an axes to the figure
            if ax is None:
                ax = fig.add_subplot(nrows, ncols, cntAx + 1)
            lines1 = ax.plot(x, z, marker='o', linewidth=0)
            lines2 = ax.plot(x, yreg, color=RED, linewidth=2)

            formatxticklabels(ax)
            ax.set_xlabel(colNumeric[cntAx])
            ax.set_ylabel('z', rotation=0)
            fig.tight_layout()

            if plotOne:
                break

        if save:
            if savepath is not None and savepath[-1:] == '\\':
                savepath = savepath + 'probplot.png'
            plt.savefig(savepath, format='png')

        if show:
            plt.show()

        if close:
            plt.close()
Esempio n. 5
0
def stemleaf(df,
             numBins: int = 20,
             title: str = None,
             save: bool = False,
             savepath: str = '.\\stemleaf.txt',
             show: bool = False):

    retall = list()

    for col in dfutl.numericColumns(df):
        vals = df.loc[:, col]

        # determine the number of stems (bins)
        if (vals.min() == vals.max()):
            return None
        else:
            # keep multiplying by 10 until the target number of bins is exceeded
            valmin = vals.min()
            valmax = vals.max()
            exp10 = 0
            while math.ceil(valmax) - math.floor(valmin) < numBins:
                vals = vals * 10
                exp10 = exp10 + 1
                valmin = vals.min()
                valmax = vals.max()

            # infer the bin width from min and max values
            currNBins = math.ceil(vals.max()) - math.floor(vals.min())
            binw = math.ceil(currNBins / numBins)

            # infer value to start binning at
            valstart = math.floor(valmin)

            ## for debugging
            #print('exp10 = {0}'.format(exp10))
            #print('valmin = {0}'.format(valmin))
            #print('valmax = {0}'.format(valmax))
            #print('valstart = {0}'.format(valstart))
            #print('binw = {0}'.format(binw))
            #print('currNBins = {0}'.format(currNBins))

            ## for debugging print the value and its inferred bin
            #for val in vals:
            #    print('{0} - {1}'.format(val, str((((val - valstart) // binw) * binw) + valstart)))

            # determine the bin of each value
            bins = [
                int((((val - valstart) // binw) * binw)) + valstart
                for val in vals
            ]

            # create a series object and group each value by its bin
            srs = pd.Series(vals.astype(int))
            grouped = srs.groupby(bins)

            # for debugging - print the values in each group
            #print(grouped.apply(lambda x: sorted([val for val in x])))
            aggregated = grouped.apply(
                lambda x: sorted([str(val)[-1] for val in x]))

            # determine the number of spaces for each stem
            ndigits = math.ceil(math.log10(aggregated.index.max())) - 1
            line = '{0: <' + str(ndigits + 1) + '}| '

            # print index except last character and and the last characters of list of values
            # associated with the index
            idxstr = [line.format(str(idx)[0:-1]) if len(str(idx)) > 1\
                else line.format('0')\
                for idx in aggregated.index]
            valstr = [''.join(vals) for vals in aggregated]
            N = len(aggregated.index)
            ret = [idxstr[idx] + valstr[idx] for idx in range(N)]

            # build a list of every line for this column
            ret = [line.format('x') + format('y', '<30') + 'x.y'] + ret
            if title is not None:
                ret = [title + ' ' + col] + ret

            # add to overall list of strings
            retall = retall + ret
            retall = retall + ['\n\n']

    # join each line in the list with a newline
    retall = '\n'.join(retall)

    # save if applicable
    if save:
        if savepath is not None and savepath[-1] == '\\':
            savepath = savepath + 'stemleaf.txt'
        with open(savepath, 'w') as fl:
            fl.write(retall)

    # show if applicable
    if show:
        print(retall)

    return retall
Esempio n. 6
0
def heatmap(df,
            figsize: tuple = (14.4, 9),
            correlation: float = None,
            xcolumns: list = None,
            ycolumns: list = None,
            title: str = None,
            save: bool = False,
            savepath: str = '.\\heatmap.png',
            show: bool = False,
            close: bool = False):

    # prepare variables for rows and columns
    if xcolumns is None:
        xcolumns = dfutl.numericColumns(df)
    else:
        xcolumns = dfutil.numericColumns(df.loc[:, xcolumns])
    if ycolumns is None:
        ycolumns = dfutl.numericColumns(df)
    else:
        ycolumns = dfutil.numericColumns(df.loc[:, ycolumns])

    # calculate correlations
    dfCorr = df.corr()
    dfCorr = dfCorr.loc[xcolumns, ycolumns]

    # bi-directionally mask correlations that are less than a certain threshold
    if correlation is not None:
        mask = dfCorr <= correlation
        mask = mask & (dfCorr >= correlation * -1)
        dfCorrMask = dfCorr.mask(mask, 0)
    else:
        dfCorrMask = dfCorr

    # heat map of correlations
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1, 1, 1)
    ax = sns.heatmap(data=dfCorrMask,
                     vmin=-1,
                     vmax=1,
                     annot=True,
                     annot_kws=dict([('fontsize', 6)]))
    ax.set_yticks([x + 0.5 for x in range(0, len(dfCorrMask.index))])
    ax.set_yticklabels(dfCorrMask.index)
    ax.set_xticks([x + 0.5 for x in range(0, len(dfCorrMask.columns))])
    ax.set_xticklabels(dfCorrMask.columns)

    formatxticklabels(ax)

    if title is None and correlation is not None:
        title = 'Correlation Threshold = {0:.3f}'.format(correlation)
    ax.set_title(title)

    if save:
        if savepath is not None and savepath[-1:] == '\\':
            savepath = savepath + 'heatmap.png'
        plt.savefig(savepath, format='png')

    if show:
        plt.show()

    if close:
        plt.close()