Esempio n. 1
0
def distribution_summary_pretty(_value_df, col, figsize=None, date_flag=False):
    """
	Draw pretty distribution graph for a single column

	Parameters
	----------
	_value_df: pandas dataframe
		the dataframe that contains the values of 'col'
	col: name of the column
	figsize: (width, height)
		Size of the figure
	date_flag: boolean
		Whether the column is date type
	"""

    # colors for graph
    DIS_LINE = "#F79646"

    # copy the raw dataframe
    value_df = _value_df.copy()

    if date_flag:
        numeric_col = '%s_numeric' % (col)
        if numeric_col not in value_df.columns.values:
            snapshot_date_now = str(datetime.datetime.now().date())
            value_df[numeric_col] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(value_df[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
    else:
        numeric_col = col

    # get min, mean, median, max
    value_min = value_df[numeric_col].min()
    value_mean = value_df[numeric_col].mean()
    value_median = value_df[numeric_col].median()
    value_max = value_df[numeric_col].max()

    if date_flag:
        date_min = pd.to_datetime(value_df[col], errors='coerce').min()
        date_max = pd.to_datetime(value_df[col], errors='coerce').max()

    num_uni = value_df[col].dropna().nunique()
    value_dropna = value_df[numeric_col].dropna().values

    # get distribution
    scale_flg = 0
    draw_values = value_dropna
    draw_value_4 = [value_min, value_mean, value_median, value_max]
    if np.max([abs(value_min), abs(value_max)]) >= pow(10, 6):
        scale_flg = 1
        draw_values, draw_value_4 = _get_scale_draw_values(
            draw_values, draw_value_4)

    # draw and save distribution graph
    plt.clf()
    if figsize is not None:
        plt.figure(figsize)
    else:
        plt.figure(figsize=(10, 6))

    if scale_flg:
        plt.title('%s (log10 scale)' % (col))
    else:
        plt.title('%s' % (col))

    # if unique level is less than 10, draw countplot instead
    if num_uni <= 10:
        temp_df = pd.DataFrame(draw_values, columns=['value'])
        sns.countplot(temp_df['value'], color=DIS_LINE)
        if num_uni > 5:
            plt.xticks(rotation=90)
    else:
        ax = sns.distplot(draw_values,
                          color=DIS_LINE,
                          norm_hist=True,
                          hist=False)
        y_low, y_up = ax.get_ylim()

        if date_flag:
            _draw_texts(text_values=[date_min, date_max],
                        draw_value_4=draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
        else:
            _draw_texts(
                text_values=[value_min, value_mean, value_median, value_max],
                draw_value_4=draw_value_4,
                mark=1,
                y_low=y_low,
                y_up=y_up)
    plt.show()
Esempio n. 2
0
def _check_numeric(col, _value_df, img_dir, date_flag=False):
    """
	Summarize numeric feature

	Parameters
	----------
	col: name of the column
	_value_df: pandas dataframe
		the dataframe that contains the values of 'col'
	img_dir: root directory for the generated images
	date_flag: boolean
		Whether the column is date type

	Returns
	-------
	Dictionary: (column value, result dataframe / error message)
	"""

    value_df = _value_df.copy()

    # ensure all values are numeric
    value_df[col] = pd.to_numeric(value_df[col], errors='coerce')

    # percentage of nan
    nan_rate = value_df[
        value_df[col].isnull()].shape[0] * 1.0 / value_df.shape[0]

    # if all values are nan
    if nan_rate == 1:
        return {'column': col, 'error_msg': 'all values are nan'}
    else:
        # unique_level
        num_uni = value_df[col].dropna().nunique()

        # get clean values
        value_dropna = value_df[col].dropna().values

        # get sample value
        sample_value = np.random.choice(value_dropna, 1)[0]

        # get min, mean, median, max
        value_min = value_df[col].min()
        value_mean = value_df[col].mean()
        value_median = value_df[col].median()
        value_max = value_df[col].max()
        if date_flag:
            date_min = pd.to_datetime(value_df[col.replace('_numeric', '')],
                                      errors='coerce').min()
            date_max = pd.to_datetime(value_df[col.replace('_numeric', '')],
                                      errors='coerce').max()

        # get distribution
        scale_flg = 0
        draw_values = value_dropna
        draw_value_4 = [value_min, value_mean, value_median, value_max]
        if np.max([abs(value_min), abs(value_max)]) >= pow(10, 6):
            scale_flg = 1
            draw_values, draw_value_4 = _get_scale_draw_values(
                draw_values, draw_value_4)

        # draw and save distribution graph
        if date_flag:
            plt.figure(figsize=(9, 5.5))
        else:
            plt.figure(figsize=(9, 4.5))
        if scale_flg:
            plt.title('%s (log10 scale)' % (col))
        else:
            plt.title('%s' % (col))

        # if unique level is less than 10, draw countplot instead
        if num_uni <= 10:
            temp_df = pd.DataFrame(draw_values, columns=['value'])
            sns.countplot(temp_df['value'], color=DIS_LINE)
            if num_uni > 5:
                plt.xticks(rotation=90)
        else:
            ax = sns.distplot(draw_values,
                              color=DIS_LINE,
                              norm_hist=True,
                              hist=False)
            y_low, y_up = ax.get_ylim()

            if date_flag:
                _draw_texts(text_values=[date_min, date_max],
                            draw_value_4=draw_value_4,
                            mark=1,
                            y_low=y_low,
                            y_up=y_up,
                            date_flag=True)
            else:
                _draw_texts(text_values=[
                    value_min, value_mean, value_median, value_max
                ],
                            draw_value_4=draw_value_4,
                            mark=1,
                            y_low=y_low,
                            y_up=y_up)

        # save the graphs
        plt.savefig(os.path.join(img_dir, col + '.png'), transparent=True)

        output = [
            {
                'feature': 'column',
                'value': col,
                'graph': 'Distribution'
            },
            {
                'feature': 'sample_value',
                'value': sample_value
            },
            {
                'feature': 'nan_rate',
                'value': nan_rate
            },
            {
                'feature': 'num_uni',
                'value': '%d/%d' % (num_uni, len(value_dropna))
            },
            {
                'feature': 'value_min',
                'value': value_min
            },
            {
                'feature': 'value_mean',
                'value': value_mean
            },
            {
                'feature': 'value_median',
                'value': value_median
            },
            {
                'feature': 'value_max',
                'value': value_max
            },
        ]

        if date_flag:
            output.append({'feature': 'date_min', 'value': date_min})
            output.append({'feature': 'date_max', 'value': date_max})

        return {'column': col, 'result_df': pd.DataFrame(output)}
Esempio n. 3
0
def _compare_numeric(col, _df1, _df2, img_dir, date_flag=False):
    """
	Compare two numeric type values

	Parameters
	----------
	col: string
		name of column to check
	_df1: pandas DataFrame
		slice of table1 containing enough information to check
	_df2: pandas DataFrame
		slice of table2 containing enough information to check
	img_dir: root directory for the generated images
	date_flag: boolean
		Whether the column is date type

	Returns
	-------
	Dictionary contains the output result
	"""

    # sampling
    df1_sample = _df1.copy()
    df2_sample = _df2.copy()

    stat_output = _simple_stats(col, df1_sample, df2_sample, 'numeric')

    nan_rate1, nan_rate2 = stat_output['nan_rate']
    if (nan_rate1 == 1) or (nan_rate2 == 1):
        if (nan_rate1 == 1) and (nan_rate2 == 1):
            error_msg = 'all nan in both table'
        elif nan_rate1 == 1:
            error_msg = 'all nan in table1'
        else:
            error_msg = 'all nan in table2'
        return {'column': col, 'error_msg': error_msg}

    # generate the output
    output = [{
        'feature': 'column',
        'value': col,
        'graph': 'Distribution'
    }, {
        'feature': 'sample_value',
        'value': '\n'.join([str(v) for v in stat_output['sample_value']])
    }, {
        'feature':
        'nan_rate',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['nan_rate']])
    }, {
        'feature':
        'num_uni',
        'value':
        '%s/%s\n%s/%s' %
        (str(stat_output['num_uni'][0]), str(df1_sample.dropna().shape[0]),
         str(stat_output['num_uni'][1]), str(df2_sample.dropna().shape[0]))
    }, {
        'feature':
        'value_min',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_min']])
    }, {
        'feature':
        'value_mean',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_mean']])
    }, {
        'feature':
        'value_median',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_median']])
    }, {
        'feature':
        'value_max',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_max']])
    }]

    both_value_max = np.max([abs(v) for v in stat_output['value_max']] +
                            [abs(v) for v in stat_output['value_min']])

    # get clean values
    df1_sample_dropna_values = df1_sample[col].dropna().values
    df2_sample_dropna_values = df2_sample[col].dropna().values

    if date_flag:
        dt1 = pd.to_datetime(df1_sample[col.replace('_numeric', '')],
                             errors='coerce')
        dt2 = pd.to_datetime(df2_sample[col.replace('_numeric', '')],
                             errors='coerce')
        date_min1, date_max1 = dt1.min(), dt1.max()
        date_min2, date_max2 = dt2.min(), dt2.max()

    # get distribution
    scale_flg = 0
    df1_draw_values = df1_sample_dropna_values
    df1_draw_value_4 = [
        stat_output['value_min'][0], stat_output['value_mean'][0],
        stat_output['value_median'][0], stat_output['value_max'][0]
    ]

    df2_draw_values = df2_sample_dropna_values
    df2_draw_value_4 = [
        stat_output['value_min'][1], stat_output['value_mean'][1],
        stat_output['value_median'][1], stat_output['value_max'][1]
    ]

    if both_value_max >= pow(10, 6):
        scale_flg = 1
        df1_draw_values, df1_draw_value_4 = _get_scale_draw_values(
            df1_draw_values, df1_draw_value_4)
        df2_draw_values, df2_draw_value_4 = _get_scale_draw_values(
            df2_draw_values, df2_draw_value_4)

    # calculate correlation between two distributions
    if np.max(stat_output['num_uni']) <= 100:
        vc1, vc2 = _value_counts_df(df1_draw_values), _value_counts_df(
            df2_draw_values)
        vc = vc1.merge(vc2, on='value', how='outer').fillna(0)
        obs1, obs2 = vc['count_x'].values * 1.0 / vc['count_x'].sum(
        ), vc['count_y'].values * 1.0 / vc['count_y'].sum()
    else:
        both_min = np.min([np.min(df1_draw_values), np.min(df2_draw_values)])
        both_max = np.max([np.max(df1_draw_values), np.max(df2_draw_values)])
        hist1 = np.histogram(df1_draw_values,
                             bins=100,
                             range=(both_min, both_max),
                             normed=False,
                             density=False)
        hist2 = np.histogram(df2_draw_values,
                             bins=100,
                             range=(both_min, both_max),
                             normed=False,
                             density=False)
        obs1, obs2 = hist1[0] / (np.sum(hist1[0]) *
                                 1.0), hist2[0] / (np.sum(hist2[0]) * 1.0)

    if len(obs1) == 1:
        corr = np.min([1. - nan_rate1, 1. - nan_rate2]) * 1.0 / np.max(
            [1. - nan_rate1, 1. - nan_rate2])
    elif list(obs1) == list(obs2):
        corr = 1.0
    else:
        corr = spearmanr(obs1, obs2)[0]

    # draw and save distribution graph
    if date_flag:
        plt.figure(figsize=(9, 8))
    else:
        plt.figure(figsize=(9, 6))
    if scale_flg:
        plt.title('%s (log10 scale)' % (col))
    else:
        plt.title('%s' % (col))

    # if unique level is less than 10, draw countplot instead
    both_num_uni = np.max(stat_output['num_uni'])
    if both_num_uni <= 10:
        df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value'])
        df1_temp['type'] = 'table1'
        df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value'])
        df2_temp['type'] = 'table2'
        full_temp = pd.concat([df1_temp, df2_temp], axis=0)
        sns.countplot(full_temp['value'],
                      hue=full_temp['type'],
                      palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK]))
        if both_num_uni > 5:
            plt.xticks(rotation=90)
        plt.legend(loc=1)
    else:
        ax1 = sns.distplot(df1_draw_values,
                           color=TABLE1_DARK,
                           hist=False,
                           label='table1')
        ax2 = sns.distplot(df2_draw_values,
                           color=TABLE2_DARK,
                           hist=False,
                           label='table2')
        y_low_1, y_up_1 = ax1.get_ylim()
        y_low_2, y_up_2 = ax2.get_ylim()
        y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2])
        plt.ylim((y_low, y_up))

        if date_flag:
            _draw_texts(text_values=[date_min1, date_max1],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
            _draw_texts(text_values=[date_min2, date_max2],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
        else:
            _draw_texts(text_values=[
                stat_output['value_min'][0], stat_output['value_mean'][0],
                stat_output['value_median'][0], stat_output['value_max'][0]
            ],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up)
            _draw_texts(text_values=[
                stat_output['value_min'][1], stat_output['value_mean'][1],
                stat_output['value_median'][1], stat_output['value_max'][1]
            ],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up)

    # save the graphs
    plt.savefig(os.path.join(img_dir, col + '.png'), transparent=True)

    if date_flag:
        output.append({
            'feature': 'date_min',
            'value': '%s\n%s' % (date_min1, date_min2)
        })
        output.append({
            'feature': 'date_max',
            'value': '%s\n%s' % (date_max1, date_max2)
        })
    output.append({'feature': 'corr', 'value': round(corr, 3)})

    return {
        'column': col,
        'result_df': pd.DataFrame(output),
        'corr': {
            'column': col,
            'corr': round(corr, 3)
        }
    }
Esempio n. 4
0
def distribution_compare_pretty(_df1,
                                _df2,
                                col,
                                figsize=None,
                                date_flag=False):
    """
	Draw pretty distribution graph for data compare

	Parameters
	----------
	_df1: pandas DataFrame
		slice of table1 containing enough information to check
	_df2: pandas DataFrame
		slice of table2 containing enough information to check
	col: string
		name of column to check
	figsize: tuple, default=None
		figure size
	date_flag: bool, default=False
		whether it is checking date features
	"""

    # color values for graph
    TABLE1_DARK = "#4BACC6"
    TABLE2_DARK = "#F79646"

    df1, df2 = _df1.copy(), _df2.copy()

    if date_flag:
        numeric_col = '%s_numeric' % (col)
        if numeric_col not in df1.columns.values:
            snapshot_date_now = str(datetime.datetime.now().date())
            df1[numeric_col] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(df1[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
        if numeric_col not in df2.columns.values:
            snapshot_date_now = str(datetime.datetime.now().date())
            df2[numeric_col] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(df2[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
    else:
        numeric_col = col

    value_mins = [df1[numeric_col].min(), df2[numeric_col].min()]
    value_means = [df1[numeric_col].mean(), df2[numeric_col].mean()]
    value_medians = [df1[numeric_col].median(), df2[numeric_col].median()]
    value_maxs = [df1[numeric_col].max(), df2[numeric_col].max()]

    if date_flag:
        date_mins = [
            pd.to_datetime(df1[col], errors='coerce').min(),
            pd.to_datetime(df2[col], errors='coerce').min()
        ]
        date_maxs = [
            pd.to_datetime(df1[col], errors='coerce').max(),
            pd.to_datetime(df2[col], errors='coerce').max()
        ]

    both_value_max = np.max([abs(v) for v in value_maxs] +
                            [abs(v) for v in value_mins])

    # get clean values
    df1_sample_dropna_values = df1[numeric_col].dropna().values
    df2_sample_dropna_values = df2[numeric_col].dropna().values

    # get distribution
    scale_flg = 0
    df1_draw_values = df1_sample_dropna_values
    df1_draw_value_4 = [
        value_mins[0], value_means[0], value_medians[0], value_maxs[0]
    ]

    df2_draw_values = df2_sample_dropna_values
    df2_draw_value_4 = [
        value_mins[1], value_means[1], value_medians[1], value_maxs[1]
    ]

    if both_value_max >= pow(10, 6):
        scale_flg = 1
        df1_draw_values, df1_draw_value_4 = _get_scale_draw_values(
            df1_draw_values, df1_draw_value_4)
        df2_draw_values, df2_draw_value_4 = _get_scale_draw_values(
            df2_draw_values, df2_draw_value_4)

    # draw the graph
    plt.clf()
    if figsize is not None:
        plt.figure(figsize)
    else:
        plt.figure(figsize=(10, 5))

    if scale_flg:
        plt.title('%s (log10 scale)' % (col))
    else:
        plt.title('%s' % (col))

    # if unique level is less than 10, draw countplot instead
    both_num_uni = np.max(
        [df1[col].dropna().nunique(), df2[col].dropna().nunique()])
    if both_num_uni <= 10:
        df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value'])
        df1_temp['type'] = 'table1'
        df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value'])
        df2_temp['type'] = 'table2'
        full_temp = pd.concat([df1_temp, df2_temp], axis=0)
        sns.countplot(full_temp['value'],
                      hue=full_temp['type'],
                      palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK]))
        if both_num_uni > 5:
            plt.xticks(rotation=90)
        plt.legend(loc=1)
    else:
        ax1 = sns.distplot(df1_draw_values,
                           color=TABLE1_DARK,
                           hist=False,
                           label='table1')
        ax2 = sns.distplot(df2_draw_values,
                           color=TABLE2_DARK,
                           hist=False,
                           label='table2')
        y_low_1, y_up_1 = ax1.get_ylim()
        y_low_2, y_up_2 = ax2.get_ylim()
        y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2])
        plt.ylim((y_low, y_up))

        if date_flag:
            _draw_texts(text_values=[date_mins[0], date_maxs[0]],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
            _draw_texts(text_values=[date_mins[1], date_maxs[1]],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
        else:
            _draw_texts(text_values=[
                value_mins[0], value_means[0], value_medians[0], value_maxs[0]
            ],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up)
            _draw_texts(text_values=[
                value_mins[1], value_means[1], value_medians[1], value_maxs[1]
            ],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up)

    plt.show()
Esempio n. 5
0
def numeric_consist_pretty(_df1,
                           _df2,
                           _key1,
                           _key2,
                           col,
                           figsize=None,
                           date_flag=False):
    """
    Draw pretty distribution graph for checking data consistency

    Parameters
    ----------
    _df1: pandas DataFrame
        slice of table1 containing enough information to check
    _df2: pandas DataFrame
        slice of table2 containing enough information to check
    _key1: string
        key for table1
    _key2: string
        key for table2
    col: string
        name of column to check
    figsize: tuple, default=None
        figure size
    date_flag: bool, default=False
        whether it is checking date features
    """

    # color values for graph
    TABLE1_DARK = "#4BACC6"
    TABLE2_DARK = "#F79646"

    df = _df1.merge(_df2, left_on=_key1, right_on=_key2, how="inner")
    df['diff_temp'] = df['%s_y' % (col)] - df['%s_x' % (col)]
    draw_values = df['diff_temp'].dropna().values
    origin_value_4 = [
        np.min(draw_values),
        np.mean(draw_values),
        np.median(draw_values),
        np.max(draw_values)
    ]

    # get distribution
    scale_flg = 0
    draw_value_4 = origin_value_4
    if np.max([abs(origin_value_4[0]), abs(origin_value_4[3])]) >= pow(10, 6):
        scale_flg = 1
        draw_values, draw_value_4 = _get_scale_draw_values(
            draw_values, draw_value_4)

    plt.clf()
    if figsize is not None:
        plt.figure(figsize)
    else:
        plt.figure(figsize=(9, 4))

    both_min = np.min([df['%s_x' % (col)].min(), df['%s_y' % (col)].min()])
    both_max = np.max([df['%s_x' % (col)].max(), df['%s_y' % (col)].max()])

    plt.subplot(121)
    plt.title('Scatter plot for values')
    plt.scatter(df['%s_x' % (col)].values,
                df['%s_y' % (col)].values,
                c=TABLE1_DARK,
                s=5)
    plt.plot([both_min, both_max], [both_min, both_max], '--', c='#bbbbbb')

    plt.xlim(both_min, both_max)
    plt.ylim(both_min, both_max)

    ax2 = plt.subplot(122)
    if len(np.unique(draw_values)) <= 10:
        sns.countplot(draw_values, palette=sns.color_palette([TABLE2_DARK]))
        if len(np.unique(draw_values)) > 5:
            plt.xticks(rotation=90)
    else:
        sns.distplot(draw_values, color=TABLE2_DARK)
        y_low, y_up = ax2.get_ylim()
        _draw_texts(text_values=origin_value_4,
                    draw_value_4=draw_value_4,
                    mark=1,
                    y_low=y_low,
                    y_up=y_up)

    if date_flag:
        plt.title('Distribution of differences (in months)')
    elif scale_flg:
        plt.title('Distribution of differences (log10 scale)')
    else:
        plt.title('Distribution of differences')

    plt.show()
Esempio n. 6
0
def _consist_numeric(col, _df1, _df2, _key1, _key2, img_dir, date_flag=False):
    """
    Check consistency for numeric type column

    Parameters
    ----------
    col: string
        name of column to check
    _df1: pandas DataFrame
        slice of table1 containing enough information to check
    _df2: pandas DataFrame
        slice of table2 containing enough information to check
    _key1: column to merge on for table1
    _key2: column to merge on for table2
    img_dir: root directory for the generated images
    date_flag: boolean
        Whether the column is date type

    Returns
    -------
    Dictionary contains the output result
    """

    df1, df2 = _df1.copy(), _df2.copy()
    df = pd.merge(df1, df2, left_on=_key1, right_on=_key2, how="inner")

    if (df['%s_x' % (col)].dropna().shape[0]
            == 0) or (df['%s_y' % (col)].dropna().shape[0] == 0):
        if (df['%s_x' % (col)].dropna().shape[0]
                == 0) and (df['%s_y' % (col)].dropna().shape[0] == 0):
            error_msg = 'all nan in both table'
        elif df['%s_x' % (col)].dropna().shape[0] == 0:
            error_msg = 'all nan in table1'
        else:
            error_msg = 'all nan in table2'
        return {'column': col, 'error_msg': error_msg}

    df = df.dropna(how='any', subset=['%s_x' % (col),
                                      '%s_y' % (col)]).reset_index(drop=True)
    df['diff_temp'] = df['%s_y' % (col)] - df['%s_x' % (col)]
    corr = round(
        spearmanr(df['%s_x' % (col)].values, df['%s_y' % (col)].values)[0], 3)

    output = [
        {
            'feature': 'column',
            'value': col,
            'graph': 'consistency check'
        },
        {
            'feature': 'corr',
            'value': corr
        },
        {
            'feature': 'min diff',
            'value': round(df['diff_temp'].min(), 3)
        },
        {
            'feature': 'mean diff',
            'value': round(df['diff_temp'].mean(), 3)
        },
        {
            'feature': 'median diff',
            'value': round(df['diff_temp'].median(), 3)
        },
        {
            'feature': 'max diff',
            'value': round(df['diff_temp'].max(), 3)
        },
    ]

    draw_values = df['diff_temp'].dropna().values
    origin_value_4 = [
        np.min(draw_values),
        np.mean(draw_values),
        np.median(draw_values),
        np.max(draw_values)
    ]

    # get distribution
    scale_flg = 0
    draw_value_4 = origin_value_4
    if np.max([abs(origin_value_4[0]), abs(origin_value_4[3])]) >= pow(10, 6):
        scale_flg = 1
        draw_values, draw_value_4 = _get_scale_draw_values(
            draw_values, draw_value_4)

    # draw the scatter plot
    both_min = np.min([df['%s_x' % (col)].min(), df['%s_y' % (col)].min()])
    both_max = np.max([df['%s_x' % (col)].max(), df['%s_y' % (col)].max()])

    dpi = 72
    plt.figure(figsize=(635. / dpi, 635. / (9. / 4.) / dpi), dpi=dpi)
    plt.subplot(121)
    plt.title('Scatter plot for values')
    plt.scatter(df['%s_x' % (col)].values,
                df['%s_y' % (col)].values,
                c=TABLE1_DARK,
                s=5)
    plt.plot([both_min, both_max], [both_min, both_max], '--', c='#bbbbbb')

    plt.xlim(both_min, both_max)
    plt.ylim(both_min, both_max)

    ax2 = plt.subplot(122)
    if len(np.unique(draw_values)) <= 10:
        sns.countplot(draw_values, palette=sns.color_palette([TABLE2_DARK]))
        if len(np.unique(draw_values)) > 5:
            plt.xticks(rotation=90)
    else:
        sns.distplot(draw_values, color=TABLE2_DARK)
        y_low, y_up = ax2.get_ylim()
        _draw_texts(text_values=origin_value_4,
                    draw_value_4=draw_value_4,
                    mark=1,
                    y_low=y_low,
                    y_up=y_up)

    if date_flag:
        plt.title('Distribution of differences (in months)')
    elif scale_flg:
        plt.title('Distribution of differences (log10 scale)')
    else:
        plt.title('Distribution of differences')

    # save the graphs
    # adjust graph name
    graph_name = col
    if '/' in graph_name:
        graph_name = graph_name.replace('/', '')
    plt.savefig(os.path.join(img_dir, graph_name + '.png'),
                transparent=True,
                dpi=dpi)
    return {
        'column': col,
        'result_df': pd.DataFrame(output),
        'corr': {
            'column': col,
            'corr': corr
        }
    }