Example #1
0
def _box_plot(df, name, target, category, section_number, sub_section, display_name):
    display_target = get_display_text(target)
    display_category = get_display_text(category)

    sub_section = util.header(section_number, sub_section,
                              "{} Box Plot With {} by {}".format(display_name, display_target, display_category))

    try:
        dskc_graphs.box_plot(df, category, target, name, flip=True, grouped=True, agg_func="mean")
    except:
        plt.show()
        print("\nNot available.\n")

    return sub_section
def boolean_col(df, name, target, target_true=False, section_number=1):
    # get names
    display_name = get_display_text(name)
    sub_section = 1

    # set series
    series = df[name]

    # bars
    sub_section = _bars(series, section_number, sub_section, display_name)

    # bars proportion
    if target:
        display_target = get_display_text(target)
        _bars_proportion(df, name, target, target_true, section_number,
                         sub_section, display_name, display_target)
Example #3
0
def date_col(df, name, target, target_true=False, section_number=1):
    # set names
    display_name = get_display_text(name)
    sub_section = 1

    # set series
    series = df[name]

    # graphs
    if str(series.dtype).find("date") >= 0:
        return
        markdown_h2("Line graph")
        # todo line

    if name.lower().endswith("_weekday"):
        xticks = get_weekdays()
    else:
        xticks = False

    # bars
    sub_section = _bars(series, xticks, section_number, sub_section,
                        display_name)

    # bars proportion
    if target:
        _bars_proportion(df, name, xticks, target, target_true, section_number,
                         sub_section, display_name)
Example #4
0
def number_col(df, name, target, section_number=1, categories=[]):
    '''

    :param df: pandas dataframe
    :param name: name of the column
    :param target: target column variable
    :return:
    '''
    # set names
    display_name = get_display_text(name)
    sub_section = 1

    # set series
    series = df[name]

    # correlation
    sub_section = _correlation(df, name, section_number, sub_section, display_name)

    # histogram
    sub_section = _hist(series, section_number, sub_section, display_name)

    # density plot
    if target:
        sub_section = _dns_plot_target(df, name, target, section_number, sub_section, display_name)
    else:
        sub_section = _dns_plot(df, name, section_number, sub_section, display_name)

    # histogram no outliers
    sub_section = _hist_no_outliers(series, section_number, sub_section, display_name)

    # desnity plot no ouliers
    if target:
        _dns_plot_target_no_outliers(df, name, target, section_number, sub_section, display_name)
    else:
        _dns_plot_no_outliers(df, name, section_number, sub_section, display_name)

    # box plots
    if not target:
        return

    for category in categories:
        if category == target:
            continue

        sub_section = _box_plot(df, name, target, category, section_number, sub_section, display_name)
Example #5
0
def text_col(df,
             name,
             target=None,
             target_true=False,
             section_number=1,
             top_words=15,
             stop_words=[]):
    # get names
    display_name = get_display_text(name)
    sub_section = 1

    # set series
    series = df[name]

    # wordcloud
    sub_section = _wordcloud(series, section_number, sub_section, display_name,
                             stop_words)

    # bars graph
    text = get_text_from(series, stop_words=stop_words)

    # set word series
    words = text.split(" ")
    words_series = pd.Series(words)

    # top n words
    sub_section = _top_words(words_series, top_words, section_number,
                             sub_section, display_name)

    # text proportion graphs
    if not target is None:
        try:
            _text_proportion_succcess(series, words_series, df[target],
                                      target_true, top_words, section_number,
                                      sub_section, display_name)
        except:
            plt.show()
            print("\nNot available.\n")
Example #6
0
def box_plot(df,
             x_col,
             fill_col,
             agg_col,
             agg_func='count',
             filter_col=None,
             filterby=[""],
             no_outliers=False,
             title=None,
             ylabel=None,
             grouped=True,
             flip=True,
             dots=True):
    '''
    :param df: dataframe,x_col: variable  located on x axis,fill_col: variable for coloring the dots,agg_col: name of the column to which the aggregation function is going to perform
    :optional agg_fun: for example: sum,min,mean,median,max,etc. filter_col: after groupying filtering option,filterby: list of characters to be filtered out (located in filter column),outliers,title
    :return: ggplot graphs of univariate analysis
    :type: boxplot
    :usage: after cleaning dataframe
    '''

    # set display texts

    fill_label = get_display_text(fill_col)

    if grouped:
        aux, aux_str = _get_grouped_aux(df, x_col, fill_col, agg_col, agg_func,
                                        filter_col, filterby)
    else:
        aux = df
        aux_str = agg_col

    # sort x labels
    aux = aux.sort_values(by=[x_col])

    xcol_list = aux.astype(str)[x_col].unique().tolist()

    # graph creation
    graph = (ggplot(aux) + geom_boxplot(aes(x=x_col, y=aux_str)) + theme_bw() +
             theme(axis_line_x=element_line(color='gray'),
                   axis_line_y=element_line(color='gray'),
                   line=element_line(color='white')) +
             scale_fill_manual(values=colors.OFICIAL_COLORS, name=fill_label) +
             scale_x_discrete(limits=xcol_list) +
             scale_y_continuous(labels=custom_format('{:,.0f}')))

    # dots
    if dots:
        graph += geom_jitter(aes(x=x_col, y=aux_str, fill=fill_col))

    # no outliers
    aux_describe = aux[aux_str].describe().loc
    if no_outliers:
        graph += ylim(aux_describe["min"], aux_describe["75%"])

    # title
    if title != None:
        graph += ggtitle("Box plot:" + str(title))

    # flip
    if flip:
        graph += coord_flip()

    # set y label
    if not ylabel:
        ylabel = get_display_text(agg_col)
    graph += ylab(ylabel)

    # show
    graph.draw()
    plt.show()