def barchart(dataframe, numerical_column, is_percent=False): """ df: frequency table """ rcParams['figure.figsize'] = 8, 5 if (is_percent): ## removing the % sign # dataframe[numerical_column] = dataframe[numerical_column].str.rstrip('%').astype('float') # sns.barplot(x=numerical_column, y=dataframe.index, data=dataframe, orient="h", order=dataframe.index, color="#337ab7") dataframe.sort_values(numerical_column, ascending=True)[numerical_column].plot.barh(color="#337ab7") # plt.barh(dataframe.index.values, dataframe[numerical_column]) plt.xlim(0, 100) else: # sns.barplot(x=numerical_column, y=dataframe.index, data=dataframe, orient="h", order=dataframe.index, color="#337ab7") dataframe.sort_values(numerical_column, ascending=True)[numerical_column].plot.barh(color="#337ab7") # df.plot.barh() # plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) plt.xticks(fontsize=12, rotation=0) plt.yticks(fontsize=12, rotation=0) plt.xlabel(numerical_column, fontsize=15) plt.ylabel(dataframe.index.name, fontsize=15) # plt.subplots_adjust(left=0.4, right=0.6, top=0.9, bottom=0.1) plt.tight_layout(rect=(0.1, 0.1, 0.9, 0.9)) return plot_360_n0sc0pe(plt)
def stackbarchart(df, cat_a, cat_b): rcParams['figure.figsize'] = (10, 4) ## keeping only top 10 categories temp_df = df[[cat_a, cat_b]] sorted_order = list(temp_df[cat_a].value_counts().index) if (df[cat_a].nunique() > 10): temp_df.loc[~temp_df[cat_a].isin(sorted_order[0:9]), cat_a] = "Others" sorted_order = sorted_order[0:9] + ["Others"] if (df[cat_b].nunique() > 10): top_categories = list(temp_df[cat_b].value_counts().index[0:9]) temp_df.loc[~temp_df[cat_b].isin(top_categories), cat_b] = "Others" ## plots fig, (ax1, ax2) = plt.subplots(1, 2) grouped_df = temp_df.groupby([cat_a, cat_b]).size() grouped_df.unstack().reindex(index=sorted_order).plot(kind='bar', stacked=True, ax = ax1, legend=False) ax1.set_title("Bar Chart",weight="bold").set_fontsize('12') cross_df = pd.crosstab(temp_df[cat_a], temp_df[cat_b]).apply(lambda r: r/r.sum(), axis=1) cross_df.reindex(index=sorted_order).plot(kind='bar', stacked=True, ax=ax2 ) ax2.set_title("Stacked Chart",weight="bold").set_fontsize('12') ax2.legend(title=cat_b, loc='center left', bbox_to_anchor=(1, 0.5)) plt.tight_layout(rect=(0.1, 0.05, 0.9, 0.95)) return plot_360_n0sc0pe(plt)
def missing_count_row_wise(df): rcParams['figure.figsize'] = 10, 4 dataframe = df # def missing_count_row_wise(dataframe) dataframe = df column_count = len(df.columns) row_count = dataframe.shape[0] ## missing: column wise missing_row_df = pd.DataFrame( dataframe.isnull().sum(axis=1).value_counts(), columns=["Number of rows"]).reset_index() missing_row_df["percent"] = missing_row_df["index"].apply( lambda x: 100 * round(x / column_count, 2)).astype(int) missing_row_df["Missing cells %"] = missing_row_df.apply( lambda x: str(x["index"]) + " (" + str(x["percent"]) + "%)", axis=1) missing_row_df.sort_values("index", ascending=False, inplace=True) sns.barplot(x="Number of rows", y="Missing cells %", data=missing_row_df, orient="h", color="#337ab7") plt.xlabel('Number of rows', fontsize='12') plt.ylabel('Number of missing cells', fontsize='12') # plt.title("Missing values % in each column",weight="bold", fontsize='15') plt.tight_layout(rect=(0.1, 0, 0.9, 1)) return plot_360_n0sc0pe(plt)
def scatterplot(df, num_a, num_b): ## plots fig, (ax1, ax2) = plt.subplots(1, 2) sns.distplot( df[num_a].dropna() , color="skyblue", label=num_a, ax=ax1) sns.distplot( df[num_b].dropna() , color="red", label=num_b, ax=ax1) ax1.set_title("",weight="bold").set_fontsize('12') sns.regplot(df[num_a], df[num_b], ax=ax2) reg_coeff = round(df[num_a].corr(df[num_b]), 2) ax2.set_title("R: " + str(reg_coeff), weight="bold").set_fontsize('12') return plot_360_n0sc0pe(plt)
def wordcloud(series): stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='white', stopwords=stopwords, max_words=200, max_font_size=40, random_state=42 ).generate(str(series.values)) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') return plot_360_n0sc0pe(plt)
def missing_matrix(data: pd.DataFrame) -> str: """Generate missing values matrix plot Args: data: Pandas DataFrame to generate missing values matrix from. Returns: The resulting missing values matrix encoded as a string. """ labels = True missingno.matrix( data, figsize=(10, 4), color=hex_to_rgb("#337ab7"), fontsize=10, sparkline=False, labels=labels, ) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(plt)
def missing_count_column_wise(df): rcParams['figure.figsize'] = 10, 4 dataframe = df row_count = dataframe.shape[0] ## missing: column wise missing_column_df = dataframe.isnull().sum() missing_column_df = pd.DataFrame(missing_column_df, columns=["Missing values"]) missing_column_df["values"] = row_count missing_column_df = missing_column_df.apply( lambda x: 100 * round(x / row_count, 2)).astype(int) missing_column_df.sort_values("Missing values", ascending=False, inplace=True) ## bar plot sns.set_style("darkgrid") ## plot 1 - "total" - (top) series sns.barplot(x=missing_column_df.index, y="values", data=missing_column_df, color="#337ab7", label="Values") ## plot 2 - overlay - "bottom" series sns.barplot(x=missing_column_df.index, y="Missing values", data=missing_column_df, color="red", label="Missing") plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.xticks(rotation="90") plt.ylabel('(%)', fontsize='12') plt.title("Missing values % in each column", weight="bold").set_fontsize('12') plt.tight_layout() return plot_360_n0sc0pe(plt)
def boxplot(df, num, cat): rcParams['figure.figsize'] = 10, 5 temp_df = df[[num, cat]] sorted_order = temp_df.groupby(cat, as_index=False)[num].mean().sort_values(num, ascending=False) if (sorted_order.shape[0] > 10): top_categories = list(sorted_order[cat][0:9]) temp_df.loc[~temp_df[cat].isin(top_categories), cat] = "Others" sns.boxplot(x=cat, y=num, data=temp_df, order = top_categories + ["Others"]) else: sns.boxplot(x=cat, y=num, data=temp_df, order = sorted_order[cat]) plt.xticks(fontsize=12, rotation=0) plt.yticks(fontsize=12, rotation=0) plt.xlabel(cat, fontsize=15) plt.ylabel(num, fontsize=15) # plt.subplots_adjust(left=0.4, right=0.6, top=0.9, bottom=0.1) plt.tight_layout(rect=(0.1, 0.1, 0.9, 0.9)) return plot_360_n0sc0pe(plt)
def histogram(series: pd.Series, col_name): ## drawing ## - fixing the size of the figure rcParams['figure.figsize'] = 10, 5 x = np.array(series.dropna()) # Cut the window in 2 parts f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)} ) sns.boxplot(x, ax=ax_box, color="#337ab7") sns.distplot(x, ax=ax_hist, color="#337ab7") ax_box.set(yticks=[]) sns.despine(ax=ax_hist) sns.despine(ax=ax_box, left=True) plt.xticks(fontsize=12, rotation=0) plt.yticks(fontsize=12, rotation=0) plt.tight_layout() return plot_360_n0sc0pe(plt)
def networkplot(column_types, associations): ## preprocessing assns assn_df = associations.copy() assn_df["assn"] = abs(assn_df["association"]) ## - filter assn_df = assn_df[assn_df["assn"] >= .01] ## creating graph G = nx.from_pandas_edgelist(assn_df, source='col_a', target='col_b', edge_attr=["assn", "type_"], create_using=nx.DiGraph()) ## edges corr_edges = [(u, v) for (u, v, d) in G.edges(data=True) if d['type_'] == "NUM-NUM"] corr_ratio_edges = [(u, v) for (u, v, d) in G.edges(data=True) if d['type_'] in ("NUM-CAT", "CAT-NUM")] theil_edges = [(u, v) for (u, v, d) in G.edges(data=True) if d['type_'] in ("CAT-CAT")] ## size and color attribute of nodeS size_df = assn_df.groupby( "col_a", as_index=False)["assn"].sum().rename(columns={"assn": "size"}) ## - normalizing max_size = size_df["size"].max() size_df["size"] = size_df["size"] / max_size for i in list(G.nodes()): G.nodes[i]['size'] = size_df[size_df['col_a'] == i]['size'].values[0] if (column_types[i] == "BOOL"): G.nodes[i]['color'] = "#c03d3e" elif (column_types[i] == "CAT"): G.nodes[i]['color'] = "#3a923a" elif (column_types[i] == "NUM"): G.nodes[i]['color'] = "#337ab7" else: G.nodes[i]['color'] = "blue" ## drawing ## - fixing the size of the figure plt.figure(figsize=(10, 4)) ## color, size, width node_color = [nx.get_node_attributes(G, 'color')[v] for v in G] node_size = [1000 * nx.get_node_attributes(G, 'size')[v] for v in G] edge_width = [20 * G[u][v]['assn'] for u, v in G.edges()] ## layout pos = nx.spring_layout(G, iterations=50) # node labels nx.draw_networkx_labels(G, pos, with_labels=True, font_size=15, font_family='sans-serif', font_color="#000000", font_weight="bold") ## nodes nx.draw_networkx_nodes(G, pos, node_color=node_color, node_size=node_size, node_shape="o", alpha=0.9, linewidths=10) ## edges # nx.draw_networkx_edges(G, pos, edgelist=corr_edges, # width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False) # nx.draw_networkx_edges(G, pos, edgelist=corr_ratio_edges, # width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False) # nx.draw_networkx_edges(G, pos, edgelist=theil_edges, # width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False) nx.draw_networkx_edges(G, pos, width=edge_width, alpha=0.15, style='solid', edge_color="grey", arrows=False) ## edge labels # edge_labels =dict([((u, v), d['assn']) for u, v, d in G.edges(data=True)]) # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels) plt.axis('off') plt.tight_layout() return plot_360_n0sc0pe(plt)