def plot_Gmeta(array1, array2, array3, title, ylabel, density): barWidth = .3 colors = ['tan', 'yellowgreen', 'midnightblue'] labels = ['Corpus', 'k-core', 'k-truss'] names = get_names() bars = [array1, array2, array3] fig, ax = plt.subplots() # Set position of bar on X axis r = list() r.append(np.arange(len(bars[0])) + barWidth / 2) r.append([x + barWidth for x in r[0]]) r.append([x + barWidth for x in r[1]]) # Make the plot for i in range(3): plt.bar(r[i], bars[i], color=colors[i], width=barWidth, edgecolor='white', label=labels[i]) text_x_offset = .1 text_y_offset = 1.5 for i in range(3): for j in range(3): x_pos = r[i][j] - barWidth / 3 + text_x_offset if density == False: y_pos = bars[i][j] + np.exp(text_y_offset) label = f'{bars[i][j]:.0f}' else: y_pos = bars[i][j] + np.log(1.0001) label = f'{bars[i][j]:.3f}' plt.text(x_pos, y_pos, label, horizontalalignment='center', rotation=0, color='black', fontsize=10) # Add ticks on the middle of the group bars plt.ylabel(ylabel, fontweight='normal') plt.xticks([r + barWidth for r in range(len(array1))], names) ax.set_yscale('log') plt.title(title) # Create legend show graphic & save to file plt.legend(loc='best', fontsize='small') plt.savefig(clean_filename(title, 'png', plot_directory)) plt.show()
def ROC_plot(fpr, tpr, roc_auc, n_classes, ix, gmean, algo, method, save=True): '''Plots ROC curves gmean = average ''' color_dict = get_color_dict() names = get_names() handles = get_handles() lw = 1 figsize = 6 title = f'ROC {method}+{algo}' # Make plot plt.figure(figsize=(figsize, figsize)) colors = cycle([ color_dict[handles[0]], color_dict[handles[1]], color_dict[handles[2]] ]) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, linewidth=lw, label=f'ROC {names[i]} (auc = {roc_auc[i]:0.2f})') if i == 0: plt.scatter(fpr[i][ix[i]], tpr[i][ix[i]], marker='o', color='black', label=f'Best (gmeans={gmean:.3f})') else: plt.scatter(fpr[i][ix[i]], tpr[i][ix[i]], marker='o', color='black') plt.plot([0, 1], [0, 1], 'k--', linewidth=lw, label='No Skill') # diagonal plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title) plt.legend(loc="lower right") if save: plt.savefig(clean_filename(title, 'png', plot_directory)) plt.show()
def plot_confusion_matrix(y_pred, y_test, score, handles, algo, method, normalize, save=True): """ Plot a confusion matrix Expected True values x-axis & predicted y-axis obtained from : confusion_matrix(y_pred, y_test, labels = labels) note: this order is the opposite of that suggested in scikit-learn """ figsize_x = 6.5 figsize_y = 6 cm = confusion_matrix(y_pred, y_test, labels=handles) print(cm) print(f'[confusion_matrix_wrapper] {algo} score = {score}\n') labels = get_names() #print(f'[plot_confusion_matrix] labels = {labels}') accuracy = np.trace(cm) / float(np.sum(cm)) print(f'[plot_confusion_matrix] Accuracy={accuracy}') print(f'[plot_confusion_matrix] confusion matrix:\n{cm}') if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm = np.round(cm, 2) fig = plt.figure(figsize=(figsize_x, figsize_y)) ax = fig.add_subplot(111) ax.matshow(cm) #if method == 'GOW': method = 'Graph of Words' title = f'{method}+{algo}' plt.title(title, y=1.1) plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.ylabel('Predicted') plt.xlabel(f'True\naccuracy={accuracy:0.3f}') for i in range(3): for j in range(3): plt.text(j, i, str(cm[i][j])) if save: plt.savefig(clean_filename(title, 'png', plot_directory)) plt.show()
def build_base_barplot(handles, array, x_pos, title, xlabel, ylabel, color_dict): '''Builds base plots for 3 handles''' width = .75 names = get_names() fig, ax = plt.subplots() barlist=ax.bar(x_pos, array, width) for i, handle in enumerate(handles): barlist[i].set_color(color_dict[handle]) ax.set_xticks(x_pos) ax.set_xticklabels(names) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) title = title ax.set_title(title) return fig, ax
def tweeting_period_plot(handles, start, end, color_dict): '''Plot tweeting period from start and end dates''' fontsize = 14 names = get_names() ndays = get_ndays(start, end) fig, ax = plt.subplots(figsize=(18,4.875)) y_pos = np.arange(len(handles)) barlist=ax.barh(y_pos, ndays, .75) for i, handle in enumerate(handles): barlist[i].set_color(color_dict[handle]) # Annotate bars end_x = .89 start_x = [.05, .275, .55] y_val = [.815, .485, .15] for i, handle in enumerate(handles): ax.annotate('<-- ' + start[i], xy=(start_x[i], y_val[i]), xycoords = 'axes fraction', fontsize = fontsize) ax.annotate(end[i] + ' -->', xy=(end_x, y_val[i]), xycoords = 'axes fraction', fontsize = fontsize) plt.gca().invert_xaxis() # right to left ax.set_yticks(y_pos) ax.set_yticklabels(names) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('# of days') title = 'Tweeting time period' ax.set_title(title, fontsize = 18) plt.savefig(clean_filename(title , 'png', plot_directory)) plt.show()
def build_set(tweets): '''builds set of (unique) words from corpus of tweets''' unique_words = set() for tweet in tweets: for word in tweet.split(): unique_words.add(word) return unique_words if __name__ == '__main__': t0 = time.time() name_dict = get_name_dict() color_dict = get_color_dict() handles = get_handles() names = get_names() start_dates = ['05/01/2018', '03/07/2018', '02/02/2019'] end_dates = ['25/01/2020', '26/01/2020', '25/01/2020'] eda_plots.tweeting_period_plot(handles, start_dates, end_dates, color_dict) nraw = list() # number of raw tweets for each handle ntweets = list() # number of processed tweets for each handle word_counts = list() # list of word counts char_counts = list() # list of character counts unique_words = list() # list of # of unique words for each handle unique_all = set() # set of unique words overall wc_arrays = list() # list of arrays of word_counts for sig. test for handle in handles: # Loop over handles print(name_dict[handle])