def plot_links_heatmap(self): ''' :return: a hashmap of path to delay array values ''' links = [[0 for i in gl_mote_range] for m in gl_mote_range] for pkt in self.packets: for idx, hop in enumerate(pkt.hop_info): src = hop['addr'] if idx == (len(pkt.hop_info) - 1): dst = 1 else: dst = pkt.hop_info[idx + 1]['addr'] links[src - 1][dst - 1] += 1 plt.figure() heatmap(data=links, xticklabels=[i for i in gl_mote_range], yticklabels=[i for i in gl_mote_range]) if gl_save: plt.savefig(gl_image_path + re.findall(r"(.+?)\.log", self.filename.split('/')[-1])[0] + '_link_load.png', format='png', bbox='tight')
def plot_sig_bins(self, binwise_gc, savename='sigbins.png', show=False): fig = plt.figure(figsize=(12, 4)) gs = gridspec.GridSpec(10, 15) ax1 = plt.subplot(gs[:4, :-1]) f = lambda x, y: x if (y in self.sigBinIndices[0]) else 0 y = map(f, self.chip.binnedNormSignal[0], self.binSpace) ax1.plot(self.binSpace, y, lw=0.5, c='darkorange') ax1.yaxis.set_major_locator(MaxNLocator(5)) ax1.xaxis.set_major_locator(MaxNLocator(15)) ax2 = plt.subplot(gs[4:8, :-1]) f = lambda x, y: x if (y in self.sigBinIndices[1]) else 0 y = map(f, self.chip.binnedNormSignal[1], self.binSpace) ax2.yaxis.set_major_locator(MaxNLocator(5)) ax2.plot(self.binSpace, y, lw=0.5, c='darksage') ax2.xaxis.set_major_locator(MaxNLocator(15)) ax1.set_ylim((0, max(ax1.get_ylim()[1], ax2.get_ylim()[1]))) ax2.set_ylim((0, ax1.get_ylim()[1])) ax1.set_ylabel("ChiP RPM\n(+) strand") ax2.set_ylabel("ChiP RPM\n(-) strand") ax3 = plt.subplot(gs[6:, -1:]) ax4 = plt.subplot(gs[8:, :-1]) sns.heatmap([binwise_gc, binwise_gc], cmap='seismic', ax=ax4, cbar_ax=ax3, xticklabels=False, yticklabels=False) ax4.set_xlabel('GC percentage') plt.tight_layout() plt.savefig(savename, dpi=200) if show is True: plt.show() return True
def calculate_phi(list1, list2): delta_x = 0.1 bins = np.arange(0, 1 + delta_x, delta_x) hist = np.histogram2d(list1, list2, bins=bins, normed='sample_count') hist_x, x = np.histogram(list1, bins=hist[1], density=True) hist_y, y = np.histogram(list2, bins=hist[2], density=True) reverse_h = [] for i in range(len(hist[0])): new = [ hist[0][i][j] / hist_x[j] / hist_y[i] for j in range(len(hist[0][i])) ] #new = hist[0][i] reverse_h = [new] + reverse_h reverse_h = np.array(reverse_h) fig = plt.figure(figsize=(6, 6)) ax = fig.add_axes([0.3, 0.3, 0.5, 0.5]) sns.heatmap(reverse_h, cmap="bwr", vmin=0.0, vmax=2.2, center=1.1, cbar=True, xticklabels=x[:-1], yticklabels=sorted(y[:-1], reverse=True))
def render(variable1, variable2, name): plt.subplot(1, 2, 1) plt.title(name+' values') sns.heatmap(variable1) plt.subplot(1, 2, 2) plt.title(name+' errors') sns.heatmap(variable2) plt.show()
def graph_correl_features(dataset, df): """ generates the graph of correlated features (heatmap matrix) :param dataset: dataset object :param df: data (as a dataframe) :return: None """ try: # convert categorical to numerical for col in dataset.cat_cols: encoder = LabelEncoder() df[col] = encoder.fit_transform(df[col].map(str)) # create correlation matrix with pandas corr = df.corr() # display heatmap for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): if dataset.n_cols > 50: plt.figure(figsize=(10, 10)) elif dataset.n_cols > 20: plt.figure(figsize=(8, 8)) elif dataset.n_cols > 10: plt.figure(figsize=(7, 7)) else: plt.figure(figsize=(6, 6)) sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True) plt.title('correlation map of the features') plt.xticks(rotation=90) plt.yticks(rotation=0) plt.savefig(get_dataset_folder(dataset.dataset_id) + '/graphs/_correl.png', transparent=TRANSPARENT) __save_fig(dataset.dataset_id, '_correl', dark) except: log.error('error in graph_correl_features with dataset_id %s' % dataset.dataset_id)
def plot_activity_matrix(df, cmap, normalized=False, annotate=True, out_path='', title=''): """ Plot activity matrix showing area of land transitioning between land-use types :param df: :param cmap: :param normalized: :param annotate: :param out_path: :param title: :return: """ logger.info('Plot activity matrix') sns.set(font_scale=0.8) formatter = tkr.ScalarFormatter(useMathText=True) # normalized scale is from 0 - 100, does not need scientific scale if not normalized: formatter.set_scientific(True) formatter.set_powerlimits((-2, 2)) df = df * 100.0 if normalized else df * 1.0 vmin = math.ceil(np.nanmin(df)) vmax = math.ceil(np.nanmax(df)) # maximum value on colorbar ax = sns.heatmap(df, cbar_kws={'format': formatter}, cmap=cmap, linewidths=.5, linecolor='lightgray', annot=annotate, fmt='.2g', annot_kws={'size': 6}, vmin=vmin, vmax=vmax) # for annotation of heat map cells, use: annot=True, fmt='g', annot_kws={'size': 6} # ax.invert_yaxis() ax.set_ylabel('FROM') ax.set_xlabel('TO') ax.set_title(title) locs, labels = plt.xticks() plt.setp(labels, rotation=0) locs, labels = plt.yticks() plt.setp(labels, rotation=0) plt.savefig(out_path, dpi=constants.DPI) plt.close() # revert matplotlib params sns.reset_orig() set_matplotlib_params() get_colors(palette='tableau')
def graph_classification_categorical(dataset_id, df, col, target): """ display a heatmap of col in x axis and target in y axis :param dataset_id: id of the dataset :param df: dataframe, with col and target values :param col: name of column :param target: name of target column :return: """ try: for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): df['count'] = 1 plt.figure(figsize=(8, 7)) # convert col and target in numerical encoder = LabelEncoder() x = encoder.fit_transform(df[col].values) x_labels = encoder.inverse_transform(list(range(max(x) + 1))) y = encoder.fit_transform(df[target].values) y_labels = encoder.inverse_transform(list(range(max(y) + 1))) data = pd.pivot_table(df[[col, target, 'count']], values='count', index=target, columns=col, aggfunc=np.sum) sns.heatmap(data=data, cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True) plt.xticks([x + 0.5 for x in list(range(max(x) + 1))], x_labels, rotation=90) plt.yticks([x + 0.5 for x in list(range(max(y) + 1))], y_labels, rotation=0) __save_fig(dataset_id, '_col_' + col, dark) except: log.error('error in classification_categorical with dataset_id %s' % dataset_id)
def plot_links_heatmap(self): ''' :return: a hashmap of path to delay array values ''' links = [[0 for i in gl_mote_range] for m in gl_mote_range] for pkt in self.packets: for idx, hop in enumerate(pkt.hop_info): src = hop['addr'] if idx == (len(pkt.hop_info)-1): dst = 1 else: dst = pkt.hop_info[idx+1]['addr'] links[src-1][dst-1] += 1 plt.figure() heatmap(data=links, xticklabels=[i for i in gl_mote_range], yticklabels=[i for i in gl_mote_range]) if gl_save: plt.savefig(gl_image_path+re.findall(r"(.+?)\.log", self.filename.split('/')[-1])[0]+'_link_load.png', format='png', bbox='tight')
def heatmap_overlay(data, overlay_image=None, cmap='jet', cbar=False, show_axis=False, alpha=0.5, **kwargs): fig, ax = tfplot.subplots(figsize=(5, 4) if cbar else (4, 4)) fig.subplots_adjust(0, 0, 1, 1) # use tight layout (no margins) ax.axis('off') if overlay_image is None: alpha = 1.0 sns.heatmap(data, ax=ax, alpha=alpha, cmap=cmap, cbar=cbar, **kwargs) if overlay_image is not None: h, w = data.shape ax.imshow(overlay_image, extent=[0, h, 0, w]) if show_axis: ax.axis('on') fig.subplots_adjust(left=0.1, bottom=0.1, right=0.95, top=0.95) return fig
def plot_correlations(data, save=False, savepath="", **kwds): """Calculate pairwise correlation between features. Extra arguments are passed on to DataFrame.corr() """ import seaborn.apionly as sns plt.clf() # simply call df.corr() to get a table of # correlation values if you do not need # the fancy plotting corrmat = data.corr() labels = corrmat.columns.values labels = [labels[i] for i in range(len(labels))] opts = {'cmap': 'YlGnBu_r', 'vmin': -1, 'vmax': 1} ax1 = sns.heatmap(corrmat, linewidths=.5, **opts) ax1.set_title('Correlations') for ax in (ax1, ): ax.set_xticks(np.arange(len(labels)) + .5, minor=False) ax.set_yticks(np.arange(len(labels)) + .5, minor=False) ax.set_xticklabels(labels, minor=False, rotation=70, ha='right') ax.set_yticklabels(labels, minor=False, rotation='horizontal') plt.tight_layout() if save: filename = 'correlations-' + datetime.datetime.now().strftime( '%Y-%m-%d_%H-%M-%S') + '.pdf' fig_path = os.path.join(savepath_base, filename) figure_handler(save, fig_path) elif not save: figure_handler(save)
def corr_heatmap(x, mask_half=True, cmap='RdYlGn_r', vmin=-1, vmax=1, linewidths=0.5, square=True, figsize=(10,10), **kwargs): """Wrapper around seaborn.heatmap for visualizing correlation matrix. Parameters ========== x : DataFrame Underlying data (not a correlation matrix) mask_half : bool, default True If True, mask (whiteout) the upper right triangle of the matrix All other parameters passed to seaborn.heatmap: https://seaborn.pydata.org/generated/seaborn.heatmap.html Example ======= %matplotlib inline # Generate some correlated data k = 10 size = 400 mu = np.random.randint(0, 10, k).astype(float) r = np.random.ranf(k ** 2).reshape((k, k)) * 5 df = pd.DataFrame(np.random.multivariate_normal(mu, r, size=size)) corr_heatmap(df) """ if mask_half: mask = np.zeros_like(x.corr().values) mask[np.triu_indices_from(mask)] = True else: mask = None with sns.axes_style('white'): return sns.heatmap(x.corr(), cmap=cmap, vmin=vmin, vmax=vmax, linewidths=linewidths, square=square, mask=mask, **kwargs)
for l in range(1, a + 1): listaIndex.append(i + l) break else: continue Means = Means.ix[listaIndex] Beh = Beh.ix[listaIndex] Means.index = range(-a, a) Means2 = Means2.ix[listaIndex] Means2.index = range(-a, a) ''' Heat Map for select neurons that change in significant way ''' fig, ax = plt.subplots(figsize=(10, 20)) ax.vlines([49 + 0.5], 0, 1, transform=ax.get_xaxis_transform(), colors='k') sns.heatmap(Means.transpose(), annot=False, xticklabels=1) ''' Correlation between mean1,mean32,mean13,mean4 and all others neurons. Save only neurons that have high positive correlation Select and graphs only neurons that are present more than 1 time ''' listMean = ('Mean(13)', 'Mean(32)', 'Mean(4)') listCorr = [] for el in listMean: for column in Means: if column != el: p = Means[el].corr(Means[column]) tupla = (p.round(1), column) listCorr.append(tupla) highpositivecorr = [] highpositiveMean = []
# training dataset X_train = X.loc[lambda df: df['age'] > train_cutoff] y_train = y.loc[X_train.index] # holdout dataset X_holdout = X.loc[lambda df: df['age'] <= holdout_cutoff] y_holdout = y.loc[X_holdout.index] # fit/predict clf = LogisticRegression().fit(X_train, y_train) y_pred = clf.predict_proba(X_holdout)[:, 1] auc = roc_auc_score(y_holdout, y_pred) # cache auc auc_df.at[train_cutoff, holdout_cutoff] = auc plt.figure(figsize=(6, 3)) sns.heatmap(auc_df.fillna(0).loc[auc_df.index[::-1]], ) plt.ylabel('Training set > age') plt.xlabel('Testing set < age') plt.title('AUCs of Models Trained on\nVarious dataset splits') plt.savefig(output_dir / '2018-08-11__aucs-of-age-based-dataset-splits.png') # ### Generate AUC scores for vanilla vs. hardweighted models def cross_val_scores_weighted(model, X, y, weights, cv=5, metrics=[sklearn.metrics.roc_auc_score]): kf = KFold(n_splits=cv) kf.get_n_splits(X)
linestyles="-.", color=colors[1], linewidths=2) _ = ax.set_title("Posterior predictive distribution of AUC measurement \n \ Classifier: {0} ".format(name)) _ = ax.legend(loc="upper left") # Create an empty dataframe ab_dist_df = pd.DataFrame(index=range(m1), columns=range(m2), dtype=np.float) # TODO!!!!!!!! def prob_score_higher(s_a, s_b): return (s_a - s_b) # populate each cell in dataframe with persona_less_personb() for a, b in itertools.product(range(m1), range(m2)): ab_dist_df.ix[a, b] = prob_score_higher(pos_score[a], neg_score[b]) ax = plt.subplot(l, 2, i) i += 1 cmap = plt.get_cmap("Spectral") _ = sns.heatmap(ab_dist_df, square=True, cmap=cmap, linecolor='white') _ = ax.set_title("Positive samples score higher than negative samples \n \ Classifier: {0} ".format(name)) _ = ax.set_ylabel("positive samples") _ = ax.set_xlabel("negative samples") figure.subplots_adjust(left=.04, right=.96, bottom=.05, top=.95) plt.show()
def test_edge_imputation(): constraints = {'edge_count': (1000, 1100)} accuracy_at_k = [0] * 5 confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)] samples = 100 index = [ 'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert', 'Planted Partition Model' ] constraints_enforced = False rgs = [ structural_identities.watts_strogatz_generator, structural_identities.geometric_generator, structural_identities.erdos_renyi_generator, structural_identities.barabasi_albert_generator, structural_identities.planted_partition_generator ] for uni, rg in enumerate(rgs): title = index[uni] actual = uni created_graphs = [] for i in xrange(samples): G = structural_identities.constrained_generation(rg, constraints) degree_sequence = [1] * G.number_of_nodes() new_G = random_graphs.configuration_model(degree_sequence) new_G = impute_edge_algorithm(new_G, G) created_graphs.append(new_G) cluster, types = predict_structure(new_G, 2, constraints_enforced) predicted = cluster.index(min(cluster)) print title, types[predicted] confusion_matrix[actual][predicted] += 1 array = np.array(cluster) order = array.argsort() ranks = order.argsort().tolist() k = -1 for i in xrange(len(cluster)): # 5 types of rg if title == types[ranks.index(i)]: k = i break j = len(cluster) - 1 while j >= k: accuracy_at_k[j] += 1 j -= 1 # HERE we plot distros observed_metrics, dic = structural_identities.analyze_structural_identity_graphs( created_graphs, uni) predict_metrics, dic = structural_identities.analyze_structural_identity( rg, samples, uni) # constraints=None): structural_identities.graph_created_distributions( uni, observed_metrics, predict_metrics, dic) small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM'] plt.figure(10) for i in xrange(len(accuracy_at_k)): accuracy_at_k[i] /= (samples * 1.0 * len(rgs)) if constraints_enforced: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o', color='red') else: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o') plt.xlabel('k (top k labels)') plt.ylim((0, 1.1)) plt.ylabel('Accuracy @ k') plt.title('Prediction Accuracy for Uniformly Sampled Random Graphs') plt.show() sns.set() ax = plt.axes() sns.heatmap(confusion_matrix, ax=ax, cmap="YlGnBu", yticklabels=index, xticklabels=small_index) ax.set_title('Confusion Matrix for Uniformly Sampled Random Graphs') plt.tight_layout() plt.show()
def perform_edge_imputation(): accuracy_at_removed = [] remove_probability = [0.1 * i for i in xrange(10)] constraints = {'edge_count': (1000, 1100)} samples = 2 index = [ 'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert', 'Planted Partition Model' ] constraints_enforced = False rgs = [ structural_identities.watts_strogatz_generator, structural_identities.geometric_generator, structural_identities.erdos_renyi_generator, structural_identities.barabasi_albert_generator, structural_identities.planted_partition_generator ] for p in remove_probability: correct = 0.0 accuracy_at_k = [0] * 5 confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)] for uni, rg in enumerate(rgs): title = index[uni] actual = uni for i in xrange(samples): G = structural_identities.constrained_generation( rg, constraints) new_G = deepcopy(G) new_G = remove_edges(new_G, p) new_G = impute_edge_algorithm(new_G, G) cluster, types = predict_structure(new_G, 1, constraints_enforced) predicted = cluster.index(min(cluster)) print title, types[predicted] if actual == predicted: correct += 1 confusion_matrix[actual][predicted] += 1 array = np.array(cluster) order = array.argsort() ranks = order.argsort().tolist() k = -1 for i in xrange(len(cluster)): # 5 types of rg if title == types[ranks.index(i)]: k = i break j = len(cluster) - 1 while j >= k: accuracy_at_k[j] += 1 j -= 1 small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM'] plt.figure(10) sns.set() ax = plt.axes() sns.heatmap(confusion_matrix, ax=ax, cmap="YlGnBu", yticklabels=index, xticklabels=small_index) ax.set_title('Confusion Matrix for Edge Imputed Graphs (' + str((p) * 100) + ' percent removed)') plt.tight_layout() plt.savefig( '/Users/Brennan/Desktop/Networks/networks-project/pictures/CM_' + str((p) * 100) + '_removed.png') plt.close() sns.reset_defaults() imp.reload(mpl) imp.reload(plt) imp.reload(sns) # import matplotlib as mpl # import matplotlib.pyplot as plt for i in xrange(len(accuracy_at_k)): accuracy_at_k[i] /= (samples * 1.0 * len(rgs)) if constraints_enforced: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o', color='red') else: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o') plt.xlabel('k (top k labels)') plt.ylim((0, 1.1)) plt.ylabel('Accuracy @ k') plt.title('Prediction Accuracy for Edge Imputed Graphs (' + str((p) * 100) + ' percent removed)') plt.savefig( '/Users/Brennan/Desktop/Networks/networks-project/pictures/PA_' + str((p) * 100) + '_removed.png') plt.tight_layout() plt.close() accuracy_at_removed.append(correct / (len(rgs) * samples)) plt.plot(remove_probability, accuracy_at_removed, marker='o') plt.xlabel('Percent of Edges Removed') plt.ylim((0, 1.1)) plt.ylabel('Accuracy @ 1') plt.title('Prediction Accuracy for Graph Recovery (Edge Imputation)') plt.savefig( '/Users/Brennan/Desktop/Networks/networks-project/pictures/graph_imputation_forall_p.png' ) plt.clf()
plt.savefig(outfile) d = pd.DataFrame(df, columns=feature_list) # Compute the correlation matrix corr = d.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots() sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0, square=True, xticklabels=feature_labels, yticklabels=feature_labels, linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax) outfile = args.outdir + '/feature_covariance.png' plt.savefig(outfile) print('=' * 30) print(name) # test_predictions = pipeline.predict(X_test_std) # test_acc = accuracy_score(y_test, test_predictions) # print('Test accuracy: {:.4%}'.format(test_acc)) # train_predictions = pipeline.predict(X_train_std) # train_acc = accuracy_score(y_train, train_predictions)
height=4, diag_kind='kde', plot_kws=dict(color=colors[8]), diag_kws=dict(shade=True, alpha=.7, color=colors[0])) plt.show() # - # Visualizzazione della matrice di correlazione. Alla posizione $(i,j)$ il coefficiente di correlazione (lineare) tra le feature $i$ e $j$. Valore in $[-1,1]$: $1$ correlazione perfetta, $-1$ correlazione inversa perfetta, $0$ assenza di correlazione cm = np.corrcoef(df[cols].values.T) plt.figure(figsize=(12, 4)) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols, xticklabels=cols, cmap=cmap) plt.tight_layout() plt.show() # ### Regressione di MEDV rispetto a una sola feature print("Feature utilizzabili: {0}".format(', '.join(map(str, df.columns[:-1])))) mi = mutual_info_regression(df[df.columns[:-1]], df[df.columns[-1]]) dmi = pd.DataFrame(mi, index=df.columns[:-1], columns=['mi']).sort_values(by='mi', ascending=False) dmi.head(20)
serieB = serieB.append(serieMT.loc[i, -len(serieMT.transpose()) / 2:-1]) serieA = serieA.append(serieMT.loc[i, 0:len(serieMT.transpose())] / 2) return (serieB, serieA) meanTupla = DataFrameAandB(MeansS) meanBefore = meanTupla[0] meanAfter = meanTupla[1] ''' Function for normalization ''' def Normalization(dfA, dfB): for index, row in dfB.iterrows(): m = dfB.loc[index, :].mean() dfA.loc[index, :] = dfA.loc[index, :] / m dfB.loc[index, :] = dfB.loc[index, :] / m return (dfA, dfB) DataA, DataB = Normalization(meanAfter, meanBefore) Data = DataB.join(DataA) ''' HM ''' fig, ax = plt.subplots(figsize=(20, 5)) sns.heatmap(Data, annot=False, xticklabels=1, vmin=-15, vmax=15, cmap="BuPu")
corr = df.corr() print(corr) # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure fig, ax = plt.subplots() # Draw the heatmap with the mask and correct aspect ratio vmax = np.abs(corr.values[~mask]).max() sns.heatmap(corr, mask=mask, cmap=plt.cm.PuOr, vmin=-vmax, vmax=vmax, square=True, linecolor="lightgray", linewidths=1, ax=ax) for i in range(len(corr)): ax.text(i + 0.5, len(corr) - (i + 0.5), corr.columns[i], ha="center", va="center", rotation=45) for j in range(i + 1, len(corr)): s = "{:.3f}".format(corr.values[i, j]) ax.text(j + 0.5, len(corr) - (i + 0.5), s, ha="center", va="center") ax.axis("off")
for index, row in dfB.iterrows(): m = dfB.loc[index, :].mean() dfA.loc[index, :] = dfA.loc[index, :] / m dfB.loc[index, :] = dfB.loc[index, :] / m return (dfA, dfB) DataA, DataB = Normalization(meanAfter, meanBefore) Data = DataB.join(DataA) ''' Kmeans and HM ''' km = KMeans(n_clusters=2, init='k-means++', n_init=20) km.fit(Data) x = km.fit_predict(Data) Data['Cluster'] = x Data = Data.sort_values(by=['Cluster']) df2 = Data.drop('Cluster', 1) fig, ax = plt.subplots(figsize=(20, 5)) #ax.vlines([20],0,1, transform=ax.get_xaxis_transform(), colors='k') df2 = df2.drop('Mean(34)', 0) df2 = df2.drop('Mean(20)', 0) df2 = df2.drop('Mean(11)', 0) df2 = df2.drop('Mean(2)', 0) df2 = df2.drop('Mean(10)', 0) df2 = df2.drop('Mean(29)', 0) df2 = df2.drop('Mean(32)', 0) df2 = df2.drop('Mean(36)', 0) df2 = df2.drop('Mean(17)', 0) sns.heatmap(df2, annot=False, xticklabels=1, vmin=-10, vmax=10, cmap="BuPu")
def run_predict_structure(generator=None, title=None): constraints = {'edge_count': (1000, 1100)} accuracy_at_k = [0] * 5 if generator != None and title != None: samples = 100 for sample in xrange(samples): G = structural_identities.constrained_generation( generator, constraints) cluster, types = predict_structure(G, trials=20) print sample, types[cluster.index(min(cluster))] array = np.array(cluster) order = array.argsort() ranks = order.argsort().tolist() k = -1 for i in xrange(len(cluster)): # 5 types of rg if title == types[ranks.index(i)]: k = i break j = len(cluster) - 1 while j >= k: accuracy_at_k[j] += 1 j -= 1 plt.figure(1) for i in xrange(len(accuracy_at_k)): accuracy_at_k[i] /= (samples * 1.0) plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o') plt.xlabel('k (top k labels)') plt.ylim((0, 1.1)) plt.ylabel('Accuracy @ k') plt.title('Prediction Accuracy for ' + title + ' Random Graphs') plt.show() # Uniformly sample across rg elif generator == None: confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)] samples = 100 index = [ 'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert', 'Planted Partition Model' ] constraints_enforced = True rgs = [ structural_identities.watts_strogatz_generator, structural_identities.geometric_generator, structural_identities.erdos_renyi_generator, structural_identities.barabasi_albert_generator, structural_identities.planted_partition_generator ] for j, rg in enumerate(rgs): title = index[j] actual = j for i in xrange(samples): G = structural_identities.constrained_generation( rg, constraints) cluster, types = predict_structure(G, 5, constraints_enforced) predicted = cluster.index(min(cluster)) print title, types[predicted] confusion_matrix[actual][predicted] += 1 array = np.array(cluster) order = array.argsort() ranks = order.argsort().tolist() k = -1 for i in xrange(len(cluster)): # 5 types of rg if title == types[ranks.index(i)]: k = i break j = len(cluster) - 1 while j >= k: accuracy_at_k[j] += 1 j -= 1 small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM'] for i in xrange(len(accuracy_at_k)): accuracy_at_k[i] /= (samples * 1.0 * len(rgs)) print accuracy_at_k if constraints_enforced: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o', color='red') else: plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o') plt.xlabel('k (top k labels)') plt.ylim((0, 1.1)) plt.ylabel('Accuracy @ k') plt.title('Prediction Accuracy for Uniformly Sampled Random Graphs') plt.show() sns.set() ax = plt.axes() sns.heatmap(confusion_matrix, ax=ax, cmap="YlGnBu", yticklabels=index, xticklabels=small_index) ax.set_title('Confusion Matrix for Uniformly Sampled Random Graphs') plt.tight_layout() plt.show()
Data = Data.drop('Mean(8)', 0) Data = Data.drop('Mean(28)', 0) Data = Data.drop('Mean(2)', 0) ''' Kmeans and HM ''' km = KMeans(n_clusters=3, init='k-means++', n_init=20) km.fit(Data) x = km.fit_predict(Data) Data['Cluster'] = x Data = Data.sort_values(by=['Cluster']) df2 = Data.drop('Cluster', 1) fig, ax = plt.subplots(figsize=(20, 10)) #ax.vlines([50],0,1, transform=ax.get_xaxis_transform(), colors='k') ax.hlines([30], 0, 1, transform=ax.get_yaxis_transform(), colors='r') #ax.hlines([34],0,1, transform=ax.get_yaxis_transform(), colors='k') #df2 = df2.drop ('Mean(11)',0) #df2 = df2.drop ('Mean(2)', 0) #df2 = df2.drop ('Mean(10)',0) #df2 = df2.drop ('Mean(29)',0) #df2 = df2.drop ('Mean(32)',0) #df2 = df2.drop ('Mean(36)',0) #df2 = df2.drop ('Mean(17)',0) sns.heatmap( df2, annot=False, xticklabels=1, yticklabels=1, cmap="BuPu", )
# normalize confusion matrices normalizedAvgCM = np.zeros((numClasses, numClasses)) for i in range(len(confusionMatrices)): cm = confusionMatrices[i] normalizedAvgCM += cm / cm.astype(np.float).sum(axis=1) normalizedAvgCM = normalizedAvgCM / nfold # plot one time prediction confusion matrix df_cm = pd.DataFrame(normalizedAvgCM, index=classNames, columns=classNames) plt.figure(figsize=(9.6, 4.1)) # 5.7 sns.set(font_scale=1.4) # for label size ax = sns.heatmap( df_cm, cbar_kws={'ticks': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}, vmin=0, vmax=1.0, annot=True, annot_kws={"size": 18}, fmt='2.2f', cmap="Blues") # font size bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) ax.set_ylim(sorted(ax.get_xlim(), reverse=True)) ax.set_yticklabels(classNames, rotation=0, fontsize="16", va="center") ax.set_xticklabels(classNames, rotation=0, fontsize="16", ha="center") plt.tight_layout() plt.savefig('FIGURES/normCM_cnn_10fold_4_classes_old.pdf') with plt.style.context("default"): plt.figure(figsize=(10, 5)) for iplt in range(2):
d = d.drop('Mean(34)', 0) d = d.drop('Mean(35)', 0) d = d.drop('Mean(23)', 0) d = d.drop('Mean(1)', 0) ''' Kmeans and HM ''' km = KMeans(n_clusters=6, init='k-means++', n_init=100) km.fit(d) x = km.fit_predict(d) d['Cluster'] = x d = d.sort_values(by=['Cluster']) t = d.drop('Cluster', 1) fig, ax = plt.subplots(figsize=(25, 10)) ax.vlines([a + 0.5], 0, 1, transform=ax.get_xaxis_transform(), colors='k') #ax.hlines([3],1,0, transform=ax.get_yaxis_transform(), colors='k') #ax.hlines([10],1,0, transform=ax.get_yaxis_transform(), colors='k') #ax.hlines([12],1,0, transform=ax.get_yaxis_transform(), colors='k') #ax.hlines([21],1,0, transform=ax.get_yaxis_transform(), colors='k') #ax.hlines([25],1,0, transform=ax.get_yaxis_transform(), colors='k') ##ax.hlines([22],1,0, transform=ax.get_yaxis_transform(), colors='k') ##ax.hlines([15],1,0, transform=ax.get_yaxis_transform(), colors='k') ##ax.hlines([20],1,0, transform=ax.get_yaxis_transform(), colors='k') ##ax.hlines([21],1,0, transform=ax.get_yaxis_transform(), colors='k') ##ax.hlines([24],1,0, transform=ax.get_yaxis_transform(), colors='k') sns.heatmap(t, annot=False, xticklabels=1, yticklabels=1, cmap="YlGnBu")
# fare and Survived fig.add_subplot(3, 3, 9) sns.violinplot(x="Survived", y="Fare", alpha=.7, data=df, saturation=.7) plt.show() # In[425]: cm = df.drop(['PassengerId'], axis=1).corr() mask = np.zeros_like(cm, dtype=np.bool) mask[np.triu_indices_from(mask)] = True plt.figure(figsize=(16, 8)) hm = sns.heatmap(cm, mask=mask, annot=True, fmt='.2f', cmap=sns.diverging_palette(220, 10, as_cmap=True), cbar_kws={"shrink": .5}) plt.tight_layout() plt.show() # ### Let us evaluate some conditional probabilities of surviving # In[353]: def cond_prob(feature): r = df[df[feature].notnull()][feature].unique() p = [] for val in r: joint = df[(df[feature] == val) & (df['Survived'] == 1)].shape[0]
def occurrence_count(keywords): #Remove duplicates final_count_df = pd.read_csv('final_count.csv') final_count_df.drop_duplicates(subset=['tweetID'], keep='first') del final_count_df['i'] ocurrences_df = final_count_df.copy() # final_count_df.to_csv('final_count2.csv', index=False) for index, row in ocurrences_df.iterrows(): ocurrences_df.loc[index, "date"] = datetime.fromtimestamp( float(row['date']) / 1000.0) del ocurrences_df['tweetID'] del ocurrences_df['lat'] del ocurrences_df['lon'] del ocurrences_df['userID'] ocurrences_df.index = pd.DatetimeIndex(ocurrences_df['date'].values) dfrs = ocurrences_df.resample('30min').apply(sum) dfrs.fillna(0, inplace=True) print(dfrs.head()) # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # x = list(dfrs.index.values) # y = list(dfrs.columns.values) # z = dfrs.values weeks = [g for n, g in dfrs.groupby(pd.TimeGrouper('W'))] # pd.DataFrame.index. # plt.figure() # print(len(weeks)) w = 0 for week_data in weeks: # print(week_data.std()) high_var_df = week_data.loc[:, week_data.std() > .1] high_var_df['match'] = 0 # print(high_var_df.head()) if w == 0: # for index, row in high_var_df.iterrows(): # row['match'] = False pass elif w == 1: high_var_df[329:334]['match'] = 1 # for index, row in high_var_df.iterrows(): # row['match'] = 1 if row['madrid'] > 2 and row['barcelona'] > 2 else 0 # row['match'] = 1 if row['barcelona'] > 5 else 0 #331->335 # high_var_df.loc[index] = row elif w == 2: high_var_df[136:140]['match'] = 1 # for index, row in high_var_df.iterrows(): # row['match'] = 1 if row['barcelona'] > 2 and row['villarreal'] > 2 else 0 # # row['match'] = 1 if row['barcelona'] > 5 else 0 # #138->141 # high_var_df.loc[index] = row high_var_df.to_csv(str(high_var_df.index[0]) + '.csv') plt.figure() for name, series in high_var_df.iteritems(): plt.plot(series, high_var_df.index) plt.figure() ax1 = sns.heatmap(high_var_df) plt.setp(ax1.xaxis.get_majorticklabels(), rotation=90) plt.tight_layout() plt.figure() ax2 = sns.heatmap(high_var_df.corr()) plt.setp(ax2.xaxis.get_majorticklabels(), rotation=90) plt.tight_layout() # print(x, y, z) # input('ENTER') model_train(high_var_df, w, 'linear_regression') w += 1 plt.pause(0.001) input('ENTER') plt.close() dfrs.to_csv('occurrences.csv', index=True)
plt.ylabel("frequency") # In[15]: data_mean.corr() # The most correlated features are texture:radius and perimeter:fractal dimension. It is not good to have correlated features because they can be redundant and slow down the program. They also can increase bias. Based on this knowledge, several features which are highly correlated may be removed. For this project, I will keep only the mean values. # In[50]: # Create a heat map of the correlations between the mean values. Red means highly correlated and blue is uncorrelated. plt.figure(figsize=(10,10)) sns.heatmap(data_mean.corr(),annot=True,square=True,cmap='coolwarm') # In[53]: # This makes a new datafram which removes the column that tells whether it is malignant or benign. y = data.type data_p=data.drop(columns="type") #create a new data array data_p.head() # In[57]: data_pair = data_mean.drop(columns=["ID"])