def display(data, color, names): try: fig = go.Figure(data=go.Splom( dimensions=[ dict(label=names[0], values=data[names[0]]), dict(label=names[1], values=data[names[1]]), dict(label=names[2], values=data[names[2]]), dict(label=names[3], values=data[names[3]]), dict(label=names[4], values=data[names[4]]), dict(label=names[5], values=data[names[5]]), dict(label=names[6], values=data[names[6]]), dict(label=names[7], values=data[names[7]]), dict(label=names[8], values=data[names[8]]), dict(label=names[9], values=data[names[9]]) ], marker=dict(color=color, size=5), )) fig.update_layout(title='Features scatter plot matrix', template='seaborn', title_font_size=20, height=1500) except Exception as err: print("Error: " + str(err)) sys.exit(1) fig.show()
def scatter_matrix(): df = df_dict['covid-us-state'] df.fips = df.fips.apply(lambda x: str(x).zfill(2)) df = df[df.date == max(df.date)] df = df.drop(columns='date', axis=1).reset_index(drop=True) state_pop = df_dict['state-population'] state_area = df_dict['state-area'] mask_use = df_dict['mask-use-by-county'] mask_use.countyfp = mask_use.countyfp.apply(lambda x: str(x).zfill(5)) mask_use['wear_mask_prob'] = 0.25 * mask_use['rarely'] + 0.5 * mask_use['sometimes'] + \ 0.75 * mask_use['frequently'] + 1.0 * mask_use['always'] mask_use['state_code'] = mask_use.apply(lambda x: fip_to_state(x.countyfp), axis=1) mask_use['county'] = mask_use.apply(lambda x: fip_to_county(x.countyfp), axis=1) df_agg = mask_use.groupby('state_code').agg(['mean']) df_agg.columns = ["_".join(x) for x in np.ravel(df_agg.columns)] df_agg.reset_index(inplace=True) df_agg.rename(columns={'wear_mask_prob_mean' : 'wear_mask_prob'}, inplace=True) df_agg = df_agg[['state_code', 'wear_mask_prob']] df_agg.drop(df_agg[df_agg['state_code'] == 'N/A'].index, inplace = True) df_agg.drop(df_agg[df_agg['state_code'] == 'DC'].index, inplace = True) df_agg['state'] = df_agg['state_code'].apply(lambda x: state_map_dict[x]) df_agg = df_agg[['state', 'wear_mask_prob']] data_frames = [df, state_pop, state_area, df_agg] df_merged = reduce(lambda left, right: pd.merge(left,right,on=['state'], how='inner'), data_frames) df_merged['CFR'] = df_merged['deaths'] / df_merged['cases'] df_merged['IR'] = df_merged['cases'] / df_merged['total'] df_merged['PD'] = df_merged['total'] / df_merged['area'] df_merged['WMP'] = df_merged['wear_mask_prob'] df_ana = df_merged.loc[:, ['state', 'CFR', 'IR', 'PD', 'WMP']] df_ana[['CFR', 'IR', 'PD', 'WMP']] = np.round(df_ana[['CFR', 'IR', 'PD', 'WMP']], 3) fig = go.Figure(data=go.Splom( dimensions=[dict(label='CFR', # 'Fatality rate', values=df_ana['CFR']), dict(label='IR', #'Infection rate', values=df_ana['IR']), dict(label='PD', #'Population density', values=df_ana['PD']), dict(label='WMP', #'Wear mask prob.', values=df_ana['WMP'])], text=df_ana['state'], # hovertemplate="%{x}, %{y}", marker=dict(showscale=False, # colors encode categorical variables line_color='white', line_width=0.5), showupperhalf=False, )) fig.update_layout( title='Scatter Matrix', dragmode='select', width=600, height=600, hovermode='closest', ) return fig
def scatter_matrix(df): df = df.sort_values(by="Churn", ascending=True) classes = df["Churn"].unique().tolist() classes class_code = {classes[k]: k for k in range(2)} class_code color_vals = [class_code[cl] for cl in df["Churn"]] color_vals pl_colorscale = "Viridis" pl_colorscale text = [df.loc[k, "Churn"] for k in range(len(df))] text trace = go.Splom( dimensions=[ dict(label="tenure", values=df["tenure"]), dict(label="MonthlyCharges", values=df["MonthlyCharges"]), dict(label="TotalCharges", values=df["TotalCharges"]), ], text=text, marker=dict( color=color_vals, colorscale=pl_colorscale, size=3, showscale=False, line=dict(width=0.1, color="rgb(230,230,230)"), ), ) axis = dict(showline=True, zeroline=False, gridcolor="#fff", ticklen=4) layout = go.Layout( dict( title="Scatter plot matrix for Numerical columns for customer attrition", autosize=False, height=800, width=800, dragmode="select", hovermode="closest", plot_bgcolor="rgba(240,240,240, 0.95)", xaxis1=dict(axis), yaxis1=dict(axis), xaxis2=dict(axis), yaxis2=dict(axis), xaxis3=dict(axis), yaxis3=dict(axis), ) ) data = [trace] fig = go.Figure(data=data, layout=layout) py.iplot(fig)
def mp(selected_patient, selector, values): x = multiplot_soz(selected_patient, selector, values) data = [] for e in x: d = go.Splom( dimensions=[dict(label=k, values=e[k]) for k in e if k != 0], name=e[0], marker=dict(size=4), diagonal=dict(visible=False)) data.append(d) layout = go.Layout(title="Multiplot prove", dragmode='select', hovermode='closest', showlegend=True) fig = go.Figure(data=data, layout=layout) return fig
def pairwise_plot(df: pd.DataFrame, cols: list = None): """Returns pairplot for features listed in cols parameter""" if cols is not None: df = df[[cols]] fig = go.Figure(data=go.Splom(dimensions=[{ 'label': i, 'values': df[i] } for i in cols], ), layout=go.Layout( title={ 'text': 'Pairwise Plot', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' })) return fig
def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None): """ NearMiss - Select values which are closest to minority class. TomeLinks - uses connected sets between class borders which are closest. If there are no other points closer, it assumes they are noise or borderline and remove them. ENN - Edited Nearest Neighbors, remove instances from majorit which are near bordeline NCL - NeighborhoodCleaningRule - Uses ENN to remove majority samples. Finds Nearest neighbors and if all are correctly label it keeps them. CC - Cluster Centroids - Finds Clusters of Majority Samples with K-means, then keeps cluster centroids of the clusters as the new majority sample. """ #https://towardsdatascience.com/sampling-techniques-for-extremely-imbalanced-data-part-i-under-sampling-a8dbc3d8d6d8 Y = df[_class] X = df.drop(_class, axis = 1) if method.lower() == 'nearmiss': x, y = NearMiss(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'tomelinks': x, y = TomekLinks(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'ncl': x, y = NeighbourhoodCleaningRule(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'cc': x, y = ClusterCentroids(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) else: raise Exception("{} is not a valid method for UserSampling".format(method)) df = pd.DataFrame([x, y], columns = list(df.columns) + [_class]) fig = go.Figure() fig.add_trace( go.Splom( dimensions = [ dict(label = column, values = df[column]) for column in df.columns ], marker = dict( color = df[_class] ) ) ) fig.show() if transform: return df return
def scatter_go(df,dimensions=['SEXO','EDAD'],for_text='MUNICIPIO'): index_vals = df[for_text].astype('category').cat.codes data=go.Splom( dimensions=[dict(label=dimension, values=df[dimension]) for dimension in dimensions], text=df[for_text], marker=dict(color=index_vals, showscale=False, # colors encode categorical variables line_color='white', line_width=0.5) ) fig = go.Figure(data) fig.update_layout( title='Iris Data set', dragmode='select', width=600, height=600, hovermode='closest', ) return fig
def scatter_matrix_plotly(data, columns): """ Examples -------- >>> columns = ['a', 'b', 'c'] >>> data = [ ... dict(a=1, b=2, c=3, epoch=1), ... dict(a=2, b=1, c=1, epoch=2), ... dict(a=3, b=3, c=2, epoch=3), ... ] >>> chart = scatter_matrix_plotly(data, columns) """ # Looks ugly import plotly.graph_objects as go import pandas as pd df = pd.DataFrame(data) index_vals = df['epoch'].astype('category').cat.codes fig = go.Figure(data=go.Splom( showlowerhalf=False, diagonal_visible=False, text=df['epoch'], dimensions=[dict(label=col, values=df[col]) for col in columns], marker=dict(color=index_vals, showscale=False, line_color='white', line_width=0.5))) fig.update_layout(template='plotly_dark') fig.update_layout(showlegend=True, width=600, height=600) return fig
def cluster(df: pd.DataFrame, k_min=2, k_max=10, multivariate=True) -> dict: """ @df -> input data\n @k_min -> minimum cluster number\n @k_max -> maximum cluster number\n @multivariate -> boolean multivariate clustering vs pairwise -> to be developed\n """ if multivariate: results = {k: None for k in range(k_min, k_max + 1)} for ix, k in enumerate(results.keys()): cl = KMeans(n_clusters=k, random_state=44, algorithm='full', n_init=5, init='k-means++').fit(df) out_df = pd.concat([df, pd.Series(cl.labels_, name='cluster')], axis=1) results[k] = {'centroids': cl.cluster_centers_, 'labels': cl.labels_, 'inertia': cl.inertia_, 'df': out_df, 'figure': go.Figure(go.Splom(dimensions = [{'label': lab, 'values': out_df[lab]} for lab in \ [d for d in out_df.columns if d != 'cluster']], showupperhalf = False, marker = {'color': out_df['cluster'], 'showscale' : False, 'colorscale': 'inferno'}, opacity = .8, diagonal_visible = False, ),)} return results
def chart_pairs(df, title="Time Series Pairs Plot", **kwargs): """ Pairwise scatter matrix plot for timeseries Parameters ---------- df : DataFrame pandas DataFrame with a datetime index and columns representing the futures contract, ordered by most recent expiry title : str, optional Chart title, by default "Time Series Pairs Plot" **kwargs keyword arguments to pass to plotly.graph_objects.Figure.update_layout function """ dt_idx = df.index.name df = df.reset_index().copy() dims = [] for c, i in df.iteritems(): dims.append(dict( label=c, values=df[c], )) fig = go.Figure() fig.add_trace( go.Splom( dimensions=dims, showupperhalf=False, marker=dict(color=df[dt_idx].astype(int), colorscale="Portland"), diagonal_visible=False, )) fig.update_layout(width=1000, height=1000, title=title) if kwargs is not None: fig.update_layout(**kwargs) return fig
def display_selected_data(selectedAreaMap, selectedAreaDropdown, selectedAttr): df_selected = df title_part = ' census tracks' key = 'geoid' font_ann = dict(size=10, color=colors['text']) if selectedAreaMap is not None: points = selectedAreaMap["points"] area_names = [str(point["text"].split("<br>")[2]) for point in points] df_selected = df_selected[df_selected[key].isin(area_names)] index_vals = df_selected['boro_name'].astype('category').cat.codes coef_list = [] # find pearson coeff and p_value for each pair of attributes pairs = combinations(selectedAttr, 2) flag = True for pair in pairs: if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2: coef_list.append( pearsonr(df_selected[pair[0]], df_selected[pair[1]])) else: flag = False if flag: ann = [ dict( x=1, y=1, xref="x2", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=1, y=1, xref="x1", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=1, y=1, xref="x3", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=1, y=1, xref="x1", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=1, y=1, xref="x3", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), dict( x=1, y=1, xref="x2", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), ] else: ann = [] axisd = dict(showline=True, zeroline=False, gridcolor='#cecece', showticklabels=True) # here we build a scatter matrix, and add annotations for each subgraph layout = go.Layout(dragmode='select', margin=dict(l=0, r=0, b=0, t=0, pad=0), autosize=False, hovermode='closest', font=dict(color=colors['text2'], size=12), plot_bgcolor=colors['background'], paper_bgcolor=colors['background'], xaxis1=dict(axisd), xaxis2=dict(axisd), xaxis3=dict(axisd), xaxis4=dict(axisd), yaxis1=dict(axisd), yaxis2=dict(axisd), yaxis3=dict(axisd), yaxis4=dict(axisd), annotations=ann) fig = go.Figure( data=go.Splom( dimensions=[ dict(label=selectedAttr[0], values=df_selected[selectedAttr[0]]), dict(label=selectedAttr[1], values=df_selected[selectedAttr[1]]), dict(label=selectedAttr[2], values=df_selected[selectedAttr[2]]), ], text=df_selected['boro_name'] + ', ' + df_selected['ntaname'], hoverinfo="x+y+text", # showlegend=True, marker=dict( color=index_vals, showscale=False, # colors encode categorical variables line_color='black', line_width=0.4), diagonal=dict(visible=True)), layout=layout) return fig
def create_sol_multiview(): dataOrig = analyze.loadAudioFeatures() fullLib = analyze.loadLibraryFromFiles() # list: 3799 of dict:18 # [{'danceability': 0.469, 'energy': 0.625, 'key': 4, 'loudness': -5.381, 'mode': 0, 'speechiness': 0.0306, 'acousticness': 0.00515, 'instrumentalness': 2.03e-05, 'liveness': 0.0682, 'valence': 0.325, 'tempo': 76.785, 'type': 'audio_features', 'id': '6PBzdsMi6YNdYAevzozBRi', 'uri': 'spotify:track:6PBzdsMi6YNdYAevzozBRi', 'track_href': 'https://api.spotify.com/v1/tracks/6PBzdsMi6YNdYAevzozBRi', 'analysis_url': 'https://api.spotify # {'danceability': 0.76, 'energy': 0.608, 'key': 9, 'loudness': -8.673, 'mode': 0, 'speechiness': 0.0347, 'acousticness': 0.315, 'instrumentalness': 0.79, 'liveness': 0.121, 'valence': 0.727, 'tempo': 119.032, 'type': 'audio_features', 'id': '4dJYJTPbUgFK5pCQ5bYD4g', 'uri': 'spotify:track:4dJYJTPbUgFK5pCQ5bYD4g', 'track_href': 'https://api.spotify.com/v1/tracks/4dJYJTPbUgFK5pCQ5bYD4g', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4dJYJTPbUgFK5pCQ5bYD4g', 'duration_ms': 254118, 'time_signature': 4} # {'danc.. dtype = [('danceability', '<f8'), ('energy', '<f8'), ('key', '<f8'), ('loudness', '<f8'), ('mode', '<f8'), ('speechiness', '<f8'), ('acousticness', '<f8'), ('instrumentalness', '<f8'), ('liveness', '<f8'), ('valence', '<f8'), ('tempo', '<f8'), ('type', '<f8'), ('id', '<f8'), ('duration_ms', '<f8'), ('time_signature', '<f8'), ] keys = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness'] keys = ['danceability', 'energy', 'key', 'loudness', 'valence', 'speechiness', 'tempo', 'time_signature'] # keys = ['danceability', 'energy', 'loudness'] # keys = ['danceability', 'energy'] # ('danceability','energy','key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', # 'valence', 'tempo'): dataArray = [] for key in dataOrig[0]: if key in keys: # data[key] = [li[key] for li in dataOrig] dataArray.append([li[key] for li in dataOrig]) # dataArray list:8 3799 # one row per audio feature # [[0.469, 0.76, 0.598, 0.706, 0.756, 0.555, 0.53, 0.716, 0.481, 0.415, 0.684, 0.593, 0.395, 0.487, 0.671, 0.691, 0.155, 0.61, 0.171, 0.203, 0.181, # [0.625, 0.608, 0.509, 0.653, 0.549, 0.71, 0.362, 0.685, 0.491, 0.42, 0.62, 0.626, 0.704, 0.757, 0.603, 0.669, 0 # [4, 9, 9, 7, 7, 10, 5, 4, 11, 3, 0, 4, 5, 0, 4, 1, 10, 11, 7, 2, 10, 10, 10, 0, 8, 9, 11, 6, 11, 6, 10, 1, 0, 3, 0, dataArray = np.array(dataArray) # call MinMaxScaler object min_max_scaler = MinMaxScaler() # feed in a numpy array minmaxscaled = min_max_scaler.fit_transform(dataArray) # wrap it up if you need a dataframe # df = pd.DataFrame(X_train_norm) dataArrayMean = np.mean(dataArray) dataArrayStd = np.std(dataArray) allsongsstandardized = (dataArray - dataArrayMean) / dataArrayStd X_train_norm = allsongsstandardized X_train_norm = np.flip(np.rot90(X_train_norm, 3)) dataToDisplay = np.flip(np.rot90(dataArray, 3)) # allsongs: list:3799 x 8\ # one row per song # [[0.469, 0.625, 4, -5.381, 0, 0.0306, 0.00515, 2.03e-05], # [0.76, 0.608, 9, -8.673, 0, 0.0347, 0.315, 0.79], # [0.598, 0.509, 9, -9.719, 1, 0.0269, 0.593, 0.0503], kmeans = KMeans(n_clusters=7) kmeans.fit(X_train_norm) predict = kmeans.predict(X_train_norm) centroids = kmeans.cluster_centers_ correct = 0 # for i in range(len(X1)): # predict_me = np.array(X1[i].astype(float)) # predict_me = predict_me.reshape(-1, len(predict_me)) # prediction = kmeans.predict(predict_me) # print(prediction[0]) cs2 = kmeans.labels_.astype(float) fig = go.Figure(data=go.Splom( dimensions=[dict(label=keys[0], values=dataToDisplay[:, 0]), dict(label=keys[1], values=dataToDisplay[:, 1]), dict(label=keys[2], values=dataToDisplay[:, 2]), dict(label=keys[3], values=dataToDisplay[:, 3]), dict(label=keys[4], values=dataToDisplay[:, 4]), dict(label=keys[5], values=dataToDisplay[:, 5]), dict(label=keys[6], values=dataToDisplay[:, 6]), dict(label=keys[7], values=dataToDisplay[:, 7]) ], marker=dict(color=cs2, showscale=False, # colors encode categorical variables line_color='white', line_width=0.5) )) fig.show() return fig
def create_figure_backup(): dataOrig = analyze.loadAudioFeatures() fullLib = analyze.loadLibraryFromFiles() # list: 3799 of dict:18 # [{'danceability': 0.469, 'energy': 0.625, 'key': 4, 'loudness': -5.381, 'mode': 0, 'speechiness': 0.0306, 'acousticness': 0.00515, 'instrumentalness': 2.03e-05, 'liveness': 0.0682, 'valence': 0.325, 'tempo': 76.785, 'type': 'audio_features', 'id': '6PBzdsMi6YNdYAevzozBRi', 'uri': 'spotify:track:6PBzdsMi6YNdYAevzozBRi', 'track_href': 'https://api.spotify.com/v1/tracks/6PBzdsMi6YNdYAevzozBRi', 'analysis_url': 'https://api.spotify # {'danceability': 0.76, 'energy': 0.608, 'key': 9, 'loudness': -8.673, 'mode': 0, 'speechiness': 0.0347, 'acousticness': 0.315, 'instrumentalness': 0.79, 'liveness': 0.121, 'valence': 0.727, 'tempo': 119.032, 'type': 'audio_features', 'id': '4dJYJTPbUgFK5pCQ5bYD4g', 'uri': 'spotify:track:4dJYJTPbUgFK5pCQ5bYD4g', 'track_href': 'https://api.spotify.com/v1/tracks/4dJYJTPbUgFK5pCQ5bYD4g', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4dJYJTPbUgFK5pCQ5bYD4g', 'duration_ms': 254118, 'time_signature': 4} # {'danc.. dtype = [('danceability', '<f8'), ('energy', '<f8'), ('key', '<f8'), ('loudness', '<f8'), ('mode', '<f8'), ('speechiness', '<f8'), ('acousticness', '<f8'), ('instrumentalness', '<f8'), ('liveness', '<f8'), ('valence', '<f8'), ('tempo', '<f8'), ('type', '<f8'), ('id', '<f8'), ('duration_ms', '<f8'), ('time_signature', '<f8'), ] keys = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness'] keys = ['danceability', 'energy', 'key', 'loudness', 'valence', 'speechiness', 'tempo', 'time_signature'] # keys = ['danceability', 'energy', 'loudness'] # keys = ['danceability', 'energy'] # ('danceability','energy','key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', # 'valence', 'tempo'): dataArray = [] for key in dataOrig[0]: if key in keys: # data[key] = [li[key] for li in dataOrig] dataArray.append([li[key] for li in dataOrig]) # dataArray list:8 3799 # one row per audio feature # [[0.469, 0.76, 0.598, 0.706, 0.756, 0.555, 0.53, 0.716, 0.481, 0.415, 0.684, 0.593, 0.395, 0.487, 0.671, 0.691, 0.155, 0.61, 0.171, 0.203, 0.181, # [0.625, 0.608, 0.509, 0.653, 0.549, 0.71, 0.362, 0.685, 0.491, 0.42, 0.62, 0.626, 0.704, 0.757, 0.603, 0.669, 0 # [4, 9, 9, 7, 7, 10, 5, 4, 11, 3, 0, 4, 5, 0, 4, 1, 10, 11, 7, 2, 10, 10, 10, 0, 8, 9, 11, 6, 11, 6, 10, 1, 0, 3, 0, dataArray = np.array(dataArray) dataArrayMean = np.mean(dataArray) dataArrayStd = np.std(dataArray) allsongsstandardized = (dataArray - dataArrayMean) / dataArrayStd X_train_norm = allsongsstandardized X_train_norm = np.flip(np.rot90(X_train_norm, 3)) dataToDisplay = np.flip(np.rot90(dataArray, 3)) # allsongs = [] # for songOrig in dataArray: # song = [] # for key in keys: # song.append(dataArray[key]) # allsongs.append(song) # allsongs: list:3799 x 8\ # one row per song # [[0.469, 0.625, 4, -5.381, 0, 0.0306, 0.00515, 2.03e-05], # [0.76, 0.608, 9, -8.673, 0, 0.0347, 0.315, 0.79], # [0.598, 0.509, 9, -9.719, 1, 0.0269, 0.593, 0.0503], # X1 = np.array(dataArray) # y = np.array(dataArray2) # kmeans = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=3000, # n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto', # random_state=None, tol=0.0001, verbose=0) kmeans = KMeans(n_clusters=7) kmeans.fit(X_train_norm) predict = kmeans.predict(X_train_norm) # data['cluster'] = predict # df = px.data.gapminder().query("country=='Canada'") # fig = px.line(df, x="year", y="lifeExp", title='Life expectancy in Canada') # fig.show() # fig = go.Figure(data=X_train_norm.__array__()) # fig.write_html('first_figure.html', auto_open=True) # fig = px.scatter(kmeans.cluster_centers_) # pd.plotting.parallel_coordinates(pd.array(X_train_norm),0) # plt.show() # print(numpy.info(X1)) centroids = kmeans.cluster_centers_ correct = 0 # for i in range(len(X1)): # predict_me = np.array(X1[i].astype(float)) # predict_me = predict_me.reshape(-1, len(predict_me)) # prediction = kmeans.predict(predict_me) # print(prediction[0]) # print(correct / len(X1)) X2 = dataArray[0] nCols = len(X2) nRows = dataArray.shape[0] # colors = cm.rainbow(np.linspace(0, 1, len(dataArray))) # cs1 = [colors[i // len(dataArray)] for i in range(len(dataArray) * len(dataArray))] # could be done with numpy's repmat cs2 = kmeans.labels_.astype(float) # cs3 = cs2 ** nRows # cs3 = np.repeat(cs2, nRows) # Xs1 = dataArray * nRows # use list multiplication for repetition fig = go.Figure() # fig.add_trace(go.Scatter(x=dataArray[0], y=dataArray[1] ** 2, mode='markers', marker_color=cs2)) # fig.show() fig = go.Figure(data=go.Splom( dimensions=[dict(label=keys[0], values=dataToDisplay[:, 0]), dict(label=keys[1], values=dataToDisplay[:, 1]), dict(label=keys[2], values=dataToDisplay[:, 2]), dict(label=keys[3], values=dataToDisplay[:, 3]), dict(label=keys[4], values=dataToDisplay[:, 4]), dict(label=keys[5], values=dataToDisplay[:, 5]), dict(label=keys[6], values=dataToDisplay[:, 6]), dict(label=keys[7], values=dataToDisplay[:, 7]) ], marker=dict(color=cs2, showscale=False, # colors encode categorical variables line_color='white', line_width=0.5) )) fig.show() for i, center in enumerate(kmeans.cluster_centers_): j = i % len(X_train_norm[0]) k = (i + 1) % len(X_train_norm[0]) # plt.figure(i) # plt.suptitle("scatterplot "+str(i)+" "+str(j)+":"+str(k)) # plt.scatter(X_train_norm[:, j], X_train_norm[:, k], c=cs2, s=5, alpha=0.4) # plt.scatter(centroids[:,j], centroids[:,k], c='black', s=5) fig.add_trace(go.Scatter(x=X_train_norm[:, j], y=np.arange(min(X_train_norm[:, j]), max(X_train_norm[:, j])), mode='lines')) # fig.add_trace(go.Scatter(centroids[:,j], 'b.', markersize=2)) fig.show() # plt.scatter(X_train_norm[:, 0], X_train_norm[:, 1], c=cs2, s=5, alpha=0.4) # plt.scatter(X_train_norm[:, 0], X_train_norm[:, 2], c=cs2, s=5, alpha=0.4) # plt.plot(allsongsstandardized) # plt.figure(2) # plt.plot(dataArray[0],' r.', markersize=1) # plt.figure(3) # plt.plot(dataArray[1], 'b.', markersize=1) # plt.figure(4) # plt.plot(dataArray[2], 'y.', markersize=1) # plt.scatter(dataArray[0], dataArray[3], c="blue", alpha=0.1) # plt.figure(5) # plt.scatter(dataArray[0], dataArray[0], c="blue", alpha=0.1) # plt.subplot(321, label="one") # plt.hist(dataArray[0], bins=200) # plt.title("exess") # plt.subplot(322, label="two") # plt.hist(dataArray[1], bins=200) # plt.title("222222") # plt.subplot(323) # plt.hist(dataArray[2], bins=200) # plt.title("ex333333ess") # plt.scatter(Xs1[1], Ys[1], c="blue", alpha=0.1) # plt.scatter(Xs1, Ys.flatten(), color=cs) # plt.grid(True) # plt.show() clusteredSongs = [[] for i in range(kmeans.n_clusters)] for i, cluster in enumerate(cs2): songCluster = clusteredSongs[int(cluster)] track = next((item for item in fullLib['tracks'] if item['track']['id'] == dataOrig[i]['id']), None) if (track is not None): songCluster.append({**track, **dataOrig[i]}) # print(str(i)+' '+str(track['track']['artists'][0]['name'])+ ' - '+ # str(track['track']['album']['name'])+ ' - '+ # str(track['track']['name'])+' song '+str(dataOrig[i])+' ' ) return fig
mode="lines+markers+text")) fig.update_layout(title="Линейная зависимость") fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\line.html", auto_open=True) #Гистограма trace = go.Bar(x=signs_df["skill1"], y=signs_df["value"], text=signs_df["skill2"], textposition='auto') fig = go.Figure(data=trace, layout=go.Layout(barmode='stack')) fig.update_layout(title="Гистограма") fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\gist.html", auto_open=True) #Матрица рассеевания fig = go.Figure(data=go.Splom( dimensions=[ dict(label='skill1', values=signs_df['skill1']), dict(label='skill2', values=signs_df['skill2']), dict(label='value', values=signs_df['value']) ], text=signs_df['skill2'], )) fig.update_layout(title="Матрица рассеяния") fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\matrix.html", auto_open=True) #Облако text = "" for skills in df["key_skills"].values: for el in skills.split(';'): text += el + ' ' wordcloud = WordCloud(width=3000, height=2000, background_color='black').generate(str(text)) wordcloud.to_file("E:\\7 семестр\\ПЭОЭД\\Лаб7\\cloud.png") #размах выбросов print(len(my.FindVibros(df, "min_salary")))
x, y = SVMSMOTE(stratey = strategy, k_neighbors = knn, n_jobs = n_jobs, ratio = ratio, out_step = stepsize).fit_resample(X, Y) elif method.lower() == 'kmeans': x, y = KmeansSMOTE(stratey = strategy, k_neighbors = knn, n_jobs = n_jobs, ratio = ratio, ).fit_resample(X, Y) else: raise Exception("{} is not a valid method for OverSampling".format(method)) df = pd.DataFrame([x, y], columns = list(df.columns) + [_class]) fig = go.Figure() fig.add_trace( go.Splom( dimensions = [ dict(label = column, values = df[column]) for column in df.columns ], marker = dict( color = df[_class] ) ) ) fig.show() if transform: return df return def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None): """
def display_selected_data(selectedArea, choiceNB): if choiceNB == 'boroughs': df_selected = df_trees_properties_boro title_part = ' boroughs' key = 'borough' else: title_part = ' neighborhoods' df_selected = df_trees_properties key = 'ntaname' font_ann = dict( size=10, color=colors['text'] ) if selectedArea is not None: points = selectedArea["points"] area_names = [str(point["text"].split("<br")[0]) for point in points] df_selected = df_selected[df_selected[key].isin(area_names)] index_vals = df_selected['borough'].astype('category').cat.codes coef_list = [] # find pearson coeff and p_value for each pair of attributes pairs = [['trees/sq.mile', 'avg.landprice_thous$/acre'], ['trees/sq.mile', 'properties/sq.mile'], ['avg.landprice_thous$/acre', 'properties/sq.mile']] flag = True for pair in pairs: if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2: coef_list.append( pearsonr(df_selected[pair[0]], df_selected[pair[1]])) else: flag = False if flag: ann = [ dict( x=5000, y=6000, xref="x2", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=6000, y=5000, xref="x1", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=14000, y=6000, xref="x3", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=6000, y=14000, xref="x1", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=14000, y=6000, xref="x3", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), dict( x=6000, y=14000, xref="x2", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), ] else: ann = [] axisd = dict(showline=True, zeroline=False, gridcolor='#104752', showticklabels=True) # here we build a scatter matrix, and add annotations for each subgraph layout = go.Layout( dragmode='select', margin=dict(l=0, r=0, b=0, t=0, pad=0), autosize=False, hovermode='closest', font=dict(color=colors['text'], size=12), plot_bgcolor=colors['background'], paper_bgcolor=colors['background'], xaxis1=dict(axisd), xaxis2=dict(axisd), xaxis3=dict(axisd), xaxis4=dict(axisd), yaxis1=dict(axisd), yaxis2=dict(axisd), yaxis3=dict(axisd), yaxis4=dict(axisd), annotations=ann) fig = go.Figure(data=go.Splom( dimensions=[dict(label='trees/sq.mile', values=df_selected['trees/sq.mile']), dict(label='avg.landprice($K/A)', values=df_selected['avg.landprice_thous$/acre']), dict(label='properties/sq.mile', values=df_selected['properties/sq.mile']), ], text=(df_selected[key]+': '+df_selected['borough'] if key == 'ntaname' else df_selected[key]), hoverinfo="x+y+text", # showlegend=True, marker=dict(color=index_vals, showscale=False, # colors encode categorical variables line_color='white', line_width=0.4), diagonal=dict(visible=True) ), layout=layout ) return fig