{paths:{base:'/static/base', plotly:'https://cdn.plot.ly/plotly-1.5.1.min.js?noext'}}); </script>""")) cf.go_offline(); configure_plotly() init_notebook_mode(connected=False) pd.set_option('display.float_format',lambda x:'%.6f'%x) url='https://raw.githubusercontent.com/noahgift/'+\ 'real_estate_ml/master/data/'+\ 'Zip_Zhvi_SingleFamilyResidence.csv' re=pd.read_csv(url).dropna().astype({'RegionID':'int'})\ .astype({'RegionName':'int'}).astype({'SizeRank':'int'}) re_median=pd.concat([re[re['CountyName']=='Los Angeles'].median(), re[re['CountyName']=='San Francisco'].median(), re.median()],axis=1,sort=False).iloc[3:] re_median.columns=['Los Angeles','San Francisco','Median USA'] layout=cf.Layout(height=500,width=800) re_median.iplot(title='Median Single Family Home Prices 1996-2017', xTitle='Year',yTitle='Sales Price', shape=(4,1),fill=True,layout=layout) re_median.T.iloc[:,:7].style.set_precision(1) import pylab; from skimage import io,color,measure cmaps=['ocean','cool','gnuplot2','terrain', 'winter','spring','summer','autumn'] pylab.style.use('ggplot') def vector(file_num,cm,level=.85): file_path='https://olgabelitskaya.gitlab.io/data/decors/' file_name='00_00_%03d'%(file_num)+'.png' img=io.imread(file_path+file_name) gray_img=color.colorconv.rgb2grey(img) contours=measure.find_contours(gray_img,level)
ax2.set_title('Loan status') plt.xticks(rotation=90) ax2.set_xlabel(col_name) plt.tight_layout() # ### Feature correlations # In[23]: corr = df_selected.corr(method='spearman') # In[24]: layout = cf.Layout(height=600, width=600) corr.abs().iplot(kind='heatmap', layout=layout.to_plotly_json(), colorscale='RdBu') # In[25]: import scipy import scipy.cluster.hierarchy as sch X = df_selected.corr(method='spearman').values d = sch.distance.pdist(X) # vector of ('55' choose 2) pairwise distances L = sch.linkage(d, method='complete') ind = sch.fcluster(L, 0.25 * d.max(), 'distance') columns = [df_selected.columns.tolist()[i] for i in list((np.argsort(ind)))] df_selected_new = df_selected.reindex_axis(columns, axis=1)
trace_1 = go.Pie(labels=grouped.region, values=grouped.session_id, name='session_id', showlegend=False) layout2 = go.Layout(plot_bgcolor='rgb(26,26,26)', paper_bgcolor='rgb(26,26,26)', font_color='white', hovermode='closest') fig2 = go.Figure(data=[trace_1], layout=layout2) # Step 3. Create a plotly figure layout = cf.Layout( legend=dict( orientation='h', y=1, xanchor='center', x=0.4), margin={'t': 0}, plot_bgcolor='rgb(26,26,26)', paper_bgcolor='rgb(26,26,26)', font_color='white', font_size=11 ) # aggregating the data for plotting regions = user_info.groupby('region').count()['user_id'].reset_index() regions = regions.iplot(kind='pie', labels='region', values='user_id', asFigure=True, legend=False, theme='space', title='% of Users by Region') user_age = user_info.groupby(['region', 'age_range']).count()['user_id'].unstack() user_age = user_age.iplot(kind='bar', theme='space', asFigure=True, xTitle='age range', yTitle='number of users', title='Age Demographic by Region')
def plot_correlation( df, cluster=False, layered_cluster=False, iplot=False, triangle=False, like=None, sort_by_carrier=False, figsize=(35, 30), title="", ): if iplot and type(df.columns) == pd.core.indexes.multi.MultiIndex: df.columns = [" ".join(col).strip() for col in df.columns.values] if cluster: df = cluster_correlations(df, layered=layered_cluster) if sort_by_carrier: columns = [] for l in [ "offwind-dc", "offwind-ac", "onwind", "solar", "CCGT", "OCGT", "H2", "battery", "LK", "LN", ]: columns.extend(df.filter(like=l, axis=1).columns) df = df.reindex(columns, axis="columns") df = df.rename(columns=opts["nice_names"]) corr = df.corr() if like is not None: corr = corr.filter(like=like) if iplot: if triangle: corr = corr.where(np.tril(np.ones(corr.shape)).astype(np.bool)) lt = cf.Layout(height=1000, width=1000) corr.iplot(kind="heatmap", colorscale="PiYG", zmin=-1, zmax=1, layout=lt) else: f, ax = plt.subplots(figsize=figsize) mask = np.zeros_like(corr, dtype=np.bool) if triangle: mask[np.triu_indices_from(mask)] = True sns.heatmap( corr, mask=mask, vmin=-1.0, vmax=1.0, cmap=sns.diverging_palette(230, 10, as_cmap=True), square=True, ax=ax, ) ax.set_ylabel("") ax.set_xlabel("") ax.set_title(title)
def createEarningsTables(columnName, columnNameMethod): dfg = dfgSource.groupby(['track_title_id', columnName])['payable_amount'].sum().unstack(columnName) dfg = dfg.rename(columns = lambda columnNameYearQuarter: toDate(columnNameYearQuarter)) dfgMaxAmount = dfg.max(axis=1) # determine the max amount per row, e.g (axis=1) dfgGeneralInfo = dfgMaxAmount.to_frame(); dfgGeneralInfo.columns = ['Highest earnings in a year'] #print(dfGeneral) dfgSource['distribution_year'] = dfgSource['distribution_year'].astype(int) # for min() method used below dfgGeneralInfo['start year'] = dfgSource.groupby(['track_title_id'])['distribution_year'].min() dfgGeneralInfo['song_release_year'] = dfgSource.groupby(['track_title_id'])['song_release_year'].first() # first() means get the first value from the list return from groupby dfgGeneralInfo.loc[dfgGeneralInfo["song_release_year"] == '','song_release_year'] = dfgGeneralInfo["start year"] # when song_relase_year has empty value, copy value from start year dfgGeneralInfo["song_release_year"] = dfgGeneralInfo["song_release_year"].astype(int) # convert to int dfgGeneralInfo['music_style'] = dfgSource.groupby(['track_title_id'])['music_style'].first() dfgGeneralInfo['sheet'] = dfgSource.groupby(['track_title_id'])['sheet'].first() print("The " + str(nrOfSongs) + " highest annual earning songs:\n", dfgGeneralInfo.to_string()) #downloadcsv(dfgGeneralInfo, "dfgGeneralInfo_" + columnName + ".csv") print("\n\nPayable amount per " + columnName + ":\n", dfg.to_string()) #dfg.transpose().iplot(kind="scatter", xTitle = columnName, yTitle= 'Payable amount', layout=cf.Layout(height=1000, width=1800)) # now create the ratio table, normalized based on the first NaN value dfg = dfg.div(dfgMaxAmount, axis=0) # divide all values in a row by the maximum amount found in that row (= kinda normalization) dfgShifted = dfg.copy(); dfgShifted['song_release_year'] = pd.to_datetime(dfgGeneralInfo["song_release_year"].astype(int), format="%Y") print(dfgShifted.to_string()) earliestSongReleaseYear = dfgShifted['song_release_year'].min() dfgShifted = dfgShifted.apply(lambda row: shiftRowBasedOnFirstIndexLargerThenZeroAndSongReleaseYear(row, earliestSongReleaseYear), axis=1, result_type='expand', raw=True) #shift columns to left/right # Not needed for year rename colums, but getting column index (get_loc) which is the year number and converting that index to year-month string dfgShifted = dfgShifted.rename(columns = lambda x: columnNameMethod(dfgShifted.columns.get_loc(x))) # make a copy because otherwise the mean calculation includes the count row dfgShiftedWithExtra = dfgShifted.copy() dfgShiftedWithExtra.loc['count'] = dfgShifted.count() dfgShiftedWithExtra.loc['mean'] = dfgShifted.mean() dfgShiftedWithExtra.loc['standar dev.'] = dfgShifted.std() dfgShiftedWithExtra.loc['min'] = dfgShifted.min() dfgShiftedWithExtra.loc['max'] = dfgShifted.max() setFloatFormat('{:.3f}') # show 3 digits dfgShited table import IPython print(dfgShiftedWithExtra.to_string()) #downloadcsv(dfgShiftedWithExtra, "dfgShiftedWithExtra_" + columnName + ".csv") setNormalFloatFormat() dfgShifted.transpose().iplot(kind="scatter", xTitle = 'Payable amount', yTitle= 'Payable amount', layout=cf.Layout(height=1000, width=1800))
increase_frameheight() enable_plotly_in_cell() dfg = df top_songs = dfg.groupby(['track_title_id', 'distribution_year'])['payable_amount'].sum().unstack('distribution_year').max(axis=1).sort_values(ascending=False).head(150).index.tolist() companies = ['SPOTIFY', 'APPLE MUSIC'] print('Checking the nr of plays done with: ', companies) dfg = dfg[dfg['music_user'].isin(companies)] dfg = dfg[dfg['track_title_id'].isin(top_songs)].groupby(['track_title_id', 'distribution_period'])['number_of_plays'].sum().unstack('distribution_period') dfg = dfg.rename(columns = lambda columnNameYearQuarter: toDate(columnNameYearQuarter)) dfgMaxPlaysPerSong = dfg.max(axis=1).sort_values(ascending=False).to_frame() dfgMaxPlaysPerSong.columns = ['Maximum plays in a quarter'] print(dfgMaxPlaysPerSong.to_string()) print("\n\nNr of plays per quarter:\n", dfg.to_string()) dfg.transpose().iplot(kind="scatter", xTitle = 'Distribution period', yTitle= 'Number of plays', layout=cf.Layout(height=1000, width=1800)) dfg = dfg.div(dfgMaxPlaysPerSong['Maximum plays in a quarter'], axis=0) dfgShifted = dfg.copy(); dfgShifted = dfgShifted.apply(lambda row: shiftRowBasedOnFirstIndexLargerThenZero(row), axis=1, result_type='expand', raw=True) dfgShifted = dfgShifted.rename(columns = lambda x: toYearMonth(dfgShifted.columns.get_loc(x) + 1)) print('Number of plays historically\n', dfgShifted.to_string()) dfgShifted.transpose().iplot(kind="scatter", xTitle = 'Period since receiving revenue', yTitle= 'Number of plays', layout=cf.Layout(height=1000, width=1800)) increase_frameheight() enable_plotly_in_cell() #Songs waarvan %streaming revenue tenminste 50% bedraagt van (totale revenu -/-international sources) #Songs waarvan %streaming revenue tenminste 75% bedraagt van (totale revenu -/-international sources) #Songs waarvan %streaming revenue tenminste 90% bedraagt van (totale revenu -/-international sources) (even op zoek naar sweetspot tussen waar er nog volume in aantals songs zit en waar je echt correlatie ziet) unitedStatesOnlyCriterion = df['region'] == 'domestic'