**Model:** [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF) **Framework:** PyTorch """) path_to_file = "training/PyTorch/Recommendation/NCF/results.csv" df1 = load_data(path_to_file) cols1 = list(df1.columns.values) if st.checkbox("Show data benchmark 1"): st.table(df1) dfp1 = df1.loc[:, [cols1[0], cols1[1], cols1[4], cols1[6]]] dfp1 = dfp1.rename(columns={cols1[4]: 'Time to train (s)'}) dfp1 = dfp1.rename(columns={cols1[6]: 'Throughput (samples/s)'}) dfp1['Training type'] = 'FP32' dfp2 = df1.loc[:, [cols1[0], cols1[1], cols1[5], cols1[7]]] dfp2 = dfp2.rename(columns={cols1[5]: 'Time to train (s)'}) dfp2 = dfp2.rename(columns={cols1[7]: 'Throughput (samples/s)'}) dfp2['Training type'] = 'Mixed precision' dff = pd.concat([dfp1, dfp2]) # st.table(dff)
def app(): # THE DATA data_load = st.text('Loading data...') df = get_cached_data() data_load.text(' ') # # Control charts # csfont = {'fontname': 'Comic Sans MS'} # hfont = {'fontname': 'Helvetica'} # # plt.rcParams['font.sans-serif'] = 'Arial' # # plt.rcParams['font.family'] = 'sans-serif' # plt.rcParams['text.color'] = '#404040' # plt.rcParams['axes.labelcolor'] = '#404040' # plt.rcParams['xtick.color'] = '#402020' # plt.rcParams['ytick.color'] = '#402020' # plt.rcParams['font.size'] = 12 st.markdown('# SURVEY - Characteristics of the respondants') st.write('''#''') ########## # GENDER # ########## st.markdown('## Gender of respondents percentage distribution') st.markdown('#') st.markdown(""" Resopondents were 72.08 % female, 26.89 % Male. The remaining respondents answered 'other' or did not provide an answer. """) col1, col2, col3, col4 = st.beta_columns((2, 0.5, 1.2, 0.8)) with col1: color_palette_list = ['#CDF1AE', '#90DACC', '#F7E9D1', '#F4D2B5', '#EAB7B7', '#B89ACF'] # GENDER PIE CHART values = list(df['Dem_gender'].value_counts(normalize=True)) names = list(df['Dem_gender'].value_counts().index) fig = px.pie(df, values=values, names=names ) fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(colors=color_palette_list)) st.plotly_chart(fig, use_container_width=True) with col3: st.markdown(''' # # #### Gender of respondents percentage ''') # TABLE GENDER df_gender = pd.DataFrame( (df['Dem_gender'].value_counts(normalize=True))) df_gender.columns = ['%'] st.table(df_gender) st.write('''---''') ############## # EMPLOYMENT # ############## st.markdown('## Employment Status of respondents perctage distribution') st.markdown('#') st.markdown(''' The majority of respondents (58.83%) were in full-time, part-time work or self-employed, 15.69% were either unemployed or retired, 16.6% were students. ''') col1, col2, col3, col4 = st.beta_columns((2, 0.5, 1.2, 0.8)) with col1: # EMPLOYMENT PIE CHART values = list(df['Dem_employment'].value_counts(normalize=True)) names = list(df['Dem_employment'].value_counts().index) fig = px.pie(df, values=values, names=names, title='Employment Status of respondents' ) fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(colors=color_palette_list)) st.plotly_chart(fig, use_container_width=True) with col3: st.markdown(''' # ### Employment Status of respondents percentage ''') # EMPLOYMENT TABLE df_employment = pd.DataFrame( (df['Dem_employment'].value_counts(normalize=True)*100)) df_employment.columns = ['%'] st.table(df_employment) st.write('''---''') ####### # AGE # ####### st.write('## Age of respondents') st.markdown('#') col1, col2, col3, col4 = st.beta_columns((2, 0.25, 0.75, 1)) # AGE HISTOGRAM with col1: st.markdown( 'The age of the respondents ranged from 18 to 110, with a median age of 38.89.') st.markdown('#') # FIG 1 counts, bins = np.histogram(df.Dem_age, bins=range(18, 115, 5)) bins = 0.5 * (bins[:-1] + bins[1:]) fig = px.bar(x=bins, y=counts, labels={ 'x': 'Age', 'y': 'Count'}) st.plotly_chart(fig, use_container_width=True) # FIG 2 # fig = px.histogram(df, x="Dem_age", nbins=45, histnorm='percent', # title='Age distribution of respondents', opacity=0.5) # st.plotly_chart(fig, use_container_width=True) # FIG 3 # st.bar_chart(np.histogram(df['Dem_age'], bins=25)[0]) with col3: # st.markdown('#') st.markdown(f""" Maximal age of a respondent: `{df['Dem_age'].max()}` \n Minimal age of a respondent: `{df['Dem_age'].min()}` \n Mean age: `{df['Dem_age'].mean().round(2)}` """) st.table(df[['Dem_age']].describe())
for i in range(100): time.sleep(0.01) bar.progress(i+1) # display data # single line code st.code('import pandas as pd') # section code with st.echo(): import pandas as pd import numpy as np import datetime as dt import matplotlib.pyplot as plt import seaborn as sns # plots arr = np.random.normal(1, 1, size=100) plt.hist(arr, bins=20) st.pyplot() # dataframe df = pd.DataFrame(np.random.randn(5, 5), columns=('col_%d' % i for i in range(5))) st.dataframe(df) # table() st.table(df)
def run(): #Fundamental 데이터 가져오기 earning_q, earning_a = getData.get_fundamental_data_by_Json( input_ticker, "EARNINGS") income_q, income_a = getData.get_fundamental_data_by_Json( input_ticker, "INCOME_STATEMENT") balance_q, balance_a = getData.get_fundamental_data_by_Json( input_ticker, "BALANCE_SHEET") cash_q, cash_a = getData.get_fundamental_data_by_Json( input_ticker, "CASH_FLOW") #Summary 데이터 가져오기 description_df, ratio_df, return_df, profit_df, dividend_df, volume_df, price_data, valuation_df = getData.get_overview( input_ticker) st.table(description_df) st.table(price_data) st.table(volume_df) st.table(return_df) st.table(dividend_df) st.table(ratio_df) st.table(valuation_df) # st.dataframe(income_q) #st.dataframe(balance_q) # st.dataframe(cash_a) # st.dataframe(earning_q) #Growth Ratio growth_df = makeData.make_growthRatio(earning_a, earning_q, income_a, cash_a, balance_a) st.table(growth_df) #close data price_data = getData.get_close_data(input_ticker, earning_q.iloc[0, 0], earning_q.iloc[-1, 0]) price_df = getData.get_close_data(input_ticker, before, today) # st.dataframe(price_df) #draw chart chart.earning_chart(input_ticker, earning_q, earning_a, price_data) fig = go.Figure( go.Indicator(mode="gauge+number+delta", value=valuation_df.at['RIM', 'Valuation'], delta={ 'reference': valuation_df.at['Price', 'Valuation'], 'relative': True }, title={'text': "RIM-Price(r=12%)"}, domain={ 'x': [0, 1], 'y': [0, 1] })) st.plotly_chart(fig) if valuation_df.at['Earnings Yield', 'Valuation'] == '-%': valuation_df.at['Earnings Yield', 'Valuation'] = '0%' fig = go.Figure( go.Indicator( mode="number+delta", value=float(valuation_df.at['Earnings Yield', 'Valuation'].replace("%", "")), title={ "text": "Earnings Yield<br><span style='font-size:0.8em;color:gray'>Demand Yield(15%)</span>" }, domain={ 'x': [0, 1], 'y': [0, 1] }, delta={'reference': 15})) st.plotly_chart(fig) #Draw Chart chart.price_chart(input_ticker, price_df) chart.income_chart(input_ticker, income_q, income_a) chart.income_margin_chart(input_ticker, income_q) chart.balance_chart(input_ticker, balance_q) chart.cashflow_chart(input_ticker, cash_q, income_q) #조회시 1분 기다려야 함 st.warning('Please Wait One minute Before Searching Next Company!!!') my_bar = st.progress(0) for percent_complete in range(100): time.sleep(0.6) my_bar.progress(percent_complete + 1)
min_value=0, max_value=90, value=30, step=1) if (st.sidebar.button('查詢')): if querybond.startswith('A'): govtcondition = bondfromcsv.ID.map(lambda x: x.startswith('A')) inbond = twgovbondlist[twgovbondlist['ID'] == querybond] else: govtcondition = bondfromcsv.ID.map( lambda x: not x.startswith('A')) inbond = twcorpbondlist[twcorpbondlist['ID'] == querybond] st.subheader("Bloomberg 公平市價") st.table(inbond) first_record_date = np.datetime64( (datetime.date.today() - datetime.timedelta(start_date))) # target=bondfromcsv[bondfromcsv['ID']==querybond & bondfromcsv['Trade_date']>=(np.datetime64((datetime.date.today()-datetime.timedelta(start_date))))] target = bondfromcsv[(bondfromcsv['ID'] == querybond) & ( bondfromcsv['Trade_date'] >= first_record_date)] if target.size == 0: st.write("No trade record") else: st.subheader("本券近期成交記錄") show_data( target[[ 'ID', 'Name', 'Duration', 'YTM', 'VolumeE', 'Trade_date' ]], showtype) if querybond.startswith('A') & (inbond.size > 0):
def main(): st.image('logo.png', width=200) st.title('Análise de dados Pulseira MiBand') st.markdown(""" Análise dos dados extraídos do monitoramento da pulseira de marca MiBand. \n Para obter teus dados, acesse essa [url](https://user.huami.com/hm_account/2.0.0/index.html#/threeLogin) com sua conta, e selecione a opção Exportar dados. """) file = st.file_uploader('Escolha a base de dados que deseja analisar (.csv)', type='csv') if file is not None: df = pd.read_csv(file) if 'date' in df.columns: st.sidebar.title('Filtrar por data:') df = check_data(df) if st.sidebar.checkbox("Mostrar tabela?"): st.header("Raw Data") st.write(df) st.sidebar.info("Foram carregadas {} linhas".format(df.shape[0])) st.subheader('Estatística descritiva univariada') aux = pd.DataFrame({"colunas": df.columns, 'tipos': df.dtypes}) colunas_numericas = list(aux[aux['tipos'] != 'object']['colunas']) colunas_object = list(aux[aux['tipos'] == 'object']['colunas']) colunas = list(df.columns) col = st.selectbox('Selecione a coluna :', colunas_numericas) if col is not None: st.markdown('Selecione o que deseja analisar :') mean = st.checkbox('Média') if mean: st.markdown(df[col].mean()) median = st.checkbox('Mediana') if median: st.markdown(df[col].median()) desvio_pad = st.checkbox('Desvio padrão') if desvio_pad: st.markdown(df[col].std()) kurtosis = st.checkbox('Kurtosis') if kurtosis: st.markdown(df[col].kurtosis()) skewness = st.checkbox('Skewness') if skewness: st.markdown(df[col].skew()) describe = st.checkbox('Describe') if describe: st.table(df[colunas_numericas].describe().transpose()) st.subheader('Visualização dos dados') st.markdown('Selecione a visualizacao') histograma = st.checkbox('Histograma') if histograma: col_num = st.selectbox('Selecione a Coluna Numerica: ', colunas_numericas,key = 'unique') st.markdown('Histograma da coluna : ' + str(col_num)) st.write(criar_histograma(col_num, df)) if colunas_object: barras = st.checkbox('Gráfico de barras') if barras: col_num_barras = st.selectbox('Selecione a coluna numerica: ', colunas_numericas, key = 'unique') col_cat_barras = st.selectbox('Selecione uma coluna categorica : ', colunas_object, key = 'unique') st.markdown('Gráfico de barras da coluna ' + str(col_cat_barras) + ' pela coluna ' + col_num_barras) st.write(criar_barras(col_num_barras, col_cat_barras, df)) if colunas_object: boxplot = st.checkbox('Boxplot') if boxplot: col_num_box = st.selectbox('Selecione a Coluna Numerica:', colunas_numericas, key = 'unique' ) col_cat_box = st.selectbox('Selecione uma coluna categorica : ', colunas_object, key = 'unique') st.markdown('Boxplot ' + str(col_cat_box) + ' pela coluna ' + col_num_box) st.write(criar_boxplot(col_num_box, col_cat_box, df)) if colunas_object: scatter = st.checkbox('Scatterplot') if scatter: col_num_x = st.selectbox('Selecione o valor de x ', colunas_numericas, key = 'unique') col_num_y = st.selectbox('Selecione o valor de y ', colunas_numericas, key = 'unique') col_color = st.selectbox('Selecione a coluna para cor', colunas) st.markdown('Selecione os valores de x e y') st.write(criar_scatterplot(col_num_x, col_num_y, col_color, df)) correlacao = st.checkbox('Correlacao') if correlacao: st.markdown('Gráfico de correlação das colunas númericas') st.write(cria_correlationplot(df, colunas_numericas))
def main(): def correlacao(data): plt.figure(figsize=(23, 8)) sns.heatmap(train_df.corr(), annot=True, fmt='.2f') option = st.sidebar.selectbox("Menu: ", ['Análise', 'Predição', 'Sobre']) st.sidebar.markdown( '* [email protected] \n' '* [LinkedIn](https://www.linkedin.com/in/nilsoncunhan/) - ' '[Portfólio](https://nilsoncunha.github.io/portfolioweb/) - ' '[GitHub](https://github.com/nilsoncunha/portfolio)') if option == 'Análise': train = load_train() test = load_test() st.image( 'https://raw.githubusercontent.com/nilsoncunha/portfolioweb/master/assets/img/posts/enem.jpg', use_column_width=True) st.title('Prevendo as notas de matemática do ENEM do ano de **2016**') # Minha apresentação st.markdown( '>*Análise baseada em uma das atividades propostas pelo programa de aceleração da **Codenation** que ' 'participei no final de 2019, **Acelera Dev - Data Science**, em Belo Horizonte.*' ) st.markdown( '>*Iniciei novamente a aceleração, que agora está sendo online, através do convite da própria ' '**Codenation** com o intuito de auxiliar os participantes nos desafios, códigos e também ' 'passar para eles a experiência que tive no presencial. Dessa vez vou apenas refazer a análise já feita ' 'anteriormente (se quiser verificar é só ' '[clicar aqui](https://nilsoncunha.github.io/portfolioweb/prevendo-nota-de-matematica-do-enem-2016/)) ' 'utilizando uma ferramenta apresentada pelo [Túlio Vieira](https://www.linkedin.com/in/tuliovieira/) ' '(instrutor da aceleração), que é o [Streamlit](https://docs.streamlit.io/index.html)*.' ) # Apresentação da biblioteca st.markdown( '>*Vou fazer aqui uma breve apresentação dessa biblioteca, que merece muitos aplaudos, antes de iniciar. ' 'Trazendo a definição do próprio Stramlit que se apresenta assim: ' '"O Streamlit é uma biblioteca Python de código aberto que **facilita** (e muito, esse por minha conta) a ' 'criação de aplicativos da Web personalizados e bonitos para aprendizado de máquina e ciência de dados...".' ' Com essa facilidade não precisamos ficar preocupado em utilizar html, css, javascript, etc., para montar ' 'uma interface ou ter que utilizar PowerPoit, ou outra coisa para apresentarmos ao negócio nossa análise. ' 'Se antes realizávamos toda a documentação no próprio notebook, com o ' '[Streamlit](https://docs.streamlit.io/index.html) conseguiremos fazer a documentação e deixar muito mais ' 'apresentável para as outras pessoas*') st.markdown( '> *Podemos ver a diferença no qual fiz o [deploy](https://portfolio-enem.herokuapp.com/) no Heroku ' 'utilizando html e css.*') st.markdown( 'Então, fazendo o resumo da análise e demonstrando um pouco da ferramenta, Vamos lá! ' 'Bases utilizadas de [treino](https://dl.dropbox.com/s/7vexlzohz7j3qem/train.csv?dl=0) e de ' '[teste](https://dl.dropbox.com/s/dsgzaemaau9g5z0/test.csv?dl=0).') st.markdown( 'Com essa ferramenta conseguimos definir quantas linhas queremos visualizar em nosso dataframe, ' 'podemos definir um *"slider"* e passar como parâmetro do "*head()*"' ) number_df = st.slider('Quantidade de linhas a serem exibidas:', min_value=5, max_value=15) st.dataframe(train.head(number_df)) st.markdown('Tipo de dados da base (exibindo com o *"table"*): ') base = pd.DataFrame({ 'treino': train.dtypes.value_counts(), 'teste': test.dtypes.value_counts() }) st.table(base) # Copiando o dataframe de treino e adicionando somente as colunas do dataframe de teste train_df = train.copy() train_df = train_df[test.columns] # salvando o index dos dados train_idx = train_df.shape[0] # Criando o dataframe com as features da variável train features_train = pd.DataFrame({ 'TP_PRESENCA_MT': train['TP_PRESENCA_MT'], 'NU_NOTA_MT': train['NU_NOTA_MT'] }) train_df = pd.concat([train_df, features_train], axis=1) train_df = pd.concat(objs=[train_df, test], axis=0, sort=False).reset_index(drop=True) # Excluindo algumas features que não utilizaremos. train_df.drop([ 'NU_INSCRICAO', 'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC', 'CO_PROVA_MT', 'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_DISCALCULIA', 'IN_DISLEXIA', 'IN_GESTANTE', 'IN_IDOSO', 'IN_SABATISTA', 'IN_SURDEZ', 'Q024', 'Q026', 'Q027' ], axis=1, inplace=True) base = pd.DataFrame({ 'tipo': train_df.dtypes, 'nulos': train_df.isnull().mean(), 'size': (train_df.shape[0] - train_df.isnull().sum()), 'unicos': train_df.nunique() }) base.index.name = 'coluna' base = base.reset_index() train_df.drop(['TP_DEPENDENCIA_ADM_ESC', 'TP_ENSINO'], axis=1, inplace=True) base.drop([10, 12], inplace=True) st.header('Analisando a base') st.markdown( "Conseguimos ver que o estado de São Paulo teve o maior número de candidados, seguido por Ceará e " "Minas Gerais *e também podemos utilizar o plotly facilmente*") data = [ go.Bar(x=train_df['SG_UF_RESIDENCIA'].value_counts().index, y=train_df['SG_UF_RESIDENCIA'].value_counts()) ] layout = go.Layout(title='Candidatos por estado') fig = go.Figure(data=data, layout=layout) st.plotly_chart(fig) st.markdown( "Fazendo a verificação por sexo, conseguimos observar que as mulheres tiveram uma maior " "participação na prova. *Nos gráficos, imagens, etc., o " "[Streamlit](https://docs.streamlit.io/index.html) nos dá a opção de expandi-lo, colocando o " "ponteiro do mouse em cima é exibido uma seta no canto superior direito*" ) sns.catplot(x='SG_UF_RESIDENCIA', col='TP_SEXO', kind='count', height=6, aspect=1.2, data=train) st.pyplot() st.markdown( "Observamos abaixo a distribuição de idade dos participantes. *Apenas adicionamos 'st.pyplot()' " "depois de montarmos o gráfico*") sns.distplot(train['NU_IDADE']) plt.xlabel('') plt.title("Distribuição por idade", {'fontsize': 20}) st.pyplot() train['NU_NOTA_PROVAS'] = (train['NU_NOTA_CH'] + train['NU_NOTA_CN'] + train['NU_NOTA_LC'] + train['NU_NOTA_MT']) / 4 st.markdown( 'Na redação temos alguns pontos que são observados no caso de fugir ao tema, for anulada, entre outros. ' '*Tabela gerada com o "st.table()"*') redacao_index = train_df['TP_STATUS_REDACAO'].value_counts().index redacao_values = train_df['TP_STATUS_REDACAO'].value_counts().values redacao = pd.DataFrame({ 'tipo': redacao_index.astype(int), 'valores': redacao_values }) redacao['descricao'] = redacao['tipo'].map({ 1: 'Sem problemas', 2: 'Anulada', 3: 'Cópia texto motivador', 4: 'Em branco', 5: 'Fere direitos autorais', 6: 'Fuga ao tema', 7: 'Não atendimento ao tipo', 8: 'Texto insuficiente', 9: 'Parte desconectada' }) st.table(redacao[['valores', 'descricao']]) data = [go.Bar(y=redacao['valores'], x=redacao['descricao'])] layout = go.Layout(title='Situação da Redação') fig = go.Figure(data=data, layout=layout) st.plotly_chart(fig) st.markdown( 'Visualizando agora as notas das provas por estado. *Utilizando o plotly novamente que fica muito ' 'mais fácil para identificar os valores.*') data = [go.Box(x=train['SG_UF_RESIDENCIA'], y=train['NU_NOTA_PROVAS'])] layout = go.Layout(title='Nota das provas por estado') fig = go.Figure(data=data, layout=layout) st.plotly_chart(fig) data = go.Box(x=train_df['SG_UF_RESIDENCIA'], y=train_df['NU_NOTA_REDACAO']) layout = go.Layout(title='Nota de redação por estado') fig = go.Figure(data=data, layout=layout) st.plotly_chart(fig) st.markdown( "Descrevendo agora o questionário socioeconômico. O título do gráfico corresponde as perguntas realizadas. " "*Colocamos o 'plt.figure(figsize=(x, y))' antes de iniciar a construção do gráfico, com isso conseguimos " "alterar o tamanho da imagem*") plt.figure(figsize=(18, 10)) sns.boxplot(data=train, y='Q001', x='NU_NOTA_PROVAS', order='ABCDEFGH') plt.title( 'Até que série seu pai, ou o homem responsável por você, estudou?', {'fontsize': 15}) plt.yticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7], labels=[ 'Nunca Estudou', 'Não Completou 4ª/5ª série', 'Não completou 8ª série', 'Não completou Ensino Médio', 'Completou Ensino Médio', 'Completou Faculdade', 'Completou Pós-Graduação', 'Não sei' ]) plt.xlabel("NOTA PROVA") plt.ylabel('') st.pyplot() sns.boxplot(data=train, y='Q002', x='NU_NOTA_PROVAS', order='ABCDEFGH') plt.title( 'Até que série sua mãe, ou a mulher responsável por você, estudou?', {'fontsize': 15}) plt.yticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7], labels=[ 'Nunca Estudou', 'Não Completou 4ª/5ª série', 'Não completou 8ª série', 'Não completou Ensino Médio', 'Completou Ensino Médio', 'Completou Faculdade', 'Completou Pós-Graduação', 'Não sei' ]) plt.xlabel("NOTA PROVA") plt.ylabel('') st.pyplot() st.subheader("Tratando os dados e realizando a previsão") st.markdown( "Depois dessas análises, chegou a hora de prepar os dados para a previsão. " "Primeiro realizei o tratamento imputando o valor 0 (zero) na prova daqueles " "candidatos que estavam com com status diferente de “1 = Presente na prova”. *Para exibir o código " "que escrevi eu usei o st.echo() que insere uma notação e ao mesmo tempo executa o código. Bem " "simples né!? (Exibindo apenas algumas linhas)*") with st.echo(): train_df.loc[train_df['TP_PRESENCA_CH'] != 1, 'NU_NOTA_CH'] = train_df.loc[ train_df['TP_PRESENCA_CH'] != 1, 'NU_NOTA_CH'].fillna(0) train_df.loc[train_df['TP_PRESENCA_CN'] != 1, 'NU_NOTA_CN'] = train_df.loc[ train_df['TP_PRESENCA_CN'] != 1, 'NU_NOTA_CN'].fillna(0) train_df.loc[train_df['TP_PRESENCA_MT'] != 1, 'NU_NOTA_MT'] = train_df.loc[ train_df['TP_PRESENCA_MT'] != 1, 'NU_NOTA_MT'].fillna(0) base[base.nulos > 0].sort_values(['nulos', 'coluna']) # imputando o valor 0 para os candidatos que estão com o status diferente de "1 = Presente na prova" train_df.loc[train_df['TP_PRESENCA_CH'] != 1, 'NU_NOTA_CH'] = train_df.loc[ train_df['TP_PRESENCA_CH'] != 1, 'NU_NOTA_CH'].fillna(0) train_df.loc[train_df['TP_PRESENCA_CN'] != 1, 'NU_NOTA_CN'] = train_df.loc[ train_df['TP_PRESENCA_CN'] != 1, 'NU_NOTA_CN'].fillna(0) train_df.loc[train_df['TP_PRESENCA_MT'] != 1, 'NU_NOTA_MT'] = train_df.loc[ train_df['TP_PRESENCA_MT'] != 1, 'NU_NOTA_MT'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_LC'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_LC'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_REDACAO'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_REDACAO'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP1'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP1'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP2'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP2'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP3'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP3'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP4'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP4'].fillna(0) train_df.loc[train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP5'] = train_df.loc[ train_df['TP_PRESENCA_LC'] != 1, 'NU_NOTA_COMP5'].fillna(0) # alterando o variável TP_SEXO train_df['TP_SEXO'] = train_df['TP_SEXO'].map({'M': 1, 'F': 0}) label_encoder = LabelEncoder() train_df['Q001'] = label_encoder.fit_transform(train_df['Q001']) train_df['Q002'] = label_encoder.fit_transform(train_df['Q002']) train_df['Q006'] = label_encoder.fit_transform(train_df['Q006']) train_df['Q025'] = label_encoder.fit_transform(train_df['Q025']) train_df['Q047'] = label_encoder.fit_transform(train_df['Q047']) st.markdown( "Exibindo o mapa de correlação novamente, temos agora novas _features_ com forte correlação." ) st.pyplot(fig=correlacao(train_df)) # train_df.drop(['SG_UF_RESIDENCIA', 'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', # 'TP_ANO_CONCLUIU', 'TP_LINGUA', 'TP_STATUS_REDACAO'], axis=1, inplace=True) # Features que foram removidas para utilizar no modelo de deploy train_df.drop([ 'SG_UF_RESIDENCIA', 'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_LINGUA', 'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'Q001', 'Q002', 'Q006', 'Q025', 'Q047' ], axis=1, inplace=True) # Recuperando nosso dataset train = train_df.iloc[:train_idx] test = train_df.iloc[train_idx:] st.markdown( 'No desafio tinhamos que submeter um arquivo csv com a resposta do modelo treinado. ' 'Nesse caso criei o pŕopio dataset de validação baseado nos dados de treino ' 'e verifiquei como está a performance do modelo. *Utilizei st.echo() para exibir o código ' 'e já executa-lo*') # Salvando a variável para realizar o treino futuramente e dividindo o dado de treino para validação with st.echo(): target = train['NU_NOTA_MT'] train.drop(['NU_NOTA_MT', 'TP_PRESENCA_MT'], axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split( train, target, test_size=0.25, random_state=42) test.drop(['NU_NOTA_MT', 'TP_PRESENCA_MT'], axis=1, inplace=True) # Trabalhando com os modelos de Machine Learning st.subheader( "Utilizando *Linear Regression* e *Random Forest Regressor*.") st.markdown("___Linear Regression___") lr = LinearRegression() lr_model = lr.fit(X_train, y_train) # y_pred = lr_model.predict(X_test) st.write( f'Acurácia do modelo: {round(lr_model.score(X_test, y_test), 3)}%') st.markdown("___Random Forest Regressor___") rf = RandomForestRegressor(n_jobs=-1) rf_model = rf.fit(X_train, y_train) # y_pred_rf = rf_model.predict(X_test) st.write( f'Acurácia do modelo: {round(rf_model.score(X_test, y_test), 3)}%') # Salvando o modelo do Random Forest, com esse método o arquivo fica cerca de 62MB # pickle.dump(rf_model, open('rf_model.pkl', 'wb')) # Salvando o modelo do Random Forest com o gzip para compacta-lo, dessa forma o arquivo fica cerca de 15MB # gzip.GzipFile('rf_model.pkl', 'wb').write(pickle.dumps(rf_model, protocol=0)) elif option == 'Predição': # Carregando o modelo para fazer a predição # rf_model = pickle.load(open('rf_model.pkl', 'rb')) # Carregando o modelo caso utilizar o gzip para compactar o arquivo. @st.cache(allow_output_mutation=True) def load_modelo(): zip = gzip.GzipFile('rf_model.pkl', 'rb') rf_model = pickle.load(zip) return rf_model rf_model = load_modelo() st.header('Realizando a predição da nota:') st.subheader( 'Como o melhor modelo foi o *Random Forest* vou utilizar ele para fazer a predição' ) # Gerando um dicionário dos estados e cada um com seu respectivo número do dataframe estados_op = { 0: '---', 12: 'Acre', 27: "Alagoas", 16: "Amapá", 13: "Amazonas", 29: "Bahia", 23: "Ceará", 53: "Distrito Federal", 32: "Espirito Santo", 52: "Goiás", 21: "Maranhão", 51: "Mato Grosso", 50: "Mato Grosso do Sul", 31: "Minas Gerais", 15: "Pará", 25: "Paraíba", 41: "Paraná", 26: "Pernambuco", 22: "Piauí", 33: "Rio de Janeiro", 24: "Rio Grande do Norte", 43: "Rio Grande do Sul", 11: "Rondônia", 14: "Roraima", 42: "Santa Catarina", 35: "São Paulo", 28: "Sergipe", 17: "Tocantins", } # options -> É a chave/valor/código que está sendo retornado. # format_func -> lambda que retorna na página o nome da opção selecionada. estados = st.selectbox('Estado', options=list(estados_op.keys()), format_func=lambda x: estados_op[x]) idade = st.number_input('Idade', min_value=13, max_value=80, step=1, value=18, key='idade') # Em 'options' estamos informando que queremos as chaves do dicionário escola_op = { 1: 'Não respondeu', 2: 'Pública', 3: 'Privada', 4: 'Exterior' } escola = st.selectbox('Escola', index=1, key='escola', options=list(escola_op.keys()), format_func=lambda x: escola_op[x]) # Não é necessário ser apenas dicionário, podemos ter uma lista e pegar os valores dela. treino_op = ('Não', 'Sim') treino = st.selectbox('Fez somente para treino?', index=0, key='treino', options=list(range(len(treino_op))), format_func=lambda x: treino_op[x]) st.subheader('Presença e Nota nas provas') st.markdown('Ciências da Natureza') pr_prova_cn_op = ('Não', 'Sim') pr_prova_cn = st.selectbox('Presença', index=1, key='pr_prova_cn', options=list(range(len(pr_prova_cn_op))), format_func=lambda x: pr_prova_cn_op[x]) if pr_prova_cn == 0: # se não estiver presente, a nota será 0 nt_cn = st.number_input('Nota', value=0.0, min_value=0.0, max_value=0.0, key='nt_cn') else: nt_cn = st.number_input('Nota', min_value=0.0, max_value=1000.0, value=0.0, step=5.0, key='nt_cn') st.markdown('---') st.markdown('Ciências Humanas') pr_prova_ch_op = ('Não', 'Sim') pr_prova_ch = st.selectbox('Presença', index=1, key='pr_prova_ch', options=list(range(len(pr_prova_ch_op))), format_func=lambda x: pr_prova_ch_op[x]) if pr_prova_ch == 0: # se não estiver presente, a nota será 0 nt_ch = st.number_input('Nota', value=0.0, min_value=0.0, max_value=0.0, key='nt_ch') else: nt_ch = st.number_input('Nota', min_value=0.0, max_value=1000.0, value=0.0, step=5.0, key='nt_ch') st.markdown('---') st.markdown('Linguagem e Códigos') pr_prova_lc_op = ('Não', 'Sim') pr_prova_lc = st.selectbox('Presença', index=1, key='pr_prova_lc', options=list(range(len(pr_prova_lc_op))), format_func=lambda x: pr_prova_lc_op[x]) if pr_prova_lc == 0: # se não estiver presente, a nota será 0 nt_lc = st.number_input('Nota prova', value=0.0, min_value=0.0, max_value=0.0, key='nt_lc') nt_redacao = st.number_input('Nota redação', value=0.0, min_value=0.0, max_value=0.0, key='nt_redacao') else: nt_lc = st.number_input('Nota prova', min_value=0.0, max_value=1000.0, value=0.0, step=5.0, key='nt_lc') nt_redacao = st.number_input('Nota redação', min_value=0.0, max_value=1000.0, value=0.0, step=5.0, key='nt_redacao') st.markdown('') st.markdown('') if st.button('Fazer previsão:'): int_features = [ estados, idade, escola, treino, pr_prova_cn, pr_prova_ch, pr_prova_lc, nt_cn, nt_ch, nt_lc, nt_redacao ] final_features = [np.array(int_features)] prediction = rf_model.predict(final_features) output = round(prediction[0], 2) st.write('Nota prevista: ', output) else: st.subheader('Sobre mim:') st.markdown( 'Uma pessoa que gostou de trabalhar com dados e viu que pode ser gerado muito valor através deles. ' 'Foi com esse intuito que comecei a fazer cursos e iniciar a pós-graduação em Ciência de Dados e Big Data.' ) st.markdown( '* Pós-Graduando em Ciência de Dados e Big Data pela PUC Minas. _(10/2020)_ \n' '* Acelera Dev Data Science - Codenation _(12/2019)_ \n' '* Data Science de A a Z - Udemy _(07/2019)_ \n' '* Graduação em Sistemas de informação pela Faculdades Promove. _(12/2016)_ *trancado* \n' '* Graduação Tecnológica em Redes de computadores pela Faculdades Promove. _(06/2014)_' )
if idx: input_masked = tokenized if input_masked: st.markdown('#### Input:') ids = tokenized['input_ids'].tolist()[0] subwords = unmasker.tokenizer.convert_ids_to_tokens(ids) st.markdown(f'<p dir="rtl">{display_input}</p>', unsafe_allow_html=True, ) st.markdown('#### Outputs:') res = unmasker(input_masked, tokenized=masking_level == 'SubWords', top_k=n_res) if res: res = [{'Prediction':r['token_str'], 'Completed Sentence':r['sequence'].replace('[SEP]', '').replace('[CLS]', ''), 'Score':r['score']} for r in res] res_table = pd.DataFrame(res) st.table(res_table) # cols = st.beta_columns(len(tokens)) # genre = st.radio( # 'Select token to mask:', tokens) # for col, token in zip(cols, reversed(tokens)): # col.text(token) # st.text(tokens) # res = unmasker(input_text) # res_table = pd.DataFrame(res) # st.table(res_table) # st.text(res)
# Remove the URL an ID field, since we do not need it df.drop(['URL'], axis=1, inplace=True) # Correlation analysis between attributes in the data set df_corr = df.corr().stack().reset_index().rename(columns={ 0: 'correlation', 'level_0': 'Y', 'level_1': 'X' }) df_corr['correlation_label'] = df_corr['correlation'].map('{:.3f}'.format) if st.checkbox('Show correlation sample'): ''' The pairwise correlation of all attributes in the data set. ''' st.table(df_corr.head()) # Visualize the correlation using a heat map base = alt.Chart(df_corr).encode(x='X:O', y='Y:O') # Text layer with correlation labels # Colors are for easier readability text = base.mark_text().encode(text='correlation_label', color=alt.condition(alt.datum.correlation > 0.5, alt.value('white'), alt.value('black'))) ''' Visualization of the correlation of features using a heat map. The magnitude of correlation between the attributes are strong. ''' # The correlation heatmap itself cor_plot = base.mark_rect().encode(color='correlation:Q')
s_contract, s_timetable, s_rng_years, s_rng_adjust, s_axs_target, s_year_ini, s_year_end ] def color_change(val): """ Takes a scalar and returns a string with the css property `'color: red'` for negative strings, black otherwise. """ color = 'blue' return 'color: %s' % color st.table(my_t0.style.applymap(color_change)) ################################################################3 ## FROM SELECTION # Upload the file with my data already preprocessed min_tic = 0.25 axs_limit = 1 zero = '0' key_config = s_contract + s_timetable[0:2] + s_timetable[3:5] + s_timetable[ 6:8] + s_timetable[9:] key_config = key_config + str(s_rng_years) + str(s_rng_adjust) tmp = subprocess.run([ '../RangingOperating', s_contract, s_timetable,
setup_2_3_shims(globals()) import streamlit as st import numpy as np import pandas as pd from datetime import datetime st.title('Apocrypha') st.write('The crypt of top secret _undocumented_ Streamlit API calls.') st.header('Tables') with st.echo(): arrays = [ np.array(['bar', 'bar', 'baz', 'baz', 'foo', None , 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] df = pd.DataFrame(np.random.randn(8, 4), index=arrays, columns=[datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3), datetime(2012, 5, 4)]) st.subheader("A table") st.table(df) st.subheader("...and its transpose") st.table(df.T) st.header('Maps') st.warning('TODO: Need to document the st.map() API here.') st.balloons()
def calcular_almancenamiento_picking(datos_sku, datos_embarques): """ docstring """ datos_por_embarque = pd.merge(datos_sku, datos_embarques, on='ID del Producto', how='outer') datos_por_embarque['Fecha de Embarque'] = datos_por_embarque[ 'Fecha de Embarque'].dt.strftime('%Y-%m-%d') datos_por_embarque.fillna(value=0, inplace=True) datos_por_embarque['Cajas Picking'] = datos_por_embarque[ 'Cajas Embarcadas'].mod(datos_por_embarque['Cajas x Tarima']) datos_por_embarque['Tarimas Picking'] = datos_por_embarque[ 'Cajas Picking'] / datos_por_embarque['Cajas x Tarima'] # st.write('Obtuve Cajas Picking y Tarimas Picking') # st.write(datos_por_embarque) ########################################### suma_tarimas_picking = datos_por_embarque.groupby( ['ID del Producto'], as_index=False)['Tarimas Picking'].agg({'Suma Tarimas Picking': 'sum'}) frequencia_pickeo = datos_por_embarque.groupby( ['ID del Producto', 'Fecha de Embarque'])['Cajas Embarcadas'].size() frequencia_pickeo = frequencia_pickeo.reset_index() frequencia_pickeo = frequencia_pickeo['ID del Producto'].value_counts() frequencia_pickeo = frequencia_pickeo.reset_index() frequencia_pickeo.columns = ['ID del Producto', 'Frecuencia Pickeo'] datos_por_sku = pd.merge(suma_tarimas_picking, frequencia_pickeo, on='ID del Producto', how='outer') datos_por_sku.fillna(value=0, inplace=True) datos_por_sku['Media de Tarimas Picking Diarias'] = datos_por_sku[ 'Suma Tarimas Picking'] / datos_por_sku['Frecuencia Pickeo'] # st.write('Obtuve Suma de Tarimas Picking, Frecuencia de Pickeo y Media de Tarimas Picking Diarias') # st.write(datos_por_sku) # ########################################### datos_por_embarque = pd.merge( datos_por_embarque, datos_por_sku[['ID del Producto', 'Media de Tarimas Picking Diarias']], on='ID del Producto', how='outer') datos_por_embarque['Diferencia Media Cuadrada'] = ( datos_por_embarque['Tarimas Picking'] - datos_por_embarque['Media de Tarimas Picking Diarias'])**2 # st.write('Obtuve Diferencia Media Cuadrada') # st.write(datos_por_embarque) ########################################### suma_dif_media_cuadrada = datos_por_embarque.groupby( ['ID del Producto'], as_index=False)['Diferencia Media Cuadrada'].agg( {'Suma Diferencia Media Cuadrada': 'sum'}) datos_por_sku = pd.merge(datos_por_sku, suma_dif_media_cuadrada, on='ID del Producto', how='outer') # st.write('Obtuve Suma Diferencia Media Cuadrada') # st.write(datos_por_sku) ########################################### datos_por_sku['Media de Tarimas Picking Diarias Cuadrada'] = datos_por_sku[ 'Media de Tarimas Picking Diarias']**2 datos_por_sku['Dias sin Ventas'] = datos_por_embarque[ 'Fecha de Embarque'].nunique() - datos_por_sku['Frecuencia Pickeo'] datos_por_sku['Media de los Dias sin Ventas'] = datos_por_sku[ 'Dias sin Ventas'] * datos_por_sku[ 'Media de Tarimas Picking Diarias Cuadrada'] datos_por_sku['Desviacion Estandar de los Dias sin Ventas'] = ( (datos_por_sku['Media de los Dias sin Ventas'] + datos_por_sku['Suma Diferencia Media Cuadrada']) / (datos_por_embarque['Fecha de Embarque'].nunique()))**0.5 datos_por_sku[ 'Proteccion 99%'] = datos_por_sku['Media de los Dias sin Ventas'] + ( 3 * datos_por_sku['Desviacion Estandar de los Dias sin Ventas']) datos_por_sku['Rounded Proteccion 99%'] = datos_por_sku[ 'Proteccion 99%'].round() # # st.write('Obtuve Media de Tarimas Picking Diarias Cuadrada') # st.write(datos_por_embarque) # datos_por_sku = datos_por_sku[['ID del Producto', 'Media de los Dias sin Ventas', 'Desviacion Estandar de los Dias sin Ventas', 'Proteccion 99%', 'Rounded Proteccion 99%']] # datos_por_sku.columns = ['ID del Producto', 'Picking Diario de Tarimas Promedio', 'Desviacion Estandar', 'Proteccion 99%', 'Proteccion 99% Redondeado'] datos_por_sku = datos_por_sku[[ 'ID del Producto', 'Media de los Dias sin Ventas' ]] datos_por_sku.columns = [ 'ID del Producto', 'Picking Diario de Tarimas Promedio' ] # st.write(datos_por_sku) # promedio = 220.33 # desviacion_estandar = 192.13 promedio = datos_por_sku['Picking Diario de Tarimas Promedio'].mean() desviacion_estandar = datos_por_sku[ 'Picking Diario de Tarimas Promedio'].std() dict_resumen = { 'Promedio de Tarimas por Dia': [promedio], 'Desviación Estándar': [desviacion_estandar], 'Proteccion 99%': [NormalDist(mu=promedio, sigma=desviacion_estandar).inv_cdf(0.99)], 'Proteccion 95%': [NormalDist(mu=promedio, sigma=desviacion_estandar).inv_cdf(0.95)], 'Proteccion 90%': [NormalDist(mu=promedio, sigma=desviacion_estandar).inv_cdf(0.90)] } tabla_resumen = pd.DataFrame.from_dict(dict_resumen).fillna(0) st.table(tabla_resumen.style.format('{:,.1f}')) with pd.ExcelWriter("./data/cedis_almacenamiento_picking.xlsx") as writer: tabla_resumen.to_excel(writer, sheet_name="Resumen de Tarimas para Picking", index=False) datos_por_sku.to_excel(writer, sheet_name="Tarimas para Picking Diario", index=False)
def main(): st.image('logo.png', width=200) st.title('AceleraDev Data Science') st.subheader('Semana 3 - Análise de dados exploratória') st.image('https://media.giphy.com/media/R8bcfuGTZONyw/giphy.gif', width=200) file = st.file_uploader( 'Escolha a base de dados que deseja analisar (.csv)', type='csv') if file is not None: st.subheader('Estatística descritiva univariada') df = pd.read_csv(file) aux = pd.DataFrame({"colunas": df.columns, 'tipos': df.dtypes}) num_columns = list(aux[aux['tipos'] != 'object']['colunas']) cat_columns = list(aux[aux['tipos'] == 'object']['colunas']) columns = list(df.columns) col = st.selectbox('Selecione a coluna :', num_columns) if col is not None: st.markdown('Selecione o que deseja analisar :') is_mean = st.checkbox('Média') if is_mean: st.markdown(df[col].mean()) is_median = st.checkbox('Mediana') if is_median: st.markdown(df[col].median()) is_std = st.checkbox('Desvio padrão') if is_std: st.markdown(df[col].std()) is_kurtosis = st.checkbox('Kurtosis') if is_kurtosis: st.markdown(df[col].kurtosis()) is_skewness = st.checkbox('Skewness') if is_skewness: st.markdown(df[col].skew()) is_describe = st.checkbox('Describe') if is_describe: st.table(df[num_columns].describe().transpose()) st.subheader('Visualização dos dados') st.image( 'https://media.giphy.com/media/Rkoat5KMaw2aOHDduz/giphy.gif', width=200) st.markdown('Selecione a visualizacao') is_hist = st.checkbox('Histograma') if is_hist: col_num = st.selectbox( 'Selecione a Coluna Numerica: ', num_columns, key='unique') st.markdown('Histograma da coluna : ' + str(col_num)) st.write(histogram(col_num, df)) is_bars = st.checkbox('Gráfico de barras') if is_bars: col_num_bars = st.selectbox( 'Selecione a coluna numerica: ', num_columns, key='unique') col_cat_bars = st.selectbox( 'Selecione uma coluna categorica : ', cat_columns, key='unique') st.markdown('Gráfico de barras da coluna ' + str(col_cat_bars) + ' pela coluna ' + col_num_bars) st.write(barplot(col_num_bars, col_cat_bars, df)) is_boxplot = st.checkbox('Boxplot') if is_boxplot: col_num_box = st.selectbox( 'Selecione a Coluna Numerica:', num_columns, key='unique') col_cat_box = st.selectbox( 'Selecione uma coluna categorica : ', cat_columns, key='unique') st.markdown('Boxplot ' + str(col_cat_box) + ' pela coluna ' + col_num_box) st.write(boxplot(col_num_box, col_cat_box, df)) is_scatter = st.checkbox('Scatterplot') if is_scatter: col_num_x = st.selectbox( 'Selecione o valor de x ', num_columns, key='unique') col_num_y = st.selectbox( 'Selecione o valor de y ', num_columns, key='unique') col_color = st.selectbox('Selecione a coluna para cor', columns) st.markdown('Selecione os valores de x e y') st.write(scatterplot(col_num_x, col_num_y, col_color, df)) is_correlation = st.checkbox('Correlacao') if is_correlation: st.markdown('Gráfico de correlação das colunas númericas') st.write(correlationplot(df, num_columns))
else: # from .apply(axis=None) is_max = data == data.max().max() return pd.DataFrame(np.where(is_max, attr, ''), index=data.index, columns=data.columns) # Create a table to be styled in various ways np.random.seed(24) df = pd.DataFrame({'A': np.linspace(1, 5, 5)}) df = pd.concat( [df, pd.DataFrame(np.random.randn(5, 4), columns=list('BCDE'))], axis=1) df.iloc[0, 2] = np.nan # Unstyled st.table(df) # Custom formatting st.table(df.style.format('{:.2%}')) # Colors st.table( df.style.applymap(color_negative_red).apply(highlight_max, color='darkorange', axis=0)) # Add rows x = st.table( df.style.set_properties(**{ 'background-color': 'black', 'color': 'lawngreen',
def stockmarket(tickertxt): movers = ya.get_day_most_active() st.table(movers.head()) # Right away we notice that stocks with negative price changes are also included in our results. A filter to get only stocks with a positive % change is applied to get our desired stocks # In[58]: movers = movers[movers['% Change'] >= 0] st.table(movers.head()) # Excellent! We have successfully scraped the data using the yahoo_fin python module. it is often a good idea to see if those stocks are also generating attention, and what kind of attention it is to avoid getting into false rallies. We will scrap some sentiment data courtesty of [sentdex](http://www.sentdex.com/financial-analysis/). Sometimes sentiments may lag due to source e.g Newsarticle published an hour after event, so we will also utilize [tradefollowers](https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1d) for their twitter sentiment data. We will process both lists independently and combine them. For both the sentdex and tradefollowers data we use a 30 day time period. Using a single day might be great for day trading but increases probability of jumping on false rallies. # # NOTE: Sentdex only has stocks which belong to the S&P 500 # In[59]: res = requests.get('http://www.sentdex.com/financial-analysis/?tf=30d') soup = BeautifulSoup(res.text) table = soup.find_all('tr') # In[60]: stock = [] sentiment = [] mentions = [] sentiment_trend = [] for ticker in table: ticker_info = ticker.find_all('td') try: stock.append(ticker_info[0].get_text()) except: stock.append(None) try: sentiment.append(ticker_info[3].get_text()) except: sentiment.append(None) try: mentions.append(ticker_info[2].get_text()) except: mentions.append(None) try: if (ticker_info[4].find( 'span', {"class": "glyphicon glyphicon-chevron-up"})): sentiment_trend.append('up') else: sentiment_trend.append('down') except: sentiment_trend.append(None) company_info = pd.DataFrame( data={ 'Symbol': stock, 'Sentiment': sentiment, 'direction': sentiment_trend, 'Mentions': mentions }) st.table(company_info.head(50)) # We then combine these results with our results from the biggest movers on a given day. This done using a left join of this data frame with the original movers data frame # In[61]: top_stocks = movers.merge(company_info, on='Symbol', how='left') top_stocks.drop(['Market Cap', 'PE Ratio (TTM)'], axis=1, inplace=True) st.table(top_stocks.head(50)) # A couple of stocks pop up with both very good sentiments and an upwards trend in favourability. ZNGA, TWTR and AES for instance stood out as potentially good picks. Note, the mentions here refer to the number of times the stock was referenced according to the internal metrics used by [sentdex](sentdex.com). Let's attempt supplimenting this information with some data based on twitter. We get stocks that showed the strongest twitter sentiments with a time period of 1 month # In[62]: res = requests.get( "https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1m") soup = BeautifulSoup(res.text) stock_twitter = soup.find_all('tr') # In[63]: twit_stock = [] sector = [] twit_score = [] for stock in stock_twitter: try: score = stock.find_all("td", {"class": "datalistcolumn"}) twit_stock.append(score[0].get_text().replace('$', '').strip()) sector.append(score[2].get_text().replace('\n', '').strip()) twit_score.append(score[4].get_text().replace('\n', '').strip()) except: twit_stock.append(np.nan) sector.append(np.nan) twit_score.append(np.nan) twitter_df = pd.DataFrame({ 'Symbol': twit_stock, 'Sector': sector, 'Twit_Bull_score': twit_score }) # Remove NA values twitter_df.dropna(inplace=True) twitter_df.drop_duplicates(subset="Symbol", keep='first', inplace=True) twitter_df.reset_index(drop=True, inplace=True) st.table(twitter_df.head()) # Twit_Bull_score refers to the internally scoring used at [tradefollowers](tradefollowers.com) to rank stocks based on twitter sentiments, and can range from 1 to as high as 10,000 or greater. With the twitter sentiments obtains, we combine it with our sentiment data to get an overall idea of the data. # In[64]: st.text("Final List") Final_list = top_stocks.merge(twitter_df, on='Symbol', how='left') st.table(Final_list) # Finally, we include a twitter momentum score. # In[65]: res2 = requests.get( "https://www.tradefollowers.com/active/twitter_active.jsp?tf=1m") soup2 = BeautifulSoup(res2.text) stock_twitter2 = soup2.find_all('tr') # In[66]: twit_stock2 = [] sector2 = [] twit_score2 = [] for stock in stock_twitter2: try: score2 = stock.find_all("td", {"class": "datalistcolumn"}) twit_stock2.append(score2[0].get_text().replace('$', '').strip()) sector2.append(score2[2].get_text().replace('\n', '').strip()) twit_score2.append(score2[4].get_text().replace('\n', '').strip()) except: twit_stock2.append(np.nan) sector2.append(np.nan) twit_score2.append(np.nan) twitter_df2 = pd.DataFrame({ 'Symbol': twit_stock2, 'Sector': sector2, 'Twit_mom': twit_score2 }) # Remove NA values st.text("Final List mit twitter") twitter_df2.dropna(inplace=True) twitter_df2.drop_duplicates(subset="Symbol", keep='first', inplace=True) twitter_df2.reset_index(drop=True, inplace=True) st.table(twitter_df2.head(50)) # We again combine the dataframes to earlier concatanated dataframes. This will form our recommender list # In[67]: st.text("Final List Recommandet") Recommender_list = Final_list.merge(twitter_df2, on='Symbol', how='left') Recommender_list.drop(['Volume', 'Avg Vol (3 month)'], axis=1, inplace=True) st.table(Recommender_list.head(50)) # Our list now contains even more informationt to help us with our trades. Stocks which it suggests might generate positive returns include TSLA, ZNGA and TWTR. There is also the posibility that we do not get a stock that falls in all our generated lists, so usage of, for instance, the price information and the twitter data could still give us a good idea of what to expect in terms of performance. As an added measure, we can also obtain information on the sectors to see how they've performed. Again, we will use a one month time period for comparison. The aforementioned stocks belong to the Technology and consumer staples sectors. # In[68]: sp = SectorPerformances(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(10) plt.figure(figsize=(8, 8)) data, meta_data = sp.get_sector() st.text(meta_data) data['Rank D: Month Performance'].plot(kind='bar') plt.title('One Month Performance (%) per Sector') plt.tight_layout() plt.grid() st.pyplot(plt, use_container_width=True) #plt.show() # The industrials sector appears to be the best performing in this time period. Consumer staples appears to be doing better than IT, but overall they are up which bodes well for potential investors. Please note that this analysis is only a guide to find potentially positive return generating stocks. It is still up to the investor to do the research. # ## Part 2: Forecasting using an LSTM # # In this section, we will atetmpt to apply deep learning to a stock of our chosing to predict future prices. At the time this project was conceived, the stock AMD was selected as it experienced really high gains at the time. # First we obtain stock data for our chosen stock. Data from 2014 data up till August of 2020 was obtained for our analysis. Our data will be obtained from yahoo # In[69]: from datetime import datetime from datetime import date today = date.today() #today.replace("-",",") #print(today) # In[70]: start = datetime(2014, 12, 31) end = datetime(2021, 6, 3) #print(end) # In[71]: stock_dt = web.DataReader('AMD', 'yahoo', start, end) stock_dt.reset_index(inplace=True) st.table(stock_dt.head()) # In[72]: st.table(stock_dt.tail()) # ### Feature selection/engineering # # We add additional data that might potentially increase prediction accuracy. Here we use technical indicators. # In[73]: # Technical Indicators # RSI t_rsi = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_rsi, meta_data_rsi = t_rsi.get_rsi(symbol='AMD', interval='daily', time_period=9, series_type='open') # SMA t_sma = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_sma, meta_data_sma = t_sma.get_sma(symbol='AMD', interval='daily', time_period=9, series_type='open') #EMA t_ema = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_ema, meta_data_ema = t_ema.get_ema(symbol='AMD', interval='daily', time_period=9, series_type='open') # In[74]: #On Balance volume t_obv = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_obv, meta_data_obv = t_obv.get_obv(symbol='AMD', interval='daily') # Bollinger bands t_bbands = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_bbands, meta_data_bb = t_bbands.get_bbands(symbol='AMD', interval='daily', series_type='open', time_period=9) # To learn more about technical indicators and how they are useful in stock analysis, I welcome you to explore [investopedia](https://www.investopedia.com/). Let's combine these indicators into a dataframe # In[75]: t_ind = pd.concat([data_ema, data_sma, data_rsi, data_obv, data_bbands], axis=1) t_ind # We then extract the values for the time interval of choice # In[76]: t_ind = t_ind.loc[start:end].reset_index() # Now we combine them with our original dataframe containing price and volume information # In[77]: df_updated = pd.concat([stock_dt, t_ind], axis=1) df_updated.set_index('Date', drop=True, inplace=True) st.table(df_updated.tail(20)) # Before we begin, it is often a good idea to visually inspect the stock data to have an idea of the price trend and volume information # In[78]: # In[79]: mpf.plot(df_updated.loc[datetime(2021, 5, 1):datetime(2021, 6, 3)], type='candle', style='yahoo', figsize=(8, 6), volume=True) # in the month of July, AMD experienced a massive price surge. Let's have a look at the data with the indicators included # In[80]: fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 12)) ax[0].plot( df_updated['Open'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'k', lw=2, label='Close') ax[0].plot( df_updated['EMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'r', lw=1.5, label='EMA') ax[0].plot( df_updated['SMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'b', lw=1.5, label='SMA') ax[0].plot(df_updated['Real Upper Band']. loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'g', lw=1.5, label='Boolinger band (upper)') ax[0].plot(df_updated['Real Lower Band']. loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'y', lw=1.5, label='Boolinger band (lower)') ax[0].set_ylabel('Closing price') ax[0].legend() temp = len( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]) ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'g', lw=2, label='RSI') ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]. index, 70 * np.ones((temp, 1)).flatten(), 'k') ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]. index, 30 * np.ones((temp, 1)).flatten(), 'k') ax[1].set_ylabel('RSI') #ax[1].legend() ax[2].plot( df_updated['OBV'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'y', lw=2, label='OBV') ax[2].set_ylabel('On balance Volume') #ax[2].legend() ax[2].set_xlabel('Date') st.pyplot(fig) # Indicators give us an idea of the direction of future prices. For instance, the Exponential moving average (EMA) crossing the Simple moving average (SMA) might indicate a positive uptrend in price. RSI gives us an idea of how much the stock is being bought or sold. An RSI of 70 for instance might indicate an overbought stock, and tells us the price is very likely to go down in the future, while an RSI of 30 indicates an oversold stock and could potentially be a good buy point for a stock. On balance volume gives us the relative changes in volume, and can potentially identify true rallies or breakouts. Bollinger bands provide an idea of the volatility of the stock. # # We also want to take into account relative changes between trading days as they tend to be less volatile, and therefore a bit more stationary. We will take the difference between two consecutive days in this case. # In[81]: df_updated['Diff_Open'] = df_updated['Open'] - df_updated['Open'].shift(1) df_updated['Diff_Close'] = df_updated['Close'] - df_updated['Close'].shift( 1) df_updated[ 'Diff-Volume'] = df_updated['Volume'] - df_updated['Volume'].shift(1) df_updated['Diff-High'] = df_updated['High'] - df_updated['High'].shift(1) df_updated['Diff-Low'] = df_updated['Low'] - df_updated['Low'].shift(1) df_updated['Diff-Close (forward)'] = np.where( df_updated['Close'].shift(-1) > df_updated['Close'], 1, -1) df_updated['High-Low'] = df_updated['High'] - df_updated['Low'].shift(1) df_updated['Open-Close'] = df_updated['Open'] - df_updated['Close'].shift( 1) df_updated['Returns'] = df_updated['Open'].pct_change(1) # In[82]: st.table(df_updated.head()) # The next step is to visualize how the features relate to each other. We employ a correlation matrix for this purpose # In[83]: df_updated.drop(['date', 'Real Middle Band', 'Adj Close'], axis=1, inplace=True) # In[84]: plt.figure(figsize=(12, 8)) sns.heatmap(df_updated.corr()) # The closing price has very strong correlations with some of the other price informations such as opening price, highs and lows. # On the other hands, the differential prices arn't as correlated. We want to limit the amount of colinearity in our system before running any machine learning routine. So feature selection is a must. # ### Feature Selection # # We utilize two means of feature selection in this section. Random forests and mutual information gain. Random forests are # very popular due to their relatively good accuracy, robustness as well as simplicity in terms of utilization. They can directly measure the impact of each feature on accuracy of the model and in essence give them a rank. Information gain on the other hand, calculates the reduction in entropy from transforming a dataset in some way. Mutual information gain essentially evaluates the gain of each variable in the context of the target variable. # In[85]: # ### Random forest regressor # In[88]: # Seperate the target variable from the features y = df_updated['Close'].iloc[1:].dropna() X = df_updated.drop(['Close'], axis=1).iloc[1:].dropna() #print("y-Band: ",y.count) #print("x-band: ",X.count) # In[89]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # In[90]: X_train.shape, y_train.shape # In[92]: feat = SelectFromModel( RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)) feat.fit(X_train, y_train) feat.get_support() # In[93]: X_train.columns[feat.get_support()] # The regressor essentially selected the features that displayed good correlation with the Close price. However, although it selected the most important we would like information on the information gain from each variable. An issue with using random forests is it tends to diminsh the importance of other correlated variables and may lead to incorrect interpretation. However, it does help reduce overfitting # ### Mutual information gain # In[94]: # In[96]: mi = mutual_info_regression(X_train, y_train) mi = pd.Series(mi) mi.index = X_train.columns mi.sort_values(ascending=False, inplace=True) # In[97]: st.table(mi.head(50)) # The results validate the results using the random forest regressor, but it appears some of the other variables also contribute # a decent amount of information. We will select values greater than 2 for our analysis. # In[98]: sel = SelectKBest(mutual_info_regression, k=8).fit(X_train, y_train) # Features = X_train.columns[sel.get_support()] Features.values # ### Preprocessing # # In order to construct a Long short term memory neural network (LSTM), we need to understand its structure. Below is the design of a typical LSTM unit. Data source: [Researchgate](https://www.researchgate.net/publication/334268507_Application_of_Long_Short-Term_Memory_LSTM_Neural_Network_for_Flood_Forecasting) # ![LSTM_structure.jpg](LSTM_structure.jpg) # As mentioned earlier, LSTM's are a special type of Recurrent neural networks (RNN). Recurrent neural networks (RNN) are a special type of neural network in which the output of a layer is fed back to the input layer multiple times in order to learn from the past data. Basically, the neural network is trying to learn data that follows a sequence. However, since the RNNs utilize past data, they can become computationally expensive due to storing large amouts of data in memory. The LSTM mitigates this issue, using gates. It has a cell state, and 3 gates; forget, imput and output gates. # # The cell state is essentially the memory of the network. It carries information throughtout the data sequence processing. Information is added or removed from this cell state using gates. Information from the previous hidden state and current input are combined and passed through a sigmoid function at the forget gate. The sigmoid function determines which data to keep or forget. The transformed values are then multipled by the current cell state. # # Next, the information from the previous hidden state combined with the input is passed through a sigmoid function to again determine important information, and also a tanh function to transform data between -1 and 1. This transformation helps with the stability of the network and helps deal with the vanishing/exploding gradient problem. These 2 outputs are multiplied together, and the output is added to the current cell state with the sigmoid function applied to it to give us our new cell state for the next time step. # # Finally, the information from the hidden state combined with the current input are combined and a sigmoid function applied to it. The new cell state is passed through a tanh function to transform the values and both outputs are multiplied to determine the new hidden state for the next time step. # # Now we have an idea of how the LSTM works, let's construct one. First we split our data into training and test set # In[99]: df_updated.reset_index(drop=True, inplace=True) train_size = int(len(df_updated) * 0.8) test_size = len(df_updated) - train_size # Make sure to omit the first row, contains NAN's train = df_updated.iloc[1:train_size] test = df_updated.iloc[train_size:] # In[100]: train.shape, test.shape # In[102]: # Extract the features total_features = list(Features.values) total_features.append('Close') total_features train = train[total_features] test = test[total_features] train.shape, test.shape # Before we proceed, it is important to scale the data. Scaling is done to ensure one set of features don't have more importance relative to the others. In addition, having values between 0 and 1 will help the neural network converge faster if at all it does. We apply different scalings to the test and training data to avoid leakage into our model. # In[103]: # Scale both features and target variables f_transformer = MinMaxScaler() # Feature scaler targ_transformer = MinMaxScaler() # Target scaler f_transformer = f_transformer.fit(train[Features].to_numpy()) targ_transformer = targ_transformer.fit(train[['Close']]) train.loc[:, Features] = f_transformer.transform(train[Features].to_numpy()) train['Close'] = targ_transformer.transform(train[['Close']].to_numpy()) test.loc[:, Features] = f_transformer.transform(test[Features].to_numpy()) test['Close'] = targ_transformer.transform(test[['Close']].to_numpy()) # In[104]: train.shape, test.shape # The figure below shows how the sequential data for an LSTM is constructed to be fed into the network. Data source: [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458) # ![LSTM_data_arrangement.PNG](attachment:LSTM_data_arrangement.PNG) # Bassically for data at time t, with a window size of N, the target feature will be the data point at time t, and the feature will be the data points [t-1, t-N]. We then sequentially move forward in time using this approach. We therefore need to format our data that way. # In[105]: # In[106]: time_steps = 10 X_train_lstm, y_train_lstm = create_dataset(train.drop(['Close'], axis=1), train['Close'], time_steps) X_test_lstm, y_test_lstm = create_dataset(test.drop(['Close'], axis=1), test['Close'], time_steps) # In[108]: X_train_lstm.shape, y_train_lstm.shape # In[109]: X_test_lstm.shape, y_test_lstm.shape # ### Building LSTM model # # The new installment of tensorflow (Tensorflow 2.0) via keras has made implmentation of deep learning models much easier than in previous installments. We will apply a bidrectional LSTM as they have been shown to more effective in certain applications (see [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458)). This due to the fact that the network learns using both past and future data in 2 layers. Each layer performs the operations using reversed time steps to each other. The loss function in this case will be the mean squared error, and the adam optimizer with the default learning rate is applied. # In[110]: # In[111]: model = keras.Sequential() model.add( keras.layers.Bidirectional( keras.layers.LSTM(units=32, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))) model.add(keras.layers.Dropout(rate=0.2)) model.add(keras.layers.Dense(units=1)) # In[112]: model.compile(optimizer='adam', loss='mean_squared_error') # In[114]: history = model.fit(X_train_lstm, y_train_lstm, epochs=90, batch_size=40, validation_split=0.2, shuffle=False, verbose=1) # In[115]: test_loss = model.evaluate(X_test_lstm, y_test_lstm) # In[116]: # In[117]: plot_learningCurve(history, 90) # With each epoch, the validation loss is decreasing but in a bit of a stochastic manner. The training loss is fairly consisten throughout. There maybe some overfitting in there but you can always tune model parameters and explore data more. Let's make some predictions on the test data just to see what's happening # In[118]: y_pred = model.predict(X_test_lstm) # We need to apply some inverse scaling to get back our original results. # In[119]: y_train_inv = targ_transformer.inverse_transform( y_train_lstm.reshape(1, -1)) y_test_inv = targ_transformer.inverse_transform(y_test_lstm.reshape(1, -1)) y_pred_inv = targ_transformer.inverse_transform(y_pred) # In[120]: plt.figure(figsize=(10, 10)) plt.plot(np.arange(0, len(y_train_lstm)), y_train_inv.flatten(), 'g', label="history") plt.plot(np.arange(len(y_train_lstm, ), len(y_train_lstm) + len(y_test_lstm)), y_test_inv.flatten(), marker='.', label="true") plt.plot(np.arange(len(y_train_lstm), len(y_train_lstm) + len(y_test_lstm)), y_pred_inv.flatten(), 'r', label="prediction") plt.ylabel('Close Price') plt.xlabel('Time step') plt.legend() st.pyplot(plt, use_container_width=True) #plt.show(); # At first glance we can see that the our predictions are not very great, we could define adjust our model parameters some more. However, they appear to be following the trends pretty well. Let's take a closer look # In[121]: plt.figure(figsize=(10, 10)) plt.plot(np.arange(len(y_train_lstm[0:500], ), len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])), y_test_inv.flatten()[0:500], label="true") plt.plot(np.arange(len(y_train_lstm[0:500]), len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])), y_pred_inv.flatten()[0:500], 'r', label="prediction") plt.ylabel('Close Price') plt.xlabel('Time Step') plt.legend() st.pyplot(plt, use_container_width=True) #plt.show(); # Now it will become apparent why I did not use a large amount of epochs to train my model. At first glance, we notice the LSTM has some implicit autocorrelation in its results since its predictions for a given day are very similar to those of the previous day. It essentially lags. Its basically showing that the best guess of the model is very similar to previous results. This should not be a surprising result; The stock market is influenced by a number of factors such as news, earnings reports, meargers etc. Therefore, it is a bit too choatic and stoachastic to be acurately modelled because it depends on so many factors, some of which can be sporadic i.e positive or negative news. Therefore in my opinion, this may not be the best way to predict stock prices. Of course with major advances in AI there might actually be a way, but I don't think the hedge funds will be sharing their methods anytime soon. # ## Part 3: Regression analysis # Of course we could still make an attempt to have an idea of what the possible price movements might be. In this case I will utilize the differential prices as there's less volatility compared to using absolute prices. Let's explore these relationships # In[122]: fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(10, 10)) ax[0, 0].scatter(df_updated['Open-Close'], df_updated['Diff_Close'], c='k') ax[0, 0].legend(['Open-Close']) ax[0, 0].set_ylabel('Diff-Close') ax[0, 1].scatter(df_updated['High-Low'], df_updated['Diff_Close'], c='k') ax[0, 1].legend(['High-Low']) ax[0, 1].set_ylabel('Diff-Close') ax[1, 0].scatter(df_updated['Diff_Open'], df_updated['Diff_Close'], c='k') ax[1, 0].legend(['Diff-Open']) ax[1, 0].set_ylabel('Diff-Close') ax[1, 1].scatter(df_updated['Diff-Low'], df_updated['Diff_Close'], c='k') ax[1, 1].legend(['Diff-Low']) ax[1, 1].set_ylabel('Diff-Close') ax[2, 0].scatter(df_updated['Diff-High'], df_updated['Diff_Close'], c='k') ax[2, 0].legend(['Diff-High']) ax[2, 0].set_ylabel('Diff-Close') ax[2, 1].scatter(df_updated['Open'], df_updated['Diff_Close'], c='k') ax[2, 1].legend(['Open']) ax[2, 1].set_ylabel('Diff-Close') st.pyplot(fig) # Above are a series of plots that show the relationship between different differential price measurements and the differential close. In this study, the differece relates to the difference between a value at time t and the previous day value at time t-1. The Differential high, differential low, differential high-low and differential open-close appear to have a linear relationship with the differential close. However, only the differential open-close would be useful in an analysis. This because on a given day (time t), we can not know what the highs or lows are before hand till the day ends. However, we know the open value at the start of the trading period. # Let's separate the data features and target variables. We will use Ridge regression in this case to make our model more generalizable # In[123]: # In[124]: X_reg = df_updated[['Open-Close']] y_reg = df_updated['Diff_Close'] # In[125]: X_reg = X_reg.loc[1:, :] y_reg = y_reg.iloc[1:] # In[126]: X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( X_reg, y_reg, test_size=0.2, random_state=0) # We will perform a grid search and cross validation to determine optimal paramters for our regresison model # In[127]: ridge = Ridge() alphas = [ 1e-15, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 5, 10, 20, 30, 40, 45, 50, 55, 100 ] params = {'alpha': alphas} # In[129]: ridge_regressor = GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=10) ridge_regressor.fit(X_reg, y_reg) # In[130]: st.text(ridge_regressor.best_score_) st.text(ridge_regressor.best_params_) # Finally, let's produce a plot and see how it fits # In[131]: np.shape(X_test_reg) # In[133]: regr = Ridge(alpha=1e-15) regr.fit(X_train_reg, y_train_reg) y_pred = regr.predict(X_test_reg) y_pred_train = regr.predict(X_train_reg) st.text(f'R^2 value for test set is {regr.score(X_test_reg,y_test_reg)}') st.text(f'Mean squared error is {mean_squared_error(y_test_reg,y_pred)}') plt.scatter(df_updated['Open-Close'][1:], df_updated['Diff_Close'][1:], c='k') plt.plot(df_updated['Open-Close'][1:], (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_), c='r') plt.xlabel('Open-Close') plt.ylabel('Diff-Close') st.pyplot(plt, use_container_width=True) # We obtained a mean square error of 0.58 which is fairly moderate. Our R^2 value basically says 54% of the variance in the # differential close price is explained by the differential open-close price. Not so bad so far. But to be truly effective, we need to make use of statistics. Specifically, let's define a confidence interval around our predictions i.e prediction intervals. # # Prediction intervals give you a range for the prediction that accounts for any threshold of modeling error. Prediction intervals are most commonly used when making predictions or forecasts with a regression model, where a quantity is being predicted. We select the 95% confidence interval in this example such that our actual predictions fall into this range 99% of the time. For an in-depth overview and explanation please explore [machinelearningmastery](https://machinelearningmastery.com/prediction-intervals-for-machine-learning/) # In[135]: # In[136]: lower, upper, interval = predict_range(X_reg, y_reg, regr) # In[138]: plt.scatter(X_reg, df_updated['Diff_Close'][1:], c='k') plt.plot(X_reg, lower, c='b') plt.plot(X_reg, (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_), c='r') plt.plot(X_reg, upper, c='g') #plt.errorbar(X_reg , (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_),yerr=interval) # plt.xlabel('Open-Close') plt.ylabel('Diff-Close') plt.legend(['Upper bound', 'Model', 'Lower bound']) st.pyplot(plt, use_container_width=True)
return patients dropcols = ['PatNum', 'FName', 'Recency'] #load data from source and get create original df and data for getting predictions link_data, test = load_data(link_data=for_model, drop_columns=dropcols) # load pretrained model and make predictions model = load_model('bestLRmodel.pkl') predict_probas = model.predict_proba(test) #allow use to input threshold value for prediction probabilities num_patients = st.text_input(label='# of Patients to Contact', value=10, max_chars=None, key=1, type='default') df = priority_list(link_data, predict_probas, num_patients=int(num_patients)) #st.title(f'Total # of Patients: {len(df)}') #display patient data on screen st.table(df) st.title('Prioritized Contact List') num_patients2 = st.text_input(label='# of Patients to Contact', value=10, max_chars=None, key=2, type='default') contact_df = t.contact_transform(contact) contact_df.index = contact_df.reset_index(drop=True).index + 1 st.table(contact_df.iloc[:int(num_patients2)]) st.title('Conclusions') st.title('* Proactive Action:') st.markdown('**Identifying potential churners through non-time based characteristics**') st.title('* Reactive Action:') st.markdown('**Providing a prioritized contact list for identifying the “best” patients to contact**')
if final_df is not None: final_df = pd.concat([final_df, df]) else: final_df = deepcopy(df) else: st.error("No rows in the data Extracted from the API") if len(final_df): final_df.drop_duplicates(inplace=True) final_df.rename(columns=rename_mapping, inplace=True) final_df = filter_column(final_df, "Minimum Age Limit", 18) final_df = filter_in_stock(final_df, "Available Capacity") table = deepcopy(final_df) table.reset_index(inplace=True, drop=True) st.table(table) else: st.error("No Data Found") if final_df is not None: if (len(final_df) > 0): hospitals = [] [ hospitals.append(x) for x in final_df["Hospital Name"] if x not in hospitals ] sms_text = str( "Cowin notification : Run for vaccine at {0}".format(hospitals)) # To send SMS via Twilio account_sid = 'YOUR_TWILIO_ACCOUNT_SID'
def main(): st.image('logo.png', width=200) st.title('AceleraDev Data Science') st.subheader('Semana 2 - Pré-processamento de Dados em Python') st.image('https://media.giphy.com/media/KyBX9ektgXWve/giphy.gif', width=200) file = st.file_uploader( 'Escolha a base de dados que deseja analisar (.csv)', type='csv') if file is not None: st.subheader('Analisando os dados') df = pd.read_csv(file) st.markdown('**Número de linhas:**') st.markdown(df.shape[0]) st.markdown('**Número de colunas:**') st.markdown(df.shape[1]) st.markdown('**Visualizando o dataframe**') number = st.slider('Escolha o numero de colunas que deseja ver', min_value=1, max_value=20) st.dataframe(df.head(number)) st.markdown('**Nome das colunas:**') st.markdown(list(df.columns)) exploracao = pd.DataFrame({ 'nomes': df.columns, 'tipos': df.dtypes, 'NA #': df.isna().sum(), 'NA %': (df.isna().sum() / df.shape[0]) * 100 }) st.markdown('**Contagem dos tipos de dados:**') st.write(exploracao.tipos.value_counts()) st.markdown('**Nomes das colunas do tipo int64:**') st.markdown(list(exploracao[exploracao['tipos'] == 'int64']['nomes'])) st.markdown('**Nomes das colunas do tipo float64:**') st.markdown(list( exploracao[exploracao['tipos'] == 'float64']['nomes'])) st.markdown('**Nomes das colunas do tipo object:**') st.markdown(list(exploracao[exploracao['tipos'] == 'object']['nomes'])) st.markdown('**Tabela com coluna e percentual de dados faltantes :**') st.table(exploracao[exploracao['NA #'] != 0][['tipos', 'NA %']]) st.subheader('Inputaçao de dados númericos :') percentual = st.slider( 'Escolha o limite de percentual faltante limite para as colunas vocë deseja inputar os dados', min_value=0, max_value=100) lista_colunas = list( exploracao[exploracao['NA %'] < percentual]['nomes']) select_method = st.radio('Escolha um metodo abaixo :', ('Média', 'Mediana')) st.markdown('Você selecionou : ' + str(select_method)) if select_method == 'Média': df_inputado = df[lista_colunas].fillna(df[lista_colunas].mean()) exploracao_inputado = pd.DataFrame({ 'nomes': df_inputado.columns, 'tipos': df_inputado.dtypes, 'NA #': df_inputado.isna().sum(), 'NA %': (df_inputado.isna().sum() / df_inputado.shape[0]) * 100 }) st.table(exploracao_inputado[ exploracao_inputado['tipos'] != 'object']['NA %']) st.subheader('Dados Inputados faça download abaixo : ') st.markdown(get_table_download_link(df_inputado), unsafe_allow_html=True) if select_method == 'Mediana': df_inputado = df[lista_colunas].fillna(df[lista_colunas].mean()) exploracao_inputado = pd.DataFrame({ 'nomes': df_inputado.columns, 'tipos': df_inputado.dtypes, 'NA #': df_inputado.isna().sum(), 'NA %': (df_inputado.isna().sum() / df_inputado.shape[0]) * 100 }) st.table(exploracao_inputado[ exploracao_inputado['tipos'] != 'object']['NA %']) st.subheader('Dados Inputados faça download abaixo : ') st.markdown(get_table_download_link(df_inputado), unsafe_allow_html=True)
def main(): """Have Customer and company affiliate sections. Partition by sentiments and priority respectively and return review outputs based on choice""" st.title('Ecommerce Reviews based on Topics') st.subheader( "Over 600 reviews from three popular ecommerce in Berlin,Germany (Amazon, Zalando and Outfittery) were scraped from TrustPilot,cleaned and underwent into topic themes as well as a sentiment evaluation using AWS services" ) st.image(img, width=600, caption="Visuals") if st.checkbox("Start by Clicking"): status = st.radio( " Are you using the app as a Customer or a Company Affiliate", ("Customer", "Company Affiliate")) if status == 'Customer': st.write('**Welcome Customer**') topics = st.selectbox( "Which of the topics would you be interested in?", [ "About Product/Services", "Experience/Expectation", "Transaction Journey" ]) if st.checkbox('Next'): status = st.radio( "What type of reviews would you like to view?", ("POSITIVE", "NEGATIVE")) st.text('See ' + status + ' Reviews') title, body, sentiment = get_review_sentiments( topics, status, 'Sentiment') for i, j in zip(title, body): st.write('**Review Title:** ' + i) st.warning(j) st.write('\n') if status == 'Company Affiliate': st.write('**Welcome Company Afiliate.**') st.write( 'Check out the summary stats of reviews based on their department topics' ) df = summary_statistics() st.table(df) topics = st.selectbox( "Which of the Department would you be interested in?", [ "About Product/Services", "Experience/Expectation", "Transactional Journey" ]) if st.checkbox('Check sentiment analytics chart'): st.text('See the analytics') new_data = data[data['topic'] == topics] data_new = new_data.iloc[:, -2].value_counts() st.write(data_new.plot.bar()) st.pyplot() status = st.radio( "What type of reviews would you like to view?", ("High-priority", "Moderate-priority", "Low-priority")) st.text('See ' + status) title, body, sentiment = get_review_sentiments( topics, status, 'priority') for i, j, k in zip(title, body, sentiment): st.write('**Review Title:** ' + i) st.warning(j) st.text('Sentiment: ' + k) st.write('\n')
# display leaderboard with lbCol1: # read csv # make a copy of the dataframe as df athlete_data = pd.read_csv('player_stats.csv') df = athlete_data.copy() # grab Player and Wins columns, rename, # remove index and convert win ratio to percent dfLeaderBoard = df[['Rank', 'Player', 'Win %']] # dfLeaderBoard.columns = ['Rank', 'Player', 'Win %'] dfLeaderBoard.index = [""] * len(dfLeaderBoard) dfLeaderBoard['Win %'] = (dfLeaderBoard['Win %'] * 100).astype(str) + "%" # dfLeaderBoard st.table(dfLeaderBoard) # display leaderboard data in bar chart with lbCol2: st.write('#') st.write('#') leaderBoard = pd.DataFrame({ 'Win %': df['Win %'], 'Rank': df["Rank"] }, ) chart = alt.Chart(leaderBoard).mark_bar().encode( x=alt.X('Rank:O'), y=alt.Y('Win %:Q', axis=alt.Axis(format="%", tickCount=leaderBoard.shape[0], grid=False)),
### Results ---------- ## Table with top 3 topics and score ud_bow_vector = dictionary.doc2bow(ud_tokens) results = pd.DataFrame(lda_model[ud_bow_vector]) results.columns = ['Topic', 'Proximity Score'] results.sort_values(['Proximity Score'], ascending=False, inplace=True) results_3 = (results.nlargest(3, ['Proximity Score'])) # Get topic numbers first_choice = int(results_3.iloc[0]['Topic']) second_choice = int(results_3.iloc[1]['Topic']) third_choice = int(results_3.iloc[2]['Topic']) # Display table st.subheader("Here are the topics most related to your research proposal:") st.table(results_3.assign(hack='').set_index('hack')) ### Words per topic ---------- ## Create table with top words in the topics above n_words = 20 topic_words = pd.DataFrame({}) for i, topic in enumerate(lda_model.get_topics()): top_feature_ids = topic.argsort()[-n_words:][::-1] feature_values = topic[top_feature_ids] words = [dictionary[id] for id in top_feature_ids] topic_df = pd.DataFrame({ 'value': feature_values, 'word': words, 'topic': i })
) pie_plot_age.update_layout({ 'paper_bgcolor': 'rgba(0,0,0,0)', 'plot_bgcolor': 'rgba(0,0,0,0)', }) pie_plot_age.update_layout(legend_font_size=10) # pie_plot_age.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(colors=age_range, line=dict(color='#000000', width=2))) col2.plotly_chart(pie_plot_age) #---Expanders---# with st.beta_expander('Claim amounts by state'): state_amount_df = cache_data.groupby('state', as_index= True).agg({'amount':'sum'})\ .sort_values(by='amount', ascending = False) state_amount_df.columns = ['Total amount claims'] state_amount_df_rank = state_amount_df['Total amount claims'][:10] st.table(state_amount_df_rank) bar_plot_state = px.bar(state_amount_df_rank, labels={ "value": "Total amount claims", "state": "State" }) bar_plot_state.update_layout({ 'paper_bgcolor': 'rgba(0,0,0,0)', 'plot_bgcolor': 'rgba(0,0,0,0)', }) bar_plot_state.update_layout( width=1000, height=500, showlegend=False, ) bar_plot_state.update_traces(marker_color='darkblue')
def import_traffic(year: int, traffic_type: int): """ Replaces the table traffic_source with new data. Truncates first all data, then loads the all data from data.bs.ch in a dataframe and filters for rows with year > currentyear -2. this is to reduce the amount of data replaced in the target table miv. """ def transfer_staging2fact(): ok = True tools.log('Copy miv traffic records from staging to miv_traffic table') cmd = qry['traffic_staging_fact'] ok = db.execute_non_query(cmd, db.conn) if ok: cmd = qry[update_info_query[traffic_type]] ok = db.execute_non_query(cmd, db.conn) if ok: cmd = qry['reset_station_flags'] ok = db.execute_non_query(cmd, db.conn) if ok: cmd = qry['update_station_flags'].format('miv_flag', 1) ok = db.execute_non_query(cmd, db.conn) cmd = qry['update_station_flags'].format('velo_data_flag', 2) ok = db.execute_non_query(cmd, db.conn) cmd = qry['update_station_flags'].format('fuss_data_flag', 3) ok = db.execute_non_query(cmd, db.conn) if ok: st.info('Statement executed.') else: st.error('Statement failed.') return ok # cmd = qry['last_miv_observation'] # result = get_single_value(cmd, conn, 'max_dat') # st.info('Most recent observation in miv_traffic: ' + result) ### Main ok = True update_info_query = {1: 'update_miv_info', 2: 'update_slow_info'} source_table = {1: 'miv_traffic_source', 2: 'slow_traffic_source'} source_file = { 1: cn.source_miv_file_name.format(year), 2: cn.source_slow_file_name.format(year) } source_fields = { 1: [ 'SiteCode', 'SiteName', 'DirectionName', 'LaneCode', 'LaneName', 'Date', 'TimeFrom', 'TimeTo', 'ValuesApproved', 'ValuesEdited', 'TrafficType', 'Total', 'MR', 'PW', 'PW+', 'Lief', 'Lief+', 'Lief+Aufl.', 'LW', 'LW+', 'Sattelzug', 'Bus', 'andere', 'Year', 'Month', 'Weekday', 'HourFrom' ], 2: [ 'SiteCode', 'SiteName', 'DirectionName', 'LaneCode', 'LaneName', 'Date', 'TimeFrom', 'TimeTo', 'ValuesApproved', 'ValuesEdited', 'TrafficType', 'Total', 'Year', 'Month', 'Weekday', 'HourFrom' ] } source_staging_transfer_query = { 1: 'miv_traffic_source_staging', 2: 'slow_traffic_source_staging' } traffic_type_criteria = {1: 'traffic_type = 1', 2: traffic_type > 1} row_count_start = db.count_rows("select * from traffic_fact", db.conn) # delete all records from the miv_traffic_source table if ok: cmd = all_qry.qry['truncate_table'].format(source_table[traffic_type]) ok = db.execute_non_query(cmd, db.conn) if ok: st.info(f'Table {source_table[traffic_type]} was initialized.') else: st.error( f'Table {source_table[traffic_type]} could not be deleted.') if ok: df, ok = read_source_file(source_file[traffic_type]) if ok: ok = save_db_table(source_table[traffic_type], df, source_fields[traffic_type]) # delete all rows from the staging table if ok: cmd = all_qry.qry['truncate_table'].format('traffic_staging') ok = db.execute_non_query(cmd, db.conn) if ok: st.info(f'Table {"traffic_staging"} was initialized.') else: st.error(f'Table {"traffic_staging"} could not be deleted.') # copy the source data to the staging table, some fields are removed and counts are simplified, e.g. pw and pw with anhänger are summed # there is a new count for pw and lieferwagen and for lastwagen, lastwagen with anhänger and sattelschlepper so light and heavy traffic can be easily # distinguished. if ok: ok = transfer_source2staging( source_staging_transfer_query[traffic_type]) # get the station_id from the station table if ok: cmd = qry['traffic_update_station_id'] ok = db.execute_non_query(cmd, db.conn) # append new direction names to the lookup table if ok: cmd = qry['traffic_update_direction_codes'] ok = db.execute_non_query(cmd, db.conn) # update direction id field in traffic_staging table if ok: cmd = qry['traffic_update_direction_id'] ok = db.execute_non_query(cmd, db.conn) # update time fields if ok: cmd = qry['update_traffic_time_columns'] ok = db.execute_non_query(cmd, db.conn) ok = True if ok: cmd = all_qry.qry['delete_rows_where'].format( 'traffic_fact', f'{traffic_type_criteria[traffic_type]} and year = {year}') st.write(cmd) ok = db.execute_non_query(cmd, db.conn) if ok: st.info( f'Table {"traffic_fact"} was initialized for year and traffic type.' ) else: st.error( f'Table {"traffic_staging"} could not be initialized for year and traffic type.' ) if ok: ok = transfer_staging2fact() if ok: row_count_end = db.count_rows("select * from traffic_fact", db.conn) st.info( f'{row_count_end - row_count_start} rows where successfully imported' ) df = db.execute_query(qry['import_result_summary'], db.conn) st.write("Summary") st.table(df) else: st.error( "The import could not be completed, check log above for error messages" )
deasing = covid[covid["Country"] == cty]['New deaths'].max() recsing = covid[covid["Country"] == cty]['New recovered'].max() tab = { "Category": [ "Total Confirmed Cases", "Total Recovered", "Total Active Cases", "Total Deaths", "Maximum Cases on a Single Day", "Maximum Deaths on a Single Day", "Maximum Recoveries on a Single Day" ], "Total Count": [tot, reco, act, dths, infsing, deasing, recsing] } stat = pd.DataFrame(tab) st.table(stat) st.header(f"Daily New Cases and Total Cases for Selected Countries") options = st.multiselect('Select Multiple Countries', covid["Country"][:186]) fire = alt.Chart(covid[covid["Country"].isin(options)], width=500, height=300).mark_circle().encode( x="Date", y="Country", tooltip=["Date", "Country", "New cases"], color="Country", size="New cases").interactive() bar1 = alt.Chart(covid[covid["Country"].isin(options)]).mark_bar().encode( y="sum(New cases)",
pipe.fit(X_train, y_train) mod_results = pd.DataFrame( { 'Train Size': X_train.shape[0], 'Validation Size': X_val.shape[0], 'Boosting Rounds': num_rounds, 'Tree Depth': tree_depth, 'Learning Rate': learning_rate, 'Training Score': pipe.score(X_train, y_train), 'Validation Score': pipe.score(X_val, y_val) }, index=['Values']) st.subheader("Model Results") st.table(mod_results) st.subheader("Real vs Predicted Validation Values") chart = sns.regplot(x=pipe.predict(X_val), y=y_val) st.pyplot(chart.figure) if page == 'Causal Impact': pipe.fit(X_train, y_train) # transform the dataset for PDPBox -- necessary step dataset = pipe[0].fit_transform(X_train) columns = dataset.columns.tolist() pipe.fit(X_train, y_train)
train_data = pd.read_csv(train_data) test_size = st.slider("精度の検証用データに分割するデータのサイズを選択してください(単位:%)", 10, 90, 30, 10) / 100 st.write("教師データのレコード数 : " + str(round(len(train_data) * (1 - test_size)))) st.write("検証データのレコード数 : " + str(round(len(train_data) * (test_size)))) df_train, df_test = train_test_split(train_data, test_size=test_size, random_state=111) pred_data = st.file_uploader("テストデータを読み込んでください", type="csv") df_pred = pd.read_csv(pred_data) # 読み込んだデータのサマリー st.dataframe(df_train.head()) label = st.selectbox("目的変数を選択してください", list(df_train.columns)) st.write("Summary of target variable") st.table(df_train[label].describe()) ########################################################################################## # Test # ########################################################################################## ''' *** ''' st.write("※実行中の計算内容は右下の[Manage app]ボタンをクリックすることで確認できます") st.write("(計算時間:サンプルデータを用いた場合で約5分です)") run_pred = st.checkbox("AutoML/AutoGluonの実行") if run_pred == True: save_path = 'agModels-predictClass' # specifies folder to store trained models predictor = TabularPredictor(label=label, path=save_path).fit(df_train,
def xplore_sidebar(state, sqobjs: dict): '''Draw appropriate sidebar for the page''' stime = state.start_time etime = state.end_time table_vals = sorted(list(sqobjs.keys())) if state.table: if isinstance(state.table, list): tblidx = table_vals.index(state.table[0]) else: tblidx = table_vals.index(state.table) else: tblidx = table_vals.index('device') # Default starting table assert_val = state.assert_clicked view_idx = 1 if state.view == 'all' else 0 devdf = gui_get_df(sqobjs['device'], columns=['namespace', 'hostname']) if devdf.empty: st.error('Unable to retrieve any namespace info') st.stop() namespaces = [""] namespaces.extend(sorted(devdf.namespace.unique().tolist())) if state.namespace: nsidx = namespaces.index(state.namespace) else: nsidx = 0 namespace = st.sidebar.selectbox('Namespace', namespaces, index=nsidx) if namespace != state.namespace: state.hostname = None state.namespace = namespace hostnames = [""] if state.namespace: hostlist = devdf.query(f'namespace=="{state.namespace}"') \ .hostname.unique().tolist() else: hostlist = devdf.hostname.unique().tolist() hostnames.extend(sorted(hostlist)) if state.hostname: hostidx = hostnames.index(state.hostname) else: hostidx = 0 state.hostname = st.sidebar.selectbox('Hostname', hostnames, index=hostidx) state.start_time = st.sidebar.text_input('Start time', value=stime, key='stime') state.end_time = st.sidebar.text_input('End time', value=etime, key='etime') table = st.sidebar.selectbox( 'Select Table to View', tuple(table_vals), index=tblidx) if table != state.table: # We need to reset the specific variables state.query = '' state.assert_clicked = False state.uniq_clicked = 0 state.table = table state.columns = 'default' view_vals = ('latest', 'all') if state.start_time and state.end_time: # We show everything thats happened when both times are specified view_idx = 1 state.view = st.sidebar.radio("View of Data", view_vals, index=view_idx) fields = TablesObj().describe(table=state.table) if state.table != 'tables': colist = sorted((filter(lambda x: x not in ['index', 'sqvers'], fields.name.tolist()))) columns = st.sidebar.multiselect('Pick columns', ['default', 'all'] + colist, default=state.columns) if ('default' in columns or 'all' in columns) and len(columns) == 1: col_sel_val = True else: col_sel_val = False col_ok = st.sidebar.checkbox('Column Selection Done', value=col_sel_val) if not col_ok: columns = ['default'] else: col_ok = True columns = ['default'] if not columns: columns = ['default'] state.columns = columns if state.table in ['interfaces', 'ospf', 'bgp', 'evpnVni']: state.assert_clicked = st.sidebar.checkbox( 'Run Assert', value=assert_val) else: state.assert_clicked = False if not col_ok: st.experimental_set_query_params(**asdict(state)) st.stop() if ('default' in columns or 'all' in columns) and len(columns) != 1: st.error('Cannot select default/all with any other columns') st.experimental_set_query_params(**asdict(state)) st.stop() elif not columns: st.error('Columns cannot be empty') st.experimental_set_query_params(**asdict(state)) st.stop() state.query = st.sidebar.text_input( 'Filter results with pandas query', value=state.query, key=state.table) st.sidebar.markdown( "[query syntax help](https://suzieq.readthedocs.io/en/latest/pandas-query-examples/") if columns == ['all']: columns = ['*'] if state.table != "tables": col_expander = st.sidebar.beta_expander('Column Names', expanded=False) with col_expander: st.subheader(f'{state.table} column names') st.table(TablesObj().describe(table=state.table) .query('name != "sqvers"') .reset_index(drop=True).style)
def bar_plot(self, data_row): empty_plot1 = st.empty() empty_plot2 = st.empty() num_mat = len(data_row['material'].unique().tolist()) i = 0 xx = pd.DataFrame({'radius_nm': data_row['radius_nm'].unique()}) xx = xx['radius_nm'].to_numpy() yy = np.zeros(len(xx)) for material in data_row['material'].unique().tolist(): data = data_row[data_row['material'] == material] data_medie = pd.DataFrame({'radius_nm': data['radius_nm'].unique()}) raggi = data['radius_nm'].unique().tolist() media_quality = [] media_speed = [] media_error_q = [] media_error_s = [] media_material = [] for raggio in raggi: media_quality.append(data['normalized_signal_quality'][data['radius_nm'] == raggio].mean()) media_speed.append(data['signal_speed'][data['radius_nm'] == raggio].mean()) media_error_q.append(data['normalized_signal_quality'][data['radius_nm'] == raggio].std()) media_error_s.append(data['signal_speed'][data['radius_nm'] == raggio].std()) media_material.append(material) data_medie['normalized_signal_quality'] = media_quality data_medie['signal_speed'] = media_speed data_medie['normalized_signal_quality_err'] = media_error_q data_medie['signal_speed_err'] = media_error_s data_medie['material'] = media_material data_medie = data_medie.fillna(0) st.write(material) st.table(data) delta_delay = (12)/num_mat delay = -3 + delta_delay*i ds().nuova_fig(30) ds().titoli(titolo='Normalized Signal Intensity', xtag='radius[nm]', ytag='counts') ds().dati(x = data_medie['radius_nm'].to_numpy(), y = data_medie['normalized_signal_quality'].to_numpy(), scat_plot = 'bar', delay = delay, width = 3, descrizione=material) ds().dati(x = data_medie['radius_nm'].to_numpy()+delay/2, y = data_medie['normalized_signal_quality'].to_numpy(), y_error=data_medie['normalized_signal_quality_err'].to_numpy()/2, scat_plot = 'err', colore='black') ds().dati(x = data['radius_nm']+delay/2, y = data['normalized_signal_quality'], scat_plot ='scat', colore="blue", larghezza_riga =12, layer = 2) ds().dati(x = xx, y = yy, scat_plot ='bar', width = 3, delay = 0) ds().legenda() ds().nuova_fig(31) ds().titoli(titolo='Slope (C)', xtag='radius[nm]', ytag='T/I [k/uW]') ds().dati(x = data_medie['radius_nm'].to_numpy(), y = data_medie['signal_speed'].to_numpy(), scat_plot = 'bar', delay = delay, width = 3, descrizione=material) ds().dati(x = data_medie['radius_nm'].to_numpy()+delay/2, y = data_medie['signal_speed'].to_numpy(), y_error=data_medie['signal_speed_err'].to_numpy()/2, scat_plot = 'err', colore='black') ds().dati(x = data['radius_nm']+delay/2, y = data['signal_speed'], scat_plot ='scat', colore="blue", larghezza_riga =12, layer = 2) ds().dati(x = xx, y = yy, scat_plot ='bar', width = 3, delay = 0) ds().legenda() i = i+1 ds().nuova_fig(30) empty_plot1.pyplot() ds().nuova_fig(31) empty_plot2.pyplot()
st.sidebar.markdown(f'##### Period: {option1}') st.sidebar.markdown(f'##### RSI Range: [{option2_1} - {option2_2}]') st.sidebar.markdown(f'##### VolRatio: {option3}') st.sidebar.markdown(f'') form_data = { 'rsi_buy_sell': f'{option2_1},{option2_2}', 'vol_ratio_trigger': option3, 'period': option1 } # st.write(f'Period : {option1} RSI Range : [{option2_1},{option2_2}]\tVolume Ratio Trigger : {option3}') # my_bar = st.progress(0) # for percent_complete in range(100): # time.sleep(0.1) # my_bar.progress(percent_complete + 1) if st.sidebar.button('Calculate', key='run'): with st.spinner('Wait for it...'): # st.success('Done!') companies, timeofevent, paramets = get_table(form_data) st.markdown(f"<h5 color:red'>Time of Request : {timeofevent}</h5>", unsafe_allow_html=True) st.table(companies) else: st.markdown(""" #### Period : It is duration of stock Data to be considered for calculating RSI #### RSI Range : At what value to sell or Buy the stocks #### Volume Change Ratio : What ratio of volume change should trigger a Buy/Sell """)
# -*- coding: utf-8 -*- import streamlit as st import numpy as np import cv2 import time import requests from bs4 import BeautifulSoup import csv df = pd.read_csv("offers.csv") name = st.multiselect('Тип квартиры', df['Тип'].unique()) room = st.multiselect('Количество комнат', df['Количество комнат'].unique()) new_df = df[(df['Тип'].isin(name)) & (df['Количество комнат'].isin(room))] st.table(new_df) uploaded_file1 = st.file_uploader("Choose a image file 1", type="jpg") if uploaded_file1 is not None: image1 = Image.open(uploaded_file1) st.image(image1, caption='Photo 1', use_column_width=True) img_array1 = np.array(image1) resized1 = cv2.resize(img_array1, (8, 8), interpolation=cv2.INTER_AREA) gray_image1 = cv2.cvtColor(resized1, cv2.COLOR_BGR2GRAY) avg = gray_image1.mean() ret, threshold_image1 = cv2.threshold(gray_image1, avg, 255, 0) st.image(threshold_image1) hash1 = "" for x in range(8): for y in range(8): val = threshold_image1[x, y] if val == 255: hash1 = hash1 + "1"