'''tweets_df = pd.read_csv('data/tweets_1600000.csv', encoding = 'latin')
tweets_df.columns = ['sentiment','id','time','query','name','tweet']
tweets_df = tweets_df[['tweet','sentiment']]
tweets_df['clean_tweet'] = clean.clean(tweets_df['tweet'])
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].apply(lambda x: clean.tokenize(x))
docs2 = tweets_df['clean_tweet']
t2 = Tokenizer()
t2.fit_on_texts(docs2)
vocab_size2 = len(t2.word_index) + 1
#encode the documents
encoded_docs2 = t2.texts_to_sequences(docs2)'''

clean = CleanText()

#clean() removes urls, emoticons and hashtags
tweets['text'] = clean.clean(tweets['text'])
#remove punctuations, stopwords, lemmatize and splits the sentences into tokens
tweets['text'] = tweets['text'].apply(lambda x: clean.tokenize(x))

docs = tweets['text']
labels = tweets['sentiment']
le = LabelEncoder()
labels_en = le.fit_transform(labels)  #Negative: 0, Positive: 1
labels_en = keras.utils.to_categorical(np.asarray(labels_en))

#tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
#encode the documents
encoded_docs = t.texts_to_sequences(docs)
import keras
from keras.models import Model, Sequential
from keras.layers import LSTM
from keras.layers import Flatten, Dense, Dropout, Activation, Input, BatchNormalization
from keras.optimizers import Adam
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tweets = pd.read_csv('Tweets.csv')
tweets = tweets[['text', 'airline_sentiment']]

clean = CleanText()

tweets['text'] = tweets['text'].apply(lambda x: clean.clean(x))

docs = tweets['text']
labels = tweets['airline_sentiment']
le = LabelEncoder()
labels_en = le.fit_transform(labels)  #Neutral: 1, Positive: 2, Negative: 0
labels_en = keras.utils.to_categorical(np.asarray(labels_en))

#tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
#encode the documents
encoded_docs = t.texts_to_sequences(docs)

 st.title("Sentiment Analysis of Tweets")    
 date = st.sidebar.date_input('Enter Date Range:',[datetime.date(2019, 7, 6), datetime.date(2019, 7, 8)])
 limit = st.sidebar.slider('Enter number of Tweets to scrape:',0,1000)
 lang = 'english'
 
 
 if st.button('Scrape Tweets'):
     with st.spinner('Scraping Tweets...'):
         tweets = query_tweets('videogames', begindate = date[0], enddate = date[1], limit = limit, lang = lang)
     
     
     df = pd.DataFrame(t.__dict__ for t in tweets)
     df = df[['timestamp','text','likes','retweets']]
     df = df.drop_duplicates(subset=['likes'])
     clean = CleanText()
     df['clean_text'] = clean.clean(df['text']) 
     df['clean_text'] = df['clean_text'].apply(lambda x: clean.tokenize(x)) 
     
     docs = df['clean_text']
     
     #tokenizer
     t = Tokenizer()
     t.fit_on_texts(docs)
     vocab_size = len(t.word_index) + 1
     
     #encode the documents
     encoded_docs = t.texts_to_sequences(docs)
     
     #pad docs to max length
     padded_docs = pad_sequences(encoded_docs, maxlen = 40, padding = 'post') 
     labels_categorical = model_sentiment.predict(padded_docs)