def clean_And_Parse_tweets(tweets): # parsing and adding bag of words to the tweet_words for each tweet for tweet in tweets: tweet['tweet_words']=tweet_tokenizer(tweet['tweet_text']) # Get the content from the webpage for tweet in tweets: if tweet['tweet_urls']!="": webpage = Words_In_Webpage(tweet['tweet_urls']) tweet['tweet_webpage_words']= webcontent_tokenizer(webpage) return tweets
def clean_And_Parse_tweets(tweets): # parsing and adding bag of words to the tweet_words for each tweet for tweet in tweets: tweet['tweet_words']=tweet_tokenizer(tweet['tweet_text']) # Get the content from the webpage for tweet in tweets: tweet["word_vector"] = {} wordvecfinal = {} wordvec = {} for word in tweet['tweet_words']: wordvec.setdefault(word,0) wordvec[word]+=1 if tweet['tweet_urls']!="": webpage = Words_In_Webpage(tweet['tweet_urls']) tweet['tweet_webpage_words']= webcontent_tokenizer(webpage) for word in tweet['tweet_webpage_words']: wordvec.setdefault(word,0) wordvec[word]+=1 for word,wordcount in wordvec.items(): #frac = float(wordcount)/len(wordvec) #if frac >0: wordvecfinal[word] = wordcount tweet["word_vector"] = wordvecfinal return tweets