def update_tweet_hashtags(self): logger = logging.getLogger(__name__) cleaner = DataCleaner() # Get tweets with no entities extracted yet (or including no entities indeed) sql_list = "SELECT TOP 1000 id FROM tweets_tweet WHERE entities_extracted = 0;" df_tweets = pd.read_sql(sql_list, connection) try: with connection.cursor() as cursor: for i, row in df_tweets.iterrows(): id = row["id"] # Get tweet sql_get = "SELECT tweet FROM tweets_tweet WHERE id = " + str(id) + ";" df_tweet = pd.read_sql(sql_get, connection) # Extract entities for the tweet ents = cleaner.extract_entities(df_tweet['tweet'][0]) ents = ents.replace("'", "''") # Update tweet's entities field sql_upd = "UPDATE tweets_tweet SET entities = '" + ents + "', entities_extracted = 1 WHERE id = " + str(id) + ";" cursor.execute(sql_upd) connection.commit() except (Exception) as error: print(error) logger.error(error) finally: if connection is not None: connection.close()
def frequency(self, df, count, only_entities=False, excluded_words=[]): if only_entities: col = 'entities' sep = ',' else: col = 'tweet' sep = ' ' cleaner = DataCleaner() df = cleaner.clean(df) if count is None: count = DEF_RESULT_SIZE_FREQUENCY nested_word_lists = [tweet.lower().split(sep) for tweet in df[col]] # Exclude unwanted words nested_word_lists = [[ word.strip() for word in word_list if word.strip() != '' and word.strip() not in excluded_words ] for word_list in nested_word_lists] # Convert to a dictionary containing all words with the number of appearances counter = collections.Counter(itertools.chain(*nested_word_lists)) # Convert to a dataframe df_freq = pd.DataFrame(counter.most_common(count), columns=['word', 'count']) return df_freq
def upsert_coocmatrix(self, search_term, df): logger = logging.getLogger(__name__) cleaner = DataCleaner() try: with connection.cursor() as cursor: # Get existing coocmatrix sql_get = "SELECT * FROM tweets_coocmatrix WHERE search_term = '" + search_term + "';" df_coocmatrix = pd.read_sql(sql_get, connection) matrix = df.to_json() # Update or insert coocmatrix if not df_coocmatrix.empty: sql = "UPDATE tweets_coocmatrix SET matrix = %s WHERE search_term = %s;" cursor.execute(sql, (matrix, search_term)) else: sql = "INSERT INTO tweets_coocmatrix (search_term, matrix, created_at) VALUES (%s, %s, %s);" cursor.execute(sql, (search_term, matrix, dt.now())) connection.commit() except (Exception) as error: print(error) logger.error(error) finally: if connection is not None: connection.close()
def bigram(request, search_term=None): start = datetime.now() search_term = extract_search_term(request, search_term) start_date = request.POST.get('start_date', None) end_date = request.POST.get('end_date', None) tweet_count = request.POST.get('tweet_count', None) if tweet_count: tweet_count = int(tweet_count.replace(',', '')) result_size = request.POST.get('result_size', None) if result_size: result_size = int(result_size.replace(',', '')) db = Database() vis = Visualization() cleaner = DataCleaner() analysis = Analysis() tweets = pd.DataFrame() if search_term or request.method == "POST": tweets = db.get_tweet_entities(search_term, tweet_count, start_date, end_date, tweet_count) html = '' first_tweet_date = '' last_tweet_date = '' if tweets is not None: if not tweets.empty: excluded_terms = [search_term] excluded_terms += cleaner.excluded_tokens df_coocmatrix, df_cooc = analysis.cooccurrence(tweets, 'entities', result_size, excluded_terms, ngram=(2, 2)) # db.upsert_coocmatrix(search_term, df_coocmatrix) # db.upsert_cooc(search_term, df_cooc) html = vis.network_pyvis(df_cooc) tweet_count = len(tweets.index) result_size = len(df_cooc.index) first_tweet_date = datetime.strftime(tweets.min()['created_at'], '%m/%d/%Y %H:%M') last_tweet_date = datetime.strftime(tweets.max()['created_at'], '%m/%d/%Y %H:%M') context = { 'search_term': search_term if search_term else '-', 'graphic': html, 'tweet_count': tweet_count, 'result_size': result_size, 'first_tweet_date': first_tweet_date, 'last_tweet_date': last_tweet_date, 'time_elapsed': (datetime.now() - start).seconds, } return render(request, 'tweets/bigram.html', context)
def frequency(request, search_term=None): start = datetime.now() search_term = extract_search_term(request, search_term) start_date = request.POST.get('start_date', None) end_date = request.POST.get('end_date', None) tweet_count = request.POST.get('tweet_count', None) if tweet_count: tweet_count = int(tweet_count.replace(',', '')) result_size = request.POST.get('result_size', None) if result_size: result_size = int(result_size.replace(',', '')) db = Database() analysis = Analysis() vis = Visualization() cleaner = DataCleaner() tweets = pd.DataFrame() if search_term or request.method == "POST": tweets = db.get_tweets(search_term, tweet_count, start_date, end_date) script = '' div = '' first_tweet_date = '' last_tweet_date = '' if tweets is not None: if not tweets.empty: excluded_words = [search_term] excluded_words += cleaner.excluded_tokens df = analysis.frequency(df=tweets, count=result_size, excluded_words=excluded_words) script, div = vis.frequency(df, search_term) tweet_count = len(tweets.index) result_size = len(df.index) first_tweet_date = datetime.strftime(tweets.min()['created_at'], '%m/%d/%Y %H:%M') last_tweet_date = datetime.strftime(tweets.max()['created_at'], '%m/%d/%Y %H:%M') context = { 'search_term': search_term if search_term else '-', 'script': script, 'div': div, 'tweet_count': tweet_count, 'result_size': result_size, 'first_tweet_date': first_tweet_date if first_tweet_date else '-', 'last_tweet_date': last_tweet_date if last_tweet_date else '-', 'time_elapsed': (datetime.now() - start).seconds, } return render(request, 'tweets/frequency.html', context)
def hashtag_network(request, search_term=None): start = datetime.now() search_term = extract_search_term(request, search_term) start_date = request.POST.get('start_date', None) end_date = request.POST.get('end_date', None) tweet_count = request.POST.get('tweet_count', None) if tweet_count: tweet_count = int(tweet_count.replace(',', '')) hashtag_count = request.POST.get('hashtag_count', None) if hashtag_count: hashtag_count = int(hashtag_count.replace(',', '')) db = Database() analysis = Analysis() vis = Visualization() cleaner = DataCleaner() df = pd.DataFrame() if search_term or request.method == "POST": tweets = db.get_tweets(search_term, tweet_count, start_date, end_date) script = '' div = '' if not df.empty: df_coocmatrix, df_cooc = analysis.cooccurrence(tweets, 'hashtag', hashtag_count, excluded_terms=None) db.upsert_coocmatrix(search_term, df_coocmatrix) db.upsert_cooc(search_term, df_cooc) html = vis.network_pyvis(df_cooc) context = { 'search_term': search_term if search_term else '-', 'script': script, 'div': div, 'hashtag_count': hashtag_count, 'first_tweet_date': start_date if start_date else '-', 'last_tweet_date': end_date if end_date else '-', 'time_elapsed': (datetime.now() - start).seconds, } return render(request, 'tweets/hashtag.html', context)
def hashtag(request, search_term=None): start = datetime.now() search_term = extract_search_term(request, search_term) start_date = request.POST.get('start_date', None) end_date = request.POST.get('end_date', None) hashtag_count = request.POST.get('hashtag_count', None) if hashtag_count: hashtag_count = int(hashtag_count.replace(',', '')) db = Database() analysis = Analysis() vis = Visualization() cleaner = DataCleaner() df = pd.DataFrame() if search_term or request.method == "POST": df = db.get_hashtags(search_term, max=hashtag_count) script = '' div = '' if not df.empty: script, div = vis.frequency(df, search_term) hashtag_count = len(df.index) context = { 'search_term': search_term if search_term else '-', 'script': script, 'div': div, 'hashtag_count': hashtag_count, 'first_tweet_date': start_date if start_date else '-', 'last_tweet_date': end_date if end_date else '-', 'time_elapsed': (datetime.now() - start).seconds, } return render(request, 'tweets/hashtag.html', context)