def load_data_2(limit=250, split=0.7): mysql_con = mysqlClient.mysql_connection() get_tweets_query = f""" SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.AMZN where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.AMD where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.AAPL where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.GOOG where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.FB where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bullish' FROM tweets.MSFT where contents like 'up%' or contents like 'buy%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.AMZN where contents like 'down%' or contents like 'sell%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.AMD where contents like 'down%' or contents like 'sell%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.AAPL where contents like 'down%' or contents like 'sell%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.GOOG where contents like 'down%' or contents like 'sell%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.FB where contents like 'down%' or contents like 'sell%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'bearish' FROM tweets.MSFT where contents like 'short%' or contents like 'bear%' or contents like 'crash%' union all SELECT contents COLLATE utf8mb4_unicode_ci, 'neutral' FROM tweets.AMZN where CATEGORY like 'neutral%' """ tweets = mysql_con.select_query(get_tweets_query) tweets_df = pd.DataFrame(list(tweets), columns=['contents', 'category']) training_df_bullish = tweets_df.query("category == 'bullish'").sample( limit, replace=True) training_df_bearish = tweets_df.query("category == 'bearish'").sample( limit, replace=True) training_df_neutral = tweets_df.query("category == 'neutral'").sample( limit, replace=True) """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation train_data = pd.concat( [training_df_bullish, training_df_bearish, training_df_neutral]).values.tolist() # train_data = pd.concat([training_df_positive, training_df_negative]).values.tolist() # train_data = [(Doc(i[0]), i[1]) for i in train_data] texts, labels = zip(*train_data) cats = [{ 'Neutral': float(y == 'neutral'), 'Bullish': float(y == 'bullish'), 'Bearish': float(y == 'bearish') } for y in labels] # cats = [{'Bullish':float(y=='Positive'), 'Bearish':float(y=='Negative')} for y in labels] split = int(len(train_data) * split) return (texts[:split], cats[:split]), (texts[split:], cats[split:])
import pandas as pd import datetime import numpy as np def in_trading_hour(x): start = datetime.time(14, 30) end = datetime.time(21, 0) """Return true if x is in the range [start, end]""" if start <= end: return start <= x <= end else: return start <= x or x <= end mysql_con = mysqlClient.mysql_connection() mysql_con.init_engine('tweets_features') engine = mysql_con.engine # , for stocks in ['AAPL', 'AMD', 'AMZN', 'FB', 'GOOG', 'MSFT']: print(f'getting {stocks} data...') data = mysql_con.select_query(f"SELECT * from tweets_features.{stocks}_CLASSIFIED;") data_df = pd.DataFrame(list(data), columns=['GUID', 'DATETIME', 'URL', 'CONTENTS', 'AUTHOR', 'NAME', 'COUNTRY', 'STATE/REGION', 'CITY/URBAN_AREA', 'CATEGORY', 'EMOTION', 'SOURCE', 'GENDER', 'POSTS', 'FOLLOWERS', 'FOLLOWING', 'POST_TITLES', 'POST_TYPE', 'IMAGE_URL', 'BRAND', 'BULLISH', 'BEARISH', 'NEUTRAL']) data_df.set_index('DATETIME', inplace=True) data_df.drop(columns=['GUID'], inplace=True) # TODO: Factorize location data and sum the count for each location as a feature
import json import mysqlClient db_helper = mysqlClient.mysql_connection() for stock in ['AAPL', 'AMD', 'AMZN', 'FB', 'GOOG', 'MSFT']: try: file_path = f'stockDataCollector/5m_data/{stock}2.json' with open(file_path, 'r') as f: data_unload = json.load(f) prices = data_unload['Time Series (5min)'] insert_query = "replace into stock_prices.{} values ".format(stock) for k, v in prices.items(): ts = k _open = v['1. open'] high = v['2. high'] low = v['3. low'] _close = v['4. close'] vol = v['5. volume'] insert_query += f"('{ts}', {_open}, {high}, {low}, {_close}, {vol})," insert_query = insert_query[:-1] insert_query += ';' print(insert_query) db_helper.commit_query(insert_query) except: pass