def getStocksList_CHN(root_path): try: df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN") df.index = df.index.astype(str).str.zfill(6) except Exception as e: df = pd.DataFrame() if df.empty == False: return df stock_info = ts.get_stock_basics() listData = pd.DataFrame(stock_info) listData['daily_update'] = '1970-07-01' listData['weekly_update'] = '1970-07-01' listData['monthly_update'] = '1970-07-01' listData['news_update'] = '1970-07-01' listData.index.name = 'symbol' listData = listData.reset_index() #listData.index.name = 'symbol' #listData.index = listData.index.astype(str).str.zfill(6) #[str(symbol).zfill(6) for symbol in listData.index] #listData.index.astype(str).str.zfill(6) #print(listData.index) #listData['symbol'] = listData['symbol'].str.strip() storeStockList(root_path, "DB_STOCK", "SHEET_CHN", listData) df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN") if df.empty == False: df.index = df.index.astype(str).str.zfill(6) return df
def getStocksList_US(root_path): try: df = queryStockList(root_path, "DB_STOCK", "SHEET_US") except Exception as e: df = pd.DataFrame() if df.empty == False: return df for exchange in ["NASDAQ", "NYSE"]: print("fetching " + exchange + " stocklist...") url = "http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=%s&render=download" % exchange repeat_times = 1 # repeat downloading in case of http error for _ in range(repeat_times): try: urlData = requests.get(url, timeout=15).content if df.empty: df = pd.read_csv(io.StringIO(urlData.decode('utf-8'))) else: df = pd.concat([ df, pd.read_csv(io.StringIO(urlData.decode('utf-8'))) ]) break except Exception as e: print("exception in getStocks:" + exchange, str(e)) continue df = df[(df['MarketCap'] > 100000000)] df = df.drop_duplicates(subset=['Symbol'], keep='first') df.sort_index(ascending=True, inplace=True) listData = df[['Symbol', 'Name', 'MarketCap', 'Sector', 'Industry']].copy() listData = listData.rename( columns={ 'Symbol': 'symbol', 'Name': 'name', 'MarketCap': 'market_cap', 'Sector': 'sector', 'Industry': 'industry' }) listData.loc[len(listData)] = [ 'SPY', 'SPDR S&P 500 ETF Trust', 0.0, '', '' ] listData.loc[len(listData)] = ['^VIX', 'VOLATILITY S&P 500', 0.0, '', ''] listData['symbol'] = listData['symbol'].str.strip() listData['daily_update'] = '1970-07-01' listData['weekly_update'] = '1970-07-01' listData['monthly_update'] = '1970-07-01' listData['news_update'] = '1970-07-01' listData = listData.reset_index(drop=True) storeStockList(root_path, "DB_STOCK", "SHEET_US", listData) return queryStockList(root_path, "DB_STOCK", "SHEET_US")
def getStocksList(root_path): try: df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY") df.index = df.index.astype(str).str.zfill(6) except Exception as e: df = pd.DataFrame() if df.empty == False: return df stock_info = ts.get_stock_basics() listData = pd.DataFrame(stock_info) #listData.index.name = 'Symbol' #listData.index = listData.index.astype(str).str.zfill(6) #[str(symbol).zfill(6) for symbol in listData.index] #listData.index.astype(str).str.zfill(6) #print(listData.index) #listData['Symbol'] = listData['Symbol'].str.strip() storeStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY", listData) df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY") df.index = df.index.astype(str).str.zfill(6) return df
def updateStockTwitterData(root_path, from_date, till_date, storeType): df = queryStockList(root_path, "DB_STOCK", "SHEET_US_DAILY") symbols = df['symbol'].values.tolist() pbar = tqdm(total=len(symbols)) if storeType == 1 or storeType == 2: for symbol in symbols: startTime = updateSingleStockTwitterData(root_path, symbol, from_date, till_date) outMessage = '%-*s fetched in: %.4s seconds' % (6, symbol, (time.time() - startTime)) pbar.set_description(outMessage) pbar.update(1) pbar.close()
def updateStockTwitterData(root_path, from_date, till_date, storeType): df = queryStockList(root_path, "DB_STOCK", "SHEET_US_DAILY") symbols = df['symbol'].values.tolist() pbar = tqdm(total=len(symbols)) if storeType == 1 or storeType == 2: for symbol in symbols: startTime = updateSingleStockTwitterData(root_path, symbol, from_date, till_date) outMessage = '%-*s fetched in: %.4s seconds' % (6, symbol, (time.time() - startTime)) pbar.set_description(outMessage) pbar.update(1) # top5 = getWordCount(df, symbol, stocklist) # print("hot correlation:", top5) # getSentiments(df) # if storeType == 1: # log_errors = [] # log_update = [] # with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: # # Start the load operations and mark each future with its URL # future_to_stock = {executor.submit(updateSingleStockTwitterData, root_path, symbol, from_date, till_date): symbol for symbol in symbols} # for future in concurrent.futures.as_completed(future_to_stock): # stock = future_to_stock[future] # try: # startTime, message = future.result() # except Exception as exc: # startTime = time.time() # log_errors.append('%r generated an exception: %s' % (stock, exc)) # len_errors = len(log_errors) # if len_errors % 5 == 0: print(log_errors[(len_errors-5):]) # else: # if len(message) > 0: log_update.append(message) # outMessage = '%-*s fetched in: %.4s seconds' % (6, stock, (time.time() - startTime)) # pbar.set_description(outMessage) # pbar.update(1) # if len(log_errors) > 0: print(log_errors) # # if len(log_update) > 0: print(log_update) pbar.close()
def process_all_stocks_data(root_path, window=1): df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY") df.index = df.index.astype(str).str.zfill(6) symbols = df.index.values.tolist() pbar = tqdm(total=len(symbols)) day_selection = [] week_selection = [] month_selection = [] startTime = time.time() for symbol in symbols: startTime = processing_stock_data(root_path, symbol, window, day_selection, week_selection, month_selection) outMessage = '%-*s processed in: %.4s seconds' % (6, symbol, (time.time() - startTime)) pbar.set_description(outMessage) pbar.update(1) print('total processing in: %.4s seconds' % ((time.time() - startTime))) day_week_selection = list(set(day_selection) & set(week_selection)) week_month_selection = list(set(week_selection) & set(month_selection)) day_month_selection = list(set(day_selection) & set(month_selection)) all_selection = list(set(day_week_selection) & set(week_month_selection)) print("all_selection", len(all_selection), sorted(all_selection)) print("day_week_selection", len(day_week_selection), sorted(day_week_selection)) print("week_month_selection", len(week_month_selection), sorted(week_month_selection)) print("day_month_selection", len(day_month_selection), sorted(day_month_selection)) print("/n ------------------------ /n") print("day_selection", len(day_selection), sorted(day_selection)) print("week_selection", len(week_selection), sorted(week_selection)) print("month_selection", len(month_selection), sorted(month_selection))
def updateSingleStockData(root_path, symbol, from_date, till_date, force_check): startTime = time.time() message = "" if len(symbol) == 0: return startTime, message now_date = pd.Timestamp((datetime.datetime.now()).strftime("%Y-%m-%d")) start_date = pd.Timestamp(from_date) end_date = pd.Timestamp(till_date) if end_date == now_date: end_date = end_date - datetime.timedelta(days=1) stockData, lastUpdateTime = queryStock(root_path, "DB_STOCK", "SHEET_US", "_WEEKLY", symbol, "weekly_update") if stockData.empty: stockData, message = getSingleStock(symbol, from_date, till_date) if stockData.empty == False: storeStock(root_path, "DB_STOCK", "SHEET_US", "_WEEKLY", symbol, stockData, "weekly_update") first_date = pd.Timestamp(stockData.index[0]) to_date = (first_date - datetime.timedelta(days=1)).strftime("%Y-%m-%d") if judgeNeedPreDownload(root_path, symbol, first_date, from_date, to_date): storePublishDay(root_path, "DB_STOCK", "SHEET_US", symbol, first_date.strftime("%Y-%m-%d")) message = message + ", database updated" else: print("get stock from network failed", symbol) return startTime, message modified = False savePublishDay = False first_date = pd.Timestamp(stockData.index[0]) last_date = pd.Timestamp(stockData.index[-1]) if start_date < first_date: to_date = (first_date - datetime.timedelta(days=1)).strftime("%Y-%m-%d") if judgeNeedPreDownload(root_path, symbol, first_date, from_date, to_date): message = message + ", download pre data from " + from_date + " to " + to_date moreStockData, tempMessage = getSingleStock( symbol, from_date, to_date) message = message + tempMessage if len(moreStockData) > 0: if isinstance(moreStockData.index, pd.DatetimeIndex): moreStockData.index = moreStockData.index.strftime( "%Y-%m-%d") modified = True stockData = pd.concat([moreStockData, stockData]) stockData.index.name = 'date' else: savePublishDay = True storePublishDay(root_path, "DB_STOCK", "SHEET_US", symbol, first_date.strftime("%Y-%m-%d")) message = message + ", save stock publish(IPO) day, next time won't check it again" updateOnce = now_date > lastUpdateTime if (end_date > last_date) and (updateOnce or force_check): to_date = (last_date + datetime.timedelta(days=1)).strftime("%Y-%m-%d") if judgeNeedPostDownload(now_date, to_date, till_date): message = message + ", download post data from " + to_date + " to " + till_date moreStockData, tempMessage = getSingleStock( symbol, to_date, till_date) message = message + tempMessage if len(moreStockData) > 0: if isinstance(moreStockData.index, pd.DatetimeIndex): moreStockData.index = moreStockData.index.strftime( "%Y-%m-%d") modified = True stockData = pd.concat([stockData, moreStockData]) stockData.index.name = 'date' if modified: stockData = stockData[~stockData.index.duplicated(keep='first')] storeStock(root_path, "DB_STOCK", "SHEET_US", "_WEEKLY", symbol, stockData, "weekly_update") elif updateOnce: now_date = datetime.datetime.now().strftime("%Y-%m-%d") stockList = queryStockList(root_path, "DB_STOCK", "SHEET_US") if stockList[stockList.index == symbol]['daily_update'][0] != now_date: stockList.set_value(symbol, 'weekly_update', now_date) storeStockList(root_path, "DB_STOCK", "SHEET_US", stockList, symbol) elif savePublishDay == False: message = "" return startTime, message
def process_all_stocks_data(root_path, window=1): df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY") df.index = df.index.astype(str).str.zfill(6) symbols = df.index.values.tolist() pbar = tqdm(total=len(symbols)) day_selection = [] week_selection = [] month_selection = [] # for index in range(0, window): # day_window = [] # day_selection.append(day_window) # week_window = [] # week_selection.append(week_window) # month_window = [] # month_selection.append(month_window) startTime = time.time() for symbol in symbols: startTime = processing_stock_data(root_path, symbol, window, day_selection, week_selection, month_selection) outMessage = '%-*s processed in: %.4s seconds' % (6, symbol, (time.time() - startTime)) pbar.set_description(outMessage) pbar.update(1) print('total processing in: %.4s seconds' % ((time.time() - startTime))) # with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: # # Start the load operations and mark each future with its URL # future_to_stock = {executor.submit(processing_stock_data, root_path, symbol, window, day_selection, week_selection, month_selection): symbol for symbol in symbols} # for future in concurrent.futures.as_completed(future_to_stock): # stock = future_to_stock[future] # try: # startTime = future.result() # except Exception as exc: # startTime = time.time() # print('%r generated an exception: %s' % (stock, exc)) # outMessage = '%-*s processed in: %.4s seconds' % (6, stock, (time.time() - startTime)) # pbar.set_description(outMessage) # pbar.update(1) # day_week_selection = [] # week_month_selection = [] # day_month_selection = [] # all_selection = [] #count = [] day_week_selection = list(set(day_selection) & set(week_selection)) week_month_selection = list(set(week_selection) & set(month_selection)) day_month_selection = list(set(day_selection) & set(month_selection)) all_selection = list(set(day_week_selection) & set(week_month_selection)) #day_selection = list(set(day_selection) - set(all_selection)) #week_selection = list(set(week_selection) - set(all_selection)) #month_selection = list(set(month_selection) - set(all_selection)) # sumUp = len(day_week_selection[index]) + len(week_month_selection[index]) + len(day_month_selection[index]) + len(all_selection[index]) # count.insert(0,sumUp) print("all_selection", len(all_selection), sorted(all_selection)) print("day_week_selection", len(day_week_selection), sorted(day_week_selection)) print("week_month_selection", len(week_month_selection), sorted(week_month_selection)) print("day_month_selection", len(day_month_selection), sorted(day_month_selection)) print("/n ------------------------ /n") # plt.plot(range(0, len(count)), count) # plt.title('A simple chirp') # plt.show() print("day_selection", len(day_selection), sorted(day_selection)) print("week_selection", len(week_selection), sorted(week_selection)) print("month_selection", len(month_selection), sorted(month_selection))
def get_all_stocks_correlation(root_path, dates_range): df = queryCorrelation(root_path, "DB_STOCK", "SHEET_CHN_RELA") if df.empty == False: return df df = queryStockList(root_path, "DB_STOCK", "SHEET_CHN_DAILY") symbols = df.index.values.tolist() pbar = tqdm(total=len(symbols)) stockData = [] stockList = [] print("get stock data...") # count = 500 for symbol in symbols: startTime = time.time() df = get_single_stock_data(root_path, symbol, dates_range) if df.empty: continue stockData.append(df['Return']) stockList.append(symbol) outMessage = '%-*s fetched in: %.4s seconds' % (12, symbol, (time.time() - startTime)) pbar.set_description(outMessage) pbar.update(1) # count -= 1 # if count == 0: break pbar.close() print("merge stock data...") startTime = time.time() df_returns = pd.concat(stockData, axis=1).fillna(0) df_returns.columns = stockList df_correlations = df_returns.corr() print('total processing in: %.4s seconds' % ((time.time() - startTime))) print("cal correlationship...") startTime = time.time() pairwise_correlations = [] stockCount = len(stockList) pbar = tqdm(total=stockCount * stockCount) for i in range(stockCount): for j in range(stockCount): if j > i: pairwise_correlations.append(df_correlations.iloc[i][j]) pbar.set_description(str(i) + " " + str(j)) pbar.update(1) print("arrange matrix...") CHN_company_pairs = combinations(stockList, 2) df_CHN_company_pairs = pd.DataFrame(list(CHN_company_pairs)) df_CHN_company_pairs.columns = ['company1', 'company2'] df_CHN_company_pairs.loc[:, 'correlation'] = pd.Series( pairwise_correlations).T df_CHN_company_pairs = df_CHN_company_pairs.sort_values( ['correlation'], ascending=[False]).reset_index(drop=True) storeCorrelation(root_path, "DB_STOCK", "SHEET_CHN_RELA", df_CHN_company_pairs) print('total processing in: %.4s seconds' % ((time.time() - startTime))) pbar.close() return df_CHN_company_pairs
if __name__ == "__main__": if len(sys.argv) != 4: print( "please input Stock symbol and start date, end date after python file" ) exit() pd.set_option('precision', 3) pd.set_option('display.width', 1000) warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning) symbol = str(sys.argv[1]) stocklist = queryStockList(root_path, "DB_STOCK", "SHEET_US_DAILY") result = stocklist[stocklist.index == symbol] if result.empty: print("symbol not exist.") exit() start_date = str(sys.argv[2]) end_date = str(sys.argv[3]) now = datetime.datetime.now().strftime("%Y-%m-%d") config = configparser.ConfigParser() config.read(root_path + "/" + "config.ini") storeType = int(config.get('Setting', 'StoreType'))