def forward(): """ Creates the forward sample by parsing the current data html files that we downloaded in check_yahoo(). :return: a pandas dataframe containing all of the current data for each ticker. """ # Creating an empty dataframe which we will later fill. In addition to the features, we need some index variables # (date, unix timestamp, ticker), and of course the dependent variables (prices). df_columns = [ "Date", "Unix", "Ticker", "Price", "stock_p_change", "SP500", "SP500_p_change", ] + features df = pd.DataFrame(columns=df_columns) tickerfile_list = os.listdir("forward/") # Required in macOS to remove the hidden index file. if ".DS_Store" in tickerfile_list: tickerfile_list.remove(".DS_Store") # This is the actual parsing. This needs to be fixed every time yahoo changes their UI. for tickerfile in tqdm(tickerfile_list, desc="Parsing progress:", unit="tickers"): ticker = tickerfile.split(".html")[0].upper() source = open(f"forward/{tickerfile}").read() # Remove commas from the html to make parsing easier. source = source.replace(",", "") # Regex search for the different variables in the html file, then append to value_list value_list = [] for variable in features: try: # Basically, look for the first number present after we an occurence of the variable regex = (r">" + re.escape(variable) + r".*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?" r"(</td>|</span>)") value = re.search(regex, source, flags=re.DOTALL).group(1) # Dealing with number formatting value_list.append(data_string_to_float(value)) # The data may not be present. Process accordingly. except AttributeError: value_list.append("N/A") # print(ticker, variable) # Append the ticker and the features to the dataframe new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) return df.replace("N/A", np.nan)
def forward(): df_columns = [ 'Date', 'Unix', 'Ticker', 'Price', 'stock_p_change', 'SP500', 'SP500_p_change' ] + features df = pd.DataFrame(columns=df_columns) tickerfile_list = os.listdir('forward/') # fix .ds_store issue on mac if '.DS_Store' in tickerfile_list: tickerfile_list.remove('.DS_Store') for tickerfile in tqdm(tickerfile_list, desc="Parsing progress:", unit="tickers"): ticker = tickerfile.split('.html')[0].upper() source = open(f"forward/{tickerfile}").read() source = source.replace(',', '') value_list = [] for variable in features: try: regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \ r'(</td>|</span>)' value = re.search(regex, source, flags=re.DOTALL).group(1) value_list.append(data_string_to_float(value)) except AttributeError: value_list.append('N/A') new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) return df.replace('N/A', np.nan)
def parse_keystats(sp500_df, stock_df): """ We have downloaded a large number of html files, which are snapshots of a ticker at different times, containing the fundamental data (our features). To extract the key statistics, we use regex. For supervised machine learning, we also need the data that will form our dependent variable, the performance of the stock compared to the SP500. :sp500_df: dataframe containing SP500 prices :stock_df: dataframe containing stock prices :return: a dataframe of training data (i.e features and the components of our dependent variable) """ # The tickers whose data is to be parsed. stock_list = [x[0] for x in os.walk(statspath)] stock_list = stock_list[1:] # Creating a new dataframe which we will later fill. df_columns = [ 'Date', 'Unix', 'Ticker', 'Price', 'stock_p_change', 'SP500', 'SP500_p_change' ] + features df = pd.DataFrame(columns=df_columns) # tqdm is a simple progress bar for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"): keystats_html_files = os.listdir(stock_directory) # Snippet to get rid of the .DS_Store file in macOS if '.DS_Store' in keystats_html_files: keystats_html_files.remove('.DS_Store') ticker = stock_directory.split(statspath)[1] for file in keystats_html_files: # Convert the datetime format of our file to unix time date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html') unix_time = time.mktime(date_stamp.timetuple()) # Read in the html file as a string. full_file_path = stock_directory + '/' + file # This will store the parsed values value_list = [] with open(full_file_path, 'r') as source: source = source.read() # Remove commas from the html to make parsing easier. source = source.replace(',', '') # Regex search for the different variables in the html file, then append to value_list for variable in features: # Search for the table entry adjacent to the variable name. try: regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \ r'(</td>|</span>)' value = re.search(regex, source, flags=re.DOTALL).group(1) # Dealing with number formatting value_list.append(data_string_to_float(value)) # The data may not be present. Process accordingly except AttributeError: # In the past, 'Avg Vol' was instead named 'Average Volume' # If 'Avg Vol' fails, search for 'Average Volume'. if variable == 'Avg Vol (3 month)': try: new_variable = '>Average Volume (3 month)' regex = re.escape(new_variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0)%?' \ r'(</td>|</span>)' value = re.search(regex, source, flags=re.DOTALL).group(1) value_list.append(data_string_to_float(value)) except AttributeError: value_list.append('N/A') else: value_list.append('N/A') # We need the stock price and SP500 price now and one year from now. # Convert from unix time to YYYY-MM-DD, so we can look for the price in the dataframe # then calculate the percentage change. current_date = datetime.fromtimestamp(unix_time).strftime( '%Y-%m-%d') one_year_later = datetime.fromtimestamp( unix_time + 31536000).strftime('%Y-%m-%d') # SP500 prices now and one year later, and the percentage change sp500_price = float(sp500_df.loc[current_date, 'Adj Close']) sp500_1y_price = float(sp500_df.loc[one_year_later, 'Adj Close']) sp500_p_change = round( ((sp500_1y_price - sp500_price) / sp500_price * 100), 2) # Stock prices now and one year later. We need a try/except because some data is missing stock_price, stock_1y_price = 'N/A', 'N/A' try: stock_price = float(stock_df.loc[current_date, ticker.upper()]) stock_1y_price = float(stock_df.loc[one_year_later, ticker.upper()]) except KeyError: # If stock data is missing, we must skip this datapoint # print(f"PRICE RETRIEVAL ERROR for {ticker}") continue stock_p_change = round( ((stock_1y_price - stock_price) / stock_price * 100), 2) # Append all our data to the dataframe. new_df_row = [ date_stamp, unix_time, ticker, stock_price, stock_p_change, sp500_price, sp500_p_change ] + value_list df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) # Remove rows with missing stock price data df.dropna(axis=0, subset=['Price', 'stock_p_change'], inplace=True) # Output the CSV df.to_csv('keystats.csv', index=False)
def test_data_string_to_float(): """ data_string_to_float() is a function that needs to meet lots of empirical requirements owing to the idiosyncrasies of Yahoo Finance's HTML. The main jobs are parsing negatives and abbreviations of big numbers. """ assert utils.data_string_to_float("asdfNaN") == "N/A" assert utils.data_string_to_float(">N/A\n</") == "N/A" assert utils.data_string_to_float(">0") == 0 assert utils.data_string_to_float("-3") == -3 assert utils.data_string_to_float("4K") == 4000 assert utils.data_string_to_float("2M") == 2000000 assert utils.data_string_to_float("0.07B") == 70000000 assert utils.data_string_to_float("-100.1K") == -100100 assert utils.data_string_to_float("-0.1M") == -100000 assert utils.data_string_to_float("-0.02B") == -20000000 assert utils.data_string_to_float("-0.00") == 0 assert utils.data_string_to_float("0.00") == 0 assert utils.data_string_to_float("0M") == 0 assert utils.data_string_to_float("010K") == 10000 with pytest.raises(ValueError): utils.data_string_to_float(">0x") with pytest.raises(ValueError): utils.data_string_to_float("10k") with pytest.raises(ValueError): utils.data_string_to_float("2KB")
def test_data_string_to_float(): assert utils.data_string_to_float("asdfNaN") == "N/A" assert utils.data_string_to_float(">N/A\n</") == "N/A" assert utils.data_string_to_float(">0") == 0 assert utils.data_string_to_float("-3") == -3 assert utils.data_string_to_float("4K") == 4000 assert utils.data_string_to_float("2M") == 2000000 assert utils.data_string_to_float("0.07B") == 70000000 assert utils.data_string_to_float("-100.1K") == -100100 assert utils.data_string_to_float("-0.1M") == -100000 assert utils.data_string_to_float("-0.02B") == -20000000 assert utils.data_string_to_float("-0.00") == 0 assert utils.data_string_to_float("0.00") == 0 assert utils.data_string_to_float("0M") == 0 assert utils.data_string_to_float("010K") == 10000 with pytest.raises(ValueError): utils.data_string_to_float(">0x") with pytest.raises(ValueError): utils.data_string_to_float("10k") with pytest.raises(ValueError): utils.data_string_to_float("2KB")
with open(full_file_path, 'r') as source: source = source.read() # Remove commas from the html to make parsing easier. source = source.replace(',', '') # Regex search for the different variables in the html file, then append to value_list for variable in features: # Search for the table entry adjacent to the variable name. try: regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \ r'(</td>|</span>)' value = re.search( regex, source, flags=re.DOTALL).group(1) # Dealing with number formatting value_list.append(data_string_to_float(value)) # The data may not be present. Process accordingly except AttributeError: # In the past, 'Avg Vol' was instead named 'Average Volume' # If 'Avg Vol' fails, search for 'Average Volume'. if variable == 'Avg Vol (3 month)': try: new_variable = '>Average Volume (3 month)' regex = re.escape(new_variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0)%?' \ r'(</td>|</span>)' value = re.search( regex, source, flags=re.DOTALL).group(1) value_list.append(data_string_to_float(value)) except AttributeError: value_list.append('N/A')