def forward():
    """
    Creates the forward sample by parsing the current data html files that we downloaded in check_yahoo().
    :return: a pandas dataframe containing all of the current data for each ticker.
    """
    # Creating an empty dataframe which we will later fill. In addition to the features, we need some index variables
    # (date, unix timestamp, ticker), and of course the dependent variables (prices).
    df_columns = [
        "Date",
        "Unix",
        "Ticker",
        "Price",
        "stock_p_change",
        "SP500",
        "SP500_p_change",
    ] + features

    df = pd.DataFrame(columns=df_columns)

    tickerfile_list = os.listdir("forward/")

    # Required in macOS to remove the hidden index file.
    if ".DS_Store" in tickerfile_list:
        tickerfile_list.remove(".DS_Store")

    # This is the actual parsing. This needs to be fixed every time yahoo changes their UI.
    for tickerfile in tqdm(tickerfile_list,
                           desc="Parsing progress:",
                           unit="tickers"):
        ticker = tickerfile.split(".html")[0].upper()
        source = open(f"forward/{tickerfile}").read()
        # Remove commas from the html to make parsing easier.
        source = source.replace(",", "")

        # Regex search for the different variables in the html file, then append to value_list
        value_list = []
        for variable in features:
            try:
                # Basically, look for the first number present after we an occurence of the variable
                regex = (r">" + re.escape(variable) +
                         r".*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?"
                         r"(</td>|</span>)")
                value = re.search(regex, source, flags=re.DOTALL).group(1)

                # Dealing with number formatting
                value_list.append(data_string_to_float(value))

            # The data may not be present. Process accordingly.
            except AttributeError:
                value_list.append("N/A")
                # print(ticker, variable)

        # Append the ticker and the features to the dataframe
        new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list

        df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)

    return df.replace("N/A", np.nan)
Esempio n. 2
0
def forward():
    df_columns = [
        'Date', 'Unix', 'Ticker', 'Price', 'stock_p_change', 'SP500',
        'SP500_p_change'
    ] + features

    df = pd.DataFrame(columns=df_columns)

    tickerfile_list = os.listdir('forward/')

    # fix .ds_store issue on mac
    if '.DS_Store' in tickerfile_list:
        tickerfile_list.remove('.DS_Store')

    for tickerfile in tqdm(tickerfile_list,
                           desc="Parsing progress:",
                           unit="tickers"):
        ticker = tickerfile.split('.html')[0].upper()
        source = open(f"forward/{tickerfile}").read()
        source = source.replace(',', '')

        value_list = []
        for variable in features:
            try:
                regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \
                                                     r'(</td>|</span>)'
                value = re.search(regex, source, flags=re.DOTALL).group(1)

                value_list.append(data_string_to_float(value))

            except AttributeError:
                value_list.append('N/A')

        new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list

        df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)

    return df.replace('N/A', np.nan)
Esempio n. 3
0
def parse_keystats(sp500_df, stock_df):
    """
    We have downloaded a large number of html files, which are snapshots of a ticker at different times,
    containing the fundamental data (our features). To extract the key statistics, we use regex.
    For supervised machine learning, we also need the data that will form our dependent variable,
    the performance of the stock compared to the SP500.
    :sp500_df: dataframe containing SP500 prices
    :stock_df: dataframe containing stock prices
    :return: a dataframe of training data (i.e features and the components of our dependent variable)
    """
    # The tickers whose data is to be parsed.
    stock_list = [x[0] for x in os.walk(statspath)]
    stock_list = stock_list[1:]

    # Creating a new dataframe which we will later fill.
    df_columns = [
        'Date', 'Unix', 'Ticker', 'Price', 'stock_p_change', 'SP500',
        'SP500_p_change'
    ] + features

    df = pd.DataFrame(columns=df_columns)

    # tqdm is a simple progress bar
    for stock_directory in tqdm(stock_list,
                                desc="Parsing progress:",
                                unit="tickers"):
        keystats_html_files = os.listdir(stock_directory)

        # Snippet to get rid of the .DS_Store file in macOS
        if '.DS_Store' in keystats_html_files:
            keystats_html_files.remove('.DS_Store')

        ticker = stock_directory.split(statspath)[1]

        for file in keystats_html_files:
            # Convert the datetime format of our file to unix time
            date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
            unix_time = time.mktime(date_stamp.timetuple())

            # Read in the html file as a string.
            full_file_path = stock_directory + '/' + file

            # This will store the parsed values
            value_list = []

            with open(full_file_path, 'r') as source:
                source = source.read()
                # Remove commas from the html to make parsing easier.
                source = source.replace(',', '')

                # Regex search for the different variables in the html file, then append to value_list
                for variable in features:
                    # Search for the table entry adjacent to the variable name.
                    try:
                        regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \
                            r'(</td>|</span>)'
                        value = re.search(regex, source,
                                          flags=re.DOTALL).group(1)

                        # Dealing with number formatting
                        value_list.append(data_string_to_float(value))

                    # The data may not be present. Process accordingly
                    except AttributeError:
                        # In the past, 'Avg Vol' was instead named 'Average Volume'
                        # If 'Avg Vol' fails, search for 'Average Volume'.
                        if variable == 'Avg Vol (3 month)':
                            try:
                                new_variable = '>Average Volume (3 month)'
                                regex = re.escape(new_variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0)%?' \
                                    r'(</td>|</span>)'
                                value = re.search(regex,
                                                  source,
                                                  flags=re.DOTALL).group(1)
                                value_list.append(data_string_to_float(value))
                            except AttributeError:
                                value_list.append('N/A')
                        else:
                            value_list.append('N/A')

            # We need the stock price and SP500 price now and one year from now.
            # Convert from unix time to YYYY-MM-DD, so we can look for the price in the dataframe
            # then calculate the percentage change.
            current_date = datetime.fromtimestamp(unix_time).strftime(
                '%Y-%m-%d')
            one_year_later = datetime.fromtimestamp(
                unix_time + 31536000).strftime('%Y-%m-%d')

            # SP500 prices now and one year later, and the percentage change
            sp500_price = float(sp500_df.loc[current_date, 'Adj Close'])
            sp500_1y_price = float(sp500_df.loc[one_year_later, 'Adj Close'])
            sp500_p_change = round(
                ((sp500_1y_price - sp500_price) / sp500_price * 100), 2)

            # Stock prices now and one year later. We need a try/except because some data is missing
            stock_price, stock_1y_price = 'N/A', 'N/A'
            try:
                stock_price = float(stock_df.loc[current_date, ticker.upper()])
                stock_1y_price = float(stock_df.loc[one_year_later,
                                                    ticker.upper()])
            except KeyError:
                # If stock data is missing, we must skip this datapoint
                # print(f"PRICE RETRIEVAL ERROR for {ticker}")
                continue

            stock_p_change = round(
                ((stock_1y_price - stock_price) / stock_price * 100), 2)

            # Append all our data to the dataframe.
            new_df_row = [
                date_stamp, unix_time, ticker, stock_price, stock_p_change,
                sp500_price, sp500_p_change
            ] + value_list

            df = df.append(dict(zip(df_columns, new_df_row)),
                           ignore_index=True)

    # Remove rows with missing stock price data
    df.dropna(axis=0, subset=['Price', 'stock_p_change'], inplace=True)
    # Output the CSV
    df.to_csv('keystats.csv', index=False)
Esempio n. 4
0
def test_data_string_to_float():
    """
    data_string_to_float() is a function that needs to meet lots of empirical requirements
    owing to the idiosyncrasies of Yahoo Finance's HTML. The main jobs are parsing negatives and
    abbreviations of big numbers.
    """
    assert utils.data_string_to_float("asdfNaN") == "N/A"
    assert utils.data_string_to_float(">N/A\n</") == "N/A"
    assert utils.data_string_to_float(">0") == 0
    assert utils.data_string_to_float("-3") == -3
    assert utils.data_string_to_float("4K") == 4000
    assert utils.data_string_to_float("2M") == 2000000
    assert utils.data_string_to_float("0.07B") == 70000000
    assert utils.data_string_to_float("-100.1K") == -100100
    assert utils.data_string_to_float("-0.1M") == -100000
    assert utils.data_string_to_float("-0.02B") == -20000000
    assert utils.data_string_to_float("-0.00") == 0
    assert utils.data_string_to_float("0.00") == 0
    assert utils.data_string_to_float("0M") == 0
    assert utils.data_string_to_float("010K") == 10000

    with pytest.raises(ValueError):
        utils.data_string_to_float(">0x")
    with pytest.raises(ValueError):
        utils.data_string_to_float("10k")
    with pytest.raises(ValueError):
        utils.data_string_to_float("2KB")
Esempio n. 5
0
def test_data_string_to_float():

    assert utils.data_string_to_float("asdfNaN") == "N/A"
    assert utils.data_string_to_float(">N/A\n</") == "N/A"
    assert utils.data_string_to_float(">0") == 0
    assert utils.data_string_to_float("-3") == -3
    assert utils.data_string_to_float("4K") == 4000
    assert utils.data_string_to_float("2M") == 2000000
    assert utils.data_string_to_float("0.07B") == 70000000
    assert utils.data_string_to_float("-100.1K") == -100100
    assert utils.data_string_to_float("-0.1M") == -100000
    assert utils.data_string_to_float("-0.02B") == -20000000
    assert utils.data_string_to_float("-0.00") == 0
    assert utils.data_string_to_float("0.00") == 0
    assert utils.data_string_to_float("0M") == 0
    assert utils.data_string_to_float("010K") == 10000

    with pytest.raises(ValueError):
        utils.data_string_to_float(">0x")
    with pytest.raises(ValueError):
        utils.data_string_to_float("10k")
    with pytest.raises(ValueError):
        utils.data_string_to_float("2KB")
Esempio n. 6
0
            with open(full_file_path, 'r') as source:
                source = source.read()
                # Remove commas from the html to make parsing easier.
                source = source.replace(',', '')

                # Regex search for the different variables in the html file, then append to value_list
                for variable in features:
                    # Search for the table entry adjacent to the variable name.
                    try:
                        regex = r'>' + re.escape(variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?' \
                            r'(</td>|</span>)'
                        value = re.search(
                            regex, source, flags=re.DOTALL).group(1)

                        # Dealing with number formatting
                        value_list.append(data_string_to_float(value))

                    # The data may not be present. Process accordingly
                    except AttributeError:
                        # In the past, 'Avg Vol' was instead named 'Average Volume'
                        # If 'Avg Vol' fails, search for 'Average Volume'.
                        if variable == 'Avg Vol (3 month)':
                            try:
                                new_variable = '>Average Volume (3 month)'
                                regex = re.escape(new_variable) + r'.*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0)%?' \
                                    r'(</td>|</span>)'
                                value = re.search(
                                    regex, source, flags=re.DOTALL).group(1)
                                value_list.append(data_string_to_float(value))
                            except AttributeError:
                                value_list.append('N/A')