def scrape_historical_column_data(self, fund_symbol, url, page): #Build lxml tree from webpage tree = html.fromstring(page.content) #Find the H3 tag that says Annual Total Return (%) History h3_span_text = tree.xpath( './/span[text()="Annual Total Return (%) History"]') if len(h3_span_text) > 0: #The table we wnat is the div tag, which is a sibling to h3. The h3 and the div tag are under one overarching div tag. Get h3's sibiling h3 = h3_span_text[0].getparent() table = h3.getnext() #Grab all columns as lxml Element objects. This includes the 2 columns we don't want (placeholder value column + current year), so we need to filter them out. columns = [column for column in list(table)] #Assuming elements are in document order, we can just remove the first 2 elements of the list del columns[0:2] #Return filtered version return columns else: redirected_to_error_page = tree.xpath( './/span[contains(text(),"Symbols similar to ")]') if len(redirected_to_error_page) > 0: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for historical returns: Symbol does not exist: {fund_symbol}" ) else: raise FundException.UIChangedError( f"Error while retrieving data for historical returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" )
def get_trailing_returns(self, fund_symbol): # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = [ "1-Month", "3-Month", "6-Month", "YTD", "1-Year", "3-Year", "5-Year", "10-Year", "15-Year" ] response = {} url = Util.build_url(Section.TRAILING, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing returns. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: quarterly_returns = [ col.text for col in row.findAll("td") ] response = dict(zip(timespans, quarterly_returns)) else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_morningstar_overall_rating(self, fund_symbol): """ Gets the overall Morningstar rating Process: 1. Use lxml to find the corresponding performanceId for the fund, located in head > meta name = performanceId 2. Then, hit the security identifier API of morningstar with that id, which will return in a field the number of stars' Ex: FSDAX's performanceId is 0P00002PPP """ performanceId = self.extract_performance_id(fund_symbol) response = {} url = Util.build_url(Section.OVERALL_RATING, fund_symbol, 0, performanceId) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") data = raw.json() if "starRating" in data: response["starRating"] = data["starRating"] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def parseData(self, data, fund_symbol): response = {} soup = BeautifulSoup(data, 'html.parser') tabs = ["equity_holding_tab", "equityPrice_holding_tab"] for tab in tabs: table = soup.find("table", id=tab) if table is not None: tbody = table.find('tbody') rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: #Extract stock name row_header = row.find("th") if row_header is not None: stock_name = row_header.text #Extract details for that stock stats = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(stats) > 1: statsDict = self.buildStatsDict(tab, stats) if stock_name not in response: response[stock_name] = statsDict else: current_dict = response[stock_name] current_dict = {**current_dict, **statsDict} response[stock_name] = current_dict else: raise FundException.UIChangedError( f"Error while retrieving data for holdings data: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) return response
def get_holdings_stats(self, fund_symbol): """ Gets the top 25 companies in their portfolio, as well as the following stats: 1. Name 2. % portfolio weight 3. YTD return 4. Shares owned 5. Shares changed 6. P/E 7. Price 8. G/L % (gain/loss percent for the day) First get the first 25 most weighted companies from portfolio (desc) For each: 1. Equity view tab -Name -% portfolio weight -Shares owned -Shares changed -YTD return (could be positive, negative, float, or blank (-) ) -P/E (could be positive, negative, float, or blank (-) ) 2. Equity prices tab -Price -G/L % (gain/loss percent) Each tab is represented as a table -equity view tab: id = equity_holding_tab -get <tbody> with id holding_epage0 -equity prices tab: id = equityPrice_holding_tab Comparisons between 2+ mutual funds will compare Name and % portfolio weight only """ fund_symbol = fund_symbol.upper() response = {} try: Util.validate_format(fund_symbol) url = Util.build_url(Section.HOLDINGS_PAGE_TOP_25, fund_symbol) response = self.extractHoldings(url, fund_symbol) except FundException.ImproperSymbolFormatError as e: raise FundException.ImproperSymbolFormatError(e) except FundException.SymbolDoesNotExistError as e: raise FundException.SymbolDoesNotExistError(e) except FundException.UIChangedError as e: raise FundException.UIChangedError(e) except FundException.SourceEndpointChangedError as e: raise FundException.SourceEndpointChangedError(e) return response
def get_capture_ratios(self, fund_symbol, timespan): """ Gets upside and downside capture ratios for 1 year, 3 year, 5 year, 10 year, 15 year """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] upsidedownside_fields = ["Upside ratio", "Downside ratio"] fields = [ "Standard Deviation", "Return", "Sharpe Ratio", "Sortino Ratio" ] response = {} url = Util.build_url(Section.CAPTURE_RATIOS, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: stats = [] for col in row.findAll("td"): #Values are stuck together. Ex: Convert "145.9576.71" --> "145.95", "76.71" raw = col.text first_dot = raw.find(".") upside_ratio = raw[:first_dot + 3] downside_ratio = raw[first_dot + 3:] stats.append({ "upside_ratio": upside_ratio, "downside_ratio": downside_ratio }) del stats[ 0] #Delete 1-Year for consistency, since other stats only have 3year, 5year, 10year, 15year response = dict(zip(timespans, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk capture ratios: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk capture ratios: Symbol does not exist: {fund_symbol}" ) return response
def get_asset_allocation_data(self, fund_symbol): """ Gets the asset allocation data necessary for the pie chart Mimics Morningstar's asset allocation pie chart on the quotes page Note: On morningstar, there are 2 possible layouts: 1. Pie chart: -7 rows in response (1 blank, 6 with 2 columns each: field name and value) -ex: PRHSX 2. Table: -8 rows in response (2 irrelvant, 6 with 4 columns each: field name, net, short, long) -We'll only use field name and net, to match consistency with pie chart scenario -Contains the phrase "Note: Contains derivatives or short positions" -ex: FSDAX """ # Build a dictionary, where key = time period, value = trailing return for that time period. response = {} url = Util.build_url(Section.ASSET_ALLOCATION, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') fields = [ "Cash", "US Stock", "US Stocks", "Non US Stock", "Non US Stocks", "Bond", "Bonds", "Other" ] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text for col in row.findAll("td") if col.text != "" ] if len(rowData) > 0: fieldEntry = rowData[0] if fieldEntry in fields: response[fieldEntry] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_10000_growth(self, fund_symbol): response = {} url = Util.build_url(Section.GROWTH, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": raw_json = {} try: raw_json = raw.json() except Exception as e: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for $10000 growth: Symbol does not exist: {fund_symbol}" ) #Interpret HTML using BeautifulSoup, then extract out data in JSON from <div data_mod_config = ...., class = mod-ui-chart--dynamic> html = raw_json["html"] soup = BeautifulSoup(html, 'html.parser') response = {} data_mod_config_div = soup.find( "div", {"class": "mod-ui-chart--dynamic"})["data-mod-config"] if data_mod_config_div != "": #Convert dictionary in string form to an actual dictionary growth_json = ast.literal_eval(data_mod_config_div) internal_data = growth_json["data"] if len(internal_data) >= 1: #Access first element in the dict, which is the list of values growths = next(iter(internal_data.values())) #Parse into a dict where key = date (YYYY-MM-DD, removing the "T00:00:00" from the end), value = expected dollar value that year response = { year["date"][:len(year["date"]) - 9]: year["value"] for year in growths } else: raise FundException.UIChangedError( f"Error while retrieving data for $10000 growth: UI changed for symbol name: {fund_symbol}; thus, we cannot scrape" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for $10000 growth: Symbol does not exist: {fund_symbol}" ) return response
def get_mpt_stats(self, fund_symbol, timespan): """ Retrieves alpha, beta, R-squared, Treynor ratio """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] fields = [ "Category Index", "R-Squared", "Beta", "Alpha", "Treynor Ratio", "Currency" ] response = {} for timespan in timespans: year = timespan.split("-")[0] url = Util.build_url(Section.RISK_MPT, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict dataNotFoundYet = True table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if dataNotFoundYet and row_header != None and row_header.text == fund_symbol: dataNotFoundYet = False stats = [ col.text.strip() for col in row.findAll("td") ] response[timespan] = dict(zip(fields, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk mpt: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk mpt: Symbol does not exist: {fund_symbol}" ) return response
def get_risk_stats(self, fund_symbol): """ Grabs risk stats. Grabs 8 things, for 4 time periods (3 year, 5 year, 10 year, 15 year): 1. Alpha 2. Beta 3. R-squared 4. Standard deviation 5. Sharpe ratio 6. Sortino ratio 7. Treynor ratio 8. Capture ratios Return in a JsonResponse encoded object """ #Add data from capture ratios first. We can get all data in capture ratios in 1 GET request, but need multiple GET requests for mpt and volatility fund_symbol = fund_symbol.upper() response = {} try: Util.validate_format(fund_symbol) response = self.get_capture_ratios(fund_symbol) timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] for timespan in timespans: #Extract and aggregate data for MPT stats and Volatility stats mpt_and_volatility = self.get_mpt_and_volatility_data( fund_symbol, timespan) #Add these values into the current timespan dict along with the capture ratios response[timespan] = { **response[timespan], **mpt_and_volatility } except FundException.ImproperSymbolFormatError as e: raise FundException.ImproperSymbolFormatError(e) except FundException.SymbolDoesNotExistError as e: raise FundException.SymbolDoesNotExistError(e) except FundException.UIChangedError as e: raise FundException.UIChangedError(e) except FundException.SourceEndpointChangedError as e: raise FundException.SourceEndpointChangedError(e) return response
def get_volatility_stats(self, fund_symbol, timespan): """ Retrieves standard deviation, return, sharpe ratio, sortino ratio """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] fields = [ "Standard Deviation", "Return", "Sharpe Ratio", "Sortino Ratio" ] response = {} for timespan in timespans: year = timespan.split("-")[0] url = Util.build_url(Section.RISK_VOLATILITY, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: stats = [ col.text.strip() for col in row.findAll("td") ] del stats[len(stats) - 1] #Remove unnecessary values response[timespan] = dict(zip(fields, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk volatility statistics: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk volatility statistics: Symbol does not exist: {fund_symbol}" ) return response
def get_mpt_and_volatility_data(self, fund_symbol, timespan): """ For a given timespan, gets ALL the MPT + Volatility data Builds a dictionary, where key = time period, value = dict containing all the stats for that year """ timespan_dict = {} year = timespan.split("-")[0] sections = [Section.RISK_MPT, Section.RISK_VOLATILITY] for section in sections: section_dict = {} url = Util.build_url(section, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict dataNotFoundYet = True table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if dataNotFoundYet and row_header != None and row_header.text == fund_symbol: dataNotFoundYet = False section_dict = self.extract_column_data( row, section) #Accumulate key-value pairs of section_dict into timespan_dict timespan_dict = {**timespan_dict, **section_dict} else: raise FundException.UIChangedError( f"Error while retrieving data for risk mpt: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk mpt: Symbol does not exist: {fund_symbol}" ) return timespan_dict
def get_performance_stats(self, fund_symbol): fund_symbol = fund_symbol.upper() stats = {} try: Util.validate_format(fund_symbol) stats["trailing_returns"] = self.get_trailing_returns(fund_symbol) stats["historical_returns"] = self.get_fund_historical_returns( fund_symbol) stats["10000_growth_data"] = self.get_10000_growth(fund_symbol) except FundException.ImproperSymbolFormatError as e: raise FundException.ImproperSymbolFormatError(e) except FundException.SymbolDoesNotExistError as e: raise FundException.SymbolDoesNotExistError(e) except FundException.UIChangedError as e: raise FundException.UIChangedError(e) except FundException.SourceEndpointChangedError as e: raise FundException.SourceEndpointChangedError(e) return stats
def get_general_stats(self, fund_symbol): """ Grabs general stats of the mutual fund. Grabs things: 1. Price (NAV) 2. Min. initial investment 3. Expense ratio 4. Asset allocation pie chart data(Morningstar's pie chart: Cash, US stock, Non-US stock, bonds, etc)Asset allocation pie chart data(Morningstar's pie chart: Cash, US stock, Non-US stock, bonds, etc) 5. Morningstar overall rating 6. Morningstar risk vs category 7. Morningstar return vs category 8. Morningstar category 9. Turnover ratio Source = Morningstar, quotes page """ fund_symbol = fund_symbol.upper() response = {} try: Util.validate_format(fund_symbol) sections = [ Section.GENERAL_STATS, Section.ASSET_ALLOCATION, Section.RISK_RETURN_VS_CATEGORY, Section.OVERALL_RATING ] for section in sections: response[str(section)] = self.get_section_data( section, fund_symbol) except FundException.ImproperSymbolFormatError as e: raise FundException.ImproperSymbolFormatError(e) except FundException.SymbolDoesNotExistError as e: raise FundException.SymbolDoesNotExistError(e) except FundException.UIChangedError as e: raise FundException.UIChangedError(e) except FundException.SourceEndpointChangedError as e: raise FundException.SourceEndpointChangedError(e) return response
def get_risk_return_vs_category(self, fund_symbol): """ Gets the: 1. overall risk compared to its category, as judged by Morningstar 2. overall return compared to its category, as judged by Morningstar Found on quotes page """ response = {} url = Util.build_url(Section.RISK_RETURN_VS_CATEGORY, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') fields = ["Risk vs.Category", "Return vs.Category"] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(rowData) > 0: fieldEntry = rowData[0] for field in fields: if fieldEntry.find(field) != -1: response[field] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_general_details(self, fund_symbol): """ Gets the following: 1. Price/ NAV 2. Minimum investment 3. Expense ratio (in percentage, ex: .77%) 4. Turnover ratio (in percentage, ex: .77%) 5. Morningstar Category """ # Build a dictionary, where key = time period, value = trailing return for that time period. response = {} url = Util.build_url(Section.GENERAL_STATS, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') keys = [ "NAV", "MinInvestment", "ExpenseRatio", "Turnover", "MorningstarCategory" ] for key in keys: spans = soup.findAll("span", attrs={"vkey": key}) if len(spans) > 0: span = spans[0] span_text = span.text response[key] = span_text.strip() else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def extract_column_data(self, section, soup, raw): response = {} if section == Section.OVERALL_RATING: data = raw.json() if "starRating" in data: response["starRating"] = data["starRating"] else: raise FundException.UIChangedError( f"Error while retrieving data for General stats, section {section}: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) elif section == Section.GENERAL_STATS: keys = [ "NAV", "MinInvestment", "ExpenseRatio", "Turnover", "MorningstarCategory" ] for key in keys: spans = soup.findAll("span", attrs={"vkey": key}) if len(spans) > 0: span = spans[0] span_text = span.text response[key] = span_text.strip() elif section == Section.ASSET_ALLOCATION: fields = [ "Cash", "US Stock", "US Stocks", "Non US Stock", "Non US Stocks", "Bond", "Bonds", "Other" ] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text for col in row.findAll("td") if col.text != "" ] if len(rowData) > 0: fieldEntry = rowData[0] if fieldEntry in fields: response[fieldEntry] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for General stats, section {section}: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: fields = ["Risk vs.Category", "Return vs.Category"] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(rowData) > 0: fieldEntry = rowData[0] for field in fields: if fieldEntry.find(field) != -1: response[field] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for General stats, section {section}: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) return response