def get_morningstar_overall_rating(self, fund_symbol): """ Gets the overall Morningstar rating Process: 1. Use lxml to find the corresponding performanceId for the fund, located in head > meta name = performanceId 2. Then, hit the security identifier API of morningstar with that id, which will return in a field the number of stars' Ex: FSDAX's performanceId is 0P00002PPP """ performanceId = self.extract_performance_id(fund_symbol) response = {} url = Util.build_url(Section.OVERALL_RATING, fund_symbol, 0, performanceId) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") data = raw.json() if "starRating" in data: response["starRating"] = data["starRating"] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_trailing_returns(self, fund_symbol): # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = [ "1-Month", "3-Month", "6-Month", "YTD", "1-Year", "3-Year", "5-Year", "10-Year", "15-Year" ] response = {} url = Util.build_url(Section.TRAILING, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing returns. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: quarterly_returns = [ col.text for col in row.findAll("td") ] response = dict(zip(timespans, quarterly_returns)) else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_fund_historical_returns(self, fund_symbol): url = Util.build_url(Section.HISTORICAL, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": return self.retrieve_historical_returns(fund_symbol, url, raw) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for historical returns: Symbol does not exist: {fund_symbol}" )
def get_section_data(self, section, fund_symbol): response = {} url = "" if section == Section.OVERALL_RATING: performanceId = self.extract_performance_id(fund_symbol) url = Util.build_url(section, fund_symbol, 0, performanceId) else: url = Util.build_url(section, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') return self.extract_column_data(section, soup, raw) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for General stats, section {section}: Symbol does not exist: {fund_symbol}" )
def get_holdings_stats(self, fund_symbol): """ Gets the top 25 companies in their portfolio, as well as the following stats: 1. Name 2. % portfolio weight 3. YTD return 4. Shares owned 5. Shares changed 6. P/E 7. Price 8. G/L % (gain/loss percent for the day) First get the first 25 most weighted companies from portfolio (desc) For each: 1. Equity view tab -Name -% portfolio weight -Shares owned -Shares changed -YTD return (could be positive, negative, float, or blank (-) ) -P/E (could be positive, negative, float, or blank (-) ) 2. Equity prices tab -Price -G/L % (gain/loss percent) Each tab is represented as a table -equity view tab: id = equity_holding_tab -get <tbody> with id holding_epage0 -equity prices tab: id = equityPrice_holding_tab Comparisons between 2+ mutual funds will compare Name and % portfolio weight only """ fund_symbol = fund_symbol.upper() response = {} try: Util.validate_format(fund_symbol) url = Util.build_url(Section.HOLDINGS_PAGE_TOP_25, fund_symbol) response = self.extractHoldings(url, fund_symbol) except FundException.ImproperSymbolFormatError as e: raise FundException.ImproperSymbolFormatError(e) except FundException.SymbolDoesNotExistError as e: raise FundException.SymbolDoesNotExistError(e) except FundException.UIChangedError as e: raise FundException.UIChangedError(e) except FundException.SourceEndpointChangedError as e: raise FundException.SourceEndpointChangedError(e) return response
def extract_performance_id(self, fund_symbol): """ Extract id from page, so get_morningstar_overall_rating() can build a url that can get the actual star rating """ url = Util.build_url(Section.QUOTES_PAGE, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": #Build lxml tree from webpage tree = html.fromstring(raw.content) #Find the meta tag that says "performanceId", and extract the content field tags = tree.xpath('.//meta[@name="performanceId"]') tag = tags[0] return tag.get("content")
def get_capture_ratios(self, fund_symbol, timespan): """ Gets upside and downside capture ratios for 1 year, 3 year, 5 year, 10 year, 15 year """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] upsidedownside_fields = ["Upside ratio", "Downside ratio"] fields = [ "Standard Deviation", "Return", "Sharpe Ratio", "Sortino Ratio" ] response = {} url = Util.build_url(Section.CAPTURE_RATIOS, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: stats = [] for col in row.findAll("td"): #Values are stuck together. Ex: Convert "145.9576.71" --> "145.95", "76.71" raw = col.text first_dot = raw.find(".") upside_ratio = raw[:first_dot + 3] downside_ratio = raw[first_dot + 3:] stats.append({ "upside_ratio": upside_ratio, "downside_ratio": downside_ratio }) del stats[ 0] #Delete 1-Year for consistency, since other stats only have 3year, 5year, 10year, 15year response = dict(zip(timespans, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk capture ratios: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk capture ratios: Symbol does not exist: {fund_symbol}" ) return response
def get_asset_allocation_data(self, fund_symbol): """ Gets the asset allocation data necessary for the pie chart Mimics Morningstar's asset allocation pie chart on the quotes page Note: On morningstar, there are 2 possible layouts: 1. Pie chart: -7 rows in response (1 blank, 6 with 2 columns each: field name and value) -ex: PRHSX 2. Table: -8 rows in response (2 irrelvant, 6 with 4 columns each: field name, net, short, long) -We'll only use field name and net, to match consistency with pie chart scenario -Contains the phrase "Note: Contains derivatives or short positions" -ex: FSDAX """ # Build a dictionary, where key = time period, value = trailing return for that time period. response = {} url = Util.build_url(Section.ASSET_ALLOCATION, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') fields = [ "Cash", "US Stock", "US Stocks", "Non US Stock", "Non US Stocks", "Bond", "Bonds", "Other" ] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text for col in row.findAll("td") if col.text != "" ] if len(rowData) > 0: fieldEntry = rowData[0] if fieldEntry in fields: response[fieldEntry] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_10000_growth(self, fund_symbol): response = {} url = Util.build_url(Section.GROWTH, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": raw_json = {} try: raw_json = raw.json() except Exception as e: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for $10000 growth: Symbol does not exist: {fund_symbol}" ) #Interpret HTML using BeautifulSoup, then extract out data in JSON from <div data_mod_config = ...., class = mod-ui-chart--dynamic> html = raw_json["html"] soup = BeautifulSoup(html, 'html.parser') response = {} data_mod_config_div = soup.find( "div", {"class": "mod-ui-chart--dynamic"})["data-mod-config"] if data_mod_config_div != "": #Convert dictionary in string form to an actual dictionary growth_json = ast.literal_eval(data_mod_config_div) internal_data = growth_json["data"] if len(internal_data) >= 1: #Access first element in the dict, which is the list of values growths = next(iter(internal_data.values())) #Parse into a dict where key = date (YYYY-MM-DD, removing the "T00:00:00" from the end), value = expected dollar value that year response = { year["date"][:len(year["date"]) - 9]: year["value"] for year in growths } else: raise FundException.UIChangedError( f"Error while retrieving data for $10000 growth: UI changed for symbol name: {fund_symbol}; thus, we cannot scrape" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for $10000 growth: Symbol does not exist: {fund_symbol}" ) return response
def get_mpt_stats(self, fund_symbol, timespan): """ Retrieves alpha, beta, R-squared, Treynor ratio """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] fields = [ "Category Index", "R-Squared", "Beta", "Alpha", "Treynor Ratio", "Currency" ] response = {} for timespan in timespans: year = timespan.split("-")[0] url = Util.build_url(Section.RISK_MPT, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict dataNotFoundYet = True table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if dataNotFoundYet and row_header != None and row_header.text == fund_symbol: dataNotFoundYet = False stats = [ col.text.strip() for col in row.findAll("td") ] response[timespan] = dict(zip(fields, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk mpt: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk mpt: Symbol does not exist: {fund_symbol}" ) return response
def get_mpt_and_volatility_data(self, fund_symbol, timespan): """ For a given timespan, gets ALL the MPT + Volatility data Builds a dictionary, where key = time period, value = dict containing all the stats for that year """ timespan_dict = {} year = timespan.split("-")[0] sections = [Section.RISK_MPT, Section.RISK_VOLATILITY] for section in sections: section_dict = {} url = Util.build_url(section, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict dataNotFoundYet = True table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if dataNotFoundYet and row_header != None and row_header.text == fund_symbol: dataNotFoundYet = False section_dict = self.extract_column_data( row, section) #Accumulate key-value pairs of section_dict into timespan_dict timespan_dict = {**timespan_dict, **section_dict} else: raise FundException.UIChangedError( f"Error while retrieving data for risk mpt: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk mpt: Symbol does not exist: {fund_symbol}" ) return timespan_dict
def get_volatility_stats(self, fund_symbol, timespan): """ Retrieves standard deviation, return, sharpe ratio, sortino ratio """ # Build a dictionary, where key = time period, value = trailing return for that time period. timespans = ["3-Year", "5-Year", "10-Year", "15-Year"] fields = [ "Standard Deviation", "Return", "Sharpe Ratio", "Sortino Ratio" ] response = {} for timespan in timespans: year = timespan.split("-")[0] url = Util.build_url(Section.RISK_VOLATILITY, fund_symbol, year) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') # Find corresponding column values of trailing risk stats. These will be the values of the dict table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: row_header = row.find("th") if row_header != None and row_header.text == fund_symbol: stats = [ col.text.strip() for col in row.findAll("td") ] del stats[len(stats) - 1] #Remove unnecessary values response[timespan] = dict(zip(fields, stats)) else: raise FundException.UIChangedError( f"Error while retrieving data for risk volatility statistics: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for risk volatility statistics: Symbol does not exist: {fund_symbol}" ) return response
def get_risk_return_vs_category(self, fund_symbol): """ Gets the: 1. overall risk compared to its category, as judged by Morningstar 2. overall return compared to its category, as judged by Morningstar Found on quotes page """ response = {} url = Util.build_url(Section.RISK_RETURN_VS_CATEGORY, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') fields = ["Risk vs.Category", "Return vs.Category"] table = soup.find("table") if table is not None: rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: rowData = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(rowData) > 0: fieldEntry = rowData[0] for field in fields: if fieldEntry.find(field) != -1: response[field] = rowData[1] else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_general_details(self, fund_symbol): """ Gets the following: 1. Price/ NAV 2. Minimum investment 3. Expense ratio (in percentage, ex: .77%) 4. Turnover ratio (in percentage, ex: .77%) 5. Morningstar Category """ # Build a dictionary, where key = time period, value = trailing return for that time period. response = {} url = Util.build_url(Section.GENERAL_STATS, fund_symbol) raw = requests.get(url) if raw.status_code == 200 and raw.text != "": print("200 and not empty") soup = BeautifulSoup(raw.text, 'html.parser') keys = [ "NAV", "MinInvestment", "ExpenseRatio", "Turnover", "MorningstarCategory" ] for key in keys: spans = soup.findAll("span", attrs={"vkey": key}) if len(spans) > 0: span = spans[0] span_text = span.text response[key] = span_text.strip() else: raise FundException.UIChangedError( f"Error while retrieving data for trailing returns: UI for source website of this symbol has changed, so we can't scrape the data: {fund_symbol}" ) else: raise FundException.SymbolDoesNotExistError( f"Error while retrieving data for trailing returns: Symbol does not exist: {fund_symbol}" ) return response
def get_holdings_stats(self, fund_symbol): """ Gets the top 25 companies in their portfolio, as well as the following stats: 1. Name 2. % portfolio weight 3. YTD return First get the first 25 most weighted companies from portfolio (desc), then bottom 25 (asc) For each: 1. Equity view tab -Name -% portfolio weight -Shares owned -Shares changed -YTD return (could be positive, negative, float, or blank (-) ) -P/E (could be positive, negative, float, or blank (-) ) 2. Equity prices tab -Price -G/L % (gain/loss percent) Each tab is represented as a table -equity view tab: id = equity_holding_tab -get <tbody> with id holding_epage0 -equity prices tab: id = equityPrice_holding_tab Comparisons between 2+ mutual funds will compare Name and % portfolio weight only """ section = Section.HOLDINGS_PAGE_BOTTOM_25 url = Util.build_url(section, fund_symbol) raw = requests.get(url) raw_data = raw.json() data = raw_data["htmlStr"] data = data.strip() data = data.replace("\n", "") data = data.replace("\t", "") soup = BeautifulSoup(data, 'html.parser') # Equity view tab table = soup.find("table", id="equity_holding_tab") if table is not None: tbody = table.find('tbody') rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: #Extract stock name row_header = row.find("th") if row_header is not None: stock_name = row_header.text #Extract details for that stock stats = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(stats) > 1: #Delete values in positions 2,3,4,5, as they don't pertain with what we want to retain del stats[2:5] fields = [ "% portfolio weight", "Shares Owned", "Country", "YTD Return", "P/E ratio" ] response[stock_name] = dict(zip(fields, stats)) # Equity prices tab table = soup.find("table", id="equityPrice_holding_tab") if table is not None: tbody = table.find('tbody') rows = table.findAll(lambda tag: tag.name == 'tr') for row in rows: #Extract stock name row_header = row.find("th") if row_header is not None: stock_name = row_header.text #Extract details for that stock stats = [ col.text.strip() for col in row.findAll("td") if col.text.strip() != "" ] if len(stats) > 1: print(stats) #Only retain values in positions 2,3,4 (Currency, price, Gain/loss %) stats = stats[2:5] print("stats after: ", stats) fields = ["Currency", "Price", "Gain/Loss %"] response[stock_name] = dict(zip(fields, stats)) response["data"] = data return response