def __init__(self): config = configparser.ConfigParser() config.read('config.ini') ms_username = config.get('Credentials', 'morningstar_username') ms_password = config.get('Credentials', 'morningstar_password') # we only want stocks with market cap > 50,000 # TODO: add this value to the config file! self._google_scraper = GoogleScraper(50000000) self._asx_scraper = AsxScraper() self._ms_scraper = MorningStarScraper(ms_username, ms_password) self._ms_scraper.login()
class StockScraper(object): """Used to scrape stock data from a variety of sources on the web. This class should be used by the client code instead of the specific scrapers. """ def __init__(self): config = configparser.ConfigParser() config.read('config.ini') ms_username = config.get('Credentials', 'morningstar_username') ms_password = config.get('Credentials', 'morningstar_password') # we only want stocks with market cap > 50,000 # TODO: add this value to the config file! self._google_scraper = GoogleScraper(50000000) self._asx_scraper = AsxScraper() self._ms_scraper = MorningStarScraper(ms_username, ms_password) self._ms_scraper.login() def scrape_stock_profiles(self): """Scrapes and returns a list of stock profiles from various sources on the web. """ stock_profiles = [] # using Google stock data as our base, populate all the stock profiles for stock in self._google_scraper.scrape_stock_profiles(): try: stock.sector = self._asx_scraper.sector(stock.symbol) except KeyError: # The Google list sometimes has delisted companies, so if a # sector can't be found, it's probably been delisted. continue # TODO: add these values into the config file? if 'Utilities' in stock.sector \ or 'Financ' in stock.sector \ or 'Banks' in stock.sector \ or 'Real Estate' in stock.sector: continue # Merge in stock data from MorningStar # Note that we keep Google's 'market cap', as it's more up-to-date try: ms_stock = self._ms_scraper.scrape_stock_profile(stock.symbol) except FieldMissingException as e: print(ms_stock.symbol, str(e)) # if an attribute can't be found, we can't really do anything # other than just continue. some companies don't have 'return on # capital' available. continue stock.return_on_capital = ms_stock.return_on_capital stock.ebit = ms_stock.ebit stock.total_debt = ms_stock.total_debt stock.cash = ms_stock.cash stock_profiles.append(stock) return stock_profiles
def execute(self, args): BINARY_PATH = os.getenv("SCRAPER_CHROME_BINARY_PATH") DRIVER_PATH = os.getenv("SCRAPER_CHROME_DRIVER_PATH") if BINARY_PATH is None or DRIVER_PATH is None: print( "Error: SCRAPER_CHROME_BINARY_PATH and SCRAPER_CHROME_DRIVER_PATH environment variables must be set to Selenium Chrome Driver settings." ) return binary_path = BINARY_PATH driver_path = DRIVER_PATH # Parse arguments parser = argparse.ArgumentParser( description="Scrape a website for images using searchterms.") parser.add_argument( "site", type=str, help= "The site to search in. Can be 'bigstock', 'google' or 'shutterstock'", ) parser.add_argument("searchterm", type=str, help="The terms to search for.") parser.add_argument( "pagecount", type=int, default=100, help="The total number of pages to scrape. Default is 100.", ) parser.add_argument( "start_page", type=int, default=1, help="The page to start the search on. Default is 1.", ) parser.add_argument( "image_size", type=str, default="regular", help= "The image size that should be downloaded. Can be 'small', 'regular' or 'large'. Default is 'regular'.", ) arguments = parser.parse_args() self.site = arguments.site self.searchterm = arguments.searchterm self.pagecount = arguments.pagecount self.start_page = arguments.start_page self.image_size = arguments.image_size search_options = { "searchterm": self.searchterm, "pagecount": self.pagecount, "start_page": self.start_page, "image_size": self.image_size, } webdriver_options = { "chrome_binary_path": binary_path, "chrome_driver_path": driver_path, } # Select proper scraper based on received ards if self.site == "google": self.scraper = GoogleScraper(self) self.scraper.run(search_options, webdriver_options) elif self.site == "bigstock": self.scraper = BigStockScraper(self) self.scraper.run(search_options, webdriver_options) elif self.site == "shutterstock": self.scraper = ShutterStockScraper(self) self.scraper.run(search_options, webdriver_options) else: print(f"Error: no scraper found for website '{self.site}'.")