def downloadEverything(downloader, tickerType, insecure, sleeptime, pandantic): loop = 0 while not downloader.isDone(): symbols = downloader.nextRequest(insecure, pandantic) print("Got " + str(len(symbols)) + " downloaded " + downloader.type + " symbols:") if (len(symbols) > 2): try: print(" " + text(symbols[0])) print(" " + text(symbols[1])) print(" ect...") except: print( " Could not display some ticker symbols due to char encoding" ) downloader.printProgress() # Save download state occasionally. # We do this in case this long running is suddenly interrupted. loop = loop + 1 if loop % 200 == 0: print("Saving downloader to disk...") saveDownloader(downloader, tickerType) print("Downloader successfully saved.") print("") if not downloader.isDone(): sleep(sleeptime) # So we don't overload the server.
def downloadEverything(downloader, tickerType, insecure, sleeptime, pandantic): loop = 0 while not downloader.isDone(): symbols = downloader.nextRequest(insecure, pandantic) print("Got " + str(len(symbols)) + " downloaded " + downloader.type + " symbols:") if(len(symbols) > 2): try: print (" " + text(symbols[0])) print (" " + text(symbols[1])) print (" ect...") except: print (" Could not display some ticker symbols due to char encoding") downloader.printProgress() # Save download state occasionally. # We do this in case this long running is suddenly interrupted. loop = loop + 1 if loop % 200 == 0: print ("Saving downloader to disk...") saveDownloader(downloader, tickerType) print ("Downloader successfully saved.") print ("") if not downloader.isDone(): sleep(sleeptime) # So we don't overload the server.
def _fetch(self, insecure, market): params = { 'm': market, 'b': text(self.current_q_item_offset), 's': self.current_q, 't': self.type[0].upper(), 'p': 1, } query_string = { 'device': 'console', 'returnMeta': 'true', } protocol = 'http' if insecure else 'https' req = requests.Request( 'GET', protocol + '://finance.yahoo.com/_finance_doubledown/api/resource/finance.yfinlist.symbol_lookup' + self._encodeParams(params), headers={'User-agent': user_agent}, params=query_string) req = req.prepare() print("req " + req.url) resp = self.rsession.send(req, timeout=(12, 12)) resp.raise_for_status() if self.current_q_item_offset > 2000: # Y! stops returning symbols at offset > 2000, workaround: add finer granulated search query self._add_queries(self.current_q) return resp.json()
def _fetch_processor(self): while True: (current_query, json, msg) = self.fetch_returns.get() (symbols, count) = self.decodeSymbolsContainer(json) for symbol in symbols: self.symbols[symbol.ticker] = symbol # record symbols returned for this query current_query.results.append(symbol.ticker) if (count > 10): # This should never happen with this API, it always returns at most 10 items raise Exception("Funny things are happening: count " + text(count) + " > 10. Content:\n" + repr(json)) # There is no pagination with this API. # If we receive X results, we assume there are more than X and # add another layer of queries to narrow the search further # In the past, X was known to be 10. Now it is some number 1 < X <= 10 if self.result_count_action[count] is None: # the action for this number of results is unknown, # so assume search narrowing is required self._add_queries(current_query, general_search_characters) elif self.result_count_action[count]: # this number of results is known to require search narrowing self._add_queries(current_query, general_search_characters) else: # Tell the query it's done current_query.done() print(msg) self.status_print(symbols) self.completed_queries.append(current_query) self.fetch_returns.task_done()
def exportFile(data, downloader, file_format): exporting_function = {'xlsx': data.xlsx, 'json': data.json.encode('UTF-8'), 'yaml': data.yaml.encode('UTF-8')} if file_format == 'csv': with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f: f.write(text.join(u',', data.headers) + '\n') writer = csv.writer(f) for i in range(0, len(data)): row = [text(y) if not y is None else u"" for y in data[i]] writer.writerow(row) elif file_format == 'sqlite': db = sqlite3.connect(f'{downloader.type}.{file_format}') df = data.export('df') df.to_sql('YAHOO_TICKERS', db, if_exists='replace') db.commit() db.close() elif file_format in [item for item in formats if item != 'csv']: try: with open(f'{downloader.type}.{file_format}', 'wb') as f: f.write(exporting_function[file_format]) except: logger.warning(f"Could not export .{file_format} due to a internal error") else: logger.error('Unknown output format')
def nextRequest(self, insecure=False, pandantic=False): if not self.IsFirstRequest: self._nextQuery() else: self.IsFirstRequest=False success = False retryCount = 0 json = None # Eponential back-off algorithm # to attempt 10 more times sleeping 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 seconds # respectively. maxRetries = 10 while(success == False): try: json = self._fetch(insecure) success = True except (requests.HTTPError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as ex: if retryCount < maxRetries: attempt = retryCount + 1 sleepAmt = int(math.pow(2,attempt)) print("Retry attempt: " + str(attempt) + " of " + str(maxRetries) + "." " Sleep period: " + str(sleepAmt) + " seconds." ) sleep(sleepAmt) retryCount = attempt else: raise (symbols, count) = self.decodeSymbolsContainer(json) for symbol in symbols: self.symbols[symbol.ticker] = symbol # There is no pagination with this API. # If we receive 10 results, we assume there are more than 10 and add another layer of queries to narrow the search further if(count == 10): self._add_queries(self.current_q) elif(count > 10): # This should never happen with this API, it always returns at most 10 items raise Exception("Funny things are happening: count " + text(count) + " > 10. " + "Content:" + "\n" + repr(json)) if self._getQueryIndex() + 1 >= len(self.queries): self.done = True else: self.done = False return symbols
def nextRequest(self, pbar, insecure=False, pandantic=False): self._nextQuery() success = False retryCount = 0 json = None # Eponential back-off algorithm # to attempt 5 more times sleeping x, x^2, x^3, x^4, x^5 seconds respectively. maxRetries = 5 firstSleep = 5 # seconds while not success: try: json = self._fetch(insecure) success = True except (requests.HTTPError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as ex: if retryCount < maxRetries: retryCount += 1 sleepAmt = int(math.pow(firstSleep, retryCount)) pbar.write("Retry attempt: " + str(retryCount) + " of " + str(maxRetries) + "." " Sleep period: " + str(sleepAmt) + " seconds.") sleep(sleepAmt) # Recreate the session after sleeping. self.rsession = requests.Session() else: raise ex (symbols, count) = self.decodeSymbolsContainer(json) for symbol in symbols: self.symbols[symbol.ticker] = symbol # There is no pagination with this API. # If we receive 10 results, add another layer of queries by expending the query to narrow the search further. if(count == 10): self._add_queries(self.queries[self.idx]) elif(count > 10): # This should never happen with this API, it always returns at most 10 items raise Exception("Funny things are happening: count " + text(count) + " > 10. " + "Content:" + "\n" + repr(json)) if self.idx + 1 >= len(self.queries): self.done = True else: self.done = False return symbols
def nextRequest(self, insecure=False, pandantic=False): self._nextQuery() success = False retryCount = 0 json = None # Eponential back-off algorithm # to attempt 5 more times sleeping 5, 25, 125, 625, 3125 seconds # respectively. maxRetries = 5 while (success == False): try: json = self._fetch(insecure) success = True except (requests.HTTPError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as ex: if retryCount < maxRetries: attempt = retryCount + 1 sleepAmt = int(math.pow(5, attempt)) print("Retry attempt: " + str(attempt) + " of " + str(maxRetries) + "." " Sleep period: " + str(sleepAmt) + " seconds.") sleep(sleepAmt) retryCount = attempt else: raise (symbols, count) = self.decodeSymbolsContainer(json) for symbol in symbols: self.symbols[symbol.ticker] = symbol # There is no pagination with this API. # If we receive 10 results, we assume there are more than 10 and add another layer of queries to narrow the search further # original test on ==10 but yahoo sometimes stops earlier, so replaced with >7 (tunable) if (7 < count <= 10): self._add_queries(self.current_q) elif (count > 10): # This should never happen with this API, it always returns at most 10 items raise Exception("Funny things are happening: count " + text(count) + " > 10. " + "Content:" + "\n" + repr(json)) #test if queries[0]has been searched before signalling done if self._getQueryIndex() + 1 >= len(self.queries): self.done = True else: self.done = False return symbols
def main(): downloader = None parser = argparse.ArgumentParser() parser.add_argument("-i", "--insecure", help="use HTTP instead of HTTPS", action="store_true") parser.add_argument( "-e", "--export", help= "export immediately without downloading (Only useful if you already downloaded something to the .pickle file)", action="store_true") parser.add_argument( '-E', '--Exchange', help= 'Only export ticker symbols from this exchange (the filtering is done during the export phase)' ) parser.add_argument('type', nargs='?', default='tiger', help='The type to download, this can be: ' + " ".join(list(options.keys()))) parser.add_argument("-s", "--sleep", help="The time to sleep in seconds between requests", type=float, default=0) parser.add_argument( "-p", "--pandantic", help="Stop and warn the user if some rare assertion fails", action="store_true") args = parser.parse_args() protocol = 'http' if args.insecure else 'https' if args.insecure: print("Using insecure connection") if args.export: print("Exporting pickle file") tickerType = args.type = args.type.lower() print("Checking if we can resume a old download session") try: downloader = loadDownloader(tickerType) print("Downloader found on disk, resuming") except: print("No old downloader found on disk") print("Starting a new session") if tickerType not in options: print("Error: " + tickerType + " is not a valid type option. See --help") exit(1) else: downloader = options[tickerType] rp = robotparser.RobotFileParser() rp.set_url(protocol + '://finance.yahoo.com/robots.txt') rp.read() try: if not args.export: if (not rp.can_fetch( user_agent, protocol + '://finance.yahoo.com/_finance_doubledown/api/resource/searchassist' )): print('Execution of script halted due to robots.txt') return 1 if not downloader.isDone(): print("Downloading " + downloader.type) print("") downloadEverything(downloader, tickerType, args.insecure, args.sleep, args.pandantic) print("Saving downloader to disk...") saveDownloader(downloader, tickerType) print("Downloader successfully saved.") print("") else: print( "The downloader has already finished downloading everything" ) print("") except Exception as ex: print( "A exception occurred while downloading. Suspending downloader to disk" ) saveDownloader(downloader, tickerType) print("Successfully saved download state") print("Try removing {type}.pickle file if this error persists") print( "Issues can be reported on https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues" ) print("") raise except KeyboardInterrupt as ex: print("Suspending downloader to disk as .pickle file") saveDownloader(downloader, tickerType) if downloader.isDone() or args.export: print("Exporting " + downloader.type + " symbols") data = tablib.Dataset() data.headers = downloader.getRowHeader() for symbol in downloader.getCollectedSymbols(): if (args.Exchange == None): data.append(symbol.getRow()) elif (symbol.exchange == args.Exchange): data.append(symbol.getRow()) with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f: f.write(text.join(u',', data.headers) + '\n') writer = csv.writer(f) for i in range(0, len(data)): row = [text(y) if not y is None else u"" for y in data[i]] writer.writerow(row) try: with open(downloader.type + '.xlsx', 'wb') as f: f.write(data.xlsx) except: print("Could not export .xlsx due to a internal error") try: with open(downloader.type + '.json', 'wb') as f: f.write(data.json.encode('UTF-8')) except: print("Could not export .json due to a internal error") try: with open(downloader.type + '.yaml', 'wb') as f: f.write(data.yaml.encode('UTF-8')) except: print("Could not export .yaml due to a internal error")
def _encodeParams(self, params): encoded = '' for key, value in params.items(): encoded += ';' + quote(key) + '=' + quote(text(value)) return encoded
def nextRequest(self, insecure=False, pandantic=False, market='all'): # You would expect query_done to be a boolean. # But unfortunaly we can't depend on Yahoo telling use if there # are any more entries. Only if yahoo tells us x amount of times in # succession they are done will we actually go on to the next query. if (self.query_done >= self.query_done_max): self._nextQuery() success = False retryCount = 0 json = None # Eponential back-off algorithm # to attempt 3 more times sleeping 5, 25, 125 seconds # respectively. while (success == False): try: json = self._fetch(insecure, market) success = True except (requests.HTTPError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as ex: if retryCount < 3: attempt = retryCount + 1 sleepAmt = int(math.pow(5, attempt)) print("Retry attempt: " + str(attempt) + "." " Sleep period: " + str(sleepAmt) + " seconds.") sleep(sleepAmt) retryCount = attempt else: raise (symbols, count) = self.decodeSymbolsContainer(json) for symbol in symbols: self.symbols[symbol.ticker] = symbol current_q_item_offset = self.current_q_item_offset + len(symbols) current_q_total_items = count if (current_q_item_offset == current_q_total_items): self.query_done += 1 elif (current_q_item_offset > current_q_total_items and pandantic): # This should never happen now that we are using the a JSON API raise Exception( "Funny things are happening: current_q_item_offset " + text(current_q_item_offset) + " > " + text(self.current_q_total_items) + " current_q_total_items. Content:" + "\n" + repr(json)) else: self.query_done = 0 self.current_q_item_offset = current_q_item_offset self.current_q_total_items = current_q_total_items if len(symbols) == 0: self.current_page_retries += 1 # Related to issue #4 # See https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues/4#issuecomment-51718922 # Yahoo sometimes gives a "bad" page. There is no way we can determine if we are # At the end of pagination or if we happen to get a bad page a few times in a row. # So we simply request the page a lot of times. At some point we are fairly certain # we are at end of pagination. if self.current_page_retries > 20: self.query_done = self.query_done + self.query_done_max else: self.current_page_retries = 0 if (self.query_done >= self.query_done_max): if self._getQueryIndex() + 1 >= len(self.queries): self.done = True else: self.done = False return symbols
def nextRequest(self, insecure=False, pandantic=False): # You would expect query_done to be a boolean. # But unfortunaly we can't depend on Yahoo telling use if there # are any more entries. Only if yahoo tells us x amount of times in # succession they are done will we actually go on to the next query. if (self.query_done >= self.query_done_max): self._nextQuery() success = False retryCount = 0 html = "" # _fetchHtml may raise an exception based on response status or # if the request caused a transport error. # At this point we try a simple exponential back-off algorithm # to attempt 3 more times sleeping 5, 25, 125 seconds # respectively. while (success == False): try: html = self._fetchHtml(insecure) success = True except (requests.HTTPError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as ex: if retryCount < 3: attempt = retryCount + 1 sleepAmt = int(math.pow(5, attempt)) print("Retry attempt: " + str(attempt) + "." " Sleep period: " + str(sleepAmt) + " seconds.") sleep(sleepAmt) retryCount = attempt else: raise soup = BeautifulSoup(html, "html.parser") symbols = None try: # A exception is thrown here for the following reasons: # 1. Yahoo does not include a table (or any results!) if you # request items at offset 2020 or more # 2. Yahoo randomly screws a http request up and table is missing (a bad page). # A succesive http request might not result in a exception here. # 3. A TypeError is raised. This is disabled for now. # TypeError should be thrown in the different downloaders like MutualFundDownloader.py # It should be a sanity check to make sure we download the correct type. # But for some reason Yahoo consistently gives back some incorrect types. # Search for Mutual Fund and 1 out of the 20 are ETF's. # I am not sure what is going on. # At the moment the sanity checks have been disabled in the different downloaders. symbolsContainer = soup.find("table", {"class": "yui-dt"}).tbody symbols = self.decodeSymbolsContainer(symbolsContainer) except KeyboardInterrupt as ex: raise except TypeError as ex: raise except: symbols = [] for symbol in symbols: self.symbols[symbol.ticker] = symbol current_q_item_offset = self.current_q_item_offset + len(symbols) current_q_total_items = self._getTotalItemsFromSoup(soup) if (current_q_total_items != 'Unknown'): if (current_q_item_offset == current_q_total_items): self.query_done = self.query_done + 1 elif (current_q_item_offset > current_q_total_items and pandantic): # This happens rarely for multiple requests to the same url # Output is garanteed to be inconsistent between runs. raise Exception( "Funny things are happening: current_q_item_offset " + text(current_q_item_offset) + " > " + text(self.current_q_total_items) + " current_q_total_items. HTML:" + "\n" + text(html)) else: self.query_done = 0 self.current_q_item_offset = current_q_item_offset self.current_q_total_items = current_q_total_items if len(symbols) == 0: self.current_page_retries += 1 # Related to issue #4 # See https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues/4#issuecomment-51718922 # Yahoo sometimes gives a "bad" page. There is no way we can determine if we are # At the end of pagination or if we happen to get a bad page a few times in a row. # So we simply request the page a lot of times. At some point we are fairly certain # we are at end of pagination. if self.current_page_retries > 20: self.query_done = self.query_done + self.query_done_max else: self.current_page_retries = 0 if (self.query_done >= self.query_done_max): if self._getQueryIndex() + 1 >= len(self.queries): self.done = True else: self.done = False return symbols
def main(): downloader = None parser = argparse.ArgumentParser() parser.add_argument("-i", "--insecure", help="use HTTP instead of HTTPS", action="store_true") parser.add_argument("-e", "--export", help="export immediately without downloading (Only useful if you already downloaded something to the .pickle file)", action="store_true") parser.add_argument('-E', '--Exchange', help='Only export ticker symbols from this exchange (the filtering is done during the export phase)') parser.add_argument('type', help='The type to download, this can be: '+" ".join(list(options.keys()))) parser.add_argument("-s", "--sleep", help="The time to sleep in seconds between requests", type=float, default=0) parser.add_argument("-p", "--pandantic", help="Stop and warn the user if some rare assertion fails", action="store_true") parser.add_argument("-m", "--market", help="Specify the Region of queried exchanges (us = USA+Canada, dr=Germany, fr=France, hk=Hongkong, gb=United Kingdom, default= all)", default="all") args = parser.parse_args() if args.insecure: print("Using insecure connection") if args.export: print("Exporting pickle file") tickerType = args.type = args.type.lower() market = args.market = args.market.lower() print("Checking if we can resume a old download session") try: downloader = loadDownloader(tickerType) print("Downloader found on disk, resuming") except: print("No old downloader found on disk") print("Starting a new session") if tickerType not in options: print("Error: " + tickerType + " is not a valid type option. See --help") exit(1) else: downloader = options[tickerType] try: if not args.export: if not downloader.isDone(): print("Downloading " + downloader.type) print("") downloadEverything(downloader, tickerType, args.insecure, args.sleep, args.pandantic, market) print ("Saving downloader to disk...") saveDownloader(downloader, tickerType) print ("Downloader successfully saved.") print ("") else: print("The downloader has already finished downloading everything") print("") except Exception as ex: print("A exception occurred while downloading. Suspending downloader to disk") saveDownloader(downloader, tickerType) print("Successfully saved download state") print("Try removing {type}.pickle file if this error persists") print("Issues can be reported on https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues") print("") raise except KeyboardInterrupt as ex: print("Suspending downloader to disk as .pickle file") saveDownloader(downloader, tickerType) if downloader.isDone() or args.export: print("Exporting "+downloader.type+" symbols") data = tablib.Dataset() data.headers = downloader.getRowHeader() for symbol in downloader.getCollectedSymbols(): if(args.Exchange == None): data.append(symbol.getRow()) elif (symbol.exchange == args.Exchange): data.append(symbol.getRow()) with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f: f.write(text.join(u',', data.headers) + '\n') writer = csv.writer(f) for i in range(0, len(data)): row = [text(y) if not y is None else u"" for y in data[i]] writer.writerow(row) with open(downloader.type + '.xlsx', 'wb') as f: f.write(data.xlsx) with open(downloader.type + '.json', 'wb') as f: f.write(data.json.encode('UTF-8')) with open(downloader.type + '.yaml', 'wb') as f: f.write(data.yaml.encode('UTF-8'))
def print_symbol(symbol): try: print(" " + text(symbol)) except: print(" Could not display some ticker symbols due to char encoding")
def main(): downloader = None parser = argparse.ArgumentParser() parser.add_argument("-i", "--insecure", help="use HTTP instead of HTTPS", action="store_true") parser.add_argument("-e", "--export", help="export immediately without downloading (Only useful if you already downloaded something to the .pickle file)", action="store_true") parser.add_argument('-E', '--Exchange', help='Only export ticker symbols from this exchange (the filtering is done during the export phase)') parser.add_argument('type', nargs='?', default='generic', help='The type to download, this can be: '+" ".join(list(options.keys()))) parser.add_argument("-s", "--sleep", help="The time to sleep in seconds between requests", type=float, default=0) parser.add_argument("-p", "--pandantic", help="Stop and warn the user if some rare assertion fails", action="store_true") args = parser.parse_args() protocol = 'http' if args.insecure else 'https' if args.insecure: print("Using insecure connection") if args.export: print("Exporting pickle file") tickerType = args.type = args.type.lower() print("Checking if we can resume a old download session") try: downloader = loadDownloader(tickerType) print("Downloader found on disk, resuming") except: print("No old downloader found on disk") print("Starting a new session") if tickerType not in options: print("Error: " + tickerType + " is not a valid type option. See --help") exit(1) else: downloader = options[tickerType] rp = robotparser.RobotFileParser() rp.set_url(protocol + '://finance.yahoo.com/robots.txt') rp.read() try: if not args.export: if(not rp.can_fetch(user_agent, protocol + '://finance.yahoo.com/_finance_doubledown/api/resource/searchassist')): print('Execution of script halted due to robots.txt') return 1 if not downloader.isDone(): print("Downloading " + downloader.type) print("") downloadEverything(downloader, tickerType, args.insecure, args.sleep, args.pandantic) print ("Saving downloader to disk...") saveDownloader(downloader, tickerType) print ("Downloader successfully saved.") print ("") else: print("The downloader has already finished downloading everything") print("") except Exception as ex: print("A exception occurred while downloading. Suspending downloader to disk") saveDownloader(downloader, tickerType) print("Successfully saved download state") print("Try removing {type}.pickle file if this error persists") print("Issues can be reported on https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues") print("") raise except KeyboardInterrupt as ex: print("Suspending downloader to disk as .pickle file") saveDownloader(downloader, tickerType) if downloader.isDone() or args.export: print("Exporting "+downloader.type+" symbols") data = tablib.Dataset() data.headers = downloader.getRowHeader() for symbol in downloader.getCollectedSymbols(): if(args.Exchange == None): data.append(symbol.getRow()) elif (symbol.exchange == args.Exchange): data.append(symbol.getRow()) with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f: f.write(text.join(u',', data.headers) + '\n') writer = csv.writer(f) for i in range(0, len(data)): row = [text(y) if not y is None else u"" for y in data[i]] writer.writerow(row) try: with open(downloader.type + '.xlsx', 'wb') as f: f.write(data.xlsx) except: print("Could not export .xlsx due to a internal error") try: with open(downloader.type + '.json', 'wb') as f: f.write(data.json.encode('UTF-8')) except: print("Could not export .json due to a internal error") try: with open(downloader.type + '.yaml', 'wb') as f: f.write(data.yaml.encode('UTF-8')) except: print("Could not export .yaml due to a internal error")