def parse_user_args(command_line=None): # root parser parser = ArgumentParser( prog='tws_equities', description= 'A Python CLI built to download bar-data for Japanese Equities from ' 'TWS API.', epilog= 'All optional arguments work like a toggle switch, user need not pass an ' 'explicit value to them.', ) parser.add_argument( '--verbose', '-v', default=False, action='store_true', help= 'Use this option to enable console logging, default behavior is to display ' 'error messages only. Pair this option with "--debug / -d" option to view more ' 'detailed messages.') parser.add_argument( '--debug', '-d', default=False, action='store_true', help= 'This option will not only enable console logging but would also start raising ' 'hidden errors, specifically built for developers trying to debug a problem.' ) # add & build sub-parser for supported commands # refer to _COMMAND_CONFIG for available commands sub_praser = parser.add_subparsers(dest='command') for name, config in CLI_CONFIG.items(): _build_command(sub_praser, name=name, **config) args = parser.parse_args(command_line) # user did not choose a command to run if args.command is None: write_to_console( 'User should specify which command to run, please choose from the given options.\n', verbose=True) parser.print_help() exit(0) # user did not specify tickers if hasattr(args, 'tickers') and args.tickers is None: write_to_console( 'User did not specify target tickers, loading from default input file.\n', verbose=True) args.tickers = get_default_tickers() return vars(args)
def _cleanup(success_files, success_directory, failure_files, failure_directory, verbose=False): message = 'Post-extraction cleanup initiated...' write_to_console(message, verbose=verbose) # delete duplicate files # TODO: this operation should not be required duplicate_files = list(set(success_files).intersection(failure_files)) for file in duplicate_files: delete_file(failure_directory, file) message = f'Cleaned {len(duplicate_files)} duplicate files...' write_to_console(message, pointer='->', indent=1, verbose=verbose)
def create_csv_dump(target_date, end_time='15:01:00', bar_size='1 min', verbose=False): """ Creates a CSV file from JSON files for a given date. Raise an error if directory for the gven is not present. CSV files to be saved at the historical data storage location: 'success.csv' & 'failure.csv' """ logger.info('Generating final CSV dump') storage_dir = _setup_storage_directories(target_date, bar_size=bar_size) _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}' write_to_console(f'{"-"*30} CSV Conversion: {_date} {"-"*31}', verbose=True) target_directory = join(CACHE_DIR, bar_size.replace(' ', ''), target_date, end_time.replace(':', '_')) if not isdir(target_directory): raise NotADirectoryError( f'Could not find a data storage directory for date: {target_directory}' ) success_directory = join(target_directory, 'success') failure_directory = join(target_directory, 'failure') if isdir(success_directory): path = join(storage_dir, 'success.csv') success = generate_success_dataframe(success_directory, bar_title='Success', verbose=verbose) success.to_csv(path, index=False) logger.debug(f'Success file saved at: {path}') if isdir(failure_directory): path = join(storage_dir, 'failure.csv') failure = generate_failure_dataframe(failure_directory, bar_title='Failure', verbose=verbose) failure.to_csv(path, index=False) logger.debug(f'Failure file saved at: {path}')
def create_csv_dump(target_date, end_time='15:01:00', verbose=False): """ Creates a CSV file from JSON files for a given date. Raise an error if directory for the gven is not present. Created CSV files will be saved at the same location by the name: 'success.csv' & 'failure.csv' """ logger.info('Generating final CSV dump') _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}' write_to_console(f'{"-"*30} CSV Conversion: {_date} {"-"*31}', verbose=True) target_directory = join(_HISTORICAL_DATA_STORAGE, target_date, end_time.replace(':', '_')) if not isdir(target_directory): raise NotADirectoryError( f'Could not find a data storage directory for date: {target_date}') success_directory = join(target_directory, '.success') failure_directory = join(target_directory, '.failure') if isdir(success_directory): path = join(target_directory, 'success.csv') success = generate_success_dataframe(success_directory, bar_title='Success', verbose=verbose) success.to_csv(path, index=False) logger.debug(f'Success file saved at: {path}') if isdir(failure_directory): failure = generate_failure_dataframe(failure_directory, bar_title='Failure', verbose=verbose) path = join(target_directory, 'failure.csv') failure.to_csv(path, index=False) logger.debug(f'Failure file saved at: {path}')
def generate_success_dataframe(target_directory, bar_title=None, verbose=False): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid bar data. :param target_directory: location to read JSON files from :param bar_title: message to show infron of progress bar :param verbose: set to true to see info messages on console """ if bar_title is not None: _BAR_CONFIG['title'] = bar_title def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = [ 'time_stamp', 'ecode', 'session', 'high', 'low', 'close', 'volume', 'average', 'count' ] # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory success_files = get_files_by_type(target_directory) success_tickers = list(map(_get_ticker_id, success_files)) total = len(success_tickers) data = pd.DataFrame(columns=expected_columns) if bool(total): write_to_console(f'=> Generating dataframe for success tickers...', verbose=verbose) json_generator = map(read_json_file, success_files) counter = 0 # to count temp files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker = success_tickers[i] ticker_data = next( json_generator) # load data into a dictionary bar_data, meta_data = ticker_data['bar_data'], ticker_data[ 'meta_data'] temp_data = pd.DataFrame(bar_data) temp_data['ecode'] = ticker data = data.append(temp_data) _time_to_cache = ((i > 0) and (i % 100 == 0)) or (i + 1 == total) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'success_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = get_files_by_type(temp_directory, file_type='csv') if bool(temp_files): data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode', 'time_stamp'], inplace=True, ignore_index=True) data = data[expected_columns] delete_directory(temp_directory) return data
def generate_extraction_metrics(target_date, end_time='15:01:00', input_tickers=None, verbose=False): """ Generates metrics about success & failure tickers. Metrics are saved into a new file called 'metrics.csv' :param target_date: date for which metrics are needed :param end_time: end time for metrics are to be generated :param input_tickers: tickers for which metrics are to be generated """ logger.info('Generating final extraction metrics') _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}' write_to_console(f'{"-"*30} Metrics Generation: {_date} {"-"*31}', verbose=True) expected_metrics = [ 'total_tickers', 'total_extracted', 'total_extraction_ratio', 'extraction_successful', 'extraction_failure', 'success_ratio', 'failure_ratio', 'n_225_input_ratio', 'n_225_success_ratio', 'n_225_failure_ratio', 'topix_input_ratio', 'topix_success_ratio', 'topix_failure_ratio', 'jasdaq_20_input_ratio', 'jasdaq_20_success_ratio', 'jasdaq_20_failure_ratio', 'missing_tickers_ratio', 'missing_tickers' ] metrics = dict(zip(expected_metrics, [0.0] * len(expected_metrics))) target_directory = join(_HISTORICAL_DATA_STORAGE, target_date, end_time.replace(':', '_')) if not isdir(target_directory): raise NotADirectoryError( f'Data storage directory for {target_date} not found at' f'{_HISTORICAL_DATA_STORAGE}') success_file = join(target_directory, 'success.csv') failure_file = join(target_directory, 'failure.csv') if not isfile(success_file): raise FileNotFoundError(f'Can not find success file: {success_file}') if not isfile(failure_file): raise FileNotFoundError(f'Can not find failure file: {failure_file}') input_tickers_file = join(target_directory, 'input_tickers.json') if input_tickers is None: if not isfile(input_tickers_file): raise FileNotFoundError( f'Can not find input tickers file: {input_tickers_file}') input_tickers = read_json_file(input_tickers_file) japan_indices = get_japan_indices() _n_225_tickers = japan_indices[japan_indices.n_225.str.contains( 'T')].n_225.unique().tolist() n_225_tickers = list(map(lambda x: int(x.split('.')[0]), _n_225_tickers)) _topix_tickers = japan_indices[japan_indices.topix.str.contains( 'T')].topix.unique().tolist() topix_tickers = list(map(lambda x: int(x.split('.')[0]), _topix_tickers)) _jasdaq_20_tickers = japan_indices[japan_indices.jasdaq_20.str.contains( 'T')].jasdaq_20.unique().tolist() jasdaq_20_tickers = list( map(lambda x: int(x.split('.')[0]), _jasdaq_20_tickers)) success = read_csv(success_file) failure = read_csv(failure_file) success_tickers = success.ecode.unique().tolist() failure_tickers = failure.ecode.unique().tolist() total_tickers = len(input_tickers) if total_tickers == 0: raise ValueError( f'Can not find any input tickers in file {input_tickers_file}') extraction_successful = len(success_tickers) extraction_failure = len(failure_tickers) total_extracted = extraction_successful + extraction_failure total_extraction_ratio = round(total_extracted / total_tickers, 3) success_ratio = round(extraction_successful / total_tickers, 3) failure_ratio = round(extraction_failure / total_tickers, 3) logger.debug(f'Updated over-all extraction ratio: {success_ratio}') write_to_console(f'Over-all Extraction: {_get_marker(success_ratio)}', pointer='->', indent=2, verbose=True) write_to_console(f'Over-all Success Ratio: {success_ratio}', pointer='-', indent=4, verbose=verbose) n_225_input = list(set(input_tickers).intersection(n_225_tickers)) if bool(n_225_input): n_225_input_ratio = round(len(n_225_input) / len(n_225_tickers), 3) n_225_success = list(set(success_tickers).intersection(n_225_input)) n_225_failure = list(set(failure_tickers).intersection(n_225_input)) n_225_success_ratio = round(len(n_225_success) / len(n_225_input), 3) n_225_failure_ratio = round(len(n_225_failure) / len(n_225_input), 3) logger.debug(f'Updated N225 extraction ratio: {n_225_success_ratio}') write_to_console( f'N225 Extraction: {_get_marker(n_225_success_ratio)}', pointer='->', indent=2, verbose=True) write_to_console(f'Over-all Success Ratio: {n_225_success_ratio}', pointer='-', indent=4, verbose=verbose) else: logger.debug('Could not find any N 225 tickers in the given input') topix_input = list(set(input_tickers).intersection(topix_tickers)) if bool(topix_input): topix_input_ratio = round(len(topix_input) / len(topix_tickers), 3) topix_success = list(set(success_tickers).intersection(topix_input)) topix_failure = list(set(failure_tickers).intersection(topix_input)) topix_success_ratio = round(len(topix_success) / len(topix_input), 3) topix_failure_ratio = round(len(topix_failure) / len(topix_input), 3) logger.debug(f'Updated Topix extraction ratio: {topix_success_ratio}') write_to_console( f'Topix Extraction: {_get_marker(topix_success_ratio)}', pointer='->', indent=2, verbose=True) write_to_console(f'Topix Success Ratio: {topix_success_ratio}', pointer='-', indent=4, verbose=verbose) else: logger.debug('Could not find any Topix tickers in the given input') jasdaq_20_input = list(set(input_tickers).intersection(jasdaq_20_tickers)) if bool(jasdaq_20_input): jasdaq_20_input_ratio = round( len(jasdaq_20_input) / len(jasdaq_20_tickers), 3) jasdaq_20_success = list( set(success_tickers).intersection(jasdaq_20_input)) jasdaq_20_failure = list( set(failure_tickers).intersection(jasdaq_20_input)) jasdaq_20_success_ratio = round( len(jasdaq_20_success) / len(jasdaq_20_input), 3) jasdaq_20_failure_ratio = round( len(jasdaq_20_failure) / len(jasdaq_20_input), 3) logger.debug( f'Updated JASDAQ 20 extraction ratio: {jasdaq_20_success_ratio}') write_to_console( f'JASDAQ 20 Extraction: {_get_marker(jasdaq_20_success_ratio)}', pointer='->', indent=2, verbose=True) write_to_console(f'JASDAQ 20 Success Ratio: {jasdaq_20_success_ratio}', pointer='-', indent=4, verbose=verbose) else: logger.debug('Could not find any JASDAQ 20 tickers in the given input') missing_tickers = list( set(input_tickers).difference(success_tickers + failure_tickers)) missing_tickers_ratio = round(len(missing_tickers) / total_tickers, 3) logger.debug(f'Updated missing tickers ratio: {missing_tickers_ratio}') all_vars = vars() for key in all_vars: if key in expected_metrics: metrics[key] = all_vars[key] metrics_file = join(target_directory, 'metrics.json') save_data_as_json(metrics, metrics_file) logger.debug(f'Metrics saved at: {metrics_file}')
def generate_failure_dataframe(target_directory, bar_title=None, verbose=False): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid error stacks. :param target_directory: location to read JSON files from :param bar_title: message to show infron of progress bar :param verbose: set to true to see info messages on console """ if bar_title is not None: _BAR_CONFIG['title'] = bar_title def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = ['ecode', 'code', 'message'] data = pd.DataFrame(columns=expected_columns) # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory file_pattern = join( target_directory, '*.json') # TODO: can be modified to match digital values failure_files = glob(file_pattern) total = len(failure_files) if bool(total): write_to_console(f'=> Generting dataframe for failure tickers...', verbose=verbose) json_generator = map(read_json_file, failure_files) counter = 0 # to count temp CSV files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker_data = next(json_generator) meta = ticker_data['meta_data'] error_stack = meta['_error_stack'] ecode = meta.get('ecode', _get_ticker_id(failure_files[i])) temp_data = pd.DataFrame(error_stack, columns=expected_columns) temp_data['ecode'] = ecode # if error stack is empty, then create a dummy row if temp_data.shape[ 0] == 0: # fixme: find a way to control this in the TWS Client dummy_row = { 'ecode': ecode, 'code': 'unknown', 'message': 'not available' } temp_data = temp_data.append(dummy_row, ignore_index=True) data = data.append(temp_data) _time_to_cache = (i + 1 == total) or ((i > 0) and (i % 100 == 0)) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'failure_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = get_files_by_type(temp_directory, file_type='csv') data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode'], ignore_index=True, inplace=True) data = data[expected_columns] delete_directory(temp_directory) return data
def metrics_generator(date, bar_size, tickers): """ Generate extraction metrics for daily downloaded data Writes data to two new files: - metrics.csv: day-wise metrics (success, failed, missed v/s total stocks) - status.csv: extraction status for each input ticker for a specific day - Parameters: ------------- - data_location(str): location where downloaded data is kept - input_file(str): full path to input file """ logger.info('Generating final extraction metrics') display_date = f'{date[:4]}/{date[4:6]}/{date[6:]}' write_to_console(f'{"-"*30} Metrics Generation: {display_date} {"-"*31}', verbose=True) try: data_location = join(HISTORICAL_DATA_STORAGE, bar_size.replace(' ', ''), date[:4], MONTH_MAP[int(date[4:6])], date) # read success, failure & input files success = pd.read_csv(join(data_location, 'success.csv')) failure = pd.read_csv(join(data_location, 'failure.csv')) if type(tickers) is list: pass # TODO: simple metrics generation else: # assuming that input is a file path input_ = pd.read_csv(tickers) # filter out relevant input --> active tickers relevant_input = input_[input_.status == 'A'] # get extraction metrics metrics = compute_extraction_metrics(success, failure, relevant_input) write_to_console( f'Over-all Extraction: {_get_marker(metrics["extraction_ratio"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Topix Extraction: {_get_marker(metrics["extraction_ratio_topix"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Nikkei 225 Extraction: {_get_marker(metrics["extraction_ratio_nikkei225"])}', pointer='->', indent=2, verbose=True) write_to_console( f'JASDAQ 20 Extraction: {_get_marker(metrics["extraction_ratio_jasdaq20"])}', pointer='->', indent=2, verbose=True) write_to_console( f'First Section Extraction: {_get_marker(metrics["extraction_ratio_first_section"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Second Section Extraction: ' f'{_get_marker(metrics["extraction_ratio_second_section"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Mothers Extraction: {_get_marker(metrics["extraction_ratio_mothers"])}', pointer='->', indent=2, verbose=True) write_to_console( f'JASDAQ Growth Extraction: ' f'{_get_marker(metrics["extraction_ratio_jasdaq_growth"])}', pointer='->', indent=2, verbose=True) write_to_console( f'JASDAQ Standard Extraction: ' f'{_get_marker(metrics["extraction_ratio_jasdaq_standard"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Market Capital Above ¥10B Extraction: ' f'{_get_marker(metrics["extraction_ratio_mcap_above_10b"])}', pointer='->', indent=2, verbose=True) write_to_console( f'Price x 3 Month\'s Trading Volume ¥85MM Extraction: ' f'{_get_marker(metrics["extraction_ratio_pv_above_85m"])}', pointer='->', indent=2, verbose=True) # generate / update metrics sheet _date = f'{date[:4]}-{date[4:6]}-{date[6:]}' update_metrics_sheet(_date, metrics) # generate daily extraction status sheet generate_daily_extraction_status_sheet(success, input_, data_location, date) except Exception as e: logger.critical(f'Metrics generation failed: {e}')
def extract_historical_data(tickers=None, end_date=None, end_time=None, duration='1 D', bar_size='1 min', what_to_show='TRADES', use_rth=0, date_format=1, keep_upto_date=False, chart_options=(), batch_size=_BATCH_SIZE, max_attempts=3, run_counter=1, verbose=False): """ A wrapper function around HistoricalDataExtractor, that pulls data from TWS for the given tickers. :param tickers: ticker ID (ex: 1301) :param end_date: end date (ex: '20210101') :param end_time: end time (ex: '15:00:01') :param duration: the amount of time to go back from end_date_time (ex: '1 D') :param bar_size: valid bar size or granularity of data (ex: '1 min') :param what_to_show: the type of data to retrieve (ex: 'TRADES') :param use_rth: 0 means retrieve data withing regular trading hours, else 0 :param date_format: format for bar data, 1 means yyyyMMdd, 0 means epoch time :param keep_upto_date: setting to True will continue to return unfinished bar data :param chart_options: to be documented :param batch_size: size of each batch as integer, default=30 :param max_attempts: maximum number of times to try for failure tickers :param run_counter: counts the number of attempts performed, not to be used from outside :param verbose: set to True to display messages on console """ logger.info( f'Running extractor, attempt: {run_counter} | max attempts: {max_attempts}' ) # let the user know that data extraction has been initiated if run_counter == 1: _date_formatted = f'{end_date[:4]}/{end_date[4:6]}/{end_date[6:]}' message = f'{"-" * 30} Data Extraction: {_date_formatted} {"-" * 30}' write_to_console(message, verbose=True) # additional info, if user asks for it message = f'Setting things up for data-extraction...' write_to_console(message, indent=2, verbose=verbose) tickers, cache_success, cache_failure = _prep_for_extraction( tickers, end_date, end_time, bar_size) write_to_console('Refreshed cache directories...', indent=4, pointer='->', verbose=verbose) write_to_console('Removed already cached tickers...', indent=4, pointer='->', verbose=verbose) write_to_console('Reset failed tickers...', indent=4, pointer='->', verbose=verbose) write_to_console('Generating ticker batches...', indent=2, verbose=verbose) batches = create_batches(tickers, batch_size) write_to_console(f'Total Tickers: {len(tickers)}', indent=4, verbose=verbose, pointer='->') write_to_console(f'Total Batches: {len(batches)}', indent=4, verbose=verbose, pointer='->') write_to_console(f'Batch Size: {batch_size}', indent=4, verbose=verbose, pointer='->') # core processing section bar_title = f'=> Attempt: {run_counter}' message = 'Batch-wise extraction in progress, this can take some time. Please be patient...' write_to_console(message, indent=2, verbose=verbose) success_files, failure_files = _run_extractor(batches, end_date, end_time, duration, bar_size, what_to_show, use_rth, date_format, keep_upto_date, chart_options, cache_success, cache_failure, bar_title=bar_title) run_counter += 1 # feedback loop, process failed or missing tickers until we hit the max attempt threshold if tickers != list(map(_get_ticker_id, success_files)): if run_counter <= max_attempts: # TODO: optimize unprocessed_tickers = set(tickers).difference( map(_get_ticker_id, success_files)) batch_size = 10 extract_historical_data(tickers=unprocessed_tickers, end_date=end_date, end_time=end_time, duration=duration, bar_size=bar_size, what_to_show=what_to_show, use_rth=use_rth, date_format=date_format, keep_upto_date=keep_upto_date, chart_options=chart_options, batch_size=batch_size, run_counter=run_counter) _cleanup(success_files, cache_success, failure_files, cache_failure, verbose=verbose)