Exemple #1
0
def _prep_for_extraction(tickers, target_date, end_time):
    # stdout.write(f'=> Setting things up for extraction for: {target_date}\n')

    # form data caching directory
    cache_directory = join(_HISTORICAL_DATA_STORAGE, target_date)

    # create cache directory for success
    cache_success = join(cache_directory, '.success')
    make_dirs(cache_success)

    # create cache directory for failure
    cache_failure = join(cache_directory, '.failure')
    make_dirs(cache_failure)
    # stdout.write(f'\t-> Cache directies created...\n')

    # save tickers for later use
    path_input_tickers = join(cache_directory, 'input_tickers.json')
    # todo: find a better way to do this
    if not isfile(path_input_tickers):
        save_data_as_json(tickers, path_input_tickers, indent=0)
    # stdout.write(f'\t-> Cached target tickers...\n')

    # extract tickers that are yet to be processed
    tickers = _get_unprocessed_tickers(tickers, cache_success, cache_failure,
                                       target_date, end_time)
    # stdout.write(f'\t-> Excluded already processed tickers...\n')
    # stdout.write(f'\t-> Total target tickers: {len(tickers)}\n')

    return tickers, cache_success, cache_failure
Exemple #2
0
def _setup_storage_directories(date, bar_size='1 min'):
    y, m = date[:4], date[4:6]
    storage_dir = join(HISTORICAL_DATA_STORAGE, bar_size.replace(' ', ''), y,
                       MONTH_MAP[int(m)], date)

    make_dirs(storage_dir)
    return storage_dir
Exemple #3
0
def _cache_data(data, cache_success, cache_failure):
    for ticker in data:
        data_to_save = data[ticker]
        status = data_to_save['meta_data']['status']

        file_path = join(cache_success if status else cache_failure,
                         f'{ticker}.json')
        save_data_as_json(data_to_save, file_path)
Exemple #4
0
def generate_daily_extraction_status_sheet(data, input_, location, date):
    extracted_tickers = data.ecode.unique()

    def status_provider(row):
        return 'N/A' if row.status == 'D' else (
            True if row.code in extracted_tickers else False)

    input_['extraction_status'] = input_.apply(status_provider, axis=1)
    status_file = join(location, f'status_{date}.csv')
    input_.to_csv(status_file, index=False)
Exemple #5
0
def create_csv_dump(target_date,
                    end_time='15:01:00',
                    bar_size='1 min',
                    verbose=False):
    """
        Creates a CSV file from JSON files for a given date.
        Raise an error if directory for the gven is not present.
        CSV files to be saved at the historical data storage location:
            'success.csv' & 'failure.csv'
    """
    logger.info('Generating final CSV dump')
    storage_dir = _setup_storage_directories(target_date, bar_size=bar_size)
    _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}'
    write_to_console(f'{"-"*30} CSV Conversion: {_date} {"-"*31}',
                     verbose=True)
    target_directory = join(CACHE_DIR, bar_size.replace(' ', ''), target_date,
                            end_time.replace(':', '_'))

    if not isdir(target_directory):
        raise NotADirectoryError(
            f'Could not find a data storage directory for date: {target_directory}'
        )

    success_directory = join(target_directory, 'success')
    failure_directory = join(target_directory, 'failure')

    if isdir(success_directory):
        path = join(storage_dir, 'success.csv')
        success = generate_success_dataframe(success_directory,
                                             bar_title='Success',
                                             verbose=verbose)
        success.to_csv(path, index=False)
        logger.debug(f'Success file saved at: {path}')

    if isdir(failure_directory):
        path = join(storage_dir, 'failure.csv')
        failure = generate_failure_dataframe(failure_directory,
                                             bar_title='Failure',
                                             verbose=verbose)
        failure.to_csv(path, index=False)
        logger.debug(f'Failure file saved at: {path}')
Exemple #6
0
def _prep_for_extraction(tickers, end_date, end_time, bar_size):
    """
        # todo: to be added...
    """
    # form data caching directory
    cache_directory = join(CACHE_DIR, bar_size.replace(' ', ''), end_date,
                           end_time.replace(':', '_'))

    # create cache directory for success
    cache_success = join(cache_directory, 'success')
    make_dirs(cache_success)

    # create cache directory for failure
    cache_failure = join(cache_directory, 'failure')
    make_dirs(cache_failure)

    # save tickers for later use
    path_input_tickers = join(cache_directory, 'input_tickers.json')

    # todo: find a better way to do this
    if not isfile(path_input_tickers):
        save_data_as_json(tickers,
                          path_input_tickers,
                          indent=1,
                          sort_keys=True)

    # extract tickers that are yet to be processed
    tickers = _get_unprocessed_tickers(tickers, cache_success)

    # clean failure directory, all these tickers will have to be processed again
    failure_tickers = list(
        map(_get_ticker_id, get_files_by_type(cache_failure)))
    common_tickers = list(set(tickers).intersection(failure_tickers))
    for ticker in common_tickers:
        file_name = f'{ticker}.json'
        delete_file(cache_failure, file_name)

    return tickers, cache_success, cache_failure
def create_csv_dump(target_date, end_time='15:01:00', verbose=False):
    """
        Creates a CSV file from JSON files for a given date.
        Raise an error if directory for the gven is not present.
        Created CSV files will be saved at the same location by the name:
            'success.csv' & 'failure.csv'
    """
    logger.info('Generating final CSV dump')
    _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}'
    write_to_console(f'{"-"*30} CSV Conversion: {_date} {"-"*31}',
                     verbose=True)
    target_directory = join(_HISTORICAL_DATA_STORAGE, target_date,
                            end_time.replace(':', '_'))

    if not isdir(target_directory):
        raise NotADirectoryError(
            f'Could not find a data storage directory for date: {target_date}')

    success_directory = join(target_directory, '.success')
    failure_directory = join(target_directory, '.failure')

    if isdir(success_directory):
        path = join(target_directory, 'success.csv')
        success = generate_success_dataframe(success_directory,
                                             bar_title='Success',
                                             verbose=verbose)
        success.to_csv(path, index=False)
        logger.debug(f'Success file saved at: {path}')

    if isdir(failure_directory):
        failure = generate_failure_dataframe(failure_directory,
                                             bar_title='Failure',
                                             verbose=verbose)
        path = join(target_directory, 'failure.csv')
        failure.to_csv(path, index=False)
        logger.debug(f'Failure file saved at: {path}')
Exemple #8
0
def _get_successfully_extracted_tickers(target_directory, end_date, end_time):
    """
        get tickers for which data already been extracted for the given parameters..
        # todo: to be implemented....
    """
    tickers = []
    success_files = list(filter(_filter_file, listdir(target_directory)))
    for file in success_files:
        path = join(target_directory, file)
        data = read_json_file(path)
        if 'meta_data' in data:
            end_date_time = data['meta_data']['end']
            if end_date_time.startswith(end_date) and end_date_time.endswith(
                    end_time):
                tickers.append(data['meta_data']['ecode'])
    return tickers
def generate_success_dataframe(target_directory,
                               bar_title=None,
                               verbose=False):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid bar data.
        :param target_directory: location to read JSON files from
        :param bar_title: message to show infron of progress bar
        :param verbose: set to true to see info messages on console
    """
    if bar_title is not None:
        _BAR_CONFIG['title'] = bar_title

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])

    # create a place holder dataframe
    expected_columns = [
        'time_stamp', 'ecode', 'session', 'high', 'low', 'close', 'volume',
        'average', 'count'
    ]

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    success_files = get_files_by_type(target_directory)
    success_tickers = list(map(_get_ticker_id, success_files))
    total = len(success_tickers)
    data = pd.DataFrame(columns=expected_columns)

    if bool(total):
        write_to_console(f'=> Generating dataframe for success tickers...',
                         verbose=verbose)
        json_generator = map(read_json_file, success_files)
        counter = 0  # to count temp files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker = success_tickers[i]
                ticker_data = next(
                    json_generator)  # load data into a dictionary
                bar_data, meta_data = ticker_data['bar_data'], ticker_data[
                    'meta_data']
                temp_data = pd.DataFrame(bar_data)
                temp_data['ecode'] = ticker
                data = data.append(temp_data)
                _time_to_cache = ((i > 0) and
                                  (i % 100 == 0)) or (i + 1 == total)
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory,
                                         f'success_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = get_files_by_type(temp_directory, file_type='csv')
        if bool(temp_files):
            data = pd.concat(map(read_csv, temp_files))
            data.sort_values(by=['ecode', 'time_stamp'],
                             inplace=True,
                             ignore_index=True)
            data = data[expected_columns]
    delete_directory(temp_directory)

    return data
def generate_extraction_metrics(target_date,
                                end_time='15:01:00',
                                input_tickers=None,
                                verbose=False):
    """
        Generates metrics about success & failure tickers.
        Metrics are saved into a new file called 'metrics.csv'
        :param target_date: date for which metrics are needed
        :param end_time: end time for metrics are to be generated
        :param input_tickers: tickers for which metrics are to be generated
    """
    logger.info('Generating final extraction metrics')
    _date = f'{target_date[:4]}/{target_date[4:6]}/{target_date[6:]}'
    write_to_console(f'{"-"*30} Metrics Generation: {_date} {"-"*31}',
                     verbose=True)
    expected_metrics = [
        'total_tickers', 'total_extracted', 'total_extraction_ratio',
        'extraction_successful', 'extraction_failure', 'success_ratio',
        'failure_ratio', 'n_225_input_ratio', 'n_225_success_ratio',
        'n_225_failure_ratio', 'topix_input_ratio', 'topix_success_ratio',
        'topix_failure_ratio', 'jasdaq_20_input_ratio',
        'jasdaq_20_success_ratio', 'jasdaq_20_failure_ratio',
        'missing_tickers_ratio', 'missing_tickers'
    ]
    metrics = dict(zip(expected_metrics, [0.0] * len(expected_metrics)))
    target_directory = join(_HISTORICAL_DATA_STORAGE, target_date,
                            end_time.replace(':', '_'))
    if not isdir(target_directory):
        raise NotADirectoryError(
            f'Data storage directory for {target_date} not found at'
            f'{_HISTORICAL_DATA_STORAGE}')

    success_file = join(target_directory, 'success.csv')
    failure_file = join(target_directory, 'failure.csv')

    if not isfile(success_file):
        raise FileNotFoundError(f'Can not find success file: {success_file}')

    if not isfile(failure_file):
        raise FileNotFoundError(f'Can not find failure file: {failure_file}')

    input_tickers_file = join(target_directory, 'input_tickers.json')
    if input_tickers is None:
        if not isfile(input_tickers_file):
            raise FileNotFoundError(
                f'Can not find input tickers file: {input_tickers_file}')
        input_tickers = read_json_file(input_tickers_file)

    japan_indices = get_japan_indices()

    _n_225_tickers = japan_indices[japan_indices.n_225.str.contains(
        'T')].n_225.unique().tolist()
    n_225_tickers = list(map(lambda x: int(x.split('.')[0]), _n_225_tickers))

    _topix_tickers = japan_indices[japan_indices.topix.str.contains(
        'T')].topix.unique().tolist()
    topix_tickers = list(map(lambda x: int(x.split('.')[0]), _topix_tickers))

    _jasdaq_20_tickers = japan_indices[japan_indices.jasdaq_20.str.contains(
        'T')].jasdaq_20.unique().tolist()
    jasdaq_20_tickers = list(
        map(lambda x: int(x.split('.')[0]), _jasdaq_20_tickers))

    success = read_csv(success_file)
    failure = read_csv(failure_file)

    success_tickers = success.ecode.unique().tolist()
    failure_tickers = failure.ecode.unique().tolist()

    total_tickers = len(input_tickers)
    if total_tickers == 0:
        raise ValueError(
            f'Can not find any input tickers in file {input_tickers_file}')

    extraction_successful = len(success_tickers)
    extraction_failure = len(failure_tickers)
    total_extracted = extraction_successful + extraction_failure
    total_extraction_ratio = round(total_extracted / total_tickers, 3)

    success_ratio = round(extraction_successful / total_tickers, 3)
    failure_ratio = round(extraction_failure / total_tickers, 3)
    logger.debug(f'Updated over-all extraction ratio: {success_ratio}')
    write_to_console(f'Over-all Extraction: {_get_marker(success_ratio)}',
                     pointer='->',
                     indent=2,
                     verbose=True)
    write_to_console(f'Over-all Success Ratio: {success_ratio}',
                     pointer='-',
                     indent=4,
                     verbose=verbose)

    n_225_input = list(set(input_tickers).intersection(n_225_tickers))
    if bool(n_225_input):
        n_225_input_ratio = round(len(n_225_input) / len(n_225_tickers), 3)
        n_225_success = list(set(success_tickers).intersection(n_225_input))
        n_225_failure = list(set(failure_tickers).intersection(n_225_input))
        n_225_success_ratio = round(len(n_225_success) / len(n_225_input), 3)
        n_225_failure_ratio = round(len(n_225_failure) / len(n_225_input), 3)
        logger.debug(f'Updated N225 extraction ratio: {n_225_success_ratio}')
        write_to_console(
            f'N225 Extraction: {_get_marker(n_225_success_ratio)}',
            pointer='->',
            indent=2,
            verbose=True)
        write_to_console(f'Over-all Success Ratio: {n_225_success_ratio}',
                         pointer='-',
                         indent=4,
                         verbose=verbose)
    else:
        logger.debug('Could not find any N 225 tickers in the given input')

    topix_input = list(set(input_tickers).intersection(topix_tickers))
    if bool(topix_input):
        topix_input_ratio = round(len(topix_input) / len(topix_tickers), 3)
        topix_success = list(set(success_tickers).intersection(topix_input))
        topix_failure = list(set(failure_tickers).intersection(topix_input))
        topix_success_ratio = round(len(topix_success) / len(topix_input), 3)
        topix_failure_ratio = round(len(topix_failure) / len(topix_input), 3)
        logger.debug(f'Updated Topix extraction ratio: {topix_success_ratio}')
        write_to_console(
            f'Topix Extraction: {_get_marker(topix_success_ratio)}',
            pointer='->',
            indent=2,
            verbose=True)
        write_to_console(f'Topix Success Ratio: {topix_success_ratio}',
                         pointer='-',
                         indent=4,
                         verbose=verbose)
    else:
        logger.debug('Could not find any Topix tickers in the given input')

    jasdaq_20_input = list(set(input_tickers).intersection(jasdaq_20_tickers))
    if bool(jasdaq_20_input):
        jasdaq_20_input_ratio = round(
            len(jasdaq_20_input) / len(jasdaq_20_tickers), 3)
        jasdaq_20_success = list(
            set(success_tickers).intersection(jasdaq_20_input))
        jasdaq_20_failure = list(
            set(failure_tickers).intersection(jasdaq_20_input))
        jasdaq_20_success_ratio = round(
            len(jasdaq_20_success) / len(jasdaq_20_input), 3)
        jasdaq_20_failure_ratio = round(
            len(jasdaq_20_failure) / len(jasdaq_20_input), 3)
        logger.debug(
            f'Updated JASDAQ 20 extraction ratio: {jasdaq_20_success_ratio}')
        write_to_console(
            f'JASDAQ 20 Extraction: {_get_marker(jasdaq_20_success_ratio)}',
            pointer='->',
            indent=2,
            verbose=True)
        write_to_console(f'JASDAQ 20 Success Ratio: {jasdaq_20_success_ratio}',
                         pointer='-',
                         indent=4,
                         verbose=verbose)
    else:
        logger.debug('Could not find any JASDAQ 20 tickers in the given input')

    missing_tickers = list(
        set(input_tickers).difference(success_tickers + failure_tickers))
    missing_tickers_ratio = round(len(missing_tickers) / total_tickers, 3)
    logger.debug(f'Updated missing tickers ratio: {missing_tickers_ratio}')

    all_vars = vars()
    for key in all_vars:
        if key in expected_metrics:
            metrics[key] = all_vars[key]

    metrics_file = join(target_directory, 'metrics.json')
    save_data_as_json(metrics, metrics_file)
    logger.debug(f'Metrics saved at: {metrics_file}')
def generate_failure_dataframe(target_directory,
                               bar_title=None,
                               verbose=False):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid error stacks.
        :param target_directory: location to read JSON files from
        :param bar_title: message to show infron of progress bar
        :param verbose: set to true to see info messages on console
    """
    if bar_title is not None:
        _BAR_CONFIG['title'] = bar_title

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])

    # create a place holder dataframe
    expected_columns = ['ecode', 'code', 'message']
    data = pd.DataFrame(columns=expected_columns)

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    file_pattern = join(
        target_directory,
        '*.json')  # TODO: can be modified to match digital values
    failure_files = glob(file_pattern)
    total = len(failure_files)

    if bool(total):
        write_to_console(f'=> Generting dataframe for failure tickers...',
                         verbose=verbose)
        json_generator = map(read_json_file, failure_files)
        counter = 0  # to count temp CSV files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker_data = next(json_generator)
                meta = ticker_data['meta_data']
                error_stack = meta['_error_stack']
                ecode = meta.get('ecode', _get_ticker_id(failure_files[i]))
                temp_data = pd.DataFrame(error_stack, columns=expected_columns)
                temp_data['ecode'] = ecode
                # if error stack is empty, then create a dummy row
                if temp_data.shape[
                        0] == 0:  # fixme: find a way to control this in the TWS Client
                    dummy_row = {
                        'ecode': ecode,
                        'code': 'unknown',
                        'message': 'not available'
                    }
                    temp_data = temp_data.append(dummy_row, ignore_index=True)

                data = data.append(temp_data)
                _time_to_cache = (i + 1 == total) or ((i > 0) and
                                                      (i % 100 == 0))
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory,
                                         f'failure_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = get_files_by_type(temp_directory, file_type='csv')
        data = pd.concat(map(read_csv, temp_files))
        data.sort_values(by=['ecode'], ignore_index=True, inplace=True)
        data = data[expected_columns]
    delete_directory(temp_directory)

    return data
Exemple #12
0
def metrics_generator(date, bar_size, tickers):
    """
        Generate extraction metrics for daily downloaded data
        Writes data to two new files:
        - metrics.csv: day-wise metrics (success, failed, missed v/s total stocks)
        - status.csv: extraction status for each input ticker for a specific day

        - Parameters:
        -------------
        - data_location(str): location where downloaded data is kept
        - input_file(str): full path to input file
    """
    logger.info('Generating final extraction metrics')
    display_date = f'{date[:4]}/{date[4:6]}/{date[6:]}'
    write_to_console(f'{"-"*30} Metrics Generation: {display_date} {"-"*31}',
                     verbose=True)
    try:
        data_location = join(HISTORICAL_DATA_STORAGE,
                             bar_size.replace(' ', ''), date[:4],
                             MONTH_MAP[int(date[4:6])], date)
        # read success, failure & input files
        success = pd.read_csv(join(data_location, 'success.csv'))
        failure = pd.read_csv(join(data_location, 'failure.csv'))

        if type(tickers) is list:
            pass  # TODO: simple metrics generation
        else:  # assuming that input is a file path
            input_ = pd.read_csv(tickers)
            # filter out relevant input --> active tickers
            relevant_input = input_[input_.status == 'A']

            # get extraction metrics
            metrics = compute_extraction_metrics(success, failure,
                                                 relevant_input)
            write_to_console(
                f'Over-all Extraction: {_get_marker(metrics["extraction_ratio"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Topix Extraction: {_get_marker(metrics["extraction_ratio_topix"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Nikkei 225 Extraction: {_get_marker(metrics["extraction_ratio_nikkei225"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'JASDAQ 20 Extraction: {_get_marker(metrics["extraction_ratio_jasdaq20"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'First Section Extraction: {_get_marker(metrics["extraction_ratio_first_section"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Second Section Extraction: '
                f'{_get_marker(metrics["extraction_ratio_second_section"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Mothers Extraction: {_get_marker(metrics["extraction_ratio_mothers"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'JASDAQ Growth Extraction: '
                f'{_get_marker(metrics["extraction_ratio_jasdaq_growth"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'JASDAQ Standard Extraction: '
                f'{_get_marker(metrics["extraction_ratio_jasdaq_standard"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Market Capital Above ¥10B Extraction: '
                f'{_get_marker(metrics["extraction_ratio_mcap_above_10b"])}',
                pointer='->',
                indent=2,
                verbose=True)
            write_to_console(
                f'Price x 3 Month\'s Trading Volume ¥85MM Extraction: '
                f'{_get_marker(metrics["extraction_ratio_pv_above_85m"])}',
                pointer='->',
                indent=2,
                verbose=True)
            # generate / update metrics sheet
            _date = f'{date[:4]}-{date[4:6]}-{date[6:]}'
            update_metrics_sheet(_date, metrics)

            # generate daily extraction status sheet
            generate_daily_extraction_status_sheet(success, input_,
                                                   data_location, date)
    except Exception as e:
        logger.critical(f'Metrics generation failed: {e}')